GitList

libclamav/jsparse/js-norm.c

fd08e02e	/* * Javascript normalizer. *
c442ca9c	* Copyright (C) 2013-2019 Cisco Systems, Inc. and/or its affiliates. All rights reserved. * Copyright (C) 2008-2013 Sourcefire, Inc.
fd08e02e	* * Authors: Török Edvin * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as * published by the Free Software Foundation. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, * MA 02110-1301, USA. */
8be1d5a4	#ifdef HAVE_CONFIG_H #include "clamav-config.h" #endif
fd08e02e	#include <stdio.h>
61eb9432	#ifdef HAVE_UNISTD_H
fd08e02e	#include <unistd.h>
61eb9432	#endif
fd08e02e	#include <fcntl.h> #include <stdlib.h> #include <string.h> #include <ctype.h> #include <assert.h>
61eb9432
60d8d2c3	#include "clamav.h"
b3722aeb	#include "jsparse/lexglobal.h"
fd08e02e	#include "hashtab.h" #include "others.h"
bd02147f	#include "str.h"
fd08e02e	#include "js-norm.h" #include "jsparse/generated/operators.h" #include "jsparse/generated/keywords.h"
4a6ade44	#include "jsparse/textbuf.h"
fd08e02e	/* ----------- tokenizer ---------------- */ enum tokenizer_state { Initial, MultilineComment, SinglelineComment, Number, DoubleQString, SingleQString,
72733fba	Identifier, Dummy
fd08e02e	}; typedef struct scanner { struct text_buffer buf; const char yytext; size_t yylen; const char in; size_t insize; size_t pos;
72733fba	size_t lastpos; enum tokenizer_state state; enum tokenizer_state last_state;
fd08e02e	} yyscan_t; typedef int YY_BUFFER_STATE; static int yylex( YYSTYPE lvalp, yyscan_t ); static YY_BUFFER_STATE yy_scan_bytes( const char , size_t, yyscan_t scanner ); static const char yyget_text ( yyscan_t scanner ); static int yyget_leng ( yyscan_t scanner ); static int yylex_init ( yyscan_t * ptr_yy_globals ) ; static int yylex_destroy ( yyscan_t yyscanner ) ; /* ----------- tokenizer end ---------------- */ enum fsm_state { Base, InsideVar, InsideInitializer, WaitFunctionName, WaitParameterList, InsideFunctionDecl }; struct scope {
cc447ac8	struct cli_hashtable id_map;
fd08e02e	struct scope parent;/ hierarchy / struct scope nxt;/* all scopes kept in a list so we can easily free all of them / enum fsm_state fsm_state; int last_token; unsigned int brackets; unsigned int blocks; }; struct tokens { yystype data; size_t cnt; size_t capacity; }; /* state for the current JS file being parsed */ struct parser_state {
094104a7	unsigned long var_uniq; unsigned long syntax_errors;
fd08e02e	struct scope global; struct scope current; struct scope *list; yyscan_t scanner; struct tokens tokens;
fe389c84	unsigned int rec;
fd08e02e	}; static struct scope* scope_new(struct parser_state state) { struct scope parent = state->current; struct scope s = cli_calloc(1, sizeof(s)); if(!s) return NULL;
cc447ac8	if(cli_hashtab_init(&s->id_map, 10) < 0) {
fd08e02e	free(s); return NULL; } s->parent = parent; s->fsm_state = Base; s->nxt = state->list; state->list = s; state->current = s; return s; } static struct scope* scope_done(struct scope s) { struct scope parent = s->parent; /* TODO: have a hashtab_destroy */
cc447ac8	cli_hashtab_clear(&s->id_map);
fd08e02e	free(s->id_map.htable); free(s); return parent; } /* transitions: * Base --(VAR)--> InsideVar * InsideVar --(Identifier)-->InsideInitializer * InsideVar --(anything_else) --> POP (to Base) * InsideInitializer --(COMMA)--> POP (to InsideVar) * InsideInitializer \| InsideVar --(SEMICOLON) --> POP (to Base) * InsideInitializer --(BRACKET_OPEN) --> WaitBrClose * InsideInitializer --(PAR_OPEN) --> WaitParClose * WaitBrClose --(BRACKET_OPEN) --> increase depth * WaitBrClose --(BRACKET_CLOSE) --> POP * WaitParClose --(PAR_CLOSE) --> POP * WaitParClose --(PAR_OPEN) --> increase depth / / Base --(VAR)--> PUSH, to InsideVar * InsideVar --(Identifier)--> InsideInitializer * InsideVar --(ELSE)--> POP, inc. syntax_errors * InsideInitializer --(COMMA)--> POP (to InsideVar) * --(BRACKET_OPEN)--> inc bracket_counter * --(PAR_OPEN)--> inc par_counter * --(BRACKET_CLOSE) --> dec bracket_counter * --(PAR_CLOSE)--> dec par_counter * --(VAR)--> PUSH, to InsideVar (if bracket_counter != 0 \|\| par_counter != 0) * --> POP, to InsideVar, inc. syntax_errors (if bracket_counter == 0 && par_counter == 0) * POP only allowed if bracket_counter == 0 && par_counter == 0 * * InsideInitializer acts differently, make it only a flag * .................... * * Pushing, Poping is done when entering / exiting function scopes, * tracking { and function ( is done by the function scope tracker too. * * we only need to track brackets. / / * var x = document; * x.writeln(...); * * ^we must not normalize member method names / / * Variables are declared at function scope, and their initial value is * undefined. At the point where the initializer is, and from there on the value * is defined. * * { doesn't introduce a new variable scope, they are in function's scope too * * function foo() { * alert(x); -> x exists, undefined * var x=5; * alert(x); -> x exists, =5 * } * * vs. * * function bar() { * alert(x);//error, x not declared * x=5; * } * * vs. * * but we can declare variables without var, only valid if we use them after * assigning. * * function foobar() { * x=5; * alert(x);//x is defined, value is 5 * } * * other examples: * function foo2() { * alert(x); -> x exists, undefined * { * var x=5; -> x equals to 5 * } * alert(x); -> x is 5 * } * * function foo3() { * var x=4; -> x exists, equals to 4 * alert(x); -> x exists, equals to 4 * { * var x=5; -> x equals to 5 * } * alert(x); -> x is 5 * } * * function bar3() { * //same as foo3 * var x=4; * alert(x); * { * x=5; * } * alert(x); * } * / static const char scope_declare(struct scope s, const char token, const size_t len, struct parser_state *state) {
cc447ac8	const struct cli_element el = cli_hashtab_insert(&s->id_map, token, len, state->var_uniq++); / cli_hashtab_insert either finds an already existing entry, or allocates a
fd08e02e	* new one, we return the allocated string / return el ? el->key : NULL; } static const char scope_use(struct scope s, const char token, const size_t len) {
cc447ac8	const struct cli_element *el = cli_hashtab_find(&s->id_map, token, len);
fd08e02e	if(el) { /* identifier already found in current scope, * return here to avoid overwriting uniq id / return el->key; } / identifier not yet in current scope's hashtab, add with ID -1. * Later if we find a declaration it will automatically assign a uniq ID * to it. If not, we'll know that we have to push ID == -1 tokens to an * outer scope.*/
cc447ac8	el = cli_hashtab_insert(&s->id_map, token, len, -1);
fd08e02e	return el ? el->key : NULL; } static long scope_lookup(struct scope s, const char token, const size_t len) { while(s) {
cc447ac8	const struct cli_element *el = cli_hashtab_find(&s->id_map, token, len);
fd08e02e	if(el && el->data != -1) { return el->data; } /* not found in current scope, try in outer scope / s = s->parent; } return -1; } static int tokens_ensure_capacity(struct tokens tokens, size_t cap) { if(tokens->capacity < cap) {
b0033c8f	yystype *data;
377a2330	cap += 1024;
b0033c8f	/* Keep old data if OOM / data = cli_realloc(tokens->data, cap sizeof(*tokens->data)); if(!data)
fd08e02e	return CL_EMEM;
b0033c8f	tokens->data = data;
377a2330	tokens->capacity = cap;
fd08e02e	} return CL_SUCCESS; } static int add_token(struct parser_state state, const yystype token) {
b0033c8f	if(tokens_ensure_capacity(&state->tokens, state->tokens.cnt + 1))
fd08e02e	return -1; state->tokens.data[state->tokens.cnt++] = token; return 0; } struct buf { size_t pos; int outfd; char buf[65536]; }; static inline int buf_outc(char c, struct buf buf) { if(buf->pos >= sizeof(buf->buf)) { if(write(buf->outfd, buf->buf, sizeof(buf->buf)) != sizeof(buf->buf))
871177cd	return CL_EWRITE;
fd08e02e	buf->pos = 0; } buf->buf[buf->pos++] = c; return CL_SUCCESS; } static inline int buf_outs(const char s, struct buf buf) { const size_t buf_len = sizeof(buf->buf);
6ba8b950	size_t i; i = buf->pos; while(s) { while(i < buf_len && s) {
2686c7ba	if(isspace(*s & 0xff))
1279faf6	buf->buf[i++] = ' '; else buf->buf[i++] = tolower((unsigned char)(*s)); ++s;
6ba8b950	} if(i == buf_len) { if(write(buf->outfd, buf->buf, buf_len) < 0)
871177cd	return CL_EWRITE;
6ba8b950	i = 0; }
fd08e02e	}
6ba8b950	buf->pos = i;
fd08e02e	return CL_SUCCESS; } static inline void output_space(char last, char current, struct buf out) { if(isalnum(last) && isalnum(current)) buf_outc(' ', out); } / return class of last character / static char output_token(const yystype token, struct scope scope, struct buf out, char lastchar) { char sbuf[128]; const char s = TOKEN_GET(token, cstring); / TODO: use a local buffer, instead of FILE* */ switch(token->type) { case TOK_StringLiteral: output_space(lastchar,'"', out); buf_outc('"', out); if(s) { buf_outs(s, out); } buf_outc('"', out); return '\"'; case TOK_NumericInt: output_space(lastchar,'0', out); snprintf(sbuf, sizeof(sbuf), "%ld", TOKEN_GET(token, ival)); buf_outs(sbuf, out); return '0'; case TOK_NumericFloat: output_space(lastchar,'0', out);
377a2330	snprintf(sbuf, sizeof(sbuf), "%g", TOKEN_GET(token, dval));
fd08e02e	buf_outs(sbuf, out); return '0'; case TOK_IDENTIFIER_NAME: output_space(lastchar,'a', out); if(s) { long id = scope_lookup(scope, s, strlen(s)); if(id == -1) { /* identifier not normalized / buf_outs(s, out); } else { snprintf(sbuf, sizeof(sbuf), "n%03ld",id); buf_outs(sbuf, out); } } return 'a'; case TOK_FUNCTION: output_space(lastchar,'a', out); buf_outs("function",out); return 'a'; default: if(s) { const size_t len = strlen(s); output_space(lastchar,s[0], out); buf_outs(s, out); return len ? s[len-1] : '\0'; } return '\0'; } } / * We can't delete the scope as soon as we see a }, because * we still need the hashmap from it. * * If we would normalize all the identifiers, and output when a scope is closed, * then it would be impossible to normalize calls to other functions. * * So we need to keep all scopes in memory, to do this instead of scope_done, we * simply just set current = current->parent when a scope is closed. * We keep a list of all scopes created in parser_state-> When we parsed * everything, we output everything, and then delete all scopes. * * We also need to know where to switch scopes on the second pass, so for * TOK_FUNCTION types we will use another pointer, that points to the scope * (added to yystype's union). * * We lookup the identifier in the scope (using scope_lookup, it looks in parent * scopes too), if ID is found then output (n%3d, Id), * otherwise output the identifier as is. * * To make it easier to match sigs, we do a xfrm : * 'function ID1 (..'. => 'n%3d = function (...' / / * we'll add all identifier to the scope's map * those that are not decl. will have initial ID -1 * if we later see a decl for it in same scope, it'll automatically get a * correct ID. * * When parsing of local scope is done, we take any ID -1 identifiers, * and push them up one level (careful not to overwrite existing IDs). * * it would be nice if the tokens would contain a link to the entry in the * hashtab, a link that automatically gets updated when the element is moved * (pushed up). This would prevent subsequent lookups in the map, * when we want to output the tokens. * There is no easy way to do that, so we just do another lookup * / / * This actually works, redefining foo: * function foo() { * var foo=5; alert(foo); * } * So we can't treat function names just as any other identifier? * We can, because you can no longer call foo, if you redefined it as a var. * So if we rename both foo-s with same name, it will have same behaviour. * * This means that a new scope should begin after function, and not after * function ... (. / static void scope_free_all(struct scope p) { struct scope *nxt; do { nxt = p->nxt; scope_done(p); p = nxt; } while(p); }
72fb25ea	size_t cli_strtokenize(char buffer, const char delim, const size_t token_count, const char *tokens);
fd08e02e	static int match_parameters(const yystype tokens, const char * param_names, size_t count) { size_t i,j=0; if(tokens[0].type != TOK_PAR_OPEN) return -1; i=1; while(count--) { const char token_val = TOKEN_GET(&tokens[i], cstring); if(tokens[i].type != TOK_IDENTIFIER_NAME \|\| !token_val \|\| strcmp(token_val, param_names[j++])) return -1; ++i; if((count && tokens[i].type != TOK_COMMA) \|\| (!count && tokens[i].type != TOK_PAR_CLOSE)) return -1; ++i; } return 0; } static const char de_packer_3[] = {"p","a","c","k","e","r"}; static const char de_packer_2[] = {"p","a","c","k","e","d"}; static inline char textbuffer_done(yyscan_t scanner) {
4b2ade1d	char *str = cli_realloc(scanner->buf.data, scanner->buf.pos); if(!str) { str = scanner->buf.data; } scanner->yytext = str; scanner->yylen = scanner->buf.pos - 1; memset(&scanner->buf, 0, sizeof(scanner->buf)); return str;
fd08e02e	} #define MODULE "JS-Norm: " static void free_token(yystype token) { if(token->vtype == vtype_string) { free(token->val.string); token->val.string = NULL; } } static int replace_token_range(struct tokens dst, size_t start, size_t end, const struct tokens *with) { const size_t len = with ? with->cnt : 0; size_t i;
8af6e5a2	cli_dbgmsg(MODULE "Replacing tokens %lu - %lu with %lu tokens\n", (unsigned long)start, (unsigned long)end, (unsigned long)len);
fd08e02e	if(start >= dst->cnt \|\| end > dst->cnt) return -1; for(i=start;i<end;i++) { free_token(&dst->data[i]); }
4b2ade1d	if(tokens_ensure_capacity(dst, dst->cnt - (end-start) + len))
fd08e02e	return CL_EMEM; memmove(&dst->data[start+len], &dst->data[end], (dst->cnt - end) * sizeof(dst->data[0])); if(with && len > 0) { memcpy(&dst->data[start], with->data, len * sizeof(dst->data[0])); } dst->cnt = dst->cnt - (end-start) + len; return CL_SUCCESS; } static int append_tokens(struct tokens dst, const struct tokens src) { if(!dst \|\| !src) return CL_ENULLARG;
4b2ade1d	if(tokens_ensure_capacity(dst, dst->cnt + src->cnt))
fd08e02e	return CL_EMEM;
8af6e5a2	cli_dbgmsg(MODULE "Appending %lu tokens\n", (unsigned long)(src->cnt));
fd08e02e	memcpy(&dst->data[dst->cnt], src->data, src->cnt * sizeof(dst->data[0])); dst->cnt += src->cnt; return CL_SUCCESS; } static void decode_de(yystype params[], struct text_buffer txtbuf) { const char p = TOKEN_GET(params[0], cstring); const long a = TOKEN_GET(params[1], ival); /const char c = params[2];/ char k = TOKEN_GET(params[3], string); /const char r = params[5];/ unsigned val=0; unsigned nsplit = 0; const char* o; const char *tokens; memset(txtbuf, 0, sizeof(txtbuf)); if(!p \|\| !k ) return; for(o = k; o; o++) if(o == '\|') nsplit++; nsplit++; tokens = malloc(sizeof(char)nsplit); if(!tokens) { return; } cli_strtokenize(k,'\|',nsplit, tokens); do { while(p && !isalnum(p)) { if(p=='\\' && (p[1] == '\'' \|\| p[1] == '\"')) p++; else textbuffer_putc(txtbuf, p++); } if(!p) break; val = 0; o = p; while(p && isalnum(p)) { unsigned x; unsigned char v = p++; /* TODO: use a table here / if(v >= 'a') x = 10+v-'a'; else if(v >= 'A') x = 36+v-'A'; else x = v-'0'; val = vala+x; } if(val >= nsplit \|\| !tokens[val] \|\| !tokens[val][0]) while(o!=p) textbuffer_putc(txtbuf, o++); else textbuffer_append(txtbuf, tokens[val]); } while (p); free(tokens); textbuffer_append(txtbuf, "\0"); } struct decode_result { struct text_buffer txtbuf; size_t pos_begin; size_t pos_end; unsigned append:1; /* 0: tokens are replaced with new token(s), 1: old tokens are deleted, new ones appended at the end / }; static void handle_de(yystype tokens, size_t start, const size_t cnt, const char name, struct decode_result res) { /* find function decl. end / size_t i, nesting = 1, j; yystype parameters [6]; const size_t parameters_cnt = 6; for(i=start;i < cnt; i++) { if(tokens[i].type == TOK_FUNCTION) { if(TOKEN_GET(&tokens[i], scope)) nesting++; else nesting--; if(!nesting) break; } } if(nesting) return;
377a2330	memset(parameters, 0, sizeof(parameters));
fd08e02e	if(name) { /* find call to function / for(;i+2 < cnt; i++) { const char token_val = TOKEN_GET(&tokens[i], cstring); if(tokens[i].type == TOK_IDENTIFIER_NAME && token_val && !strcmp(name, token_val) && tokens[i+1].type == TOK_PAR_OPEN) { i += 2; for(j = 0;j < parameters_cnt && i < cnt;j++) { parameters[j] = &tokens[i++]; if(j != parameters_cnt-1) while (tokens[i].type != TOK_COMMA && i < cnt) i++; else while (tokens[i].type != TOK_PAR_CLOSE && i < cnt) i++; i++; } if(j == parameters_cnt) decode_de(parameters, &res->txtbuf); } } } else { while(i<cnt && tokens[i].type != TOK_PAR_OPEN) i++; ++i; if(i >= cnt) return; /* TODO: move this v to another func */ for(j = 0;j < parameters_cnt && i < cnt;j++) { parameters[j] = &tokens[i++]; if(j != parameters_cnt-1) while (tokens[i].type != TOK_COMMA && i < cnt) i++; else while (tokens[i].type != TOK_PAR_CLOSE && i < cnt) i++; i++; } if(j == parameters_cnt) decode_de(parameters, &res->txtbuf); }
377a2330	if(parameters[0] && parameters[parameters_cnt-1]) { res->pos_begin = parameters[0] - tokens; res->pos_end = parameters[parameters_cnt-1] - tokens + 1; if(tokens[res->pos_end].type == TOK_BRACKET_OPEN && tokens[res->pos_end+1].type == TOK_BRACKET_CLOSE && tokens[res->pos_end+2].type == TOK_PAR_CLOSE) res->pos_end += 3; /* {}) / else res->pos_end++; / ) */ }
fd08e02e	}
08402afa	static int handle_unescape(struct tokens *tokens, size_t start)
fd08e02e	{ if(tokens->data[start].type == TOK_StringLiteral) { char R; struct tokens new_tokens; yystype tok; R = cli_unescape(TOKEN_GET(&tokens->data[start], cstring)); tok.type = TOK_StringLiteral; TOKEN_SET(&tok, string, R); new_tokens.capacity = new_tokens.cnt = 1; new_tokens.data = &tok; if(replace_token_range(tokens, start-2, start+2, &new_tokens) < 0) return CL_EMEM; } return CL_SUCCESS; } / scriptasylum dot com's JS encoder */
08402afa	static void handle_df(const yystype tokens, size_t start, struct decode_result res)
fd08e02e	{ char str, s1; size_t len, s1_len, i; unsigned char clast; char *R; if(tokens[start].type != TOK_StringLiteral) return; str = TOKEN_GET(&tokens[start], string); if(!str) return; len = strlen(str);
2e67cefc	if(!len) return;
fd08e02e	clast = str[len-1] - '0'; str[len-1] = '\0'; s1 = cli_unescape(str); s1_len = strlen(s1); for(i=0;i<s1_len;i++) { s1[i] -= clast; } R = cli_unescape(s1); free(s1); res->pos_begin = start-2; res->pos_end = start+2; res->txtbuf.data = R; res->txtbuf.pos = strlen(R); res->append = 1; } static void handle_eval(struct tokens tokens, size_t start, struct decode_result res) { res->txtbuf.data = TOKEN_GET(&tokens->data[start], string); if(res->txtbuf.data && tokens->data[start+1].type == TOK_PAR_CLOSE) { TOKEN_SET(&tokens->data[start], string, NULL); res->txtbuf.pos = strlen(res->txtbuf.data); res->pos_begin = start-2; res->pos_end = start+2; } } static void run_folders(struct tokens tokens) { size_t i; for(i = 0; i < tokens->cnt; i++) { const char cstring = TOKEN_GET(&tokens->data[i], cstring); if(i+2 < tokens->cnt && tokens->data[i].type == TOK_IDENTIFIER_NAME && cstring && !strcmp("unescape", cstring) && tokens->data[i+1].type == TOK_PAR_OPEN) {
08402afa	handle_unescape(tokens, i+2);
fd08e02e	} } } static inline int state_update_scope(struct parser_state state, const yystype token) { if(token->type == TOK_FUNCTION) { struct scope scope = TOKEN_GET(token, scope); if(scope) { state->current = scope; } else { / dummy token marking function end / if(state->current->parent) state->current = state->current->parent; / don't output this token, it is just a dummy marker / return 0; } } return 1; } static void run_decoders(struct parser_state state) { size_t i; const char* name; struct tokens tokens = &state->tokens; for(i = 0; i < tokens->cnt; i++) { const char cstring = TOKEN_GET(&tokens->data[i], cstring); struct decode_result res; res.pos_begin = res.pos_end = 0; res.append = 0; if(tokens->data[i].type == TOK_FUNCTION && i+13 < tokens->cnt) { name = NULL; ++i; if(tokens->data[i].type == TOK_IDENTIFIER_NAME) {
377a2330	cstring = TOKEN_GET(&tokens->data[i], cstring);
fd08e02e	name = cstring; ++i; } if(match_parameters(&tokens->data[i], de_packer_3, sizeof(de_packer_3)/sizeof(de_packer_3[0])) != -1 \|\| match_parameters(&tokens->data[i], de_packer_2, sizeof(de_packer_2)/sizeof(de_packer_2[0])) != -1) { /* find function decl. end / handle_de(tokens->data, i, tokens->cnt, name, &res); } } else if(i+2 < tokens->cnt && tokens->data[i].type == TOK_IDENTIFIER_NAME && cstring && !strcmp("dF", cstring) && tokens->data[i+1].type == TOK_PAR_OPEN) { / TODO: also match signature of dF function (possibly * declared using unescape */
08402afa	handle_df(tokens->data, i+2, &res);
fd08e02e	} else if(i+2 < tokens->cnt && tokens->data[i].type == TOK_IDENTIFIER_NAME && cstring && !strcmp("eval", cstring) && tokens->data[i+1].type == TOK_PAR_OPEN) { handle_eval(tokens, i+2, &res); } if(res.pos_end > res.pos_begin) { struct tokens parent_tokens; if(res.pos_end < tokens->cnt && tokens->data[res.pos_end].type == TOK_SEMICOLON) res.pos_end++; parent_tokens = state->tokens;/* save current tokens / / initialize embedded context */ memset(&state->tokens, 0, sizeof(state->tokens));
d39b5281	if(++state->rec > 16)
1405207a	cli_dbgmsg(MODULE "recursion limit reached\n");
d39b5281	else { cli_js_process_buffer(state, res.txtbuf.data, res.txtbuf.pos); --state->rec; }
fd08e02e	free(res.txtbuf.data); /* state->tokens still refers to the embedded/nested context * here / if(!res.append) { replace_token_range(&parent_tokens, res.pos_begin, res.pos_end, &state->tokens); } else { / delete tokens / replace_token_range(&parent_tokens, res.pos_begin, res.pos_end, NULL); append_tokens(&parent_tokens, &state->tokens); } / end of embedded context, restore tokens state / free(state->tokens.data); state->tokens = parent_tokens; } state_update_scope(state, &state->tokens.data[i]); } } void cli_js_parse_done(struct parser_state state) {
a66b62f8	struct tokens * tokens = &state->tokens; size_t par_balance = 0, i; char end = '\0'; YYSTYPE val;
8be1d5a4	cli_dbgmsg(MODULE "in cli_js_parse_done()\n");
a66b62f8	/* close unfinished token */ switch (state->scanner->state) { case DoubleQString: end = '"'; break; case SingleQString: end = '\''; break;
b9b47784	default: /* make gcc happy */ break;
a66b62f8	} if (end != '\0') cli_js_process_buffer(state, &end, 1);
22cb38ed	/* close remaining parenthesis */
a66b62f8	for (i=0;i<tokens->cnt;i++) { if (tokens->data[i].type == TOK_PAR_OPEN) par_balance++; else if (tokens->data[i].type == TOK_PAR_CLOSE && par_balance > 0) par_balance--; } if (par_balance > 0) { memset(&val, 0, sizeof(val)); val.type = TOK_PAR_CLOSE; TOKEN_SET(&val, cstring, ")"); while (par_balance-- > 0) { add_token(state, &val); } }
22cb38ed	/* we had to close unfinished strings, parenthesis,
a66b62f8	* so that the folders/decoders can run properly */
fd08e02e	run_folders(&state->tokens); run_decoders(state); yylex_destroy(state->scanner);
377a2330	state->scanner = NULL;
fd08e02e	}
8be1d5a4	void cli_js_output(struct parser_state state, const char tempdir)
fd08e02e	{ unsigned i; struct buf buf; char lastchar = '\0';
8be1d5a4	char filename[1024];
58481352	snprintf(filename, 1024, "%s"PATHSEP"javascript", tempdir);
8be1d5a4
fd08e02e	buf.pos = 0;
8be1d5a4	buf.outfd = open(filename, O_CREAT \| O_WRONLY, 0600); if(buf.outfd < 0) { cli_errmsg(MODULE "cannot open output file for writing: %s\n", filename); return; } /* append to file / if(lseek(buf.outfd, 0, SEEK_END) != 0) { / separate multiple scripts with \n */ buf_outc('\n', &buf); }
1279faf6	buf_outs("<script>", &buf);
fd08e02e	state->current = state->global; for(i = 0; i < state->tokens.cnt; i++) { if(state_update_scope(state, &state->tokens.data[i])) lastchar = output_token(&state->tokens.data[i], state->current, &buf, lastchar); }
1279faf6	/* add /script if not already there */ if(buf.pos < 9 \|\| memcmp(buf.buf + buf.pos - 9, "</script>", 9)) buf_outs("</script>", &buf);
fd08e02e	if(write(buf.outfd, buf.buf, buf.pos) < 0) {
1405207a	cli_dbgmsg(MODULE "I/O error\n");
fd08e02e	}
8be1d5a4	close(buf.outfd); cli_dbgmsg(MODULE "dumped/appended normalized script to: %s\n",filename);
fd08e02e	} void cli_js_destroy(struct parser_state *state) { size_t i;
377a2330	if(!state) return;
fd08e02e	scope_free_all(state->list); for(i=0;i<state->tokens.cnt;i++) { free_token(&state->tokens.data[i]); } free(state->tokens.data);
8be1d5a4	/* detect use after free */
377a2330	if(state->scanner) yylex_destroy(state->scanner);
8be1d5a4	memset(state, 0x55, sizeof(*state)); free(state); cli_dbgmsg(MODULE "cli_js_destroy() done\n");
fd08e02e	} /* buffer is html-normlike "chunk", if original file is bigger than buffer, * we rewind to a space, so we'll know that tokens won't be broken in half at * the end of a buffer. All tokens except string-literals of course. * So we can assume that after the buffer there is either a space, EOF, or a * chunk of text not containing whitespace at all (for which we care only if its * a stringliteral)/ void cli_js_process_buffer(struct parser_state state, const char buf, size_t n) { struct scope current = state->current; YYSTYPE val; int yv; YY_BUFFER_STATE yyb; if(!state->global) { /* this state has either not been initialized, * or cli_js_parse_done() was already called on it */
1405207a	cli_warnmsg(MODULE "invalid state\n");
fd08e02e	return; } yyb = yy_scan_bytes(buf, n, state->scanner); memset(&val, 0, sizeof(val)); val.vtype = vtype_undefined; /* on EOF yylex will return 0 / while( (yv=yylex(&val, state->scanner)) != 0) { const char text; size_t leng; val.type = yv; switch(yv) { case TOK_VAR: current->fsm_state = InsideVar; break; case TOK_IDENTIFIER_NAME: text = yyget_text(state->scanner); leng = yyget_leng(state->scanner); if(current->last_token == TOK_DOT) { /* this is a member name, don't normalize / TOKEN_SET(&val, string, cli_strdup(text)); val.type = TOK_UNNORM_IDENTIFIER; } else { switch(current->fsm_state) { case WaitParameterList: state->syntax_errors++; / fall through / case Base: case InsideInitializer: TOKEN_SET(&val, cstring, scope_use(current, text, leng)); break; case InsideVar: case InsideFunctionDecl: TOKEN_SET(&val, cstring, scope_declare(current, text, leng, state)); current->fsm_state = InsideInitializer; current->brackets = 0; break; case WaitFunctionName: TOKEN_SET(&val, cstring, scope_declare(current, text, leng, state)); current->fsm_state = WaitParameterList; break; } } break; case TOK_PAR_OPEN: switch(current->fsm_state) { case WaitFunctionName: / fallthrough / case WaitParameterList: current->fsm_state = InsideFunctionDecl; break; default: / noop / break; } break; case TOK_PAR_CLOSE: switch(current->fsm_state) { case WaitFunctionName: state->syntax_errors++; break; case WaitParameterList: current->fsm_state = Base; break; default: / noop / break; } break; case TOK_CURLY_BRACE_OPEN: switch(current->fsm_state) { case WaitFunctionName: / fallthrough / case WaitParameterList: case InsideFunctionDecl: / in a syntactically correct * file, we would already be in * the Base state when we see a { / current->fsm_state = Base; / fall-through / case InsideVar: case InsideInitializer: state->syntax_errors++; / fall-through / case Base: default: current->blocks++; break; } break; case TOK_CURLY_BRACE_CLOSE: if(current->blocks > 0) current->blocks--; else state->syntax_errors++; if(!current->blocks) { if(current->parent) { / add dummy FUNCTION token to * mark function end / TOKEN_SET(&val, cstring, "}"); add_token(state, &val); TOKEN_SET(&val, scope, NULL); val.type = TOK_FUNCTION; state->current = current = current->parent; } else{ / extra } / state->syntax_errors++; } } break; case TOK_BRACKET_OPEN: current->brackets++; break; case TOK_BRACKET_CLOSE: if(current->brackets > 0) current->brackets--; else state->syntax_errors++; break; case TOK_COMMA: if (current->fsm_state == InsideInitializer && current->brackets == 0 && current->blocks == 0) { / initializer ended only if we * encountered a comma, and [] are * balanced. * This avoids switching state on: * var x = [4,y,u];/ current->fsm_state = InsideVar; } break; case TOK_SEMICOLON: if (current->brackets == 0 && current->blocks == 0) { / avoid switching state on unbalanced []: * var x = [test;testi]; */ current->fsm_state = Base; } break; case TOK_FUNCTION: current = scope_new(state); current->fsm_state = WaitFunctionName; TOKEN_SET(&val, scope, state->current); break; case TOK_StringLiteral:
8af6e5a2	if(state->tokens.cnt > 1 && state->tokens.data[state->tokens.cnt-1].type == TOK_PLUS) {
fd08e02e	/* see if can fold / yystype prev_string = &state->tokens.data[state->tokens.cnt-2]; if(prev_string->type == TOK_StringLiteral) { char str = TOKEN_GET(prev_string, string); size_t str_len = strlen(str); text = yyget_text(state->scanner); leng = yyget_leng(state->scanner); / delete TOK_PLUS */ free_token(&state->tokens.data[--state->tokens.cnt]); str = cli_realloc(str, str_len + leng + 1);
b0033c8f	if (!str) break;
377a2330	strncpy(str+str_len, text, leng);
fd08e02e	str[str_len + leng] = '\0'; TOKEN_SET(prev_string, string, str); free(val.val.string); memset(&val, 0, sizeof(val)); val.vtype = vtype_undefined; continue; } } break; } if(val.vtype == vtype_undefined) { text = yyget_text(state->scanner); TOKEN_SET(&val, string, cli_strdup(text)); abort(); } add_token(state, &val); current->last_token = yv; memset(&val, 0, sizeof(val)); val.vtype = vtype_undefined; } }
8be1d5a4	struct parser_state *cli_js_init(void)
fd08e02e	{
8be1d5a4	struct parser_state state = cli_calloc(1, sizeof(state));
fd08e02e	if(!state)
8be1d5a4	return NULL;
fd08e02e	if(!scope_new(state)) {
8be1d5a4	free(state); return NULL;
fd08e02e	} state->global = state->current; if(yylex_init(&state->scanner)) { scope_done(state->global);
8be1d5a4	free(state); return NULL;
fd08e02e	}
8be1d5a4	cli_dbgmsg(MODULE "cli_js_init() done\n"); return state;
fd08e02e	} /-------------- tokenizer ---------------------/ enum char_class { Whitespace, Slash, Operator, DQuote, SQuote, Digit, IdStart, BracketOpen = TOK_BRACKET_OPEN, BracketClose = TOK_BRACKET_CLOSE, Comma = TOK_COMMA, CurlyOpen = TOK_CURLY_BRACE_OPEN, CurlyClose = TOK_CURLY_BRACE_CLOSE, ParOpen = TOK_PAR_OPEN, ParClose = TOK_PAR_CLOSE, Dot = TOK_DOT, SemiColon = TOK_SEMICOLON, Nop }; #define SL Slash #define DG Digit #define DQ DQuote #define SQ SQuote #define ID IdStart #define OP Operator #define WS Whitespace #define BO BracketOpen #define BC BracketClose #define CM Comma #define CO CurlyOpen #define CC CurlyClose #define PO ParOpen #define PC ParClose #define DT Dot #define SC SemiColon #define NA Nop static const enum char_class ctype[256] = { NA, NA, NA, NA, NA, NA, NA, NA, NA, WS, WS, WS, NA, WS, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, WS, OP, DQ, NA, ID, OP, OP, SQ, PO, PC, OP, OP, CM, OP, DT, SL, DG, DG, DG, DG, DG, DG, DG, DG, DG, DG, OP, SC, OP, OP, OP, OP, NA, ID, ID, ID, ID, ID, ID, ID, ID, ID, ID, ID, ID, ID, ID, ID, ID, ID, ID, ID, ID, ID, ID, ID, ID, ID, ID, BO, ID, BC, OP, ID, NA, ID, ID, ID, ID, ID, ID, ID, ID, ID, ID, ID, ID, ID, ID, ID, ID, ID, ID, ID, ID, ID, ID, ID, ID, ID, ID, CO, OP, CC, OP, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA }; static const enum char_class id_ctype[256] = { NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ID, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ID, ID, ID, ID, ID, ID, ID, ID, ID, ID, NA, NA, NA, NA, NA, NA, NA, ID, ID, ID, ID, ID, ID, ID, ID, ID, ID, ID, ID, ID, ID, ID, ID, ID, ID, ID, ID, ID, ID, ID, ID, ID, ID, NA, OP, NA, NA, ID, NA, ID, ID, ID, ID, ID, ID, ID, ID, ID, ID, ID, ID, ID, ID, ID, ID, ID, ID, ID, ID, ID, ID, ID, ID, ID, ID, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, }; #define CASE_SPECIAL_CHAR(C, S) case C: TOKEN_SET(lvalp, cstring, (S)); return cClass; #define BUF_KEEP_SIZE 32768 static void textbuf_clean(struct text_buffer *buf) { if(buf->capacity > BUF_KEEP_SIZE) {
4b2ade1d	char *data= cli_realloc(buf->data, BUF_KEEP_SIZE);
b0033c8f	if (data) buf->data = data;
fd08e02e	buf->capacity = BUF_KEEP_SIZE; } buf->pos = 0; } static inline int parseString(YYSTYPE lvalp, yyscan_t scanner, const char q, enum tokenizer_state tostate) { size_t len; / look for " terminating the string / const char start = &scanner->in[scanner->pos], *end = start; do { const size_t siz = &scanner->in[scanner->insize] - end; end = memchr(end, q, siz);
69df34a0	if(end && end > start && end[-1] == '\\') {
fd08e02e	++end; continue; } break; } while (1);
377a2330	if(end && end >= start)
eb290151	len = end - start; else len = scanner->insize - scanner->pos;
37e64729	cli_textbuffer_append_normalize(&scanner->buf, start, len);
fd08e02e	if(end) {
b0033c8f	char *str;
fd08e02e	/* skip over end quote */ scanner->pos += len + 1; textbuffer_putc(&scanner->buf, '\0');
b0033c8f	str = textbuffer_done(scanner); if (str) { TOKEN_SET(lvalp, string, str); } else { TOKEN_SET(lvalp, cstring, ""); }
fd08e02e	scanner->state = Initial; assert(lvalp->val.string); return TOK_StringLiteral; } else { scanner->pos += len; /* unfinished string / scanner->state = tostate; return 0; } } static inline int parseDQString(YYSTYPE lvalp, yyscan_t scanner) { return parseString(lvalp, scanner, '"', DoubleQString); } static inline int parseSQString(YYSTYPE lvalp, yyscan_t scanner) { return parseString(lvalp, scanner, '\'', SingleQString); } static inline int parseNumber(YYSTYPE lvalp, yyscan_t scanner) { const unsigned char in = (const unsigned char)scanner->in; int is_float = 0; while(scanner->pos < scanner->insize) { unsigned char c = in[scanner->pos++]; if(isdigit(c)) { textbuffer_putc(&scanner->buf, c); continue; } if(c =='.' && !is_float) { is_float = 1; textbuffer_putc(&scanner->buf, '.'); continue; } if((c=='e' \|\| c=='E') && is_float) { textbuffer_putc(&scanner->buf, c); if(scanner->pos < scanner->insize) { c = in[scanner->pos++]; if(c == '+' \|\| c == '-' \|\| isdigit(c)) { textbuffer_putc(&scanner->buf, c); continue; } } } scanner->pos--; textbuffer_putc(&scanner->buf, '\0'); scanner->state = Initial;
4b2ade1d	if (!scanner->buf.data) return 0;
fd08e02e	if(is_float) { TOKEN_SET(lvalp, dval, atof(scanner->buf.data)); return TOK_NumericFloat; } else { TOKEN_SET(lvalp, ival, atoi(scanner->buf.data)); return TOK_NumericInt; } } scanner->state = Number; return 0; } static inline int parseId(YYSTYPE lvalp, yyscan_t scanner) { const struct keyword kw; const unsigned char in = (const unsigned char)scanner->in; scanner->state = Initial; while(scanner->pos < scanner->insize) { unsigned char c = in[scanner->pos++]; enum char_class cClass = id_ctype[c]; switch(cClass) { case IdStart: textbuffer_putc(&scanner->buf, c); break; case Operator: /* the table contains OP only for \ */ assert(c == '\\'); if(scanner->pos < scanner->insize && in[scanner->pos++] == 'u') { textbuffer_putc(&scanner->buf, c); break; }
72733fba	if(scanner->pos == scanner->insize) { scanner->pos++; }
fd08e02e	/* else fallthrough / default: / character is no longer part of identifier */
72733fba	scanner->state = Initial;
fd08e02e	textbuffer_putc(&scanner->buf, '\0'); scanner->pos--; kw = in_word_set(scanner->buf.data, scanner->buf.pos-1); if(kw) { /* we got a keyword / TOKEN_SET(lvalp, cstring, kw->name); return kw->val; } / it is not a keyword, just an identifier / TOKEN_SET(lvalp, cstring, NULL); return TOK_IDENTIFIER_NAME; } } scanner->state = Identifier; return 0; } static int parseOperator(YYSTYPE lvalp, yyscan_t scanner) { size_t len = MIN(5, scanner->insize - scanner->pos); while(len) { const struct operator *kw = in_op_set(&scanner->in[scanner->pos], len); if(kw) { TOKEN_SET(lvalp, cstring, kw->name); scanner->pos += len; return kw->val; } len--; }
377a2330	/* never reached */ assert(0);
fd08e02e	scanner->pos++; TOKEN_SET(lvalp, cstring, NULL); return TOK_ERROR; } static int yylex_init(yyscan_t scanner) { scanner = cli_calloc(1, sizeof(*scanner)); return scanner ? 0 : -1; } static int yylex_destroy(yyscan_t scanner) { free(scanner->buf.data); free(scanner); return 0; } static int yy_scan_bytes(const char *p, size_t len, yyscan_t scanner) { scanner->in = p; scanner->insize = len; scanner->pos = 0;
72733fba	scanner->lastpos = -1; scanner->last_state = Dummy;
fd08e02e	return 0; } static const char *yyget_text(yyscan_t scanner) {
4b2ade1d	return scanner->yytext ? scanner->yytext : scanner->buf.data;
fd08e02e	} static int yyget_leng(yyscan_t scanner) { /* we have a \0 too */
4b2ade1d	return scanner->yylen ? scanner->yylen: (scanner->buf.pos > 0 ? scanner->buf.pos - 1 : 0);
fd08e02e	} static int yylex(YYSTYPE lvalp, yyscan_t scanner) { const size_t len = scanner->insize; const unsigned char in = (const unsigned char*)scanner->in; unsigned char lookahead; enum char_class cClass; scanner->yytext = NULL; scanner->yylen = 0;
72733fba	if(scanner->pos == scanner->lastpos) { if(scanner->last_state == scanner->state) { cli_dbgmsg(MODULE "infloop detected, skipping character\n"); scanner->pos++; } /* its not necesarely an infloop if it changed * state, and it shouldn't infloop between states */ } scanner->lastpos = scanner->pos; scanner->last_state = scanner->state;
fd08e02e	while(scanner->pos < scanner->insize) { switch(scanner->state) { case Initial: textbuf_clean(&scanner->buf); cClass = ctype[in[scanner->pos++]]; switch(cClass) { case Whitespace: /* eat whitespace / continue; case Slash: if(scanner->pos < len) { lookahead = in[scanner->pos]; switch(lookahead) { case '': scanner->state = MultilineComment; scanner->pos++; continue; case '/': scanner->state = SinglelineComment; scanner->pos++; continue; } } --scanner->pos; return parseOperator(lvalp, scanner); case Operator: --scanner->pos; return parseOperator(lvalp, scanner); case DQuote: return parseDQString(lvalp, scanner); case SQuote: return parseSQString(lvalp, scanner); case Digit: --scanner->pos; return parseNumber(lvalp, scanner); case IdStart: --scanner->pos; return parseId(lvalp,scanner); CASE_SPECIAL_CHAR(BracketOpen, "["); CASE_SPECIAL_CHAR(BracketClose, "]"); CASE_SPECIAL_CHAR(Comma, ","); CASE_SPECIAL_CHAR(CurlyOpen, "{"); CASE_SPECIAL_CHAR(CurlyClose, "}"); CASE_SPECIAL_CHAR(ParOpen, "("); CASE_SPECIAL_CHAR(ParClose, ")"); CASE_SPECIAL_CHAR(Dot, "."); CASE_SPECIAL_CHAR(SemiColon, ";"); case Nop: continue; } break; case DoubleQString: return parseString(lvalp, scanner, '"', DoubleQString); case SingleQString:
377a2330	return parseString(lvalp, scanner, '\'', SingleQString);
fd08e02e	case Identifier: return parseId(lvalp, scanner); case MultilineComment: while(scanner->pos+1 < scanner->insize) {
377a2330	if(in[scanner->pos] == '*' && in[scanner->pos+1] == '/') { scanner->state = Initial; scanner->pos++;
fd08e02e	break;
377a2330	}
fd08e02e	scanner->pos++; }
377a2330	scanner->pos++;
fd08e02e	break; case Number: return parseNumber(lvalp, scanner); case SinglelineComment: while(scanner->pos < scanner->insize) {
086ad393	/* htmlnorm converts \n to space, so * stop on space too */ if(in[scanner->pos] == '\n' \|\| in[scanner->pos] == ' ')
fd08e02e	break; scanner->pos++; } scanner->state = Initial; break;
b9b47784	default: assert(0 && "Not reached");
fd08e02e	} } return 0; }