GitList

libclamav/jsparse/js-norm.c

fd08e02e	/* * Javascript normalizer. *
e7f5f537	* Copyright (C) 2013-2021 Cisco Systems, Inc. and/or its affiliates. All rights reserved.
52cddcbc	* Copyright (C) 2008-2013 Sourcefire, Inc.
fd08e02e	* * Authors: Török Edvin * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as * published by the Free Software Foundation. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, * MA 02110-1301, USA. */
8be1d5a4	#ifdef HAVE_CONFIG_H #include "clamav-config.h" #endif
fd08e02e	#include <stdio.h>
61eb9432	#ifdef HAVE_UNISTD_H
fd08e02e	#include <unistd.h>
61eb9432	#endif
fd08e02e	#include <fcntl.h> #include <stdlib.h> #include <string.h> #include <ctype.h> #include <assert.h>
61eb9432
60d8d2c3	#include "clamav.h"
b3722aeb	#include "jsparse/lexglobal.h"
fd08e02e	#include "hashtab.h" #include "others.h"
bd02147f	#include "str.h"
fd08e02e	#include "js-norm.h" #include "jsparse/generated/operators.h" #include "jsparse/generated/keywords.h"
4a6ade44	#include "jsparse/textbuf.h"
fd08e02e	/* ----------- tokenizer ---------------- */ enum tokenizer_state {
72fd33c8	Initial, MultilineComment, SinglelineComment, Number, DoubleQString, SingleQString, Identifier, Dummy
fd08e02e	}; typedef struct scanner {
72fd33c8	struct text_buffer buf; const char yytext; size_t yylen; const char in; size_t insize; size_t pos; size_t lastpos; enum tokenizer_state state; enum tokenizer_state last_state; } * yyscan_t;
fd08e02e	typedef int YY_BUFFER_STATE;
72fd33c8	static int yylex(YYSTYPE lvalp, yyscan_t); static YY_BUFFER_STATE yy_scan_bytes(const char , size_t, yyscan_t scanner); static const char yyget_text(yyscan_t scanner); static int yyget_leng(yyscan_t scanner); static int yylex_init(yyscan_t ptr_yy_globals); static int yylex_destroy(yyscan_t yyscanner);
fd08e02e	/* ----------- tokenizer end ---------------- */ enum fsm_state {
72fd33c8	Base, InsideVar, InsideInitializer, WaitFunctionName, WaitParameterList, InsideFunctionDecl
fd08e02e	}; struct scope {
72fd33c8	struct cli_hashtable id_map; struct scope parent; / hierarchy / struct scope nxt; /* all scopes kept in a list so we can easily free all of them */ enum fsm_state fsm_state; int last_token; unsigned int brackets; unsigned int blocks;
fd08e02e	}; struct tokens {
72fd33c8	yystype *data; size_t cnt; size_t capacity;
fd08e02e	}; /* state for the current JS file being parsed */ struct parser_state {
72fd33c8	unsigned long var_uniq; unsigned long syntax_errors; struct scope global; struct scope current; struct scope *list; yyscan_t scanner; struct tokens tokens; unsigned int rec;
fd08e02e	};
72fd33c8	static struct scope scope_new(struct parser_state state)
fd08e02e	{
72fd33c8	struct scope parent = state->current; struct scope s = cli_calloc(1, sizeof(*s)); if (!s) return NULL; if (cli_hashtab_init(&s->id_map, 10) < 0) { free(s); return NULL; } s->parent = parent; s->fsm_state = Base; s->nxt = state->list; state->list = s; state->current = s; return s;
fd08e02e	}
72fd33c8	static struct scope scope_done(struct scope s)
fd08e02e	{
72fd33c8	struct scope parent = s->parent; / TODO: have a hashtab_destroy */ cli_hashtab_clear(&s->id_map); free(s->id_map.htable); free(s); return parent;
fd08e02e	} /* transitions: * Base --(VAR)--> InsideVar * InsideVar --(Identifier)-->InsideInitializer * InsideVar --(anything_else) --> POP (to Base) * InsideInitializer --(COMMA)--> POP (to InsideVar) * InsideInitializer \| InsideVar --(SEMICOLON) --> POP (to Base) * InsideInitializer --(BRACKET_OPEN) --> WaitBrClose * InsideInitializer --(PAR_OPEN) --> WaitParClose * WaitBrClose --(BRACKET_OPEN) --> increase depth * WaitBrClose --(BRACKET_CLOSE) --> POP * WaitParClose --(PAR_CLOSE) --> POP * WaitParClose --(PAR_OPEN) --> increase depth / / Base --(VAR)--> PUSH, to InsideVar * InsideVar --(Identifier)--> InsideInitializer * InsideVar --(ELSE)--> POP, inc. syntax_errors * InsideInitializer --(COMMA)--> POP (to InsideVar) * --(BRACKET_OPEN)--> inc bracket_counter * --(PAR_OPEN)--> inc par_counter * --(BRACKET_CLOSE) --> dec bracket_counter * --(PAR_CLOSE)--> dec par_counter * --(VAR)--> PUSH, to InsideVar (if bracket_counter != 0 \|\| par_counter != 0) * --> POP, to InsideVar, inc. syntax_errors (if bracket_counter == 0 && par_counter == 0)
0c555069	* POP only allowed if bracket_counter == 0 && par_counter == 0
fd08e02e	* * InsideInitializer acts differently, make it only a flag * .................... * * Pushing, Poping is done when entering / exiting function scopes, * tracking { and function ( is done by the function scope tracker too. * * we only need to track brackets. / / * var x = document; * x.writeln(...); * * ^we must not normalize member method names / / * Variables are declared at function scope, and their initial value is * undefined. At the point where the initializer is, and from there on the value * is defined. * * { doesn't introduce a new variable scope, they are in function's scope too * * function foo() { * alert(x); -> x exists, undefined
0c555069	* var x=5;
fd08e02e	* alert(x); -> x exists, =5 * }
0c555069	*
fd08e02e	* vs. * * function bar() { * alert(x);//error, x not declared * x=5; * } * * vs. * * but we can declare variables without var, only valid if we use them after * assigning. * * function foobar() { * x=5; * alert(x);//x is defined, value is 5 * } * * other examples: * function foo2() { * alert(x); -> x exists, undefined * { * var x=5; -> x equals to 5 * } * alert(x); -> x is 5 * } * * function foo3() { * var x=4; -> x exists, equals to 4 * alert(x); -> x exists, equals to 4 * { * var x=5; -> x equals to 5 * } * alert(x); -> x is 5 * } * * function bar3() { * //same as foo3 * var x=4; * alert(x);
0c555069	* {
fd08e02e	* x=5; * } * alert(x); * } * */
72fd33c8	static const char scope_declare(struct scope s, const char token, const size_t len, struct parser_state state)
fd08e02e	{
72fd33c8	const struct cli_element el = cli_hashtab_insert(&s->id_map, token, len, state->var_uniq++); / cli_hashtab_insert either finds an already existing entry, or allocates a
fd08e02e	* new one, we return the allocated string */
72fd33c8	return el ? el->key : NULL;
fd08e02e	}
72fd33c8	static const char scope_use(struct scope s, const char *token, const size_t len)
fd08e02e	{
72fd33c8	const struct cli_element el = cli_hashtab_find(&s->id_map, token, len); if (el) { / identifier already found in current scope,
fd08e02e	* return here to avoid overwriting uniq id */
72fd33c8	return el->key; } /* identifier not yet in current scope's hashtab, add with ID -1.
fd08e02e	* Later if we find a declaration it will automatically assign a uniq ID * to it. If not, we'll know that we have to push ID == -1 tokens to an * outer scope.*/
72fd33c8	el = cli_hashtab_insert(&s->id_map, token, len, -1); return el ? el->key : NULL;
fd08e02e	} static long scope_lookup(struct scope s, const char token, const size_t len) {
72fd33c8	while (s) { const struct cli_element el = cli_hashtab_find(&s->id_map, token, len); if (el && el->data != -1) { return el->data; } / not found in current scope, try in outer scope */ s = s->parent; } return -1;
fd08e02e	}
0c555069	static cl_error_t tokens_ensure_capacity(struct tokens *tokens, size_t cap)
fd08e02e	{
72fd33c8	if (tokens->capacity < cap) { yystype data; cap += 1024; / Keep old data if OOM / data = cli_realloc(tokens->data, cap sizeof(*tokens->data)); if (!data) return CL_EMEM; tokens->data = data; tokens->capacity = cap; } return CL_SUCCESS;
fd08e02e	} static int add_token(struct parser_state state, const yystype token) {
72fd33c8	if (tokens_ensure_capacity(&state->tokens, state->tokens.cnt + 1)) return -1; state->tokens.data[state->tokens.cnt++] = *token; return 0;
fd08e02e	} struct buf {
72fd33c8	size_t pos; int outfd; char buf[65536];
fd08e02e	};
0c555069	static inline cl_error_t buf_outc(char c, struct buf *buf)
fd08e02e	{
72fd33c8	if (buf->pos >= sizeof(buf->buf)) { if (write(buf->outfd, buf->buf, sizeof(buf->buf)) != sizeof(buf->buf)) return CL_EWRITE; buf->pos = 0; } buf->buf[buf->pos++] = c; return CL_SUCCESS;
fd08e02e	}
0c555069	static inline cl_error_t buf_outs(const char s, struct buf buf)
fd08e02e	{
72fd33c8	const size_t buf_len = sizeof(buf->buf); size_t i; i = buf->pos; while (s) { while (i < buf_len && s) { if (isspace(s & 0xff)) buf->buf[i++] = ' '; else buf->buf[i++] = tolower((unsigned char)(s)); ++s; } if (i == buf_len) { if (write(buf->outfd, buf->buf, buf_len) < 0) return CL_EWRITE; i = 0; } } buf->pos = i; return CL_SUCCESS;
fd08e02e	} static inline void output_space(char last, char current, struct buf *out) {
72fd33c8	if (isalnum(last) && isalnum(current)) buf_outc(' ', out);
fd08e02e	} /* return class of last character / static char output_token(const yystype token, struct scope scope, struct buf out, char lastchar) {
72fd33c8	char sbuf[128]; const char s = TOKEN_GET(token, cstring); / TODO: use a local buffer, instead of FILE* / switch (token->type) { case TOK_StringLiteral: output_space(lastchar, '"', out); buf_outc('"', out); if (s) { buf_outs(s, out); } buf_outc('"', out); return '\"'; case TOK_NumericInt: output_space(lastchar, '0', out); snprintf(sbuf, sizeof(sbuf), "%ld", TOKEN_GET(token, ival)); buf_outs(sbuf, out); return '0'; case TOK_NumericFloat: output_space(lastchar, '0', out); snprintf(sbuf, sizeof(sbuf), "%g", TOKEN_GET(token, dval)); buf_outs(sbuf, out); return '0'; case TOK_IDENTIFIER_NAME: output_space(lastchar, 'a', out); if (s) { long id = scope_lookup(scope, s, strlen(s)); if (id == -1) { / identifier not normalized */ buf_outs(s, out); } else { snprintf(sbuf, sizeof(sbuf), "n%03ld", id); buf_outs(sbuf, out); } } return 'a'; case TOK_FUNCTION: output_space(lastchar, 'a', out); buf_outs("function", out); return 'a'; default: if (s) { const size_t len = strlen(s); output_space(lastchar, s[0], out); buf_outs(s, out); return len ? s[len - 1] : '\0'; } return '\0'; }
fd08e02e	} /* * We can't delete the scope as soon as we see a }, because * we still need the hashmap from it. * * If we would normalize all the identifiers, and output when a scope is closed, * then it would be impossible to normalize calls to other functions. * * So we need to keep all scopes in memory, to do this instead of scope_done, we * simply just set current = current->parent when a scope is closed. * We keep a list of all scopes created in parser_state-> When we parsed * everything, we output everything, and then delete all scopes. * * We also need to know where to switch scopes on the second pass, so for * TOK_FUNCTION types we will use another pointer, that points to the scope * (added to yystype's union). * * We lookup the identifier in the scope (using scope_lookup, it looks in parent * scopes too), if ID is found then output (n%3d, Id), * otherwise output the identifier as is. *
0c555069	* To make it easier to match sigs, we do a xfrm :
fd08e02e	* 'function ID1 (..'. => 'n%3d = function (...' / / * we'll add all identifier to the scope's map * those that are not decl. will have initial ID -1 * if we later see a decl for it in same scope, it'll automatically get a * correct ID. * * When parsing of local scope is done, we take any ID -1 identifiers, * and push them up one level (careful not to overwrite existing IDs). * * it would be nice if the tokens would contain a link to the entry in the * hashtab, a link that automatically gets updated when the element is moved * (pushed up). This would prevent subsequent lookups in the map, * when we want to output the tokens. * There is no easy way to do that, so we just do another lookup * / / * This actually works, redefining foo: * function foo() { * var foo=5; alert(foo); * } * So we can't treat function names just as any other identifier? * We can, because you can no longer call foo, if you redefined it as a var. * So if we rename both foo-s with same name, it will have same behaviour. * * This means that a new scope should begin after function, and not after * function ... (. / static void scope_free_all(struct scope p) {
72fd33c8	struct scope *nxt; do { nxt = p->nxt; scope_done(p); p = nxt; } while (p);
fd08e02e	}
72fb25ea	size_t cli_strtokenize(char buffer, const char delim, const size_t token_count, const char *tokens);
72fd33c8	static int match_parameters(const yystype tokens, const char *param_names, size_t count)
fd08e02e	{
72fd33c8	size_t i, j = 0; if (tokens[0].type != TOK_PAR_OPEN) return -1; i = 1; while (count--) { const char *token_val = TOKEN_GET(&tokens[i], cstring); if (tokens[i].type != TOK_IDENTIFIER_NAME \|\| !token_val \|\| strcmp(token_val, param_names[j++])) return -1; ++i; if ((count && tokens[i].type != TOK_COMMA) \|\| (!count && tokens[i].type != TOK_PAR_CLOSE)) return -1; ++i; } return 0;
fd08e02e	}
72fd33c8	static const char de_packer_3[] = {"p", "a", "c", "k", "e", "r"}; static const char de_packer_2[] = {"p", "a", "c", "k", "e", "d"};
fd08e02e	static inline char *textbuffer_done(yyscan_t scanner) {
72fd33c8	char *str = cli_realloc(scanner->buf.data, scanner->buf.pos); if (!str) { str = scanner->buf.data; } scanner->yytext = str; scanner->yylen = scanner->buf.pos - 1; memset(&scanner->buf, 0, sizeof(scanner->buf)); return str;
fd08e02e	} #define MODULE "JS-Norm: " static void free_token(yystype *token) {
72fd33c8	if (token->vtype == vtype_string) {
0c555069	if (NULL != token->val.string) { free(token->val.string); token->val.string = NULL; }
72fd33c8	}
fd08e02e	}
0c555069	static cl_error_t replace_token_range(struct tokens dst, size_t start, size_t end, const struct tokens with)
fd08e02e	{
72fd33c8	const size_t len = with ? with->cnt : 0; size_t i; cli_dbgmsg(MODULE "Replacing tokens %lu - %lu with %lu tokens\n", (unsigned long)start, (unsigned long)end, (unsigned long)len); if (start >= dst->cnt \|\| end > dst->cnt)
0c555069	return CL_EARG;
72fd33c8	for (i = start; i < end; i++) { free_token(&dst->data[i]); } if (tokens_ensure_capacity(dst, dst->cnt - (end - start) + len)) return CL_EMEM; memmove(&dst->data[start + len], &dst->data[end], (dst->cnt - end) * sizeof(dst->data[0])); if (with && len > 0) { memcpy(&dst->data[start], with->data, len * sizeof(dst->data[0])); } dst->cnt = dst->cnt - (end - start) + len; return CL_SUCCESS;
fd08e02e	}
0c555069	static cl_error_t append_tokens(struct tokens dst, const struct tokens src)
fd08e02e	{
72fd33c8	if (!dst \|\| !src) return CL_ENULLARG; if (tokens_ensure_capacity(dst, dst->cnt + src->cnt)) return CL_EMEM; cli_dbgmsg(MODULE "Appending %lu tokens\n", (unsigned long)(src->cnt)); memcpy(&dst->data[dst->cnt], src->data, src->cnt * sizeof(dst->data[0])); dst->cnt += src->cnt; return CL_SUCCESS;
fd08e02e	} static void decode_de(yystype params[], struct text_buffer txtbuf) {
72fd33c8	const char p = TOKEN_GET(params[0], cstring); const long a = TOKEN_GET(params[1], ival); /const char c = params[2];/ char k = TOKEN_GET(params[3], string); /const char r = params[5];/ unsigned val = 0; unsigned nsplit = 0; const char o; const char tokens; memset(txtbuf, 0, sizeof(txtbuf)); if (!p \|\| !k) return; for (o = k; o; o++) if (o == '\|') nsplit++; nsplit++; tokens = malloc(sizeof(char ) nsplit); if (!tokens) { return; } cli_strtokenize(k, '\|', nsplit, tokens); do { while (p && !isalnum(p)) { if (p == '\\' && (p[1] == '\'' \|\| p[1] == '\"')) p++; else textbuffer_putc(txtbuf, p++); } if (!p) break; val = 0; o = p; while (p && isalnum(p)) { unsigned x; unsigned char v = p++; /* TODO: use a table here / if (v >= 'a') x = 10 + v - 'a'; else if (v >= 'A') x = 36 + v - 'A'; else x = v - '0'; val = val a + x; } if (val >= nsplit \|\| !tokens[val] \|\| !tokens[val][0]) while (o != p) textbuffer_putc(txtbuf, o++); else textbuffer_append(txtbuf, tokens[val]); } while (p); free(tokens); textbuffer_append(txtbuf, "\0");
fd08e02e	} struct decode_result {
72fd33c8	struct text_buffer txtbuf; size_t pos_begin; size_t pos_end; unsigned append : 1; /* 0: tokens are replaced with new token(s),
fd08e02e	1: old tokens are deleted, new ones appended at the end / }; static void handle_de(yystype tokens, size_t start, const size_t cnt, const char name, struct decode_result res) {
72fd33c8	/* find function decl. end / size_t i, nesting = 1, j; yystype parameters[6]; const size_t parameters_cnt = 6; for (i = start; i < cnt; i++) { if (tokens[i].type == TOK_FUNCTION) { if (TOKEN_GET(&tokens[i], scope)) nesting++; else nesting--; if (!nesting) break; } } if (nesting) return; memset(parameters, 0, sizeof(parameters)); if (name) { /* find call to function / for (; i + 2 < cnt; i++) { const char token_val = TOKEN_GET(&tokens[i], cstring); if (tokens[i].type == TOK_IDENTIFIER_NAME && token_val && !strcmp(name, token_val) && tokens[i + 1].type == TOK_PAR_OPEN) { i += 2; for (j = 0; j < parameters_cnt && i < cnt; j++) { parameters[j] = &tokens[i++]; if (j != parameters_cnt - 1) while (tokens[i].type != TOK_COMMA && i < cnt) i++; else while (tokens[i].type != TOK_PAR_CLOSE && i < cnt) i++; i++; } if (j == parameters_cnt) decode_de(parameters, &res->txtbuf); } } } else { while (i < cnt && tokens[i].type != TOK_PAR_OPEN) i++; ++i; if (i >= cnt) return; /* TODO: move this v to another func / for (j = 0; j < parameters_cnt && i < cnt; j++) { parameters[j] = &tokens[i++]; if (j != parameters_cnt - 1) while (tokens[i].type != TOK_COMMA && i < cnt) i++; else while (tokens[i].type != TOK_PAR_CLOSE && i < cnt) i++; i++; } if (j == parameters_cnt) decode_de(parameters, &res->txtbuf); } if (parameters[0] && parameters[parameters_cnt - 1]) { res->pos_begin = parameters[0] - tokens; res->pos_end = parameters[parameters_cnt - 1] - tokens + 1; if (tokens[res->pos_end].type == TOK_BRACKET_OPEN && tokens[res->pos_end + 1].type == TOK_BRACKET_CLOSE && tokens[res->pos_end + 2].type == TOK_PAR_CLOSE) res->pos_end += 3; / {}) / else res->pos_end++; / ) */ }
fd08e02e	}
0c555069	static cl_error_t handle_unescape(struct tokens *tokens, size_t start)
fd08e02e	{
0c555069	cl_error_t retval;
72fd33c8	if (tokens->data[start].type == TOK_StringLiteral) { char *R; struct tokens new_tokens; yystype tok; R = cli_unescape(TOKEN_GET(&tokens->data[start], cstring)); tok.type = TOK_StringLiteral; TOKEN_SET(&tok, string, R); new_tokens.capacity = new_tokens.cnt = 1; new_tokens.data = &tok;
0c555069	if (CL_SUCCESS != (retval = replace_token_range(tokens, start - 2, start + 2, &new_tokens))) { if (retval == CL_EARG) { size_t i; cli_dbgmsg(MODULE "replace_token_range failed.\n"); for (i = 0; i < new_tokens.cnt; i++) { free_token(&(new_tokens.data[i])); } }
72fd33c8	return CL_EMEM;
0c555069	}
72fd33c8	} return CL_SUCCESS;
fd08e02e	} /* scriptasylum dot com's JS encoder */
08402afa	static void handle_df(const yystype tokens, size_t start, struct decode_result res)
fd08e02e	{
72fd33c8	char str, s1; size_t len, s1_len, i; unsigned char clast; char *R; if (tokens[start].type != TOK_StringLiteral) return; str = TOKEN_GET(&tokens[start], string); if (!str) return; len = strlen(str); if (!len) return; clast = str[len - 1] - '0'; str[len - 1] = '\0'; s1 = cli_unescape(str); s1_len = strlen(s1); for (i = 0; i < s1_len; i++) { s1[i] -= clast; } R = cli_unescape(s1); free(s1); res->pos_begin = start - 2; res->pos_end = start + 2; res->txtbuf.data = R; res->txtbuf.pos = strlen(R); res->append = 1;
fd08e02e	} static void handle_eval(struct tokens tokens, size_t start, struct decode_result res) {
72fd33c8	res->txtbuf.data = TOKEN_GET(&tokens->data[start], string); if (res->txtbuf.data && tokens->data[start + 1].type == TOK_PAR_CLOSE) { TOKEN_SET(&tokens->data[start], string, NULL); res->txtbuf.pos = strlen(res->txtbuf.data); res->pos_begin = start - 2; res->pos_end = start + 2; }
fd08e02e	} static void run_folders(struct tokens *tokens) {
72fd33c8	size_t i;
fd08e02e
72fd33c8	for (i = 0; i < tokens->cnt; i++) { const char *cstring = TOKEN_GET(&tokens->data[i], cstring); if (i + 2 < tokens->cnt && tokens->data[i].type == TOK_IDENTIFIER_NAME && cstring && !strcmp("unescape", cstring) && tokens->data[i + 1].type == TOK_PAR_OPEN) {
fd08e02e
72fd33c8	handle_unescape(tokens, i + 2); } }
fd08e02e	} static inline int state_update_scope(struct parser_state state, const yystype token) {
72fd33c8	if (token->type == TOK_FUNCTION) { struct scope scope = TOKEN_GET(token, scope); if (scope) { state->current = scope; } else { / dummy token marking function end / if (state->current->parent) state->current = state->current->parent; / don't output this token, it is just a dummy marker */ return 0; } } return 1;
fd08e02e	} static void run_decoders(struct parser_state *state) {
72fd33c8	size_t i; const char name; struct tokens tokens = &state->tokens; for (i = 0; i < tokens->cnt; i++) { const char cstring = TOKEN_GET(&tokens->data[i], cstring); struct decode_result res; res.pos_begin = res.pos_end = 0; res.append = 0; if (tokens->data[i].type == TOK_FUNCTION && i + 13 < tokens->cnt) { name = NULL; ++i; if (tokens->data[i].type == TOK_IDENTIFIER_NAME) { cstring = TOKEN_GET(&tokens->data[i], cstring); name = cstring; ++i; } if (match_parameters(&tokens->data[i], de_packer_3, sizeof(de_packer_3) / sizeof(de_packer_3[0])) != -1 \|\| match_parameters(&tokens->data[i], de_packer_2, sizeof(de_packer_2) / sizeof(de_packer_2[0])) != -1) { / find function decl. end / handle_de(tokens->data, i, tokens->cnt, name, &res); } } else if (i + 2 < tokens->cnt && tokens->data[i].type == TOK_IDENTIFIER_NAME && cstring && !strcmp("dF", cstring) && tokens->data[i + 1].type == TOK_PAR_OPEN) { / TODO: also match signature of dF function (possibly
fd08e02e	* declared using unescape */
72fd33c8	handle_df(tokens->data, i + 2, &res); } else if (i + 2 < tokens->cnt && tokens->data[i].type == TOK_IDENTIFIER_NAME && cstring && !strcmp("eval", cstring) && tokens->data[i + 1].type == TOK_PAR_OPEN) { handle_eval(tokens, i + 2, &res); } if (res.pos_end > res.pos_begin) { struct tokens parent_tokens; if (res.pos_end < tokens->cnt && tokens->data[res.pos_end].type == TOK_SEMICOLON) res.pos_end++; parent_tokens = state->tokens; /* save current tokens / / initialize embedded context */ memset(&state->tokens, 0, sizeof(state->tokens)); if (++state->rec > 16) cli_dbgmsg(MODULE "recursion limit reached\n"); else { cli_js_process_buffer(state, res.txtbuf.data, res.txtbuf.pos); --state->rec; } free(res.txtbuf.data);
0c555069	/* state->tokens still refers to the embedded/nested context here */
72fd33c8	if (!res.append) {
0c555069	if (CL_EARG == replace_token_range(&parent_tokens, res.pos_begin, res.pos_end, &state->tokens)) { size_t j; cli_dbgmsg(MODULE "replace_token_range failed.\n"); for (j = 0; j < state->tokens.cnt; j++) { free_token(&(state->tokens.data[j])); } }
72fd33c8	} else { /* delete tokens / replace_token_range(&parent_tokens, res.pos_begin, res.pos_end, NULL); append_tokens(&parent_tokens, &state->tokens); } / end of embedded context, restore tokens state */ free(state->tokens.data); state->tokens = parent_tokens; } state_update_scope(state, &state->tokens.data[i]); }
fd08e02e	}
72fd33c8	void cli_js_parse_done(struct parser_state *state)
fd08e02e	{
72fd33c8	struct tokens tokens = &state->tokens; size_t par_balance = 0, i; char end = '\0'; YYSTYPE val; cli_dbgmsg(MODULE "in cli_js_parse_done()\n"); / close unfinished token / switch (state->scanner->state) { case DoubleQString: end = '"'; break; case SingleQString: end = '\''; break; default: / make gcc happy / break; } if (end != '\0') cli_js_process_buffer(state, &end, 1); / close remaining parenthesis / for (i = 0; i < tokens->cnt; i++) { if (tokens->data[i].type == TOK_PAR_OPEN) par_balance++; else if (tokens->data[i].type == TOK_PAR_CLOSE && par_balance > 0) par_balance--; } if (par_balance > 0) { memset(&val, 0, sizeof(val)); val.type = TOK_PAR_CLOSE; TOKEN_SET(&val, cstring, ")"); while (par_balance-- > 0) { add_token(state, &val); } } / we had to close unfinished strings, parenthesis,
a66b62f8	* so that the folders/decoders can run properly */
72fd33c8	run_folders(&state->tokens); run_decoders(state);
fd08e02e
72fd33c8	yylex_destroy(state->scanner); state->scanner = NULL;
fd08e02e	}
8be1d5a4	void cli_js_output(struct parser_state state, const char tempdir)
fd08e02e	{
72fd33c8	unsigned i; struct buf buf; char lastchar = '\0'; char filename[1024]; snprintf(filename, 1024, "%s" PATHSEP "javascript", tempdir); buf.pos = 0; buf.outfd = open(filename, O_CREAT \| O_WRONLY, 0600); if (buf.outfd < 0) { cli_errmsg(MODULE "cannot open output file for writing: %s\n", filename); return; } /* append to file / if (lseek(buf.outfd, 0, SEEK_END) != 0) { / separate multiple scripts with \n / buf_outc('\n', &buf); } buf_outs("<script>", &buf); state->current = state->global; for (i = 0; i < state->tokens.cnt; i++) { if (state_update_scope(state, &state->tokens.data[i])) lastchar = output_token(&state->tokens.data[i], state->current, &buf, lastchar); } / add /script if not already there */ if (buf.pos < 9 \|\| memcmp(buf.buf + buf.pos - 9, "</script>", 9)) buf_outs("</script>", &buf); if (write(buf.outfd, buf.buf, buf.pos) < 0) { cli_dbgmsg(MODULE "I/O error\n"); } close(buf.outfd); cli_dbgmsg(MODULE "dumped/appended normalized script to: %s\n", filename);
fd08e02e	} void cli_js_destroy(struct parser_state *state) {
72fd33c8	size_t i; if (!state) return; scope_free_all(state->list); for (i = 0; i < state->tokens.cnt; i++) { free_token(&state->tokens.data[i]); } free(state->tokens.data); /* detect use after free / if (state->scanner) yylex_destroy(state->scanner); memset(state, 0x55, sizeof(state)); free(state); cli_dbgmsg(MODULE "cli_js_destroy() done\n");
fd08e02e	} /* buffer is html-normlike "chunk", if original file is bigger than buffer, * we rewind to a space, so we'll know that tokens won't be broken in half at * the end of a buffer. All tokens except string-literals of course. * So we can assume that after the buffer there is either a space, EOF, or a * chunk of text not containing whitespace at all (for which we care only if its * a stringliteral)/ void cli_js_process_buffer(struct parser_state state, const char *buf, size_t n) {
72fd33c8	struct scope *current = state->current; YYSTYPE val; int yv; YY_BUFFER_STATE yyb;
fd08e02e
72fd33c8	if (!state->global) { /* this state has either not been initialized,
fd08e02e	* or cli_js_parse_done() was already called on it */
72fd33c8	cli_warnmsg(MODULE "invalid state\n"); return; } yyb = yy_scan_bytes(buf, n, state->scanner); memset(&val, 0, sizeof(val)); val.vtype = vtype_undefined; /* on EOF yylex will return 0 / while ((yv = yylex(&val, state->scanner)) != 0) { const char text; size_t leng; val.type = yv; switch (yv) { case TOK_VAR: current->fsm_state = InsideVar; break; case TOK_IDENTIFIER_NAME: text = yyget_text(state->scanner); leng = yyget_leng(state->scanner); if (current->last_token == TOK_DOT) { /* this is a member name, don't normalize
fd08e02e	*/
72fd33c8	TOKEN_SET(&val, string, cli_strdup(text)); val.type = TOK_UNNORM_IDENTIFIER; } else { switch (current->fsm_state) { case WaitParameterList: state->syntax_errors++; /* fall through / case Base: case InsideInitializer: TOKEN_SET(&val, cstring, scope_use(current, text, leng)); break; case InsideVar: case InsideFunctionDecl: TOKEN_SET(&val, cstring, scope_declare(current, text, leng, state)); current->fsm_state = InsideInitializer; current->brackets = 0; break; case WaitFunctionName: TOKEN_SET(&val, cstring, scope_declare(current, text, leng, state)); current->fsm_state = WaitParameterList; break; } } break; case TOK_PAR_OPEN: switch (current->fsm_state) { case WaitFunctionName: / fallthrough / case WaitParameterList: current->fsm_state = InsideFunctionDecl; break; default: / noop / break; } break; case TOK_PAR_CLOSE: switch (current->fsm_state) { case WaitFunctionName: state->syntax_errors++; break; case WaitParameterList: current->fsm_state = Base; break; default: / noop / break; } break; case TOK_CURLY_BRACE_OPEN: switch (current->fsm_state) { case WaitFunctionName: / fallthrough / case WaitParameterList: case InsideFunctionDecl: / in a syntactically correct
fd08e02e	* file, we would already be in * the Base state when we see a { */
72fd33c8	current->fsm_state = Base; /* fall-through / case InsideVar: case InsideInitializer: state->syntax_errors++; / fall-through / case Base: default: current->blocks++; break; } break; case TOK_CURLY_BRACE_CLOSE: if (current->blocks > 0) current->blocks--; else state->syntax_errors++; if (!current->blocks) { if (current->parent) { / add dummy FUNCTION token to
fd08e02e	* mark function end */
72fd33c8	TOKEN_SET(&val, cstring, "}"); add_token(state, &val); TOKEN_SET(&val, scope, NULL); val.type = TOK_FUNCTION; state->current = current = current->parent; } else { /* extra } / state->syntax_errors++; } } break; case TOK_BRACKET_OPEN: current->brackets++; break; case TOK_BRACKET_CLOSE: if (current->brackets > 0) current->brackets--; else state->syntax_errors++; break; case TOK_COMMA: if (current->fsm_state == InsideInitializer && current->brackets == 0 && current->blocks == 0) { / initializer ended only if we
fd08e02e	* encountered a comma, and [] are * balanced. * This avoids switching state on: * var x = [4,y,u];*/
72fd33c8	current->fsm_state = InsideVar; } break; case TOK_SEMICOLON: if (current->brackets == 0 && current->blocks == 0) { /* avoid switching state on unbalanced []:
fd08e02e	* var x = [test;testi]; */
72fd33c8	current->fsm_state = Base; } break; case TOK_FUNCTION: current = scope_new(state); current->fsm_state = WaitFunctionName; TOKEN_SET(&val, scope, state->current); break; case TOK_StringLiteral: if (state->tokens.cnt > 1 && state->tokens.data[state->tokens.cnt - 1].type == TOK_PLUS) { /* see if can fold / yystype prev_string = &state->tokens.data[state->tokens.cnt - 2]; if (prev_string->type == TOK_StringLiteral) { char str = TOKEN_GET(prev_string, string); size_t str_len = strlen(str); text = yyget_text(state->scanner); leng = yyget_leng(state->scanner); / delete TOK_PLUS */ free_token(&state->tokens.data[--state->tokens.cnt]); str = cli_realloc(str, str_len + leng + 1); if (!str) break; strncpy(str + str_len, text, leng); str[str_len + leng] = '\0'; TOKEN_SET(prev_string, string, str); free(val.val.string); memset(&val, 0, sizeof(val)); val.vtype = vtype_undefined; continue; } } break; } if (val.vtype == vtype_undefined) { text = yyget_text(state->scanner); TOKEN_SET(&val, string, cli_strdup(text)); abort(); } add_token(state, &val); current->last_token = yv; memset(&val, 0, sizeof(val)); val.vtype = vtype_undefined; }
fd08e02e	}
8be1d5a4	struct parser_state *cli_js_init(void)
fd08e02e	{
72fd33c8	struct parser_state state = cli_calloc(1, sizeof(state)); if (!state) return NULL; if (!scope_new(state)) { free(state); return NULL; } state->global = state->current; if (yylex_init(&state->scanner)) { scope_done(state->global); free(state); return NULL; } cli_dbgmsg(MODULE "cli_js_init() done\n"); return state;
fd08e02e	} /-------------- tokenizer ---------------------/ enum char_class {
72fd33c8	Whitespace, Slash, Operator, DQuote, SQuote, Digit, IdStart, BracketOpen = TOK_BRACKET_OPEN, BracketClose = TOK_BRACKET_CLOSE, Comma = TOK_COMMA, CurlyOpen = TOK_CURLY_BRACE_OPEN, CurlyClose = TOK_CURLY_BRACE_CLOSE, ParOpen = TOK_PAR_OPEN, ParClose = TOK_PAR_CLOSE, Dot = TOK_DOT, SemiColon = TOK_SEMICOLON, Nop
fd08e02e	}; #define SL Slash #define DG Digit #define DQ DQuote #define SQ SQuote #define ID IdStart #define OP Operator #define WS Whitespace #define BO BracketOpen #define BC BracketClose #define CM Comma #define CO CurlyOpen #define CC CurlyClose #define PO ParOpen #define PC ParClose #define DT Dot #define SC SemiColon #define NA Nop static const enum char_class ctype[256] = {
72fd33c8	NA, NA, NA, NA, NA, NA, NA, NA, NA, WS, WS, WS, NA, WS, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, WS, OP, DQ, NA, ID, OP, OP, SQ, PO, PC, OP, OP, CM, OP, DT, SL, DG, DG, DG, DG, DG, DG, DG, DG, DG, DG, OP, SC, OP, OP, OP, OP, NA, ID, ID, ID, ID, ID, ID, ID, ID, ID, ID, ID, ID, ID, ID, ID, ID, ID, ID, ID, ID, ID, ID, ID, ID, ID, ID, BO, ID, BC, OP, ID, NA, ID, ID, ID, ID, ID, ID, ID, ID, ID, ID, ID, ID, ID, ID, ID, ID, ID, ID, ID, ID, ID, ID, ID, ID, ID, ID, CO, OP, CC, OP, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA};
fd08e02e	static const enum char_class id_ctype[256] = {
72fd33c8	NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ID, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ID, ID, ID, ID, ID, ID, ID, ID, ID, ID, NA, NA, NA, NA, NA, NA, NA, ID, ID, ID, ID, ID, ID, ID, ID, ID, ID, ID, ID, ID, ID, ID, ID, ID, ID, ID, ID, ID, ID, ID, ID, ID, ID, NA, OP, NA, NA, ID, NA, ID, ID, ID, ID, ID, ID, ID, ID, ID, ID, ID, ID, ID, ID, ID, ID, ID, ID, ID, ID, ID, ID, ID, ID, ID, ID, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
fd08e02e	};
72fd33c8	#define CASE_SPECIAL_CHAR(C, S) \ case C: \ TOKEN_SET(lvalp, cstring, (S)); \ return cClass;
fd08e02e	#define BUF_KEEP_SIZE 32768 static void textbuf_clean(struct text_buffer *buf) {
72fd33c8	if (buf->capacity > BUF_KEEP_SIZE) { char *data = cli_realloc(buf->data, BUF_KEEP_SIZE); if (data) buf->data = data; buf->capacity = BUF_KEEP_SIZE; } buf->pos = 0;
fd08e02e	} static inline int parseString(YYSTYPE *lvalp, yyscan_t scanner, const char q,
72fd33c8	enum tokenizer_state tostate)
fd08e02e	{
72fd33c8	size_t len; /* look for " terminating the string / const char start = &scanner->in[scanner->pos], end = start; do { const size_t siz = &scanner->in[scanner->insize] - end; end = memchr(end, q, siz); if (end && end > start && end[-1] == '\\') { ++end; continue; } break; } while (1); if (end && end >= start) len = end - start; else len = scanner->insize - scanner->pos; cli_textbuffer_append_normalize(&scanner->buf, start, len); if (end) { char str; /* skip over end quote / scanner->pos += len + 1; textbuffer_putc(&scanner->buf, '\0'); str = textbuffer_done(scanner); if (str) { TOKEN_SET(lvalp, string, str); } else { TOKEN_SET(lvalp, cstring, ""); } scanner->state = Initial; assert(lvalp->val.string); return TOK_StringLiteral; } else { scanner->pos += len; / unfinished string */ scanner->state = tostate; return 0; }
fd08e02e	} static inline int parseDQString(YYSTYPE *lvalp, yyscan_t scanner) {
72fd33c8	return parseString(lvalp, scanner, '"', DoubleQString);
fd08e02e	} static inline int parseSQString(YYSTYPE *lvalp, yyscan_t scanner) {
72fd33c8	return parseString(lvalp, scanner, '\'', SingleQString);
fd08e02e	} static inline int parseNumber(YYSTYPE *lvalp, yyscan_t scanner) {
72fd33c8	const unsigned char in = (const unsigned char )scanner->in; int is_float = 0; while (scanner->pos < scanner->insize) { unsigned char c = in[scanner->pos++]; if (isdigit(c)) { textbuffer_putc(&scanner->buf, c); continue; } if (c == '.' && !is_float) { is_float = 1; textbuffer_putc(&scanner->buf, '.'); continue; } if ((c == 'e' \|\| c == 'E') && is_float) { textbuffer_putc(&scanner->buf, c); if (scanner->pos < scanner->insize) { c = in[scanner->pos++]; if (c == '+' \|\| c == '-' \|\| isdigit(c)) { textbuffer_putc(&scanner->buf, c); continue; } } } scanner->pos--; textbuffer_putc(&scanner->buf, '\0'); scanner->state = Initial; if (!scanner->buf.data) return 0; if (is_float) { TOKEN_SET(lvalp, dval, atof(scanner->buf.data)); return TOK_NumericFloat; } else { TOKEN_SET(lvalp, ival, atoi(scanner->buf.data)); return TOK_NumericInt; } } scanner->state = Number; return 0;
fd08e02e	} static inline int parseId(YYSTYPE *lvalp, yyscan_t scanner) {
72fd33c8	const struct keyword kw; const unsigned char in = (const unsigned char )scanner->in; scanner->state = Initial; while (scanner->pos < scanner->insize) { unsigned char c = in[scanner->pos++]; enum char_class cClass = id_ctype[c]; switch (cClass) { case IdStart: textbuffer_putc(&scanner->buf, c); break; case Operator: / the table contains OP only for \ / assert(c == '\\'); if (scanner->pos < scanner->insize && in[scanner->pos++] == 'u') { textbuffer_putc(&scanner->buf, c); break; } if (scanner->pos == scanner->insize) { scanner->pos++; } / else fallthrough / default: / character is no longer part of identifier / scanner->state = Initial; textbuffer_putc(&scanner->buf, '\0'); scanner->pos--; kw = in_word_set(scanner->buf.data, scanner->buf.pos - 1); if (kw) { / we got a keyword / TOKEN_SET(lvalp, cstring, kw->name); return kw->val; } / it is not a keyword, just an identifier */ TOKEN_SET(lvalp, cstring, NULL); return TOK_IDENTIFIER_NAME; } } scanner->state = Identifier; return 0;
fd08e02e	} static int parseOperator(YYSTYPE *lvalp, yyscan_t scanner) {
72fd33c8	size_t len = MIN(5, scanner->insize - scanner->pos); while (len) { const struct operatorkw = in_op_set(&scanner->in[scanner->pos], len); if (kw) { TOKEN_SET(lvalp, cstring, kw->name); scanner->pos += len; return kw->val; } len--; } / never reached */ assert(0); scanner->pos++; TOKEN_SET(lvalp, cstring, NULL); return TOK_ERROR;
fd08e02e	} static int yylex_init(yyscan_t *scanner) {
72fd33c8	scanner = cli_calloc(1, sizeof(scanner)); return scanner ? 0 : -1;
fd08e02e	} static int yylex_destroy(yyscan_t scanner) {
72fd33c8	free(scanner->buf.data); free(scanner); return 0;
fd08e02e	} static int yy_scan_bytes(const char *p, size_t len, yyscan_t scanner) {
72fd33c8	scanner->in = p; scanner->insize = len; scanner->pos = 0; scanner->lastpos = -1; scanner->last_state = Dummy; return 0;
fd08e02e	} static const char *yyget_text(yyscan_t scanner) {
72fd33c8	return scanner->yytext ? scanner->yytext : scanner->buf.data;
fd08e02e	} static int yyget_leng(yyscan_t scanner) {
72fd33c8	/* we have a \0 too */ return scanner->yylen ? scanner->yylen : (scanner->buf.pos > 0 ? scanner->buf.pos - 1 : 0);
fd08e02e	}
72fd33c8	static int yylex(YYSTYPE *lvalp, yyscan_t scanner)
fd08e02e	{
72fd33c8	const size_t len = scanner->insize; const unsigned char in = (const unsigned char )scanner->in; unsigned char lookahead; enum char_class cClass; scanner->yytext = NULL; scanner->yylen = 0; if (scanner->pos == scanner->lastpos) { if (scanner->last_state == scanner->state) { cli_dbgmsg(MODULE "infloop detected, skipping character\n"); scanner->pos++; } /* its not necesarely an infloop if it changed
72733fba	* state, and it shouldn't infloop between states */
72fd33c8	} scanner->lastpos = scanner->pos; scanner->last_state = scanner->state; while (scanner->pos < scanner->insize) { switch (scanner->state) { case Initial: textbuf_clean(&scanner->buf); cClass = ctype[in[scanner->pos++]]; switch (cClass) { case Whitespace: /* eat whitespace / continue; case Slash: if (scanner->pos < len) { lookahead = in[scanner->pos]; switch (lookahead) { case '': scanner->state = MultilineComment; scanner->pos++; continue; case '/': scanner->state = SinglelineComment; scanner->pos++; continue; } } --scanner->pos; return parseOperator(lvalp, scanner); case Operator: --scanner->pos; return parseOperator(lvalp, scanner); case DQuote: return parseDQString(lvalp, scanner); case SQuote: return parseSQString(lvalp, scanner); case Digit: --scanner->pos; return parseNumber(lvalp, scanner); case IdStart: --scanner->pos; return parseId(lvalp, scanner); CASE_SPECIAL_CHAR(BracketOpen, "["); CASE_SPECIAL_CHAR(BracketClose, "]"); CASE_SPECIAL_CHAR(Comma, ","); CASE_SPECIAL_CHAR(CurlyOpen, "{"); CASE_SPECIAL_CHAR(CurlyClose, "}"); CASE_SPECIAL_CHAR(ParOpen, "("); CASE_SPECIAL_CHAR(ParClose, ")"); CASE_SPECIAL_CHAR(Dot, "."); CASE_SPECIAL_CHAR(SemiColon, ";"); case Nop: continue; } break; case DoubleQString: return parseString(lvalp, scanner, '"', DoubleQString); case SingleQString: return parseString(lvalp, scanner, '\'', SingleQString); case Identifier: return parseId(lvalp, scanner); case MultilineComment: while (scanner->pos + 1 < scanner->insize) { if (in[scanner->pos] == '' && in[scanner->pos + 1] == '/') { scanner->state = Initial; scanner->pos++; break; } scanner->pos++; } scanner->pos++; break; case Number: return parseNumber(lvalp, scanner); case SinglelineComment: while (scanner->pos < scanner->insize) { / htmlnorm converts \n to space, so
086ad393	* stop on space too */
72fd33c8	if (in[scanner->pos] == '\n' \|\| in[scanner->pos] == ' ') break; scanner->pos++; } scanner->state = Initial; break; default: assert(0 && "Not reached"); } } return 0;
fd08e02e	}