/* * Parse a regular expression, and extract a static suffix. * * Copyright (C) 2013-2019 Cisco Systems, Inc. and/or its affiliates. All rights reserved. * Copyright (C) 2007-2013 Sourcefire, Inc. * * Authors: Török Edvin * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as * published by the Free Software Foundation. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, * MA 02110-1301, USA. */ #if HAVE_CONFIG_H #include "clamav-config.h" #endif #include #include #include #include #include "clamav.h" #include "others.h" #include "jsparse/textbuf.h" #include "regex_suffix.h" #define MODULE "regex_suffix: " enum node_type { root = 0, concat, alternate, /* | */ optional, /* ?, * */ leaf, /* a character */ leaf_class /* character class */ /* (x)+ is transformed into (x)*(x) */ }; struct node { enum node_type type; /* must be first field */ struct node *parent; union { struct { struct node *left; struct node *right; } children; uint8_t *leaf_class_bitmap; uint8_t leaf_char; } u; }; /* --- Prototypes --*/ static cl_error_t build_suffixtree_descend(struct node *n, struct text_buffer *buf, suffix_callback cb, void *cbdata, struct regex_list *regex); /* -----------------*/ static uint8_t dot_bitmap[32] = { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff}; static struct node *make_node(enum node_type type, struct node *left, struct node *right) { struct node *n; if (type == concat) { if (left == NULL) return right; if (right == NULL) return left; } n = cli_malloc(sizeof(*n)); if (!n) { cli_errmsg("make_node: Unable to allocate memory for new node\n"); return NULL; } n->type = type; n->parent = NULL; n->u.children.left = left; n->u.children.right = right; if (left) left->parent = n; if (right) right->parent = n; return n; } static struct node *dup_node(struct node *p) { struct node *node_left, *node_right; struct node *d; if (!p) return NULL; d = cli_malloc(sizeof(*d)); if (!d) { cli_errmsg("dup_node: Unable to allocate memory for duplicate node\n"); return NULL; } d->type = p->type; d->parent = NULL; switch (p->type) { case leaf: d->u.leaf_char = p->u.leaf_char; break; case leaf_class: d->u.leaf_class_bitmap = cli_malloc(32); if (!d->u.leaf_class_bitmap) { cli_errmsg("make_node: Unable to allocate memory for leaf class\n"); free(d); return NULL; } memcpy(d->u.leaf_class_bitmap, p->u.leaf_class_bitmap, 32); break; default: node_left = dup_node(p->u.children.left); node_right = dup_node(p->u.children.right); d->u.children.left = node_left; d->u.children.right = node_right; if (node_left) node_left->parent = d; if (node_right) node_right->parent = d; break; } return d; } static struct node *make_charclass(uint8_t *bitmap) { struct node *v = cli_malloc(sizeof(*v)); if (!v) { cli_errmsg("make_charclass: Unable to allocate memory for character class\n"); return NULL; } v->type = leaf_class; v->parent = NULL; v->u.leaf_class_bitmap = bitmap; return v; } static struct node *make_leaf(char c) { struct node *v = cli_malloc(sizeof(*v)); if (!v) return NULL; v->type = leaf; v->parent = NULL; v->u.leaf_char = c; return v; } static void destroy_tree(struct node *n) { if (!n) return; switch (n->type) { case concat: case alternate: case optional: destroy_tree(n->u.children.left); destroy_tree(n->u.children.right); break; case leaf_class: if (n->u.leaf_class_bitmap != dot_bitmap) free(n->u.leaf_class_bitmap); break; case root: case leaf: break; } free(n); } static uint8_t *parse_char_class(const char *pat, size_t *pos) { unsigned char range_start = 0; int hasprev = 0; uint8_t *bitmap = cli_malloc(32); if (!bitmap) { cli_errmsg("parse_char_class: Unable to allocate memory for bitmap\n"); return NULL; } if (pat[*pos] == '^') { memset(bitmap, 0xFF, 32); /*match chars not in brackets*/ ++*pos; } else memset(bitmap, 0x00, 32); do { /* literal ] can be first character, so test for it at the end of the loop, for example: []] */ if (pat[*pos] == '-' && hasprev) { /* it is a range*/ unsigned char range_end; unsigned int c; assert(range_start); ++*pos; if (pat[*pos] == '[') if (pat[*pos + 1] == '.') { /* collating sequence not handled */ free(bitmap); /* we are parsing the regex for a * filter, be conservative and * tell the filter that anything could * match here */ while (pat[*pos] != ']') ++*pos; ++*pos; while (pat[*pos] != ']') ++*pos; return dot_bitmap; } else range_end = pat[*pos]; else range_end = pat[*pos]; for (c = range_start + 1; c <= range_end; c++) bitmap[c >> 3] ^= 1 << (c & 0x7); hasprev = 0; } else if (pat[*pos] == '[' && pat[*pos] == ':') { /* char class */ free(bitmap); while (pat[*pos] != ']') ++*pos; ++*pos; while (pat[*pos] != ']') ++*pos; return dot_bitmap; } else { bitmap[pat[*pos] >> 3] ^= 1 << (pat[*pos] & 0x7); range_start = pat[*pos]; ++*pos; hasprev = 1; } } while (pat[*pos] != ']'); return bitmap; } static struct node *parse_regex(const char *p, size_t *last) { struct node *v = NULL; struct node *right; struct node *tmp; while (p[*last] != '$' && p[*last] != '\0') { switch (p[*last]) { case '|': ++*last; right = parse_regex(p, last); v = make_node(alternate, v, right); if (!v) return NULL; break; case '*': case '?': v = make_node(optional, v, NULL); if (!v) return NULL; ++*last; break; case '+': /* (x)* */ tmp = make_node(optional, v, NULL); if (!tmp) return NULL; /* (x) */ right = dup_node(v); if (!right) { free(tmp); return NULL; } /* (x)*(x) => (x)+ */ v = make_node(concat, tmp, right); if (!v) return NULL; ++*last; break; case '(': ++*last; right = parse_regex(p, last); if (!right) return NULL; ++*last; v = make_node(concat, v, right); break; case ')': return v; case '.': right = make_charclass(dot_bitmap); if (!right) return NULL; v = make_node(concat, v, right); if (!v) return NULL; ++*last; break; case '[': ++*last; right = make_charclass(parse_char_class(p, last)); if (!right) return NULL; v = make_node(concat, v, right); if (!v) return NULL; ++*last; break; case '\\': /* next char is escaped, advance pointer * and let fall-through handle it */ ++*last; default: right = make_leaf(p[*last]); v = make_node(concat, v, right); if (!v) return NULL; ++*last; break; } } return v; } #define BITMAP_HASSET(b, i) (b[i >> 3] & (1 << (i & 7))) static cl_error_t build_suffixtree_ascend(struct node *n, struct text_buffer *buf, struct node *prev, suffix_callback cb, void *cbdata, struct regex_list *regex) { size_t i, cnt; while (n) { struct node *q = n; switch (n->type) { case root: textbuffer_putc(buf, '\0'); if (cb(cbdata, buf->data, buf->pos - 1, regex) != CL_SUCCESS) return CL_EMEM; return CL_SUCCESS; case leaf: textbuffer_putc(buf, n->u.leaf_char); n = n->parent; break; case leaf_class: cnt = 0; for (i = 0; i < 255; i++) if (BITMAP_HASSET(n->u.leaf_class_bitmap, i)) cnt++; if (cnt > 16) { textbuffer_putc(buf, '\0'); if (cb(cbdata, buf->data, buf->pos - 1, regex) != CL_SUCCESS) return CL_EMEM; return CL_SUCCESS; } /* handle small classes by expanding */ for (i = 0; i < 255; i++) { if (BITMAP_HASSET(n->u.leaf_class_bitmap, i)) { size_t pos; pos = buf->pos; textbuffer_putc(buf, (char)i); if (build_suffixtree_ascend(n->parent, buf, n, cb, cbdata, regex) != CL_SUCCESS) return CL_EMEM; buf->pos = pos; } } return 0; case concat: if (prev != n->u.children.left) { if (build_suffixtree_descend(n->u.children.left, buf, cb, cbdata, regex) != CL_SUCCESS) return CL_EMEM; /* we're done here, descend will call * ascend if needed */ return CL_SUCCESS; } else { n = n->parent; } break; case alternate: n = n->parent; break; case optional: textbuffer_putc(buf, '\0'); if (cb(cbdata, buf->data, buf->pos - 1, regex) != CL_SUCCESS) return CL_EMEM; return CL_SUCCESS; } prev = q; } return CL_SUCCESS; } static cl_error_t build_suffixtree_descend(struct node *n, struct text_buffer *buf, suffix_callback cb, void *cbdata, struct regex_list *regex) { size_t pos; while (n && n->type == concat) { n = n->u.children.right; } if (!n) return CL_SUCCESS; /* find out end of the regular expression, * if it ends with a static pattern */ switch (n->type) { case alternate: /* save pos as restart point */ pos = buf->pos; if (build_suffixtree_descend(n->u.children.left, buf, cb, cbdata, regex) != CL_SUCCESS) return CL_EMEM; buf->pos = pos; if (build_suffixtree_descend(n->u.children.right, buf, cb, cbdata, regex) != CL_SUCCESS) return CL_EMEM; buf->pos = pos; break; case optional: textbuffer_putc(buf, '\0'); if (cb(cbdata, buf->data, buf->pos - 1, regex) != CL_SUCCESS) return CL_EMEM; return CL_SUCCESS; case leaf: case leaf_class: if (build_suffixtree_ascend(n, buf, NULL, cb, cbdata, regex) != CL_SUCCESS) return CL_EMEM; return CL_SUCCESS; default: break; } return CL_SUCCESS; } cl_error_t cli_regex2suffix(const char *pattern, regex_t *preg, suffix_callback cb, void *cbdata) { struct regex_list regex; struct text_buffer buf; struct node root_node; struct node *n; size_t last = 0; int rc; assert(pattern); regex.preg = preg; rc = cli_regcomp(regex.preg, pattern, REG_EXTENDED); if (rc) { size_t buflen = cli_regerror(rc, regex.preg, NULL, 0); char *errbuf = cli_malloc(buflen); if (errbuf) { cli_regerror(rc, regex.preg, errbuf, buflen); cli_errmsg(MODULE "Error compiling regular expression %s: %s\n", pattern, errbuf); free(errbuf); } else { cli_errmsg(MODULE "Error compiling regular expression: %s\n", pattern); } return rc; } regex.nxt = NULL; regex.pattern = cli_strdup(pattern); n = parse_regex(pattern, &last); if (!n) return REG_ESPACE; memset(&buf, 0, sizeof(buf)); memset(&root_node, 0, sizeof(buf)); n->parent = &root_node; rc = build_suffixtree_descend(n, &buf, cb, cbdata, ®ex); free(regex.pattern); free(buf.data); destroy_tree(n); return rc; }