5ee56e41 |
/*
* Parse a regular expression, and extract a static suffix.
* |
e1cbc270 |
* Copyright (C) 2013-2019 Cisco Systems, Inc. and/or its affiliates. All rights reserved.
* Copyright (C) 2007-2013 Sourcefire, Inc. |
5ee56e41 |
*
* Authors: Török Edvin
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2 as
* published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
* MA 02110-1301, USA.
*/
#if HAVE_CONFIG_H
#include "clamav-config.h"
#endif
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <assert.h>
|
60d8d2c3 |
#include "clamav.h" |
5ee56e41 |
#include "others.h"
#include "jsparse/textbuf.h"
#include "regex_suffix.h"
#define MODULE "regex_suffix: "
enum node_type { |
288057e9 |
root = 0,
concat,
alternate, /* | */
optional, /* ?, * */
leaf, /* a character */
leaf_class /* character class */
/* (x)+ is transformed into (x)*(x) */ |
5ee56e41 |
};
struct node { |
288057e9 |
enum node_type type; /* must be first field */
struct node *parent;
union {
struct {
struct node *left;
struct node *right;
} children;
uint8_t *leaf_class_bitmap;
uint8_t leaf_char;
} u; |
5ee56e41 |
};
/* --- Prototypes --*/ |
102cd430 |
static cl_error_t build_suffixtree_descend(struct node *n, struct text_buffer *buf, suffix_callback cb, void *cbdata, struct regex_list *regex); |
5ee56e41 |
/* -----------------*/
|
dfc0c031 |
static uint8_t dot_bitmap[32] = { |
288057e9 |
0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff}; |
5ee56e41 |
|
288057e9 |
static struct node *make_node(enum node_type type, struct node *left, struct node *right) |
5ee56e41 |
{ |
288057e9 |
struct node *n;
if (type == concat) {
if (left == NULL)
return right;
if (right == NULL)
return left;
}
n = cli_malloc(sizeof(*n));
if (!n) { |
241e7eb1 |
cli_errmsg("make_node: Unable to allocate memory for new node\n"); |
288057e9 |
return NULL; |
241e7eb1 |
} |
288057e9 |
n->type = type;
n->parent = NULL;
n->u.children.left = left;
n->u.children.right = right;
if (left)
left->parent = n;
if (right)
right->parent = n;
return n; |
5ee56e41 |
}
static struct node *dup_node(struct node *p)
{ |
288057e9 |
struct node *node_left, *node_right;
struct node *d; |
5ee56e41 |
|
288057e9 |
if (!p)
return NULL;
d = cli_malloc(sizeof(*d));
if (!d) { |
241e7eb1 |
cli_errmsg("dup_node: Unable to allocate memory for duplicate node\n"); |
288057e9 |
return NULL; |
241e7eb1 |
} |
288057e9 |
d->type = p->type;
d->parent = NULL;
switch (p->type) {
case leaf:
d->u.leaf_char = p->u.leaf_char;
break;
case leaf_class:
d->u.leaf_class_bitmap = cli_malloc(32);
if (!d->u.leaf_class_bitmap) { |
241e7eb1 |
cli_errmsg("make_node: Unable to allocate memory for leaf class\n"); |
288057e9 |
free(d);
return NULL;
}
memcpy(d->u.leaf_class_bitmap, p->u.leaf_class_bitmap, 32);
break;
default:
node_left = dup_node(p->u.children.left);
node_right = dup_node(p->u.children.right);
d->u.children.left = node_left;
d->u.children.right = node_right;
if (node_left)
node_left->parent = d;
if (node_right)
node_right->parent = d;
break;
}
return d; |
5ee56e41 |
}
static struct node *make_charclass(uint8_t *bitmap)
{ |
288057e9 |
struct node *v = cli_malloc(sizeof(*v));
if (!v) { |
241e7eb1 |
cli_errmsg("make_charclass: Unable to allocate memory for character class\n"); |
288057e9 |
return NULL; |
241e7eb1 |
} |
288057e9 |
v->type = leaf_class;
v->parent = NULL;
v->u.leaf_class_bitmap = bitmap;
return v; |
5ee56e41 |
}
static struct node *make_leaf(char c)
{ |
288057e9 |
struct node *v = cli_malloc(sizeof(*v));
if (!v)
return NULL;
v->type = leaf;
v->parent = NULL;
v->u.leaf_char = c;
return v; |
5ee56e41 |
}
static void destroy_tree(struct node *n)
{ |
288057e9 |
if (!n)
return;
switch (n->type) {
case concat:
case alternate:
case optional:
destroy_tree(n->u.children.left);
destroy_tree(n->u.children.right);
break;
case leaf_class:
if (n->u.leaf_class_bitmap != dot_bitmap)
free(n->u.leaf_class_bitmap);
break;
case root:
case leaf:
break;
}
free(n); |
5ee56e41 |
}
|
288057e9 |
static uint8_t *parse_char_class(const char *pat, size_t *pos) |
5ee56e41 |
{ |
288057e9 |
unsigned char range_start = 0;
int hasprev = 0;
uint8_t *bitmap = cli_malloc(32);
if (!bitmap) { |
241e7eb1 |
cli_errmsg("parse_char_class: Unable to allocate memory for bitmap\n"); |
288057e9 |
return NULL; |
241e7eb1 |
} |
288057e9 |
if (pat[*pos] == '^') {
memset(bitmap, 0xFF, 32); /*match chars not in brackets*/
++*pos;
} else
memset(bitmap, 0x00, 32);
do {
/* literal ] can be first character, so test for it at the end of the loop, for example: []] */
if (pat[*pos] == '-' && hasprev) {
/* it is a range*/
unsigned char range_end;
unsigned int c;
assert(range_start);
++*pos;
if (pat[*pos] == '[')
if (pat[*pos + 1] == '.') {
/* collating sequence not handled */
free(bitmap);
/* we are parsing the regex for a |
5ee56e41 |
* filter, be conservative and
* tell the filter that anything could
* match here */ |
288057e9 |
while (pat[*pos] != ']') ++*pos;
++*pos;
while (pat[*pos] != ']') ++*pos;
return dot_bitmap;
} else
range_end = pat[*pos];
else
range_end = pat[*pos];
for (c = range_start + 1; c <= range_end; c++)
bitmap[c >> 3] ^= 1 << (c & 0x7);
hasprev = 0;
} else if (pat[*pos] == '[' && pat[*pos] == ':') {
/* char class */
free(bitmap);
while (pat[*pos] != ']') ++*pos;
++*pos;
while (pat[*pos] != ']') ++*pos;
return dot_bitmap;
} else {
bitmap[pat[*pos] >> 3] ^= 1 << (pat[*pos] & 0x7);
range_start = pat[*pos];
++*pos;
hasprev = 1;
}
} while (pat[*pos] != ']');
return bitmap; |
5ee56e41 |
}
|
288057e9 |
static struct node *parse_regex(const char *p, size_t *last) |
5ee56e41 |
{ |
288057e9 |
struct node *v = NULL;
struct node *right;
struct node *tmp; |
5ee56e41 |
|
288057e9 |
while (p[*last] != '$' && p[*last] != '\0') {
switch (p[*last]) {
case '|':
++*last;
right = parse_regex(p, last);
v = make_node(alternate, v, right);
if (!v)
return NULL;
break;
case '*':
case '?':
v = make_node(optional, v, NULL);
if (!v)
return NULL;
++*last;
break;
case '+':
/* (x)* */
tmp = make_node(optional, v, NULL);
if (!tmp)
return NULL;
/* (x) */
right = dup_node(v); |
50876732 |
if (!right) {
free(tmp); |
288057e9 |
return NULL; |
50876732 |
} |
288057e9 |
/* (x)*(x) => (x)+ */
v = make_node(concat, tmp, right);
if (!v)
return NULL;
++*last;
break;
case '(':
++*last;
right = parse_regex(p, last);
if (!right)
return NULL;
++*last;
v = make_node(concat, v, right);
break;
case ')':
return v;
case '.':
right = make_charclass(dot_bitmap);
if (!right)
return NULL;
v = make_node(concat, v, right);
if (!v)
return NULL;
++*last;
break;
case '[':
++*last;
right = make_charclass(parse_char_class(p, last));
if (!right)
return NULL;
v = make_node(concat, v, right);
if (!v)
return NULL;
++*last;
break;
case '\\':
/* next char is escaped, advance pointer |
5ee56e41 |
* and let fall-through handle it */ |
288057e9 |
++*last;
default:
right = make_leaf(p[*last]);
v = make_node(concat, v, right);
if (!v)
return NULL;
++*last;
break;
}
}
return v; |
5ee56e41 |
}
|
288057e9 |
#define BITMAP_HASSET(b, i) (b[i >> 3] & (1 << (i & 7))) |
5ee56e41 |
|
102cd430 |
static cl_error_t build_suffixtree_ascend(struct node *n, struct text_buffer *buf, struct node *prev, suffix_callback cb, void *cbdata, struct regex_list *regex) |
5ee56e41 |
{ |
288057e9 |
size_t i, cnt;
while (n) {
struct node *q = n;
switch (n->type) {
case root:
textbuffer_putc(buf, '\0'); |
52e85060 |
if (cb(cbdata, buf->data, buf->pos - 1, regex) != CL_SUCCESS) |
288057e9 |
return CL_EMEM; |
102cd430 |
return CL_SUCCESS; |
288057e9 |
case leaf:
textbuffer_putc(buf, n->u.leaf_char);
n = n->parent;
break;
case leaf_class:
cnt = 0;
for (i = 0; i < 255; i++)
if (BITMAP_HASSET(n->u.leaf_class_bitmap, i))
cnt++;
if (cnt > 16) {
textbuffer_putc(buf, '\0'); |
52e85060 |
if (cb(cbdata, buf->data, buf->pos - 1, regex) != CL_SUCCESS) |
288057e9 |
return CL_EMEM; |
102cd430 |
return CL_SUCCESS; |
288057e9 |
}
/* handle small classes by expanding */
for (i = 0; i < 255; i++) {
if (BITMAP_HASSET(n->u.leaf_class_bitmap, i)) {
size_t pos;
pos = buf->pos;
textbuffer_putc(buf, (char)i); |
52e85060 |
if (build_suffixtree_ascend(n->parent, buf, n, cb, cbdata, regex) != CL_SUCCESS) |
288057e9 |
return CL_EMEM;
buf->pos = pos;
}
}
return 0;
case concat:
if (prev != n->u.children.left) { |
52e85060 |
if (build_suffixtree_descend(n->u.children.left, buf, cb, cbdata, regex) != CL_SUCCESS) |
288057e9 |
return CL_EMEM;
/* we're done here, descend will call |
5ee56e41 |
* ascend if needed */ |
102cd430 |
return CL_SUCCESS; |
288057e9 |
} else {
n = n->parent;
}
break;
case alternate:
n = n->parent;
break;
case optional:
textbuffer_putc(buf, '\0'); |
52e85060 |
if (cb(cbdata, buf->data, buf->pos - 1, regex) != CL_SUCCESS) |
288057e9 |
return CL_EMEM; |
102cd430 |
return CL_SUCCESS; |
288057e9 |
}
prev = q;
} |
102cd430 |
return CL_SUCCESS; |
5ee56e41 |
}
|
102cd430 |
static cl_error_t build_suffixtree_descend(struct node *n, struct text_buffer *buf, suffix_callback cb, void *cbdata, struct regex_list *regex) |
5ee56e41 |
{ |
288057e9 |
size_t pos;
while (n && n->type == concat) {
n = n->u.children.right;
}
if (!n) |
102cd430 |
return CL_SUCCESS; |
288057e9 |
/* find out end of the regular expression, |
5ee56e41 |
* if it ends with a static pattern */ |
288057e9 |
switch (n->type) {
case alternate:
/* save pos as restart point */
pos = buf->pos; |
52e85060 |
if (build_suffixtree_descend(n->u.children.left, buf, cb, cbdata, regex) != CL_SUCCESS) |
288057e9 |
return CL_EMEM;
buf->pos = pos; |
52e85060 |
if (build_suffixtree_descend(n->u.children.right, buf, cb, cbdata, regex) != CL_SUCCESS) |
288057e9 |
return CL_EMEM;
buf->pos = pos;
break;
case optional:
textbuffer_putc(buf, '\0'); |
52e85060 |
if (cb(cbdata, buf->data, buf->pos - 1, regex) != CL_SUCCESS) |
288057e9 |
return CL_EMEM; |
102cd430 |
return CL_SUCCESS; |
288057e9 |
case leaf:
case leaf_class: |
52e85060 |
if (build_suffixtree_ascend(n, buf, NULL, cb, cbdata, regex) != CL_SUCCESS) |
288057e9 |
return CL_EMEM; |
102cd430 |
return CL_SUCCESS; |
288057e9 |
default:
break;
} |
102cd430 |
return CL_SUCCESS; |
5ee56e41 |
}
|
102cd430 |
cl_error_t cli_regex2suffix(const char *pattern, regex_t *preg, suffix_callback cb, void *cbdata) |
5ee56e41 |
{ |
288057e9 |
struct regex_list regex;
struct text_buffer buf;
struct node root_node;
struct node *n;
size_t last = 0;
int rc; |
5ee56e41 |
|
288057e9 |
assert(pattern); |
5ee56e41 |
|
288057e9 |
regex.preg = preg;
rc = cli_regcomp(regex.preg, pattern, REG_EXTENDED);
if (rc) {
size_t buflen = cli_regerror(rc, regex.preg, NULL, 0);
char *errbuf = cli_malloc(buflen);
if (errbuf) {
cli_regerror(rc, regex.preg, errbuf, buflen);
cli_errmsg(MODULE "Error compiling regular expression %s: %s\n", pattern, errbuf);
free(errbuf);
} else {
cli_errmsg(MODULE "Error compiling regular expression: %s\n", pattern);
}
return rc;
}
regex.nxt = NULL;
regex.pattern = cli_strdup(pattern); |
5ee56e41 |
|
288057e9 |
n = parse_regex(pattern, &last);
if (!n)
return REG_ESPACE;
memset(&buf, 0, sizeof(buf));
memset(&root_node, 0, sizeof(buf));
n->parent = &root_node; |
5ee56e41 |
|
288057e9 |
rc = build_suffixtree_descend(n, &buf, cb, cbdata, ®ex);
free(regex.pattern);
free(buf.data);
destroy_tree(n);
return rc; |
5ee56e41 |
} |