libclamav/regex_suffix.c
5ee56e41
 /*
  *  Parse a regular expression, and extract a static suffix.
  *
e1cbc270
  *  Copyright (C) 2013-2019 Cisco Systems, Inc. and/or its affiliates. All rights reserved.
  *  Copyright (C) 2007-2013 Sourcefire, Inc.
5ee56e41
  *
  *  Authors: Török Edvin
  *
  *  This program is free software; you can redistribute it and/or modify
  *  it under the terms of the GNU General Public License version 2 as
  *  published by the Free Software Foundation.
  *
  *  This program is distributed in the hope that it will be useful,
  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  *  GNU General Public License for more details.
  *
  *  You should have received a copy of the GNU General Public License
  *  along with this program; if not, write to the Free Software
  *  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
  *  MA 02110-1301, USA.
  */
 #if HAVE_CONFIG_H
 #include "clamav-config.h"
 #endif
 
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #include <assert.h>
 
60d8d2c3
 #include "clamav.h"
5ee56e41
 #include "others.h"
 #include "jsparse/textbuf.h"
 #include "regex_suffix.h"
 #define MODULE "regex_suffix: "
 
 enum node_type {
288057e9
     root = 0,
     concat,
     alternate, /* | */
     optional,  /* ?, * */
     leaf,      /* a character */
     leaf_class /* character class */
                /* (x)+ is transformed into (x)*(x) */
5ee56e41
 };
 
 struct node {
288057e9
     enum node_type type; /* must be first field */
     struct node *parent;
     union {
         struct {
             struct node *left;
             struct node *right;
         } children;
         uint8_t *leaf_class_bitmap;
         uint8_t leaf_char;
     } u;
5ee56e41
 };
 
 /* --- Prototypes --*/
102cd430
 static cl_error_t build_suffixtree_descend(struct node *n, struct text_buffer *buf, suffix_callback cb, void *cbdata, struct regex_list *regex);
5ee56e41
 /* -----------------*/
 
dfc0c031
 static uint8_t dot_bitmap[32] = {
288057e9
     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff};
5ee56e41
 
288057e9
 static struct node *make_node(enum node_type type, struct node *left, struct node *right)
5ee56e41
 {
288057e9
     struct node *n;
     if (type == concat) {
         if (left == NULL)
             return right;
         if (right == NULL)
             return left;
     }
     n = cli_malloc(sizeof(*n));
     if (!n) {
241e7eb1
         cli_errmsg("make_node: Unable to allocate memory for new node\n");
288057e9
         return NULL;
241e7eb1
     }
288057e9
     n->type             = type;
     n->parent           = NULL;
     n->u.children.left  = left;
     n->u.children.right = right;
     if (left)
         left->parent = n;
     if (right)
         right->parent = n;
     return n;
5ee56e41
 }
 
 static struct node *dup_node(struct node *p)
 {
288057e9
     struct node *node_left, *node_right;
     struct node *d;
5ee56e41
 
288057e9
     if (!p)
         return NULL;
     d = cli_malloc(sizeof(*d));
     if (!d) {
241e7eb1
         cli_errmsg("dup_node: Unable to allocate memory for duplicate node\n");
288057e9
         return NULL;
241e7eb1
     }
288057e9
     d->type   = p->type;
     d->parent = NULL;
     switch (p->type) {
         case leaf:
             d->u.leaf_char = p->u.leaf_char;
             break;
         case leaf_class:
             d->u.leaf_class_bitmap = cli_malloc(32);
             if (!d->u.leaf_class_bitmap) {
241e7eb1
                 cli_errmsg("make_node: Unable to allocate memory for leaf class\n");
288057e9
                 free(d);
                 return NULL;
             }
             memcpy(d->u.leaf_class_bitmap, p->u.leaf_class_bitmap, 32);
             break;
         default:
             node_left           = dup_node(p->u.children.left);
             node_right          = dup_node(p->u.children.right);
             d->u.children.left  = node_left;
             d->u.children.right = node_right;
             if (node_left)
                 node_left->parent = d;
             if (node_right)
                 node_right->parent = d;
             break;
     }
     return d;
5ee56e41
 }
 
 static struct node *make_charclass(uint8_t *bitmap)
 {
288057e9
     struct node *v = cli_malloc(sizeof(*v));
     if (!v) {
241e7eb1
         cli_errmsg("make_charclass: Unable to allocate memory for character class\n");
288057e9
         return NULL;
241e7eb1
     }
288057e9
     v->type                = leaf_class;
     v->parent              = NULL;
     v->u.leaf_class_bitmap = bitmap;
     return v;
5ee56e41
 }
 
 static struct node *make_leaf(char c)
 {
288057e9
     struct node *v = cli_malloc(sizeof(*v));
     if (!v)
         return NULL;
     v->type        = leaf;
     v->parent      = NULL;
     v->u.leaf_char = c;
     return v;
5ee56e41
 }
 
 static void destroy_tree(struct node *n)
 {
288057e9
     if (!n)
         return;
     switch (n->type) {
         case concat:
         case alternate:
         case optional:
             destroy_tree(n->u.children.left);
             destroy_tree(n->u.children.right);
             break;
         case leaf_class:
             if (n->u.leaf_class_bitmap != dot_bitmap)
                 free(n->u.leaf_class_bitmap);
             break;
         case root:
         case leaf:
             break;
     }
     free(n);
5ee56e41
 }
 
288057e9
 static uint8_t *parse_char_class(const char *pat, size_t *pos)
5ee56e41
 {
288057e9
     unsigned char range_start = 0;
     int hasprev               = 0;
     uint8_t *bitmap           = cli_malloc(32);
     if (!bitmap) {
241e7eb1
         cli_errmsg("parse_char_class: Unable to allocate memory for bitmap\n");
288057e9
         return NULL;
241e7eb1
     }
288057e9
     if (pat[*pos] == '^') {
         memset(bitmap, 0xFF, 32); /*match chars not in brackets*/
         ++*pos;
     } else
         memset(bitmap, 0x00, 32);
     do {
         /* literal ] can be first character, so test for it at the end of the loop, for example: []] */
         if (pat[*pos] == '-' && hasprev) {
             /* it is a range*/
             unsigned char range_end;
             unsigned int c;
             assert(range_start);
             ++*pos;
             if (pat[*pos] == '[')
                 if (pat[*pos + 1] == '.') {
                     /* collating sequence not handled */
                     free(bitmap);
                     /* we are parsing the regex for a
5ee56e41
 					 * filter, be conservative and
 					 * tell the filter that anything could
 					 * match here */
288057e9
                     while (pat[*pos] != ']') ++*pos;
                     ++*pos;
                     while (pat[*pos] != ']') ++*pos;
                     return dot_bitmap;
                 } else
                     range_end = pat[*pos];
             else
                 range_end = pat[*pos];
             for (c = range_start + 1; c <= range_end; c++)
                 bitmap[c >> 3] ^= 1 << (c & 0x7);
             hasprev = 0;
         } else if (pat[*pos] == '[' && pat[*pos] == ':') {
             /* char class */
             free(bitmap);
             while (pat[*pos] != ']') ++*pos;
             ++*pos;
             while (pat[*pos] != ']') ++*pos;
             return dot_bitmap;
         } else {
             bitmap[pat[*pos] >> 3] ^= 1 << (pat[*pos] & 0x7);
             range_start = pat[*pos];
             ++*pos;
             hasprev = 1;
         }
     } while (pat[*pos] != ']');
     return bitmap;
5ee56e41
 }
 
288057e9
 static struct node *parse_regex(const char *p, size_t *last)
5ee56e41
 {
288057e9
     struct node *v = NULL;
     struct node *right;
     struct node *tmp;
5ee56e41
 
288057e9
     while (p[*last] != '$' && p[*last] != '\0') {
         switch (p[*last]) {
             case '|':
                 ++*last;
                 right = parse_regex(p, last);
                 v     = make_node(alternate, v, right);
                 if (!v)
                     return NULL;
                 break;
             case '*':
             case '?':
                 v = make_node(optional, v, NULL);
                 if (!v)
                     return NULL;
                 ++*last;
                 break;
             case '+':
                 /* (x)* */
                 tmp = make_node(optional, v, NULL);
                 if (!tmp)
                     return NULL;
                 /* (x) */
                 right = dup_node(v);
50876732
                 if (!right) {
                     free(tmp);
288057e9
                     return NULL;
50876732
                 }
288057e9
                 /* (x)*(x) => (x)+ */
                 v = make_node(concat, tmp, right);
                 if (!v)
                     return NULL;
                 ++*last;
                 break;
             case '(':
                 ++*last;
                 right = parse_regex(p, last);
                 if (!right)
                     return NULL;
                 ++*last;
                 v = make_node(concat, v, right);
                 break;
             case ')':
                 return v;
             case '.':
                 right = make_charclass(dot_bitmap);
                 if (!right)
                     return NULL;
                 v = make_node(concat, v, right);
                 if (!v)
                     return NULL;
                 ++*last;
                 break;
             case '[':
                 ++*last;
                 right = make_charclass(parse_char_class(p, last));
                 if (!right)
                     return NULL;
                 v = make_node(concat, v, right);
                 if (!v)
                     return NULL;
                 ++*last;
                 break;
             case '\\':
                 /* next char is escaped, advance pointer
5ee56e41
 				 * and let fall-through handle it */
288057e9
                 ++*last;
             default:
                 right = make_leaf(p[*last]);
                 v     = make_node(concat, v, right);
                 if (!v)
                     return NULL;
                 ++*last;
                 break;
         }
     }
     return v;
5ee56e41
 }
 
288057e9
 #define BITMAP_HASSET(b, i) (b[i >> 3] & (1 << (i & 7)))
5ee56e41
 
102cd430
 static cl_error_t build_suffixtree_ascend(struct node *n, struct text_buffer *buf, struct node *prev, suffix_callback cb, void *cbdata, struct regex_list *regex)
5ee56e41
 {
288057e9
     size_t i, cnt;
     while (n) {
         struct node *q = n;
         switch (n->type) {
             case root:
                 textbuffer_putc(buf, '\0');
52e85060
                 if (cb(cbdata, buf->data, buf->pos - 1, regex) != CL_SUCCESS)
288057e9
                     return CL_EMEM;
102cd430
                 return CL_SUCCESS;
288057e9
             case leaf:
                 textbuffer_putc(buf, n->u.leaf_char);
                 n = n->parent;
                 break;
             case leaf_class:
                 cnt = 0;
                 for (i = 0; i < 255; i++)
                     if (BITMAP_HASSET(n->u.leaf_class_bitmap, i))
                         cnt++;
                 if (cnt > 16) {
                     textbuffer_putc(buf, '\0');
52e85060
                     if (cb(cbdata, buf->data, buf->pos - 1, regex) != CL_SUCCESS)
288057e9
                         return CL_EMEM;
102cd430
                     return CL_SUCCESS;
288057e9
                 }
                 /* handle small classes by expanding */
                 for (i = 0; i < 255; i++) {
                     if (BITMAP_HASSET(n->u.leaf_class_bitmap, i)) {
                         size_t pos;
                         pos = buf->pos;
                         textbuffer_putc(buf, (char)i);
52e85060
                         if (build_suffixtree_ascend(n->parent, buf, n, cb, cbdata, regex) != CL_SUCCESS)
288057e9
                             return CL_EMEM;
                         buf->pos = pos;
                     }
                 }
                 return 0;
             case concat:
                 if (prev != n->u.children.left) {
52e85060
                     if (build_suffixtree_descend(n->u.children.left, buf, cb, cbdata, regex) != CL_SUCCESS)
288057e9
                         return CL_EMEM;
                     /* we're done here, descend will call
5ee56e41
 					 * ascend if needed */
102cd430
                     return CL_SUCCESS;
288057e9
                 } else {
                     n = n->parent;
                 }
                 break;
             case alternate:
                 n = n->parent;
                 break;
             case optional:
                 textbuffer_putc(buf, '\0');
52e85060
                 if (cb(cbdata, buf->data, buf->pos - 1, regex) != CL_SUCCESS)
288057e9
                     return CL_EMEM;
102cd430
                 return CL_SUCCESS;
288057e9
         }
         prev = q;
     }
102cd430
     return CL_SUCCESS;
5ee56e41
 }
 
102cd430
 static cl_error_t build_suffixtree_descend(struct node *n, struct text_buffer *buf, suffix_callback cb, void *cbdata, struct regex_list *regex)
5ee56e41
 {
288057e9
     size_t pos;
     while (n && n->type == concat) {
         n = n->u.children.right;
     }
     if (!n)
102cd430
         return CL_SUCCESS;
288057e9
     /* find out end of the regular expression,
5ee56e41
 	 * if it ends with a static pattern */
288057e9
     switch (n->type) {
         case alternate:
             /* save pos as restart point */
             pos = buf->pos;
52e85060
             if (build_suffixtree_descend(n->u.children.left, buf, cb, cbdata, regex) != CL_SUCCESS)
288057e9
                 return CL_EMEM;
             buf->pos = pos;
52e85060
             if (build_suffixtree_descend(n->u.children.right, buf, cb, cbdata, regex) != CL_SUCCESS)
288057e9
                 return CL_EMEM;
             buf->pos = pos;
             break;
         case optional:
             textbuffer_putc(buf, '\0');
52e85060
             if (cb(cbdata, buf->data, buf->pos - 1, regex) != CL_SUCCESS)
288057e9
                 return CL_EMEM;
102cd430
             return CL_SUCCESS;
288057e9
         case leaf:
         case leaf_class:
52e85060
             if (build_suffixtree_ascend(n, buf, NULL, cb, cbdata, regex) != CL_SUCCESS)
288057e9
                 return CL_EMEM;
102cd430
             return CL_SUCCESS;
288057e9
         default:
             break;
     }
102cd430
     return CL_SUCCESS;
5ee56e41
 }
 
102cd430
 cl_error_t cli_regex2suffix(const char *pattern, regex_t *preg, suffix_callback cb, void *cbdata)
5ee56e41
 {
288057e9
     struct regex_list regex;
     struct text_buffer buf;
     struct node root_node;
     struct node *n;
     size_t last = 0;
     int rc;
5ee56e41
 
288057e9
     assert(pattern);
5ee56e41
 
288057e9
     regex.preg = preg;
     rc         = cli_regcomp(regex.preg, pattern, REG_EXTENDED);
     if (rc) {
         size_t buflen = cli_regerror(rc, regex.preg, NULL, 0);
         char *errbuf  = cli_malloc(buflen);
         if (errbuf) {
             cli_regerror(rc, regex.preg, errbuf, buflen);
             cli_errmsg(MODULE "Error compiling regular expression %s: %s\n", pattern, errbuf);
             free(errbuf);
         } else {
             cli_errmsg(MODULE "Error compiling regular expression: %s\n", pattern);
         }
         return rc;
     }
     regex.nxt     = NULL;
     regex.pattern = cli_strdup(pattern);
5ee56e41
 
288057e9
     n = parse_regex(pattern, &last);
     if (!n)
         return REG_ESPACE;
     memset(&buf, 0, sizeof(buf));
     memset(&root_node, 0, sizeof(buf));
     n->parent = &root_node;
5ee56e41
 
288057e9
     rc = build_suffixtree_descend(n, &buf, cb, cbdata, &regex);
     free(regex.pattern);
     free(buf.data);
     destroy_tree(n);
     return rc;
5ee56e41
 }