libclamav/regex_list.c
bd912dd8
 /*
  *  Match a string against a list of patterns/regexes.
  *
43ecd9a1
bd912dd8
  *
  *  This program is free software; you can redistribute it and/or modify
  *  it under the terms of the GNU General Public License as published by
  *  the Free Software Foundation; either version 2 of the License, or
  *  (at your option) any later version.
  *
  *  This program is distributed in the hope that it will be useful,
  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  *  GNU General Public License for more details.
  *
  *  You should have received a copy of the GNU General Public License
  *  along with this program; if not, write to the Free Software
  *  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
  *  MA 02110-1301, USA.
  *
  */
 
 #if HAVE_CONFIG_H
 #include "clamav-config.h"
 #endif
 
 #ifndef CL_DEBUG
 #define NDEBUG
 #endif
 
 #ifdef CL_THREAD_SAFE
 #ifndef _REENTRANT
 #define _REENTRANT
 #endif
 #endif
 
b9a533ec
 
 /* TODO: when implementation of new version is complete, enable it in CL_EXPERIMENTAL */
 #ifdef CL_EXPERIMENTAL
 //#define USE_NEW_VERSION
 #endif
 
 #ifndef USE_NEW_VERSION
 /*this is the old version of regex_list.c
  *reason for redesign: there is only one node type that has to handle all the cases: binary search among children, alternatives list, match.
  * This design is very error-prone.*/
 
bd912dd8
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #include <ctype.h>
 
 #include <limits.h>
 #include <sys/types.h>
 
e061e477
 #ifdef	HAVE_REGEX_H
bd912dd8
 #include <regex.h>
e061e477
 #endif
bd912dd8
 
 
 #include "clamav.h"
 #include "others.h"
 #include "regex_list.h"
 #include "matcher-ac.h"
43ecd9a1
 #include "str.h"
bd912dd8
 
 
 /*Tree*/
 enum token_op_t {OP_CHAR,OP_STDCLASS,OP_CUSTOMCLASS,OP_DOT,OP_LEAF,OP_ROOT,OP_PARCLOSE};
ec481027
 typedef unsigned char* char_bitmap_p;
bd912dd8
 /*
  *
  * OP_CHAR: 1 character, c = character
  * complex stuff:
  * OP_STDCLASS: standard character class, c = char class, class: 1<<(index into std_class of class name)
  * OP_CUSTOMCLASS: custom character class, first pointer in ptr array is a pointer to the bitmap table for this class
  * OP_DOT: single . matching any character except \n
  * OP_LEAF: this is a leaf node, reinterpret structure
  */
 struct tree_node {
ec481027
 	struct tree_node* next;/* next regex/complex sibling, or parent, if no more siblings , can't be NULL except for root node*/
bd912dd8
 	unsigned char c;
ec481027
 	enum token_op_t op;
bd912dd8
 	char alternatives;/* number of (non-regex) children of node, i.e. sizeof(children)*/
 	char listend;/* no more siblings, next pointer is pointer to parent*/
 	union {
 		struct tree_node** children;/* alternatives nr. of children, followed by (a null pointer terminated) regex leaf node pointers) */
 		char_bitmap_p* bitmap;
 		struct leaf_info*  leaf;
 	} u;
 };
 
 struct leaf_info {
 	char* info;/* what does it mean that we reached the leaf...*/
 	regex_t* preg;/* this is NULL if leaf node, and non-regex*/
 };
 
 /* Character classes */
ec481027
 static const char* std_class[] = {
 	"[:alnum:]",
 	"[:digit:]",
 	"[:punct:]",
 	"[:alpha:]",
 	"[:graph:]",
 	"[:space:]",
 	"[:blank:]",
 	"[:lower:]", 
 	"[:upper:]",
 	"[:cntrl:]",
 	"[:print:]",
 	"[:xdigit:]"
 	/* don't change the order of these strings, unless you change them in generate_tables.c too, and regenerate the tables*/
bd912dd8
 };
 
3da4dd4c
 
bd912dd8
 #define STD_CLASS_CNT sizeof(std_class)/sizeof(std_class[0])
3da4dd4c
 
ec481027
 /* generated by contrib/phishing/generate_tables.c */
 static const unsigned char char_class_bitmap[STD_CLASS_CNT][32] = {
         {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0x03, 
          0xfe, 0xff, 0xff, 0x07, 0xfe, 0xff, 0xff, 0x07, 
          0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 
          0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
 
         {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0x03, 
          0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 
          0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 
          0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
 
         {0x00, 0x00, 0x00, 0x00, 0xfe, 0xff, 0x00, 0xfc, 
          0x01, 0x00, 0x00, 0xf8, 0x01, 0x00, 0x00, 0x78, 
          0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 
          0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
 
         {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 
          0xfe, 0xff, 0xff, 0x07, 0xfe, 0xff, 0xff, 0x07, 
          0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 
          0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
 
         {0x00, 0x00, 0x00, 0x00, 0xfe, 0xff, 0xff, 0xff, 
          0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x7f, 
          0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 
          0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
 
         {0x00, 0x3e, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 
          0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 
          0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 
          0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
 
         {0x00, 0x02, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 
          0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 
          0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 
          0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
 
         {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 
          0x00, 0x00, 0x00, 0x00, 0xfe, 0xff, 0xff, 0x07, 
          0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 
          0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
 
         {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 
          0xfe, 0xff, 0xff, 0x07, 0x00, 0x00, 0x00, 0x00, 
          0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 
          0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
 
         {0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 
          0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x80, 
          0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 
          0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
 
         {0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 
          0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x7f, 
          0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 
          0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
 
         {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0x03, 
          0x7e, 0x00, 0x00, 0x00, 0x7e, 0x00, 0x00, 0x00, 
          0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 
          0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}
 };
3da4dd4c
 
ec481027
 static const unsigned short int char_class[256] = {
         0x200, 0x200, 0x200, 0x200, 0x200, 0x200, 0x200, 0x200, 0x200, 0x260, 0x220, 0x220, 0x220, 0x220, 0x200, 0x200, 
         0x200, 0x200, 0x200, 0x200, 0x200, 0x200, 0x200, 0x200, 0x200, 0x200, 0x200, 0x200, 0x200, 0x200, 0x200, 0x200, 
         0x460, 0x414, 0x414, 0x414, 0x414, 0x414, 0x414, 0x414, 0x414, 0x414, 0x414, 0x414, 0x414, 0x414, 0x414, 0x414, 
         0xc13, 0xc13, 0xc13, 0xc13, 0xc13, 0xc13, 0xc13, 0xc13, 0xc13, 0xc13, 0x414, 0x414, 0x414, 0x414, 0x414, 0x414, 
         0x414, 0xd19, 0xd19, 0xd19, 0xd19, 0xd19, 0xd19, 0x519, 0x519, 0x519, 0x519, 0x519, 0x519, 0x519, 0x519, 0x519, 
         0x519, 0x519, 0x519, 0x519, 0x519, 0x519, 0x519, 0x519, 0x519, 0x519, 0x519, 0x414, 0x414, 0x414, 0x414, 0x414, 
         0x414, 0xc99, 0xc99, 0xc99, 0xc99, 0xc99, 0xc99, 0x499, 0x499, 0x499, 0x499, 0x499, 0x499, 0x499, 0x499, 0x499, 
         0x499, 0x499, 0x499, 0x499, 0x499, 0x499, 0x499, 0x499, 0x499, 0x499, 0x499, 0x414, 0x414, 0x414, 0x414, 0x200, 
         0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 
         0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 
         0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 
         0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 
         0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 
         0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 
         0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 
         0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000
 };
bd912dd8
 
ec481027
 static const size_t std_class_cnt =  sizeof(std_class)/sizeof(std_class[0]);
3da4dd4c
 
bd912dd8
 /* Prototypes */
6e3332cf
 static int add_pattern(struct regex_matcher* matcher,const unsigned char* pat,const char* info,int hostOnly);
bd912dd8
 static int match_node(struct tree_node* node,const unsigned char* c,size_t len,const char** info);
 static void destroy_tree(struct regex_matcher* matcher);
ec481027
 static struct tree_node* tree_root_alloc(void);
 static int build_regex_list(struct regex_matcher* matcher);
 static void stack_destroy(struct node_stack* stack);
bd912dd8
 
ec481027
 #ifndef NDEBUG
 void dump_tree(struct tree_node* root);
 #endif
bd912dd8
 
 #define MATCH_SUCCESS 0 
 #define MATCH_FAILED  -1
 
 /*
  * Call this function when an unrecoverable error has occured, (instead of exit).
  */
 static void fatal_error(struct regex_matcher* matcher)
 {
 	regex_list_done(matcher);
 	matcher->list_inited = -1;/* the phishing module will know we tried to load a whitelist, and failed, so it will disable itself too*/
 }
 
 
 /*
  * @matcher - matcher structure to use
  * @real_url - href target
  * @display_url - <a> tag contents
  * @hostOnly - if you want to match only the host part
ec481027
  * @is_whitelist - is this a lookup in whitelist?
bd912dd8
  *
  * @return - CL_SUCCESS - url doesn't match
  *         - CL_VIRUS - url matches list
  *
  * Do not send NULL pointers to this function!!
  *
  */
ec481027
 int regex_list_match(struct regex_matcher* matcher,const char* real_url,const char* display_url,int hostOnly,const char** info,int is_whitelist)
bd912dd8
 {
ec481027
 	massert(matcher);
 	massert(real_url);
 	massert(display_url);
 	massert(info);
bd912dd8
 	if(!matcher->list_inited)
 		return 0;
ec481027
 	massert(matcher->list_built);
bd912dd8
 	{
 		size_t real_len    = strlen(real_url);
 		size_t display_len = strlen(display_url);
e9913115
 		size_t buffer_len  = (hostOnly && !is_whitelist) ? real_len : real_len + display_len + 1 + (is_whitelist ? 1 : 0);
bd912dd8
 		char*  buffer = cli_malloc(buffer_len+1);
e3b67c5e
 		size_t i;
4e9ab8ed
 		int rc = 0;
 		struct cli_ac_data mdata;
bd912dd8
 
 		if(!buffer)
 			return CL_EMEM;
 
 		strncpy(buffer,real_url,real_len);
a891c2d9
 		buffer[real_len]= (!is_whitelist && hostOnly) ? '\0' : ':';
ec481027
 		if(!hostOnly || is_whitelist) {
62b2ecc7
 			strncpy(buffer+real_len+1,display_url,display_len);
e9913115
 			if(is_whitelist) 
 				buffer[buffer_len - 1] = '/';
62b2ecc7
 			buffer[buffer_len]=0;
f15a8c49
 		}
62b2ecc7
 		cli_dbgmsg("Looking up in regex_list: %s\n", buffer);
bd912dd8
 
4e9ab8ed
 		if(hostOnly) {
 			if((rc = cli_ac_initdata(&mdata, 0, AC_DEFAULT_TRACKLEN)))
 				return rc;
 			rc = 0;
 
e3b67c5e
 			for(i = 0; i < matcher->root_hosts_cnt; i++) {
c5fdb4c8
 				/* needs to match terminating \0 too */
 				rc = cli_ac_scanbuff((unsigned char*)buffer,buffer_len+1,info, &matcher->root_hosts[i] ,&mdata,0,0,0,-1,NULL);
 				cli_ac_freedata(&mdata);
 				if(rc)
e3b67c5e
 					break;
 			}
4e9ab8ed
 		} else
e3b67c5e
 			rc = 0;
4e9ab8ed
     
6e3332cf
 		if(!rc) 
 			rc = match_node(hostOnly ? matcher->root_regex_hostonly : matcher->root_regex,(unsigned char*)buffer,buffer_len,info) == MATCH_SUCCESS ? CL_VIRUS : CL_SUCCESS;
bd912dd8
 		free(buffer);
 		if(!rc)
f74bc827
 			cli_dbgmsg("Lookup result: not in regex list\n");
 		else
 			cli_dbgmsg("Lookup result: in regex list\n");
bd912dd8
 		return rc;
 	}
 }
 
 /* node stack */
 #define NODE_STACK_INITIAL 1024
 #define NODE_STACK_GROW    4096
 /* Initialize @stack */
 static int stack_init(struct node_stack* stack)
 {
ec481027
 	massert(stack);
bd912dd8
 
 	stack->cnt = 0;
 	stack->capacity = NODE_STACK_INITIAL;
 	stack->data = cli_malloc(stack->capacity * sizeof(*stack->data));
 	if(!stack->data)
 		return CL_EMEM;
 	else
 		return CL_SUCCESS;
 }
 
 /* Reset @stack pointer, but don't realloc */
 static void stack_reset(struct node_stack* stack)
 {
ec481027
 	massert(stack);
bd912dd8
 
 	stack->cnt = 0;
 }
 
 /* Push @node on @stack, growing it if necessarry */
43ecd9a1
 static int stack_push(struct node_stack* stack,struct tree_node* node)
bd912dd8
 {
ec481027
 	massert(stack);
 	massert(stack->data);
bd912dd8
 
 	if(stack->cnt == stack->capacity) {
 		stack->capacity += NODE_STACK_GROW;
84fd5a61
 		stack->data = cli_realloc2(stack->data,stack->capacity*sizeof(*stack->data));
bd912dd8
 		if(!stack->data)
 			return CL_EMEM;
 	}
 	stack->data[stack->cnt++] = node;
 	return CL_SUCCESS;
 }
 
 /* Pops node from @stack, doesn't realloc */
43ecd9a1
 static struct tree_node* stack_pop(struct node_stack* stack)
bd912dd8
 {
ec481027
 	massert(stack);
 	massert(stack->data);
 	massert(stack->cnt);/*don't pop from empty stack */
bd912dd8
 
 	return stack->cnt ? stack->data[--stack->cnt] : NULL;
 }
 
 /* Initialization & loading */
 
 /* Initializes @matcher, allocating necesarry substructures */
 int init_regex_list(struct regex_matcher* matcher)
 {
e3b67c5e
 	int rc;
ec481027
 
 	massert(matcher);
bd912dd8
 	matcher->list_inited = 0;
e3b67c5e
  	matcher->root_hosts_cnt = 0;
  	matcher->root_hosts = NULL;
  	matcher->root_hosts_cnt = 0;
bd912dd8
 
 	matcher->root_regex = tree_root_alloc();
 	if(!matcher->root_regex) {
 		return CL_EMEM;
 	}
 
6e3332cf
 	matcher->root_regex_hostonly = tree_root_alloc();
 	if(!matcher->root_regex_hostonly) {
 		free(matcher->root_regex);
 		return CL_EMEM;
 	}
 
e3b67c5e
 	if(( rc = stack_init(&matcher->node_stack) )) {
6e3332cf
 		free(matcher->root_regex_hostonly);
e3b67c5e
 		free(matcher->root_regex);
 		return rc;
 	}
 	if(( rc = stack_init(&matcher->node_stack_alt) )) {
6e3332cf
 		free(matcher->root_regex_hostonly);
e3b67c5e
 		free(matcher->root_regex);
 		stack_destroy(&matcher->node_stack);
 		return rc;
 	}
bd912dd8
 
 	matcher->list_inited=1;
e3b67c5e
 	matcher->list_built=1;/* its empty, but pretend its built, so that load_ will realloc root_hosts */
bd912dd8
 	matcher->list_loaded=0;
 
 	return CL_SUCCESS;
 }
 
 /* inserts @pattern into @root, using ac-matcher 
  * although the name might be confusing, @pattern is not a regex!*/
 static int add_regex_list_element(struct cli_matcher* root,const char* pattern,char* info)
 {
ec481027
        int ret;
bd912dd8
        struct cli_ac_patt *new = cli_calloc(1,sizeof(*new));
ec481027
        size_t len,i;
bd912dd8
 
        if(!new)
 	       return CL_EMEM;
ec481027
        massert(root);
        massert(pattern);
bd912dd8
 
c5fdb4c8
        len = strlen(pattern)+1;
        /* need to match \0 too, so we are sure
 	* matches only happen at end of string */
bd912dd8
        new->type = 0;
        new->sigid = 0;
        new->parts = 0;
        new->partno = 0;
        new->mindist = 0;
        new->maxdist = 0;
        new->offset = 0;
        new->target = 0;
        new->length = len;
        if(new->length > root->maxpatlen)
                root->maxpatlen = new->length;
 
        new->pattern = cli_malloc(sizeof(new->pattern[0])*len);
        if(!new->pattern) {
 	       free(new);
 	       return CL_EMEM;
        }
f15a8c49
        for(i=0;i<len;i++)
 	       new->pattern[i]=pattern[i];/*new->pattern is short int* */
bd912dd8
 
43ecd9a1
        new->virname = cli_strdup(info);
bd912dd8
        if((ret = cli_ac_addpatt(root,new))) {
 	       free(new->virname);
                free(new->pattern);
                free(new);
                return ret;
        }
        return CL_SUCCESS;
 }
 
9db68157
 static int functionality_level_check(char* line)
50c27591
 {
 	char* ptmin;
 	char* ptmax;
 	size_t j;
 
 	ptmin = strrchr(line,':');
 	if(!ptmin) 
 		return CL_SUCCESS;
 	
 	ptmin++;
 
 	ptmax = strchr(ptmin,'-');
 	if(!ptmax) 
 		return CL_SUCCESS;/* there is no functionality level specified, so we're ok */
 	else {
9db68157
 		size_t min, max;
50c27591
 		ptmax++;
9db68157
 		for(j=0;j+ptmin+1 < ptmax;j++)
50c27591
 			if(!isdigit(ptmin[j])) 
 				return CL_SUCCESS;/* not numbers, not functionality level */
 		for(j=0;j<strlen(ptmax);j++)
 			if(!isdigit(ptmax[j])) 
 				return CL_SUCCESS;/* see above */
 		ptmax[-1]='\0';
 		min = atoi(ptmin);
 		if(strlen(ptmax)==0)
  			max = INT_MAX; 		
 		else
 			max = atoi(ptmax);
 
 		if(min > cl_retflevel()) {
43ecd9a1
 			cli_dbgmsg("regex list line %s not loaded (required f-level: %u)\n",line,(unsigned int)min);
50c27591
 			return CL_EMALFDB; 
 		}
 
 		if(max < cl_retflevel()) 
 			return CL_EMALFDB;
 		ptmin[-1]='\0';
 		return CL_SUCCESS;
 	}		
 }
 
 
bd912dd8
 /* Load patterns/regexes from file */
ec481027
 int load_regex_matcher(struct regex_matcher* matcher,FILE* fd,unsigned int options,int is_whitelist)
bd912dd8
 {
 	int rc,line=0;
 	char buffer[FILEBUFF];
 
ec481027
 	massert(matcher);
 	massert(fd);
bd912dd8
 
 	if(matcher->list_inited==-1)
ec481027
 		return CL_EMALFDB; /* already failed to load */
e3b67c5e
 /*	if(matcher->list_loaded) {
bd912dd8
 		cli_warnmsg("Regex list has already been loaded, ignoring further requests for load\n");
ec481027
 		return CL_SUCCESS;
e3b67c5e
 	}*/
bd912dd8
 	if(!fd) {
 		cli_errmsg("Unable to load regex list (null file)\n");
ec481027
 		return CL_EIO;
bd912dd8
 	}
 
 	cli_dbgmsg("Loading regex_list\n");
 	if(!matcher->list_inited) {
ec481027
 		rc = init_regex_list(matcher);
bd912dd8
 		if (!matcher->list_inited) {
 			cli_errmsg("Regex list failed to initialize!\n");
 			fatal_error(matcher);
ec481027
 			return rc;
bd912dd8
 		}
 		/*atexit(regex_list_done); TODO: destroy this in manager.c */
 	}
 	/*
 	 * Regexlist db format (common to .wdb(whitelist) and .pdb(domainlist) files:
 	 * Multiple lines of form, (empty lines are skipped):
  	 * Flags RealURL DisplayedURL
 	 * Where:
6e3332cf
 	 * Flags: 
 	 *
 	 * .pdb files:
 	 * R - regex, H - host-only, followed by (optional) 3-digit hexnumber representing 
bd912dd8
 	 * flags that should be filtered.
 	 * [i.e. phishcheck urls.flags that we don't want to be done for this particular host]
6e3332cf
 	 * 
 	 * .wdb files:
 	 * X - full URL regex 
 	 * Y - host-only regex
 	 * M - host simple pattern
bd912dd8
 	 *
 	 * If a line in the file doesn't conform to this format, loading fails
 	 * 
 	 */
 	while(fgets(buffer,FILEBUFF,fd)) {
 		char* pattern;
 		char* flags;
 		cli_chomp(buffer);
 		if(!*buffer)
 			continue;/* skip empty lines */
50c27591
 
 		if(functionality_level_check(buffer)) 
 			continue;
 
 		line++;
a891c2d9
 		pattern = strchr(buffer,':');
bd912dd8
 		if(!pattern) {
 			cli_errmsg("Malformed regex list line %d\n",line);
 			fatal_error(matcher);
 			return CL_EMALFDB;
 		}
 		pattern[0]='\0';
ec481027
 		flags = buffer+1;
bd912dd8
 		pattern++;
e9913115
 
 		if(is_whitelist) {
 			const size_t pattern_len = strlen(pattern);
 			if(pattern_len < FILEBUFF) {
 				pattern[pattern_len] = '/';
 				pattern[pattern_len+1] = '\0';
 			}
 			else {
 				cli_errmsg("Overlong regex line %d\n",line);
 				fatal_error(matcher);
 				return CL_EMALFDB;
 			}
 		}
 
6e3332cf
 		if((buffer[0] == 'R' && !is_whitelist) || ((buffer[0] == 'X' || buffer[0] == 'Y') && is_whitelist)) {/*regex*/
 			if(( rc = add_pattern(matcher,(const unsigned char*)pattern,flags, buffer[0] == 'Y') ))
bd912dd8
 				return rc==CL_EMEM ? CL_EMEM : CL_EMALFDB;
 		}
e3b67c5e
 		else if( ( buffer[0] == 'H' && !is_whitelist) || (buffer[0] == 'M' && is_whitelist)) {/*matches displayed host*/
f74bc827
 			struct cli_matcher* root;
e3b67c5e
  			if(matcher->list_built) {
  				struct cli_matcher* old_hosts = matcher->root_hosts;
  				matcher->root_hosts_cnt++;
  
5c98f465
  				matcher->root_hosts = cli_realloc(matcher->root_hosts, matcher->root_hosts_cnt * sizeof(*matcher->root_hosts));
e3b67c5e
  				if(!matcher->root_hosts) {
  					matcher->root_hosts = old_hosts;/* according to manpage this must still be valid*/
  					return CL_EMEM;
43ecd9a1
 				} 
f74bc827
 
 				root = &matcher->root_hosts[matcher->root_hosts_cnt-1];
  				memset(root, 0, sizeof(struct cli_matcher));
 
 				cli_dbgmsg("regex_list: Initialising AC pattern matcher\n");
 				if((rc = cli_ac_init(root, AC_DEFAULT_MIN_DEPTH, AC_DEFAULT_MAX_DEPTH))) {
 					/* no need to free previously allocated memory here */
 					cli_errmsg("regex_list: Can't initialise AC pattern matcher\n");
 					return rc;
 				}
e3b67c5e
  				matcher->list_built = 0;
  			}
f74bc827
 			else {
 				root = &matcher->root_hosts[matcher->root_hosts_cnt-1];
 			}
  			if(( rc = add_regex_list_element(root,pattern,flags) ))
bd912dd8
 				return rc==CL_EMEM ? CL_EMEM : CL_EMALFDB;
 		}
 		else {
ec481027
 			return CL_EMALFDB;
 			/* this is useless, we have host, and regex matches
bd912dd8
 			if(( rc = add_regex_list_element(matcher->root_urls,pattern,flags) ))
ec481027
 				return rc==CL_EMEM ? CL_EMEM : CL_EMALFDB;*/
bd912dd8
 		}
 	}
 	matcher->list_loaded = 1;
e3b67c5e
 	if(( rc = build_regex_list(matcher) ))
 		return rc;
bd912dd8
 
 #ifndef NDEBUG
 /*			dump_tree(matcher->root_regex);*/
 #endif
 	if(!matcher->list_built) {
 		cli_errmsg("Regex list not loaded: build failed!\n");
 		fatal_error(matcher);
 		return CL_EMALFDB;
 	}
 	regex_list_cleanup(matcher);
 	return CL_SUCCESS;
 }
 
 
 static struct tree_node ** tree_node_get_children(const struct tree_node* node)
 {
 	return node->op==OP_CUSTOMCLASS ? (node->u.children[1] ? node->u.children+1 : NULL) :node->u.children;
 }
ec481027
 
bd912dd8
 /* Build the matcher list */
 static int build_regex_list(struct regex_matcher* matcher)
 {
e3b67c5e
 	int rc;
bd912dd8
 	if(!matcher->list_inited || !matcher->list_loaded) {
 		cli_errmsg("Regex list not loaded!\n");
 		return -1;/*TODO: better error code */
 	}
 	cli_dbgmsg("Building regex list\n");
03ee6b1e
 	if(matcher->root_hosts)
 		if(( rc = cli_ac_buildtrie(&matcher->root_hosts[matcher->root_hosts_cnt-1]) ))
e3b67c5e
  			return rc;
bd912dd8
 	matcher->list_built=1;
 
 	return CL_SUCCESS;
 }
 
 /* Done with this matcher, free resources */
 void regex_list_done(struct regex_matcher* matcher)
 {
ec481027
 	massert(matcher);
bd912dd8
 
 	regex_list_cleanup(matcher);
 	if(matcher->list_loaded) {
15b08fbb
 		if(matcher->root_hosts) {
e3b67c5e
 			size_t i;
f74bc827
 			for(i=0;i<matcher->root_hosts_cnt;i++) 
e3b67c5e
 				cli_ac_free(&matcher->root_hosts[i]);
15b08fbb
 			free(matcher->root_hosts);
 			matcher->root_hosts=NULL;
 		}
bd912dd8
 
e3b67c5e
 		matcher->root_hosts_cnt=0;
bd912dd8
 		matcher->list_built=0;
 		destroy_tree(matcher);
 		matcher->list_loaded=0;
 	}
 	if(matcher->list_inited) {
 		matcher->list_inited=0;
 	}
 	stack_destroy(&matcher->node_stack);
 	stack_destroy(&matcher->node_stack_alt);
 }
 
 /* Tree matcher algorithm */
 struct token_t
 {
 	union {
ec481027
 		const unsigned char* start;
 		char_bitmap_p  bitmap;
 		unsigned char  c;
bd912dd8
 	} u;
ec481027
 	size_t len;
 	char   type;
bd912dd8
 };
 
 enum {TOKEN_CHAR,TOKEN_DOT,TOKEN_PAR_OPEN,TOKEN_PAR_CLOSE,TOKEN_BRACKET,TOKEN_ALT,TOKEN_REGEX,TOKEN_DONE};
 
 static const unsigned char* getNextToken(const unsigned char* pat,struct token_t* token)
 {
ec481027
 	massert(pat);
 	massert(token);
bd912dd8
 
 	switch(*pat) {
 		case '\\':
 			token->type=TOKEN_CHAR;
ec481027
 			token->u.c = *(++pat);
 			if(islower(token->u.c)) {
bd912dd8
 				/* handle \n, \t, etc. */
2e7c0aa1
 				char fmt[3] = {'\\', '\0', '\0'};
bd912dd8
 				char c;
2e7c0aa1
 
ec481027
 				fmt[1] = token->u.c;
 				if(snprintf(&c,1,fmt)!=1) {
bd912dd8
 					token->type=TOKEN_REGEX;
ec481027
 					token->u.start = pat;
 				}
5cf8cf16
 				else
ec481027
 					token->u.c=c;
bd912dd8
 			}
2e7c0aa1
 			token->len = 1;
bd912dd8
 			break;
 		case '|':
 			token->type=TOKEN_ALT;
 			break;
 		case '*':
 		case '+':
 		case '?':
 		case '{':
 		case '}':
 			token->type=TOKEN_REGEX;
 			break;
 		case '[':
 			{
 			/*TODO: implement*/
 			/*see if it is something simple like a list of characters, a range, or negated ...*/
 			const unsigned char* old=pat++;/* save this in case we change our mind and decide this is too complicated for us to handle*/
 			unsigned char range_start=0;
 			int hasprev = 0;
 			char_bitmap_p bitmap = cli_malloc(32);
 			if(!bitmap)
 				return NULL;
 			if (*pat=='^') {
 				memset(bitmap,0xFF,32);/*match chars not in brackets*/
 				pat++;
 			}
 			else
 				memset(bitmap,0x00,32);
 			do {
 				/* literal ] can be first character, so test for it at the end of the loop, for example: []] */
 				if (*pat=='-' && hasprev) {
 					/* it is a range*/
 					unsigned char range_end;
 					unsigned int c;
ec481027
 					massert(range_start);
bd912dd8
 					pat++;
 					if (pat[0]=='[')
 						if (pat[1]=='.') {
 							if(pat[2]=='-' && pat[3]=='.' && pat[4]==']')
 								range_end = '-';
 							else {
 								/* this is getting complicated, bail out */
 								cli_warnmsg("confused about collating sequences in regex,bailing out");
 								pat=old;
 								token->type=TOKEN_REGEX;
 								break;
 							}
 						}
 						else 
 							range_end = *pat;
 					else
 						range_end = *pat;
 					for(c=range_start+1;c<=range_end;c++)
 						bitmap[c>>3] ^= 1<<(c&0x7);
 					hasprev = 0;
 				}
 				else if (pat[0]=='[' && pat[1]==':') {
 							const unsigned char* end;
 							int len,found=-1;
 							size_t i;
 
 							pat+=2;
 							end=(unsigned char*)strstr((const char*)pat,":]");
 							if(!end) {
 								cli_warnmsg("confused about std char class syntax regex,bailing out");
 								pat=old;
 								token->type=TOKEN_REGEX;
 								break;
 							}
 
 							len = end-pat;
 							for(i=0;i<std_class_cnt;i++)
ec481027
 								if(!strncmp((const char*)pat,std_class[i],len)) {
bd912dd8
 									found=i;
 									break;
 								}
 							if(found!=-1) {
 								for(i=0;i<256;i++)
 									if(char_class[i]&(1<<found))
 										bitmap[i>>3] ^= 1<<(i&0x7);
 							}
 							else {
 								/*unknown class*/
 								cli_warnmsg("confused about regex bracket expression, bailing out");
 								pat=old;
 								token->type=TOKEN_REGEX;
 								break;
 							}
 						}
 				else {
 					bitmap[*pat>>3] ^= 1<<(*pat&0x7);
 					pat++;
 					range_start = *pat;
 					hasprev = 1;
 				}
 			} while(*pat!=']');
 			/*TODO: see if this bitmap already exists, then reuse*/			
 			token->type = TOKEN_BRACKET;
 			token->u.bitmap = bitmap;
 			break;
 			}
 		case ']':
ec481027
 			massert(0 && "Encountered ] without matching [");
bd912dd8
 			/* bad state */
 			break;
 		case '.':
 			token->type=TOKEN_DOT;
 			break;
 		case '(':
 			token->type=TOKEN_PAR_OPEN;
 			break;
 		case ')':
 			token->type=TOKEN_PAR_CLOSE;
 			break;
 		default:
 			token->type=TOKEN_CHAR;
ec481027
 			token->u.c = *pat;
bd912dd8
 			token->len=1;
 			break;
 	}
 	return ++pat;
 }
 
 #define INITIAL_ALT_STACK 10
 #define ALT_STACK_GROW 20
 
 static const unsigned char* find_regex_start(const unsigned char* pat)
 {
 	struct token_t token;
 	/*TODO: find where the regex part begins, for ex:
 	 * abcd+, regex begins at 'd'
 	 * */
 	const unsigned char* last=NULL;
 	const unsigned char* tmp=NULL;
 	const unsigned char** altpositions = cli_malloc(INITIAL_ALT_STACK*sizeof(*altpositions));
 	size_t altpositions_capacity = INITIAL_ALT_STACK;
 	size_t altpositions_cnt = 0;
 	char lasttype = -1;
 	if(!altpositions)
 		return NULL;
ec481027
 	massert(pat);
bd912dd8
 
 	/* Try to parse pattern till special regex chars are encountered, that the tree-matcher doesn't handle, like: +,*,{}.
 	 * The tricky part is that once we encounter these, the previous 'atom' has to be passed on to the regex matcher, so we have to
 	 * back up to the last known good position
 	 * Example, if we have: abc(defg)+, then only abc can be handled by tree parser, so we have to return the position of (.
 	 * Another example: abc(defg|xyz|oz+|pdo), the last known good position is |, after xyz
 	 * TODO: what about open parantheses? maybe once we found a special char, we have top back out before the first (?
 	 * */
 	do {	
 		tmp = pat;
 		pat = getNextToken(pat,&token);
 		if(token.type!=TOKEN_REGEX) {
 			last = tmp;
 			lasttype = token.type;
15b08fbb
 			if(token.type==TOKEN_BRACKET && token.u.bitmap)
bd912dd8
 				free(token.u.bitmap);
 			if(token.type==TOKEN_ALT || token.type==TOKEN_PAR_OPEN) {
 				/* save this position on stack, succesfully parsed till here*/
 				if(altpositions_cnt && altpositions[altpositions_cnt-1][0]=='|')
 					/* encountered another alternate (|) operator, override previous | position stored */
 					altpositions[altpositions_cnt-1]=last;
 				else {
 					altpositions[altpositions_cnt++] = last;
 					if(altpositions_cnt == altpositions_capacity) {
 						altpositions_capacity += ALT_STACK_GROW;
84fd5a61
 						altpositions = cli_realloc2(altpositions,altpositions_capacity*sizeof(*altpositions));
bd912dd8
 						if(!altpositions)
 							return NULL;
 					}
 				}
 			} else if (lasttype==TOKEN_PAR_CLOSE) {
 				/* remove last stored position from stack, succesfully this last group */
 				altpositions_cnt--;
ec481027
 				massert(altpositions_cnt>0);
bd912dd8
 			}
 		}
 		else {
 			if(altpositions_cnt)
 				last = altpositions[0 /*altpositions_cnt-1*/];/*TODO: which index here?, see above TODO... */
 			/*last stored 'safe' position where no special (+,*,{}) regex chars were encountered*/
 		}
 	} while(*pat && token.type!=TOKEN_REGEX);
 	free(altpositions);
 	return *pat ? last : last+1;
 }
 
 static struct tree_node* tree_node_alloc(struct tree_node* next,char listend)
 {
 	struct tree_node* node = cli_malloc(sizeof(*node));
 	if(node) {
 		node->alternatives=0;
 		node->next=next;
 		node->listend=listend;
 		node->u.children=NULL;
 	}
 	return node;
 }
 
 static struct tree_node* tree_root_alloc(void)
 {
 	struct tree_node* root=tree_node_alloc(NULL,1);
 	if(root) {
 		root->op=OP_ROOT;
 		root->c=0;
 		root->next=NULL;
 		root->listend=1;
 	}
 	return root;
 }
ec481027
 
43ecd9a1
 static struct tree_node* tree_node_char_binsearch(const struct tree_node* node,const char csearch,int* left)
bd912dd8
 {
 	int right;
 	struct tree_node **children;
ec481027
 	massert(node);
 	massert(left);
bd912dd8
 
 	children = tree_node_get_children(node);
 	right = node->alternatives-1;
 	*left = 0;
 	if(!node->alternatives)
 		return NULL;
ec481027
 	massert(children);
bd912dd8
 	while(*left<=right) {
 		int mid  = *left+(right-*left)/2;
 		if(children[mid]->c == csearch)
 			return children[mid]; 
 		else if(children[mid]->c < csearch)
 			*left=mid+1;
 		else
 			right=mid-1;
 	}
 	return NULL;
 }
 
43ecd9a1
 static struct tree_node* tree_get_next(struct tree_node* node)
bd912dd8
 {
 	struct tree_node** children;
ec481027
 	massert(node);
bd912dd8
 	children = tree_node_get_children(node);
 
 	if(!node->alternatives && children && children[0])
 		return children[0];
 	else if(node->alternatives<=1)
 		return node;
 	else
 		return children[0]->next;
 }
 
43ecd9a1
 static size_t tree_node_get_array_size(const struct tree_node* node)
bd912dd8
 {
ec481027
 	massert(node);
bd912dd8
 	/* if op is CUSTOMCLASS, then first pointer is pointer to bitmap, so array size is +1 */
 	return (node->alternatives + (node->op==OP_CUSTOMCLASS ? 1 : 0)) * sizeof(node->u.children[0]);
 }
 
43ecd9a1
 static struct tree_node* tree_node_char_insert(struct tree_node* node,const char c,int left)
bd912dd8
 {
 	struct tree_node* new, *alt = tree_get_next(node);
ec481027
 	struct tree_node **children;
bd912dd8
 	node->alternatives++;
84fd5a61
 	node->u.children = cli_realloc2(node->u.children,tree_node_get_array_size(node));
bd912dd8
 	if(!node->u.children)
 		return NULL;
 
ec481027
 	children = node->op==OP_CUSTOMCLASS ? node->u.children+1 : node->u.children;
 
bd912dd8
 	new = tree_node_alloc(alt , node == alt );
 	if(new) {
 		new->op=OP_CHAR;
 		new->c=c;
 	}
 
 	if(node->alternatives-left-1>0)
ec481027
 			memmove(&children[left+1],&children[left],(node->alternatives-left-1)*sizeof(node->u.children[0]));
 	children[left] = new;	
bd912dd8
 
 	return new;
 }
 
43ecd9a1
 static void tree_node_insert_nonbin(struct tree_node* node, struct tree_node* new)
bd912dd8
 {
 	struct tree_node **children;
ec481027
 	massert(node);
 	massert(new);
bd912dd8
 
 	children = tree_node_get_children(node);
 	if(node->alternatives) {
ec481027
 		massert(children);
bd912dd8
 	       	if(children[0]->next == node) {
 			int i;
 			new->listend = 1;
 			for(i=0;i<node->alternatives;i++) {
 				children[i]->next = new;
 				children[i]->listend = 0;
 			}
 		}
 		else {
 			struct tree_node* p;
 			for(p = children[0]->next ; p->next != node ; p = p->next)
ec481027
 				massert(!p->listend);
bd912dd8
 			new->listend = 1;
 			p->listend = 0;
 			p->next = new;
 		}
 	}
 	else {
5cf8cf16
 		int idx = node->op==OP_CUSTOMCLASS ? 1 : 0;
bd912dd8
 		if(node->u.children)
5cf8cf16
 			if(node->u.children[idx]) {
 				node = node->u.children[idx];
 				while(node->next && !node->listend)
 					node = node->next;
 				node->listend = 0;
d5b1e62a
 				if(new->next == node) {
 					new->next = node->next;
 				}
5cf8cf16
 				node->next = new;
 				new->listend=1;
c311ffc3
 				return;
5cf8cf16
 			}
84fd5a61
 		node->u.children = cli_realloc2(node->u.children,sizeof(node->u.children[0])*(2));
5cf8cf16
 		if(node->u.children) {
 			node->u.children[idx] = new;
 		}
bd912dd8
 	}
 }
 
43ecd9a1
 static unsigned char char_getclass(const unsigned char* bitmap)
bd912dd8
 {
 	size_t i;
ec481027
 	massert(bitmap);
bd912dd8
 
 	for(i=0;i<std_class_cnt;i++)
 		if(!memcmp(bitmap,char_class_bitmap[i],256>>3))
 			return i;
 	return std_class_cnt;
 }
 
 static void stack_destroy(struct node_stack* stack)
 {
ec481027
 	massert(stack);
bd912dd8
 	if(stack->data)
 		free(stack->data);
 	stack->data = NULL;
 	stack->capacity = 0;
 }
 
 /* call this after whitelist load is complete, and the tree is no longer going to be modified */
 void regex_list_cleanup(struct regex_matcher* matcher)
 {
ec481027
 	massert(matcher);
bd912dd8
 
 	stack_destroy(&matcher->node_stack);
 	stack_destroy(&matcher->node_stack_alt);
 	stack_init(&matcher->node_stack);
 	stack_init(&matcher->node_stack_alt);
 }
 
 int is_regex_ok(struct regex_matcher* matcher)
 {
ec481027
 	massert(matcher);
bd912dd8
 	return (!matcher->list_inited || matcher->list_inited!=-1);/* either we don't have a regexlist, or we initialized it successfully */
 }
 
 /* returns 0 on success, regexec error code otherwise */						
6e3332cf
 static int add_pattern(struct regex_matcher* matcher,const unsigned char* pat,const char* info, int hostonly)
bd912dd8
 {
 	int bol=1;
 	const unsigned char* pat_end = find_regex_start(pat);
 	struct token_t token;
 	struct tree_node* node;
 	
ec481027
 	massert(matcher);
bd912dd8
 
6e3332cf
 	node = hostonly ? matcher->root_regex_hostonly : matcher->root_regex;
bd912dd8
 
 	stack_reset(&matcher->node_stack);
 	stack_reset(&matcher->node_stack_alt);
 	stack_push(&matcher->node_stack,node);
 
 	for(;node->op!=OP_LEAF;){
 		if(pat<pat_end)
 			pat  = getNextToken(pat,&token);
 		else if(*pat) {
 			token.type = TOKEN_REGEX;
 			token.u.start=pat;
 		}
 		else
 			token.type = TOKEN_DONE;
 
 		switch(token.type) {
 			case TOKEN_CHAR: 
 				{
 					/* search for char in tree */
 					int left;
ec481027
 					struct tree_node* newnode = tree_node_char_binsearch(node,token.u.c,&left);
bd912dd8
 					if(newnode)
 						node = newnode;
 					else {
 						/* not found, insert it */
ec481027
 						node = tree_node_char_insert(node,token.u.c,left);
bd912dd8
 					}
 					break;
 				}
 
 			case TOKEN_PAR_OPEN:
 				stack_push(&matcher->node_stack_alt,NULL);/* marker */
 				stack_push(&matcher->node_stack,node);
 				break;
 
 			case TOKEN_PAR_CLOSE: {
 						      /*TODO: test this!!!*/
 						      struct tree_node* node_alt = node;
 						      node = tree_node_alloc(NULL,1);
 						      node->op=OP_PARCLOSE;
 						      node->c=0;
 						      node->listend=1;
 						      tree_node_insert_nonbin(node_alt,node);
 						      while (( node_alt = stack_pop(&matcher->node_stack_alt) )) {
 							      tree_node_insert_nonbin(node_alt,node);
 						      }
 				      		      stack_pop(&matcher->node_stack);					      
 		      				      break;
 					      }
 
 			case TOKEN_ALT:
 				stack_push(&matcher->node_stack_alt,node);
 				node = stack_pop(&matcher->node_stack);
 				stack_push(&matcher->node_stack,node);
 				break;
 
 			case TOKEN_BRACKET:
 				{
 					struct tree_node* new = tree_node_alloc(tree_get_next(node),1);
ec481027
 					unsigned char charclass = char_getclass(token.u.bitmap);
bd912dd8
 					if(charclass == std_class_cnt) {/*not a std char class*/
 						new->op = OP_CUSTOMCLASS;
 						new->u.children = cli_malloc(sizeof(new->u.children[0])*2);
15b08fbb
 						if(!new->u.children)
 							return CL_EMEM;
bd912dd8
 						new->u.bitmap[0] = token.u.bitmap;
 						new->u.bitmap[1] = NULL;
 						tree_node_insert_nonbin(node,new);
 						node = new;
 					}
 					else {
 						new->op = OP_STDCLASS;
 						new->c = charclass;
 						tree_node_insert_nonbin(node,new);
 						node=new;
 					}
 					break;
 				}
 
 			case TOKEN_DOT:
 				{
 					struct tree_node* new = tree_node_alloc(tree_get_next(node),1);
 					new->op = OP_DOT;
 					tree_node_insert_nonbin(node,new);
 					node=new;
 					break;
 				}
 
 			case TOKEN_REGEX:
 			case TOKEN_DONE: {
 						 struct leaf_info* leaf=cli_malloc(sizeof(*leaf));
15b08fbb
 						 if(!leaf)
 							 return CL_EMEM;
43ecd9a1
 						 leaf->info = cli_strdup(info);
bd912dd8
 						 if(token.type==TOKEN_REGEX) {
 							 int rc;
 							 struct tree_node* new;
 							 regex_t* preg;
 							 preg=cli_malloc(sizeof(*preg));
15b08fbb
 							 if(!preg)
 								 return CL_EMEM;
5cf8cf16
 							 rc = regcomp(preg,(const char*)token.u.start,REG_EXTENDED|(bol?0:REG_NOTBOL));
bd912dd8
 							 leaf->preg=preg;
 							 if(rc)
 								 return rc;
 							 new=cli_malloc(sizeof(*new));
15b08fbb
 							 if(!new)
 								 return CL_EMEM;
bd912dd8
 							 new->op=OP_LEAF;
 							 new->next=node;
 							 new->alternatives=0;
 							 new->u.leaf=leaf;
 							 new->listend=1;
 							 tree_node_insert_nonbin(node,new);
 						 }
 						 else {
 							 leaf->preg=NULL;
 							 node->alternatives=0;
 							 node->u.leaf=leaf;
 							 node->op=OP_LEAF;
 						 }
 						 return 0;
 					 }
 		}
 
 		bol=0;
 	}
 	return 0;
 }
 
 /* c has to be unsigned char here!! */
 static int match_node(struct tree_node* node,const unsigned char* c,size_t len,const char** info)
 {
 	struct tree_node** children;
 	int rc;
 
ec481027
 	massert(node);
 	massert(c);
 	massert(info);
bd912dd8
 
f15a8c49
 	if(!node->u.children)
 		return MATCH_FAILED;/* tree empty */
bd912dd8
 	*info = NULL;
 	len++;
 	c--;
 	for(;;) {
ec481027
 		massert(node);
bd912dd8
 		children = node->u.children;
 		switch(node->op) {
 			case OP_ROOT:
 				rc=1;
 				break;
 			case OP_PARCLOSE:
 				/*this isn't a real character, so don't move*/
 				c--;
 				len++;
 				rc=1;
 				break;
 			case OP_CHAR:
ec481027
 				massert(*c==node->c && "We know this has to match");
bd912dd8
 				rc = 1;/* *c==node->c;- we know it has matched */
 				break;
 			case OP_DOT:	
 				rc = *c!='\n';
 				break;
 			case OP_STDCLASS:
 				rc = char_class[*c]&(node->c);
 				break;
 			case OP_CUSTOMCLASS:
 			{
 				char_bitmap_p bitmap;
ec481027
 				massert(children);
bd912dd8
 				bitmap = (char_bitmap_p)node->u.bitmap[0];
 				children++;
 				rc = bitmap[*c>>3]&(1<<(*c&0x7));
 				break;
 			}
 			case OP_LEAF:
 			{
 				const struct leaf_info* leaf = node->u.leaf;
 				/*isleaf = 1;*/
 				if(leaf->preg) {
 					rc = !regexec(leaf->preg,(const char*)c,0,NULL,0);
 				}
 				else  {
ec481027
 					massert(*c==node->c && "We know this has to match[2]");
bd912dd8
 					rc = 1;
 				}
 				if(rc) {
 					*info = leaf->info;
 					return MATCH_SUCCESS;
 				}
 				break;
 			}
 			default:
 				/* impossible */
 				cli_errmsg("Encountered invalid operator in tree:%d\n",node->op);
 				exit(1);
 		}
 		len--;
 		if(!len) rc=0;
 		c++;
 		if(rc) {
 			const char csearch = *c;
 			int left = 0,right = node->alternatives-1;
 			int mid;
 			/*matched so far, go deeper*/
 			/*do a binary search between children */
ec481027
 			massert(children);
bd912dd8
 			while(left<=right) {
 				mid  = left+(right-left)/2;
 				if (children[mid]->c == csearch)
 					break;
 				else if(children[mid]->c < csearch)
 					left=mid+1;
 				else
 					right=mid-1;
 			}
 			if(left<=right) {
 				node = children[mid];
ec481027
 				massert(node);
bd912dd8
 			}
 			else {
 				if(node->alternatives) {
 					if(!children[0]->listend) {
 						node = children[0];
 						c++;
 						len--;
 					}
 					while(node && node->listend) {
 						node = node->next;/* climb up */
 						c--;
 						len++;
 					}
 					if(!node || !node->next) 
 						return MATCH_FAILED;/* reached root node */
 					node=node->next;
 					c--;
 					len++;
 				}
 				else if(node->u.children) {
 					struct tree_node* rewrite_next = NULL;
 					if(node->op==OP_PARCLOSE) 
 						rewrite_next = node;
 					node = children[0];
ec481027
 					massert(node);
 					massert(node->op!=OP_CHAR);
bd912dd8
 					if(rewrite_next)
 						node->next = rewrite_next;/* this node is pointed to by several parent nodes, 
 									     we need to know 
 									     from which one we came, so we can find out way back
 									     should we fail to match somewhere deeper*/
 				}
 			}
 		}
 		else {
 			/* this node didn't match, try sibling, or parent (if no more siblings) */
 			while(node && node->listend) {
 				node = node->next;/* sibling of parent */
 				c--;
 				len++;
 			}
 			if(!node || !node->next) /* reached root node, it has no next */
 				return MATCH_FAILED;
5cf8cf16
 			else {
 				c--;
 				len++;
 				node=node->next;
 			}
bd912dd8
 		}
 	}
 	return MATCH_FAILED;
 }
 
 /* push node on stack, only if it isn't there already */
43ecd9a1
 static void stack_push_once(struct node_stack* stack,struct tree_node* node)
bd912dd8
 {
 	size_t i;
ec481027
 	massert(stack);
 	massert(node);
bd912dd8
 
 	for(i=0;i < stack->cnt;i++)
 		if(stack->data[i]==node)
 			return;
 	stack_push(stack,node);
 }
 
 static void destroy_tree_internal(struct regex_matcher* matcher,struct tree_node* node)
 {
 	struct tree_node **children;
ec481027
 	massert(matcher);
 	massert(node);
bd912dd8
 
 	children = tree_node_get_children(node);
 	if(node->op==OP_LEAF) {
 		struct leaf_info* leaf = node->u.leaf;
 		if(node->next && !node->listend)
 			destroy_tree_internal(matcher,node->next);
 		stack_push_once(&matcher->node_stack,(struct tree_node*)node->u.leaf);/* cast to make compiler happy, and to not make another stack implementation for storing void* */
 		stack_push_once(&matcher->node_stack,node);
 		if(leaf->preg) {
 			regfree(leaf->preg);
 			free(leaf->preg);
 			leaf->preg=NULL;
 		}
 		if(leaf->info) {
 			free(leaf->info);
 			leaf->info=NULL;
 		}
 	/*	return;*/
 	}
 	if(node->alternatives) {
 		int i;
 		struct tree_node* p;
ec481027
 		massert(children);
bd912dd8
 		p = children[0]->op==OP_LEAF ? NULL : children[0]->next;
 		for(i=0;i<node->alternatives;i++)
 			destroy_tree_internal(matcher,children[i]);
 		if(p && p!=node)
 			destroy_tree_internal(matcher,p);/*?? is this ok, or without _internal?*/
 	}
 	else {
 		if(children) {
 			if(children[0])
 				destroy_tree_internal(matcher,children[0]);		
 		}
 	}
5cf8cf16
 	if(node->op!=OP_LEAF && node->next && !node->listend)
bd912dd8
 		destroy_tree_internal(matcher,node->next);
 	if(node->u.children)
 		stack_push_once(&matcher->node_stack,(struct tree_node*)node->u.children);/* cast to make compiler happy, it isn't really a tree_node* */
 	if(node->op==OP_CUSTOMCLASS && node->u.children[0]) {
 		free(node->u.children[0]);
 		node->u.children[0]=NULL;
 	}
 	stack_push_once(&matcher->node_stack,node);
 }
 
 static void destroy_tree(struct regex_matcher* matcher)
 {
 	/* we might have the same node linked by different nodes, so a recursive walk&free doesn't work in all situations,
 	 * i.e. it might double-free, so instead of freeing, just push the nodes on a stack, and later free the nodes in that stack,
 	 * (and push to stack only if it doesn't contain it already*/
ec481027
 	massert(matcher);
bd912dd8
 
 	stack_reset(&matcher->node_stack);
 	destroy_tree_internal(matcher,matcher->root_regex);
6e3332cf
 	destroy_tree_internal(matcher,matcher->root_regex_hostonly);
bd912dd8
 	while (matcher->node_stack.cnt) {
 		struct tree_node* node = stack_pop(&matcher->node_stack);
15b08fbb
 		if(node)
 			free(node);
bd912dd8
 	}
 }
 #ifndef NDEBUG
 static void dump_node(struct tree_node* node)
 {
 	int i;
 	struct tree_node* p,**children;
ec481027
 	massert(node);
bd912dd8
 	if(node->op==OP_LEAF) {
 		if(node->u.leaf->preg)
 			printf("n%p [label=\"regex\\nleaf\"]",(void*)node);
 		else
 			printf("n%p [label=\"%c\\nleaf\"];\n",(void*)node,node->c);
 		if(node->next && !node->listend) {
 			printf("n%p -> n%p;\n",(void*)node,(void*)node->next);
 			dump_node(node->next);
 		}
 		return;
 	}
 	printf("n%p [label=\"%c\\n%d\\nlistend:%d\"];\n",(void*)node,(node->op==OP_ROOT||node->op==OP_PARCLOSE) ?'@' :node->c,node->op,node->listend);
 	if(node->next)
 		printf("n%p -> n%p;\n",(void*)node,(void*)node->next);
 	printf("n%p -> {",(void*)node);/*using address of node as id*/
 	children = tree_node_get_children(node);
 	if(node->alternatives)
ec481027
 		massert(children);
bd912dd8
 	for(i=0;i<node->alternatives;i++)
 		printf("n%p ",(void*)children[i]);
 	if(node->alternatives && children[0]->op!=OP_LEAF)
 		for(p=children[0]->next;p!=node;p=p->next)
 		{
ec481027
 			massert(p);
bd912dd8
 			printf("n%p ",(void*)p);
 			if(p->op==OP_LEAF || p->listend)
 				break;
 		}
 	if(!node->alternatives && children && children[0])
 		printf("n%p ",(void*)children[0]);
 	printf("};\n");
 	printf("{rank=same;");
 	for(i=0;i<node->alternatives;i++)
 		printf("n%p ",(void*)node->u.children[i]);
 	if(node->alternatives && children[0]->op!=OP_LEAF)
 		for(p=children[0]->next;p!=node;p=p->next) 
 		{
 			printf("n%p ",(void*)p);	
 			if(p->op==OP_LEAF || p->listend)
 				break;
 		}
 	if(!node->alternatives && children && children[0])
 		printf("n%p ",(void*)children[0]);
 	printf("};\n");
 	for(i=0;i<node->alternatives;i++)
 		dump_node(children[i]);
 	if(node->alternatives && children[0]->op!=OP_LEAF)
 		for(p=children[0]->next;p!=node;p=p->next)
 		{
 			dump_node(p);
 			if(p->op==OP_LEAF || p->listend)
 				break;
 		}
 	if(!node->alternatives && children && children[0])
 		dump_node(children[0]);
 }
 
 void dump_tree(struct tree_node* root)
 {
 	/*use dot/dotty from graphviz to view it*/
ec481027
 	massert(root);
bd912dd8
 	printf("digraph tree {\n");
 	dump_node(root);
 	printf("}\n");
 }
 #endif
 
b9a533ec
 
 #else
 /*------------------------New version of regex_list.c------------------------*/
 
 /* Regex_list.c: 
  * A scalable, trie-based implementation for matching against 
  * a list of regular expressions.
  *
  * A trivial way to implement matching against a list of regular expressions 
  * would have been to construct a single regular expression, by concatenating 
  * the list with the alternate (|) operator.
  * BUT a usual DFA implementation of regular expression matching (eg.: GNU libc)
  * leads to "state explosion" when there are many (5000+) alternate (|) operators.
  * This is the reason for using a trie-based implementation.
  *
  *
  * Design considerations:
  *
  * Recursive call points: there are situations when match has to be retried on a different sub-trie, or with a different repeat count.
  * Alternate operators, and repeat/range operators (+,*,{}) are recursiv call points. When a failure is encountered during a match,
  * the function simply returns from the recursive call, and ends up at a failure point (recursive call point).
  *
  * "go to parent" below actually means, return from recursive call.
  *
dbf3c4d9
  * fail_action: we need to return to closest failure point (recursive call point),
  *  and switch current node to node pointed by fail_action
  *
b9a533ec
  * Node types:
  * 	OP_ROOT: contains information that applies to the entire trie.
  * 		it can only appear as root node, and not as child node.
  * 		On child fail: match has failed
dbf3c4d9
  * 		This is NOT a recursive call point
b9a533ec
  * 	OP_CHAR_BINARY_SEARCH: chooses a sub-trie, based on current character; 
  * 			using binary-search
  * 			On fail: go to node indicated by fail_action, or if 
  * 				fail_action is NULL, to parent
  * 			On child fail: execute fail of current node
  * 	OP_ALTERNATIVES: try matching each sub-trie, if all fails execute fail
  * 		action of current node. This is a recursive call point
  * 	OP_CHAR_REPEAT: repeat specified character a number of times in range:
  *		[min_range, max_range]; 
  *			min_range: 0 for * operator
  *				   1 for + operator
  *			max_range: remaining length of current string for *,+ operator
  *			OR: min_range, max_range as specified by the {min,max} operator
  *		On fail: fail_action, or parent if NULL
  *		On child fail: reduce match repeat count, try again on child, if
  *			repeat count<min_range, execute fail of current node
dbf3c4d9
  *		Also has a bitmap on what characters are accepted beyond it,
  *		as an optimizations for the case, when a maximum match isn't possible
b9a533ec
  *		Not recomended to use this when min_range=max_range=1
  *		This is a recursive call point
  *	OP_DOT_REPEAT: like OP_CHAR_REPEAT but accept any character
  *		Not recomended to use this when min_range=max_range=1
  *		This is a recursive call point
  *	OP_GROUP_START: start of a group "(", also specifies group flags:
  *		repeat: is_repeat, min_range, max_range
  *		This is a recursive call point if is_repeat
  *	OP_GROUP_END: end of group ")"
  *      OP_STRCMP: compare with specified string,
  *      	   it has an array of fail actions, one for each character
  *      	   default fail action: go to parent
  *      	   This was introduced from memory- and speed-efficiency
  *      	   considerations. 
  *      OP_CHAR_CLASS_REPEAT: match character with character class
  *      	min_range, max_range
  *      	For a single character class min_range=max_range=1
  *	OP_MATCH_OK: match has succeeded
  *
  * TODO: maybe we'll need a more efficient way to choose between character classes.
  *       OP_DOT_REPEAT/OP_CHAR_REPEAT needs a more efficient specification of its failure function, instead of using
  *       backtracking approach.
  *
  * The failure function/action is just for optimization, the match algorithms works even without it.
  * TODO:In this first draft fail action will always be NULL, in a later version I'll implement fail actions too.
  *
  *
  */ 
 
dbf3c4d9
 #include <string.h>
b9a533ec
 #include "cltypes.h"
 #include "others.h"
 
 /* offsetof is not ANSI C */
 #ifndef offsetof
 #   define offsetof(type,memb) ((size_t)&((type*)0)->memb)
 #endif
 
 #define container_of(ptr, type, member) ( (type *) ((char *)ptr - offsetof(type, member)) )
 #define container_of_const(ptr, type, member) ( (type *) ((const char *)ptr - offsetof(type, member)) )
 
 enum trie_node_type {
 	OP_ROOT,
 	OP_CHAR_BINARY_SEARCH,
 	OP_ALTERNATIVES,
 	OP_CHAR_REPEAT,
 	OP_DOT_REPEAT,
 	OP_CHAR_CLASS_REPEAT,
 	OP_STRCMP,
 	OP_GROUP_START,
 	OP_GROUP_END,
 	OP_MATCH_OK
 };
 
 
 /* the comon definition of a trie node */
 struct trie_node
 {
 	enum trie_node_type type;
 };
 
 struct trie_node_match {
 	struct trie_node node;
 	/* additional match info */
 };
 
 struct trie_node_root
 {
 	struct trie_node node;
 	struct trie_node* child;
 };
 
 struct trie_node_binary_search
 {
 	struct trie_node node;
 	uint8_t children_count;/* number of children to search among -1! 255 = 256 children*/	
 	struct trie_node* fail_action;
dbf3c4d9
 	unsigned char* char_choices;/* children_count elements */
 	struct trie_node** children;/*children_count elements */
b9a533ec
 };
 
 struct trie_node_alternatives
 {
 	struct trie_node node;
 	uint32_t alternatives_count;
 	/* need to support node with lots of alternatives, 
 	 * for a worst-case scenario where each line ends up as a sub-trie of OP_ALTERNATIVES*/
 	struct trie_node* fail_action;
 	struct trie_node** children;
 };
 
 struct trie_node_char_repeat
 {
 	struct trie_node node;
 	unsigned char character;
 	uint8_t range_min, range_max;/* according to POSIX we need not support more than 255 repetitions*/
dbf3c4d9
 	struct char_bitmap* bitmap_accept_after;/* bitmap of characters accepted after this, 
 						   to optimize repeat < max_range case; if its NULL
 						   there is no optimization*/
b9a533ec
 	struct trie_node* child;
 	struct trie_node* fail_action;
 };
 
 struct trie_node_dot_repeat
 {
 	struct trie_node node;
 	uint8_t range_min, range_max;/* according to POSIX we need not support more than 255 repetitions*/
dbf3c4d9
 	struct char_bitmap* bitmap_accept_after;/* bitmap of characters accepted after this, 
 						   to optimize repeat < max_range case; if its NULL
 						   there is no optimization*/
b9a533ec
 	struct trie_node* child;
 	struct trie_node* fail_action;
 };
 
 struct trie_node_group_start
 {
 	struct trie_node node;
 	uint8_t range_min, range_max;/* if range_min==range_max==1, then this is NOT a repeat, thus not a recursive call point*/
 	struct trie_node* child;
 	struct trie_node* fail_action;	
 };
 
 struct trie_node_group_end
 {
 	struct trie_node node;
 	struct trie_node* child;
 };
 
 struct trie_node_strcmp
 {
 	struct trie_node node;
 	uint8_t string_length;/* for longer strings a sequence of node_strcmp should be used */
 	unsigned char* string;
 	struct trie_node* child;
dbf3c4d9
 	struct trie_node** fail_actions;/* this has string_length elements, or NULL if no fail_actions are computed */
b9a533ec
 };
 
 struct trie_node_char_class_repeat
 {
 	struct trie_node node;
 	struct char_bitmap* bitmap;
dbf3c4d9
 	struct char_bitmap* bitmap_accept_after;
b9a533ec
 	uint8_t range_min, range_max;
 	struct trie_node* child;
 	struct trie_node* fail_action;
 };
 
dbf3c4d9
 static inline int bitmap_accepts(const struct char_bitmap* bitmap, const char c)
 {
 	/* TODO: check if c is accepted by bitmap */
 	return 0;
 }
 
 #define MATCH_FAILED 0
 #define MATCH_OK     1
b9a533ec
 
dbf3c4d9
 #define FAIL_ACTION( fail_node ) (*fail_action = (fail_node), MATCH_FAILED)
 
 
 #ifndef MIN
 #define MIN(a,b) ((a)<(b) ? (a) : (b))
 #endif
 
 static int match_node(const struct trie_node* node, const unsigned char* text, const unsigned char* text_end, const struct trie_node** fail_action);
 
 static int match_repeat(const unsigned char* text, const unsigned char* text_end, const size_t range_min, const size_t repeat_start, 
 		const struct char_bitmap* bitmap_accept_after, const struct trie_node* child, const struct trie_node** fail_action,
 		const struct trie_node* this_fail_action)
b9a533ec
 {
dbf3c4d9
 	size_t i;
 	for(i = repeat_start;i > range_min;i--) {
 		if(!bitmap_accept_after || bitmap_accepts( bitmap_accept_after, text[i-1])) {
 			int rc = match_node(child, &text[i], text_end, fail_action);
 			/* ignore fail_action for now, we have the bitmap_accepts_after optimization */
 			if(rc) {
 				return MATCH_OK;
 			}
 		}						
 	}
 	if(!range_min) {
 		/* this match is optional, try child only */
 		int rc = match_node(child, text, text_end, fail_action);
 		if(rc) {
 			return MATCH_OK;
 		}
 	}
 	return FAIL_ACTION(this_fail_action);
 }
 
 /* text_end points to \0 in text */
 static int match_node(const struct trie_node* node, const unsigned char* text, const unsigned char* text_end, const struct trie_node** fail_action)
 {
 	while(node && text < text_end) {	
b9a533ec
 		switch(node->type) {
 			case OP_ROOT:
 				{	
 					const struct trie_node_root* root_node = container_of_const(node, const struct trie_node_root, node);
 					node = root_node->child;
 					break;
 				}
 			case OP_CHAR_BINARY_SEARCH:
dbf3c4d9
 				{					
b9a533ec
 					const struct trie_node_binary_search* bin_node = container_of_const(node, const struct trie_node_binary_search, node);
dbf3c4d9
 					const unsigned char csearch = *text;
 					size_t mid, left = 0, right = bin_node->children_count-1;					
 					while(left<=right) {
 						mid = left+(right-left)/2;
 						if(bin_node->char_choices[mid] == csearch)
 							break;
 						else if(bin_node->char_choices[mid] < csearch)
 							left = mid+1;
 						else
 							right = mid-1;
 					}
 					if(left <= right) {
 						/* match successful */
 						node = bin_node->children[mid];
 						++text;
 					}
 					else {
 						return FAIL_ACTION( bin_node->fail_action );
 					}
b9a533ec
 					break;
 				}
 			case OP_ALTERNATIVES:
 				{
 					const struct trie_node_alternatives* alt_node = container_of_const(node, const struct trie_node_alternatives, node);
dbf3c4d9
 					size_t i;
 					*fail_action = NULL;
 					for(i=0;i < alt_node->alternatives_count;i++) {
 						int rc = match_node(alt_node->children[i], text, text_end, fail_action);
 						if(rc) {							
 							return MATCH_OK;
 						}
 						/* supporting fail_actions is tricky,
 						 *  if we just go to the node specified, what happens if the match fails, and no
 						 *  further fail_action is specified? We should know where to continue the search.
 						 * For now fail_action isn't supported for OP_ALTERNATIVES*/						
 					}
b9a533ec
 					break;
 				}
 			case OP_CHAR_REPEAT:
 				{
 					const struct trie_node_char_repeat* char_rep_node = container_of_const(node, const struct trie_node_char_repeat, node);
dbf3c4d9
 					const size_t max_len = MIN( text_end - text, char_rep_node->range_max-1);
 					/* todo: what about the 8 bit limitation of range_max, and what about inf (+,*)? */
 					const char caccept = char_rep_node->character;
 					size_t rep;
 
 					if(max_len < char_rep_node->range_min)
 						return FAIL_ACTION(char_rep_node->fail_action);
 
 					for(rep=0;rep < max_len;rep++) {
 						if(text[rep] != caccept) {
 							break;
 						}
 					}
 
 					return match_repeat(text, text_end, char_rep_node->range_min, rep,
 							char_rep_node->bitmap_accept_after, char_rep_node->child, fail_action,
 							char_rep_node->fail_action);
b9a533ec
 				}
 			case OP_DOT_REPEAT:
 				{
 					const struct trie_node_dot_repeat* dot_rep_node = container_of_const(node, const struct trie_node_dot_repeat, node);
dbf3c4d9
 					const size_t max_len = MIN( text_end - text, dot_rep_node->range_max-1);
 					/* todo: what about the 8 bit limitation of range_max, and what about inf (+,*)? */
 
 					if(max_len < dot_rep_node->range_min)
 						return FAIL_ACTION(dot_rep_node->fail_action);
 
 					return match_repeat(text, text_end, dot_rep_node->range_min, max_len,
 							dot_rep_node->bitmap_accept_after, dot_rep_node->child, fail_action,
 							dot_rep_node->fail_action);
b9a533ec
 				}
 			case OP_CHAR_CLASS_REPEAT:
 				{
 					const struct trie_node_char_class_repeat* class_rep_node = container_of_const(node, const struct trie_node_char_class_repeat, node);
dbf3c4d9
 					const size_t max_len = MIN( text_end - text, class_rep_node->range_max-1);
 					/* todo: what about the 8 bit limitation of range_max, and what about inf (+,*)? */
 					size_t rep;
 
 					if(max_len < class_rep_node->range_min)
 						return FAIL_ACTION(class_rep_node->fail_action);
 
 					for(rep=0;rep < max_len;rep++) {
 						if(!bitmap_accepts( class_rep_node->bitmap, text[rep])) {
 							break;
 						}
 					}
 
 					return match_repeat(text, text_end, class_rep_node->range_min, rep,
 							class_rep_node->bitmap_accept_after, class_rep_node->child, fail_action,
 							class_rep_node->fail_action);
b9a533ec
 					break;
 				}
 			case OP_STRCMP:
 				{
 					const struct trie_node_strcmp* strcmp_node = container_of_const(node, const struct trie_node_strcmp, node);
dbf3c4d9
 					size_t i;
 					if(strcmp_node->fail_actions) {
 						const size_t max_len = MIN(strcmp_node->string_length, text_end-text);
 						/* we don't use strncmp, because we need the exact match-fail point */
 						for(i=0;i < max_len;i++) {
 							if(text[i] != strcmp_node->string[i]) {
 								return FAIL_ACTION( strcmp_node->fail_actions[i] );
 							}
 						}
 						if(max_len < strcmp_node->string_length) {
 							/* failed, because text was shorter */
 							return FAIL_ACTION( strcmp_node->fail_actions[max_len] );
 						}
 					}
 					else {
 						/* no fail_actions computed, some shortcuts possible on compare */
 						if((text_end - text < strcmp_node->string_length) ||
 								strncmp((const char*)text, (const char*)strcmp_node->string, strcmp_node->string_length)) {
 
 							return FAIL_ACTION( NULL );
 						}
 					}
 					/* match successful */
 					node = strcmp_node->child;
 					text += strcmp_node->string_length;
b9a533ec
 					break;
 				}
 			case OP_GROUP_START:
 				{
 					const struct trie_node_group_start* group_start_node = container_of_const(node, const struct trie_node_group_start, node);
dbf3c4d9
 					/* TODO: implement */
b9a533ec
 					break;
 				}
 			case OP_GROUP_END:
dbf3c4d9
 				{					
b9a533ec
 					const struct trie_node_group_end* group_end_node = container_of_const(node, const struct trie_node_group_end, node);
dbf3c4d9
 					/* TODO: implement */
b9a533ec
 					break;
 				}
dbf3c4d9
 			case OP_MATCH_OK:
b9a533ec
 				{
dbf3c4d9
 					return MATCH_OK;
b9a533ec
 				}
 		}
 	}
dbf3c4d9
 	/* if fail_action was NULL, or text ended*/
 	return MATCH_FAILED;
b9a533ec
 }
 
 #endif