Browse code

new implementation of the Aho-Corasick pattern matcher

git-svn: trunk@3038

Tomasz Kojm authored on 2007/04/29 03:40:59
Showing 7 changed files
... ...
@@ -1,3 +1,12 @@
1
+Sat Apr 28 19:51:22 CEST 2007 (tk)
2
+----------------------------------
3
+  * libclamav: new implementation of the Aho-Corasick pattern matcher:
4
+	       - remove static depth limitation
5
+	       - optimize memory usage
6
+	       - min/max depth can be set on per-tree basis
7
+	       - use higher max-depth by default (3)
8
+	       - much better detection of wildcarded sigs
9
+
1 10
 Tue Apr 24 13:48:04 BST 2007 (njh)
2 11
 ----------------------------------
3 12
   * libclamav/mbox.c:	Bug 366
... ...
@@ -381,13 +381,12 @@ int cli_addtypesigs(struct cl_engine *engine)
381 381
 	    return CL_EMEM;
382 382
 	}
383 383
 
384
-	root->ac_root =  (struct cli_ac_node *) cli_calloc(1, sizeof(struct cli_ac_node));
385
-	if(!root->ac_root) {
386
-	    cli_errmsg("cli_addtypesigs: Can't initialise AC pattern matcher\n");
384
+	if((ret = cli_ac_init(root, AC_DEFAULT_MIN_DEPTH, AC_DEFAULT_MAX_DEPTH))) {
387 385
 	    /* No need to free previously allocated memory here - all engine
388 386
 	     * elements will be properly freed by cl_free()
389 387
 	     */
390
-	    return CL_EMEM;
388
+	    cli_errmsg("cli_addtypesigs: Can't initialise AC pattern matcher\n");
389
+	    return ret;
391 390
 	}
392 391
     } else {
393 392
 	root = engine->root[0];
... ...
@@ -1,9 +1,4 @@
1 1
 /*
2
- *  C implementation of the Aho-Corasick pattern matching algorithm. It's based
3
- *  on the ScannerDaemon's version (coded in Java) by Kurt Huwig and
4
- *  http://www-sr.informatik.uni-tuebingen.de/~buehler/AC/AC.html
5
- *  Thanks to Kurt Huwig for pointing me to this page.
6
- *
7 2
  *  Copyright (C) 2002 - 2007 Tomasz Kojm <tkojm@clamav.net>
8 3
  *
9 4
  *  This program is free software; you can redistribute it and/or modify
... ...
@@ -38,89 +33,130 @@
38 38
 #include "matcher-ac.h"
39 39
 #include "filetypes.h"
40 40
 #include "cltypes.h"
41
+#include "str.h"
41 42
 
42
-struct nodelist {
43
-    struct cli_ac_node *node;
44
-    struct nodelist *next;
45
-};
46
-
47
-static uint8_t ac_depth = AC_DEFAULT_DEPTH;
48 43
 
49 44
 int cli_ac_addpatt(struct cli_matcher *root, struct cli_ac_patt *pattern)
50 45
 {
51
-	struct cli_ac_node *pos, *next;
46
+	struct cli_ac_node *pt, *next;
52 47
 	uint8_t i;
48
+	uint16_t len = MIN(root->ac_maxdepth, pattern->length);
53 49
 
54
-    if(pattern->length < ac_depth)
50
+
51
+    for(i = 0; i < len; i++) {
52
+	if(pattern->pattern[i] & CLI_MATCH_WILDCARD) {
53
+	    len = i;
54
+	    break;
55
+	}
56
+    }
57
+
58
+    if(len < root->ac_mindepth)
55 59
 	return CL_EPATSHORT;
56 60
 
57
-    pos = root->ac_root;
61
+    pt = root->ac_root;
62
+
63
+    for(i = 0; i < len; i++) {
64
+	if(!pt->trans) {
65
+	    pt->trans = (struct cli_ac_node **) cli_calloc(256, sizeof(struct cli_ac_node *));
66
+	    if(!pt->trans) {
67
+		cli_errmsg("cli_ac_addpatt: Can't allocate memory for pt->trans\n");
68
+		return CL_EMEM;
69
+	    }
70
+	}
58 71
 
59
-    for(i = 0; i < ac_depth; i++) {
60
-	next = pos->trans[(unsigned char) (pattern->pattern[i] & 0xff)]; 
72
+	next = pt->trans[(unsigned char) (pattern->pattern[i] & 0xff)]; 
61 73
 
62 74
 	if(!next) {
63 75
 	    next = (struct cli_ac_node *) cli_calloc(1, sizeof(struct cli_ac_node));
64 76
 	    if(!next) {
65
-		cli_errmsg("cli_ac_addpatt(): Unable to allocate AC node (%u bytes)\n", sizeof(struct cli_ac_node));
77
+		cli_errmsg("cli_ac_addpatt: Can't allocate memory for AC node\n");
66 78
 		return CL_EMEM;
67 79
 	    }
68 80
 
81
+	    if(i != len - 1) {
82
+		next->trans = (struct cli_ac_node **) cli_calloc(256, sizeof(struct cli_ac_node *));
83
+		if(!next->trans) {
84
+		    cli_errmsg("cli_ac_addpatt: Can't allocate memory for next->trans\n");
85
+		    free(next);
86
+		    return CL_EMEM;
87
+		}
88
+	    } else {
89
+		next->leaf = 1;
90
+	    }
91
+
69 92
 	    root->ac_nodes++;
70
-	    root->ac_nodetable = (struct cli_ac_node **) cli_realloc(root->ac_nodetable, (root->ac_nodes) * sizeof(struct cli_ac_node *));
71
-	    if(root->ac_nodetable == NULL) {
72
-		cli_errmsg("cli_ac_addpatt(): Unable to realloc nodetable (%u bytes)\n", (root->ac_nodes) * sizeof(struct cli_matcher *));
93
+	    root->ac_nodetable = (struct cli_ac_node **) cli_realloc(root->ac_nodetable, root->ac_nodes * sizeof(struct cli_ac_node *));
94
+	    if(!root->ac_nodetable) {
95
+		cli_errmsg("cli_ac_addpatt: Can't realloc ac_nodetable\n");
96
+		if(next->trans)
97
+		    free(next->trans);
98
+		free(next);
73 99
 		return CL_EMEM;
74 100
 	    }
75 101
 	    root->ac_nodetable[root->ac_nodes - 1] = next;
76 102
 
77
-	    pos->trans[((unsigned char) pattern->pattern[i]) & 0xff] = next;
103
+	    pt->trans[(unsigned char) (pattern->pattern[i] & 0xff)] = next;
104
+	    pt->leaf = 0;
78 105
 	}
79 106
 
80
-	pos = next;
107
+	pt = next;
81 108
     }
82 109
 
83
-    pos->islast = 1;
110
+    root->ac_patterns++;
111
+    root->ac_pattable = (struct cli_ac_patt **) cli_realloc(root->ac_pattable, root->ac_patterns * sizeof(struct cli_ac_patt *));
112
+    if(!root->ac_pattable) {
113
+	cli_errmsg("cli_ac_addpatt: Can't realloc ac_pattable\n");
114
+	return CL_EMEM;
115
+    }
116
+    root->ac_pattable[root->ac_patterns - 1] = pattern;
84 117
 
85
-    pattern->next = pos->list;
86
-    pos->list = pattern;
118
+    pt->final = 1;
119
+    pattern->depth = i;
120
+    pattern->next = pt->list;
121
+    pt->list = pattern;
87 122
 
88 123
     return CL_SUCCESS;
89 124
 }
90 125
 
91
-static int cli_enqueue(struct nodelist **bfs, struct cli_ac_node *n)
126
+struct bfs_list {
127
+    struct cli_ac_node *node;
128
+    struct bfs_list *next;
129
+};
130
+
131
+static int bfs_enqueue(struct bfs_list **bfs, struct cli_ac_node *n)
92 132
 {
93
-	struct nodelist *new;
133
+	struct bfs_list *new;
134
+
94 135
 
95
-    new = (struct nodelist *) cli_calloc(1, sizeof(struct nodelist));
96
-    if (new == NULL) {
97
-	cli_errmsg("cli_enqueue(): Unable to allocate node list (%u bytes)\n", sizeof(struct nodelist));
136
+    new = (struct bfs_list *) cli_malloc(sizeof(struct bfs_list));
137
+    if(!new) {
138
+	cli_errmsg("bfs_enqueue: Can't allocate memory for bfs_list\n");
98 139
 	return CL_EMEM;
99 140
     }
100
-
101 141
     new->next = *bfs;
102 142
     new->node = n;
103 143
     *bfs = new;
144
+
104 145
     return CL_SUCCESS;
105 146
 }
106 147
 
107
-static struct cli_ac_node *cli_dequeue(struct nodelist **bfs)
148
+static struct cli_ac_node *bfs_dequeue(struct bfs_list **bfs)
108 149
 {
109
-	struct nodelist *handler, *prev = NULL;
150
+	struct bfs_list *lpt, *prev = NULL;
110 151
 	struct cli_ac_node *pt;
111 152
 
112
-    handler = *bfs;
113 153
 
114
-    while(handler && handler->next) {
115
-	prev = handler;
116
-	handler = handler->next;
154
+    lpt = *bfs;
155
+    while(lpt && lpt->next) {
156
+	prev = lpt;
157
+	lpt = lpt->next;
117 158
     }
118 159
 
119
-    if(!handler) {
160
+    if(!lpt) {
120 161
 	return NULL;
121 162
     } else {
122
-	pt = handler->node;
123
-	free(handler);
163
+	pt = lpt->node;
164
+	free(lpt);
124 165
 	if(prev)
125 166
 	    prev->next = NULL;
126 167
 	else
... ...
@@ -130,183 +166,192 @@ static struct cli_ac_node *cli_dequeue(struct nodelist **bfs)
130 130
     }
131 131
 }
132 132
 
133
-static int cli_maketrans(struct cli_matcher *root)
133
+static int ac_maketrans(struct cli_matcher *root)
134 134
 {
135
-	struct nodelist *bfs = NULL;
136
-	struct cli_ac_node *ac_root = root->ac_root, *child, *node;
135
+	struct bfs_list *bfs = NULL;
136
+	struct cli_ac_node *ac_root = root->ac_root, *child, *node, *fail;
137
+	struct cli_ac_patt *patt;
137 138
 	int i, ret;
138 139
 
139 140
 
140
-    ac_root->fail = NULL;
141
-    if((ret = cli_enqueue(&bfs, ac_root)) != 0) {
142
-	return ret;
141
+    for(i = 0; i < 256; i++) {
142
+	node = ac_root->trans[i];
143
+	if(!node) {
144
+	    ac_root->trans[i] = ac_root;
145
+	} else {
146
+	    node->fail = ac_root;
147
+	    if((ret = bfs_enqueue(&bfs, node)))
148
+		return ret;
149
+	}
143 150
     }
144 151
 
145
-    while((node = cli_dequeue(&bfs))) {
146
-	if(node->islast)
152
+    while((node = bfs_dequeue(&bfs))) {
153
+	if(node->leaf)
147 154
 	    continue;
148 155
 
149 156
 	for(i = 0; i < 256; i++) {
150 157
 	    child = node->trans[i];
151
-	    if(!child) {
152
-		if(node->fail)
153
-		    node->trans[i] = (node->fail)->trans[i];
154
-		else
155
-		    node->trans[i] = ac_root;
156
-	    } else {
157
-		if(node->fail)
158
-		    child->fail = (node->fail)->trans[i];
159
-		else
160
-		    child->fail = ac_root;
158
+	    if(child) {
159
+		fail = node->fail;
160
+		while(fail->leaf || !fail->trans[i])
161
+		    fail = fail->fail;
161 162
 
162
-		if((ret = cli_enqueue(&bfs, child)) != 0) {
163
-		    return ret;
163
+		child->fail = fail->trans[i];
164
+
165
+		if(child->list) {
166
+		    patt = child->list;
167
+		    while(patt->next)
168
+			patt = patt->next;
169
+
170
+		    patt->next = child->fail->list;
171
+		} else {
172
+		    child->list = child->fail->list;
164 173
 		}
174
+
175
+		if(child->list)
176
+		    child->final = 1;
177
+
178
+		if((ret = bfs_enqueue(&bfs, child)) != 0)
179
+		    return ret;
165 180
 	    }
166 181
 	}
167 182
     }
183
+
168 184
     return CL_SUCCESS;
169 185
 }
170 186
 
171 187
 int cli_ac_buildtrie(struct cli_matcher *root)
172 188
 {
173
-
174 189
     if(!root)
175 190
 	return CL_EMALFDB;
176 191
 
177 192
     if(!root->ac_root) {
178
-	cli_dbgmsg("cli_ac_buildtrie(): AC pattern matcher is not initialised\n");
193
+	cli_dbgmsg("cli_ac_buildtrie: AC pattern matcher is not initialised\n");
179 194
 	return CL_SUCCESS;
180 195
     }
181 196
 
182
-    return cli_maketrans(root);
197
+    return ac_maketrans(root);
183 198
 }
184 199
 
185
-static void cli_freepatt(struct cli_ac_patt *list)
200
+int cli_ac_init(struct cli_matcher *root, uint8_t mindepth, uint8_t maxdepth)
186 201
 {
187
-	struct cli_ac_patt *handler, *prev;
188
-	int i;
189
-
190 202
 
191
-    handler = list;
203
+    root->ac_root = (struct cli_ac_node *) cli_calloc(1, sizeof(struct cli_ac_node));
204
+    if(!root->ac_root) {
205
+	cli_errmsg("cli_ac_init: Can't allocate memory for ac_root\n");
206
+	return CL_EMEM;
207
+    }
192 208
 
193
-    while(handler) {
194
-	if(handler->prefix)
195
-	    free(handler->prefix);
196
-	else
197
-	    free(handler->pattern);
198
-	free(handler->virname);
199
-	if(handler->offset)
200
-	    free(handler->offset);
201
-	if(handler->alt) {
202
-	    free(handler->altn);
203
-	    for(i = 0; i < handler->alt; i++)
204
-		free(handler->altc[i]);
205
-	    free(handler->altc);
206
-	}
207
-	prev = handler;
208
-	handler = handler->next;
209
-	free(prev);
209
+    root->ac_root->trans = (struct cli_ac_node **) cli_calloc(256, sizeof(struct cli_ac_node *));
210
+    if(!root->ac_root->trans) {
211
+	cli_errmsg("cli_ac_init: Can't allocate memory for ac_root->trans\n");
212
+	free(root->ac_root);
213
+	return CL_EMEM;
210 214
     }
215
+
216
+    root->ac_mindepth = mindepth;
217
+    root->ac_maxdepth = maxdepth;
218
+
219
+    return CL_SUCCESS;
211 220
 }
212 221
 
213 222
 void cli_ac_free(struct cli_matcher *root)
214 223
 {
215
-	unsigned int i;
224
+	uint32_t i, j;
225
+	struct cli_ac_patt *patt;
216 226
 
217 227
 
228
+    for(i = 0; i < root->ac_patterns; i++) {
229
+	patt = root->ac_pattable[i];
230
+
231
+	if(patt->prefix)
232
+	    free(patt->prefix);
233
+	else
234
+	    free(patt->pattern);
235
+	free(patt->virname);
236
+	if(patt->offset)
237
+	    free(patt->offset);
238
+	if(patt->alt) {
239
+	    free(patt->altn);
240
+	    for(j = 0; j < patt->alt; j++)
241
+		free(patt->altc[j]);
242
+	    free(patt->altc);
243
+	}
244
+	free(patt);
245
+    }
246
+    if(root->ac_pattable)
247
+	free(root->ac_pattable);
248
+
218 249
     for(i = 0; i < root->ac_nodes; i++) {
219
-	cli_freepatt(root->ac_nodetable[i]->list);
250
+	if(!root->ac_nodetable[i]->leaf)
251
+	    free(root->ac_nodetable[i]->trans);
220 252
 	free(root->ac_nodetable[i]);
221 253
     }
222 254
 
223 255
     if(root->ac_nodetable)
224 256
 	free(root->ac_nodetable);
225 257
 
226
-    if(root->ac_root)
258
+    if(root->ac_root) {
259
+	free(root->ac_root->trans);
227 260
 	free(root->ac_root);
261
+    }
228 262
 }
229 263
 
230
-inline static int cli_findpos(const unsigned char *buffer, unsigned int depth, unsigned int offset, unsigned int length, const struct cli_ac_patt *pattern)
264
+#define AC_MATCH_CHAR(p,b)						\
265
+    switch(wc = p & CLI_MATCH_WILDCARD) {				\
266
+	case CLI_MATCH_ALTERNATIVE:					\
267
+	    found = 0;							\
268
+	    for(j = 0; j < pattern->altn[alt]; j++) {			\
269
+		if(pattern->altc[alt][j] == b) {			\
270
+		    found = 1;						\
271
+		    break;						\
272
+		}							\
273
+	    }								\
274
+	    if(!found)							\
275
+		return 0;						\
276
+	    alt++;							\
277
+	    break;							\
278
+									\
279
+	case CLI_MATCH_NIBBLE_HIGH:					\
280
+	    if((unsigned char) (p & 0x00f0) != (b & 0xf0))		\
281
+		return 0;						\
282
+	    break;							\
283
+									\
284
+	case CLI_MATCH_NIBBLE_LOW:					\
285
+	    if((unsigned char) (p & 0x000f) != (b & 0x0f))		\
286
+		return 0;						\
287
+	    break;							\
288
+									\
289
+	default:							\
290
+	    if(wc != CLI_MATCH_IGNORE && (unsigned char) p != b)	\
291
+		return 0;						\
292
+    }
293
+
294
+inline static int ac_findmatch(const unsigned char *buffer, uint32_t offset, uint32_t length, const struct cli_ac_patt *pattern)
231 295
 {
232
-	unsigned int bufferpos = offset + depth;
233
-	unsigned int postfixend = offset + length;
234
-	unsigned int i, j, alt = pattern->alt_pattern, found;
296
+	uint32_t bp;
297
+	uint16_t wc, i, j, alt = pattern->alt_pattern;
298
+	uint8_t found;
235 299
 
236 300
 
237 301
     if(pattern->prefix)
238 302
 	if(pattern->prefix_length > offset)
239 303
 	    return 0;
240 304
 
241
-    if(bufferpos >= length)
242
-	bufferpos %= length;
243
-
244
-    for(i = depth; i < pattern->length; i++) {
245
-
246
-	if(bufferpos == postfixend)
247
-	    return 0;
248
-
249
-	if((pattern->pattern[i] & CLI_MATCH_WILDCARD) == CLI_MATCH_ALTERNATIVE) {
250
-	    found = 0;
251
-	    for(j = 0; j < pattern->altn[alt]; j++) {
252
-		if(pattern->altc[alt][j] == buffer[bufferpos]) {
253
-		    found = 1;
254
-		    break;
255
-		}
256
-	    }
257
-
258
-	    if(!found)
259
-		return 0;
260
-	    alt++;
261
-
262
-	} else if((pattern->pattern[i] & CLI_MATCH_WILDCARD) == CLI_MATCH_NIBBLE_HIGH) {
263
-	    if((unsigned char) (pattern->pattern[i] & 0x00f0) != (buffer[bufferpos] & 0xf0))
264
-		return 0;
265
-
266
-	} else if((pattern->pattern[i] & CLI_MATCH_WILDCARD) == CLI_MATCH_NIBBLE_LOW) {
267
-	    if((unsigned char) (pattern->pattern[i] & 0x000f) != (buffer[bufferpos] & 0x0f))
268
-		return 0;
269
-
270
-	} else if((pattern->pattern[i] & CLI_MATCH_WILDCARD) != CLI_MATCH_IGNORE && (unsigned char) pattern->pattern[i] != buffer[bufferpos])
271
-	    return 0;
272
-
273
-	bufferpos++;
305
+    bp = offset + pattern->depth;
274 306
 
275
-	if(bufferpos == length)
276
-	    bufferpos = 0;
307
+    for(i = pattern->depth; i < pattern->length; i++) {
308
+	AC_MATCH_CHAR(pattern->pattern[i],buffer[bp]);
309
+	bp++;
277 310
     }
278 311
 
279 312
     if(pattern->prefix) {
280 313
 	alt = 0;
281
-	bufferpos = offset - pattern->prefix_length;
314
+	bp = offset - pattern->prefix_length;
282 315
 
283 316
 	for(i = 0; i < pattern->prefix_length; i++) {
284
-
285
-	    if((pattern->prefix[i] & CLI_MATCH_WILDCARD) == CLI_MATCH_ALTERNATIVE) {
286
-		found = 0;
287
-		for(j = 0; j < pattern->altn[alt]; j++) {
288
-		    if(pattern->altc[alt][j] == buffer[bufferpos]) {
289
-			found = 1;
290
-			break;
291
-		    }
292
-		}
293
-
294
-		if(!found)
295
-		    return 0;
296
-		alt++;
297
-
298
-	    } else if((pattern->prefix[i] & CLI_MATCH_WILDCARD) == CLI_MATCH_NIBBLE_HIGH) {
299
-		if((unsigned char) (pattern->prefix[i] & 0x00f0) != (buffer[bufferpos] & 0xf0))
300
-		    return 0;
301
-
302
-	    } else if((pattern->prefix[i] & CLI_MATCH_WILDCARD) == CLI_MATCH_NIBBLE_LOW) {
303
-		if((unsigned char) (pattern->prefix[i] & 0x000f) != (buffer[bufferpos] & 0x0f))
304
-		    return 0;
305
-
306
-	    } else if(!(pattern->prefix[i] & CLI_MATCH_IGNORE) && (unsigned char) pattern->prefix[i] != buffer[bufferpos])
307
-		return 0;
308
-
309
-	    bufferpos++;
317
+	    AC_MATCH_CHAR(pattern->prefix[i],buffer[bp]);
318
+	    bp++;
310 319
 	}
311 320
     }
312 321
 
... ...
@@ -315,11 +360,9 @@ inline static int cli_findpos(const unsigned char *buffer, unsigned int depth, u
315 315
 
316 316
 int cli_ac_initdata(struct cli_ac_data *data, uint32_t partsigs, uint8_t tracklen)
317 317
 {
318
-	unsigned int i, j;
319
-
320 318
 
321 319
     if(!data) {
322
-	cli_errmsg("cli_ac_init(): data == NULL\n");
320
+	cli_errmsg("cli_ac_init: data == NULL\n");
323 321
 	return CL_ENULLARG;
324 322
     }
325 323
 
... ...
@@ -328,126 +371,77 @@ int cli_ac_initdata(struct cli_ac_data *data, uint32_t partsigs, uint8_t trackle
328 328
     if(!partsigs)
329 329
 	return CL_SUCCESS;
330 330
 
331
-    data->inioff = (off_t *) cli_malloc(partsigs * sizeof(off_t));
332
-    if(!data->inioff) {
333
-	cli_errmsg("cli_ac_init(): unable to cli_malloc(%u)\n", partsigs * sizeof(off_t));
334
-	return CL_EMEM;
335
-    }
336
-    memset(data->inioff, -1, partsigs * sizeof(off_t));
337
-
338
-    data->partcnt = (uint16_t *) cli_calloc(partsigs, sizeof(uint16_t));
339
-
340
-    if(!data->partcnt) {
341
-	cli_errmsg("cli_ac_init(): unable to cli_calloc(%u, %u)\n", partsigs, sizeof(unsigned int));
342
-	free(data->inioff);
331
+    data->offmatrix = (int32_t ***) cli_calloc(partsigs, sizeof(int32_t **));
332
+    if(!data->offmatrix) {
333
+	cli_errmsg("cli_ac_init: Can't allocate memory for data->offmatrix\n");
343 334
 	return CL_EMEM;
344 335
     }
345 336
 
346
-    data->offcnt = (uint8_t *) cli_calloc(partsigs, sizeof(uint8_t));
337
+    return CL_SUCCESS;
338
+}
347 339
 
348
-    if(!data->offcnt) {
349
-	cli_errmsg("cli_ac_init(): unable to cli_calloc(%u, %u)\n", partsigs, sizeof(uint8_t));
350
-	free(data->inioff);
351
-	free(data->partcnt);
352
-	return CL_EMEM;
353
-    }
340
+void cli_ac_freedata(struct cli_ac_data *data)
341
+{
342
+	uint32_t i;
354 343
 
355
-    data->offidx = (uint8_t *) cli_calloc(partsigs, sizeof(uint8_t));
356 344
 
357
-    if(!data->offidx) {
358
-	cli_errmsg("cli_ac_init(): unable to cli_calloc(%u, %u)\n", partsigs, sizeof(uint8_t));
359
-	free(data->inioff);
360
-	free(data->partcnt);
361
-	free(data->offcnt);
362
-	return CL_EMEM;
345
+    if(data && data->partsigs) {
346
+	for(i = 0; i < data->partsigs; i++) {
347
+	    if(data->offmatrix[i]) {
348
+		free(data->offmatrix[i][0]);
349
+		free(data->offmatrix[i]);
350
+	    }
351
+	}
352
+	free(data->offmatrix);
363 353
     }
354
+}
364 355
 
365
-    data->maxshift = (int32_t *) cli_malloc(partsigs * sizeof(int32_t));
366
-
367
-    if(!data->maxshift) {
368
-	cli_errmsg("cli_ac_init(): unable to cli_malloc(%u)\n", partsigs * sizeof(int));
369
-	free(data->inioff);
370
-	free(data->partcnt);
371
-	free(data->offcnt);
372
-	free(data->offidx);
373
-	return CL_EMEM;
374
-    }
356
+inline static int ac_addtype(struct cli_matched_type **list, cli_file_t type, off_t offset)
357
+{
358
+	struct cli_matched_type *tnode, *tnode_last;
375 359
 
376
-    memset(data->maxshift, -1, partsigs * sizeof(int32_t));
377 360
 
378
-    data->partoff = (uint32_t **) cli_calloc(partsigs, sizeof(uint32_t *));
361
+    if(*list && (*list)->cnt >= MAX_EMBEDDED_OBJ)
362
+	return CL_SUCCESS;
379 363
 
380
-    if(!data->partoff) {
381
-	cli_errmsg("cli_ac_init(): unable to cli_calloc(%u, %u)\n", partsigs, sizeof(unsigned int));
382
-	free(data->inioff);
383
-	free(data->partcnt);
384
-	free(data->offcnt);
385
-	free(data->offidx);
386
-	free(data->maxshift);
364
+    if(!(tnode = cli_calloc(1, sizeof(struct cli_matched_type)))) {
365
+	cli_errmsg("cli_ac_addtype: Can't allocate memory for new type node\n");
387 366
 	return CL_EMEM;
388 367
     }
389 368
 
390
-    /* The number of multipart signatures is rather small so we already
391
-     * allocate the memory for all parts here instead of using a runtime
392
-     * allocation in cli_ac_scanbuff()
393
-     */
369
+    tnode->type = type;
370
+    tnode->offset = offset;
394 371
 
395
-    for(i = 0; i < partsigs; i++) {
396
-	data->partoff[i] = (uint32_t *) cli_calloc(tracklen, sizeof(uint32_t));
372
+    tnode_last = *list;
373
+    while(tnode_last && tnode_last->next)
374
+	tnode_last = tnode_last->next;
397 375
 
398
-	if(!data->partoff[i]) {
399
-	    for(j = 0; j < i; j++)
400
-		free(data->partoff[j]);
401
-
402
-	    free(data->partoff);
403
-	    free(data->inioff);
404
-	    free(data->partcnt);
405
-	    free(data->offcnt);
406
-	    free(data->offidx);
407
-	    free(data->maxshift);
408
-	    cli_errmsg("cli_ac_init(): unable to cli_calloc(%u, %u)\n", tracklen, sizeof(unsigned int));
409
-	    return CL_EMEM;
410
-	}
411
-    }
376
+    if(tnode_last)
377
+	tnode_last->next = tnode;
378
+    else
379
+	*list = tnode;
412 380
 
381
+    (*list)->cnt++;
413 382
     return CL_SUCCESS;
414 383
 }
415 384
 
416
-void cli_ac_freedata(struct cli_ac_data *data)
417
-{
418
-	unsigned int i;
419
-
420
-
421
-    if(data && data->partsigs) {
422
-	free(data->inioff);
423
-	free(data->partcnt);
424
-	free(data->offcnt);
425
-	free(data->offidx);
426
-	free(data->maxshift);
427
-
428
-	for(i = 0; i < data->partsigs; i++)
429
-	    free(data->partoff[i]);
430
-
431
-	free(data->partoff);
432
-    }
433
-}
434
-
435 385
 int cli_ac_scanbuff(const unsigned char *buffer, uint32_t length, const char **virname, const struct cli_matcher *root, struct cli_ac_data *mdata, uint8_t otfrec, uint32_t offset, cli_file_t ftype, int fd, struct cli_matched_type **ftoffset)
436 386
 {
437 387
 	struct cli_ac_node *current;
438 388
 	struct cli_ac_patt *pt;
439
-	int type = CL_CLEAN, j;
440
-        uint32_t i, position, curroff;
441
-	uint8_t offnum, found;
442
-	struct cli_matched_type *tnode, *tnode_last = NULL;
389
+        uint32_t i, bp, realoff;
390
+	uint16_t j;
391
+	int32_t **offmatrix;
392
+	uint8_t found;
443 393
 	struct cli_target_info info;
394
+	int type = CL_CLEAN;
444 395
 
445 396
 
446 397
     if(!root->ac_root)
447 398
 	return CL_CLEAN;
448 399
 
449 400
     if(!mdata) {
450
-	cli_errmsg("cli_ac_scanbuff(): mdata == NULL\n");
401
+	cli_errmsg("cli_ac_scanbuff: mdata == NULL\n");
451 402
 	return CL_ENULLARG;
452 403
     }
453 404
 
... ...
@@ -455,18 +449,21 @@ int cli_ac_scanbuff(const unsigned char *buffer, uint32_t length, const char **v
455 455
     current = root->ac_root;
456 456
 
457 457
     for(i = 0; i < length; i++)  {
458
-	current = current->trans[buffer[i] & 0xff];
459 458
 
460
-	if(current->islast) {
461
-	    position = i - ac_depth + 1;
459
+	while(current->leaf || !current->trans[buffer[i]])
460
+	    current = current->fail;
462 461
 
462
+	current = current->trans[buffer[i]];
463
+
464
+	if(current->final) {
463 465
 	    pt = current->list;
464 466
 	    while(pt) {
465
-		if(cli_findpos(buffer, ac_depth, position, length, pt)) {
466
-		    curroff = offset + position - pt->prefix_length;
467
+		bp = i + 1 - pt->depth;
468
+		if(ac_findmatch(buffer, bp, length, pt)) {
469
+		    realoff = offset + bp - pt->prefix_length;
467 470
 
468 471
 		    if((pt->offset || pt->target) && (!pt->sigid || pt->partno == 1)) {
469
-			if((fd == -1 && !ftype) || !cli_validatesig(ftype, pt->offset, curroff, &info, fd, pt->virname)) {
472
+			if((fd == -1 && !ftype) || !cli_validatesig(ftype, pt->offset, realoff, &info, fd, pt->virname)) {
470 473
 			    pt = pt->next;
471 474
 			    continue;
472 475
 			}
... ...
@@ -474,100 +471,85 @@ int cli_ac_scanbuff(const unsigned char *buffer, uint32_t length, const char **v
474 474
 
475 475
 		    if(pt->sigid) { /* it's a partial signature */
476 476
 
477
-			if(pt->partno == 1)
478
-			    mdata->inioff[pt->sigid - 1] = curroff;
479
-
480
-			if(mdata->partcnt[pt->sigid - 1] + 1 == pt->partno) {
481
-			    offnum = mdata->offcnt[pt->sigid - 1];
482
-			    if(offnum < AC_DEFAULT_TRACKLEN) {
483
-				mdata->partoff[pt->sigid - 1][offnum] = curroff + pt->length;
484
-
485
-				if(mdata->maxshift[pt->sigid - 1] == -1 || ((int) (mdata->partoff[pt->sigid - 1][offnum] - mdata->partoff[pt->sigid - 1][0]) <= mdata->maxshift[pt->sigid - 1]))
486
-				    mdata->offcnt[pt->sigid - 1]++;
487
-			    } else {
488
-				if(mdata->maxshift[pt->sigid - 1] == -1 || ((int) (curroff + pt->length - mdata->partoff[pt->sigid - 1][0]) <= mdata->maxshift[pt->sigid - 1])) {
489
-				    if(!(mdata->offidx[pt->sigid - 1] %= AC_DEFAULT_TRACKLEN))
490
-					mdata->offidx[pt->sigid - 1]++;
477
+			if(!mdata->offmatrix[pt->sigid - 1]) {
478
+			    mdata->offmatrix[pt->sigid - 1] = cli_malloc(pt->parts * sizeof(int32_t *));
479
+			    if(!mdata->offmatrix[pt->sigid - 1]) {
480
+				cli_errmsg("cli_ac_scanbuff: Can't allocate memory for mdata->offmatrix[%u]\n", pt->sigid - 1);
481
+				return CL_EMEM;
482
+			    }
491 483
 
492
-				    mdata->partoff[pt->sigid - 1][mdata->offidx[pt->sigid - 1]] = curroff + pt->length;
493
-				    mdata->offidx[pt->sigid - 1]++;
494
-				}
484
+			    mdata->offmatrix[pt->sigid - 1][0] = cli_malloc(pt->parts * (AC_DEFAULT_TRACKLEN + 1) * sizeof(int32_t));
485
+			    if(!mdata->offmatrix[pt->sigid - 1][0]) {
486
+				cli_errmsg("cli_ac_scanbuff: Can't allocate memory for mdata->offmatrix[%u][0]\n", pt->sigid - 1);
487
+				free(mdata->offmatrix[pt->sigid - 1]);
488
+				mdata->offmatrix[pt->sigid - 1] = NULL;
489
+				return CL_EMEM;
490
+			    }
491
+			    memset(mdata->offmatrix[pt->sigid - 1][0], -1, pt->parts * (AC_DEFAULT_TRACKLEN + 1) * sizeof(int32_t));
492
+			    mdata->offmatrix[pt->sigid - 1][0][0] = 0;
493
+			    for(j = 1; j < pt->parts; j++) {
494
+				mdata->offmatrix[pt->sigid - 1][j] = mdata->offmatrix[pt->sigid - 1][0] + j * (AC_DEFAULT_TRACKLEN + 1);
495
+				 mdata->offmatrix[pt->sigid - 1][j][0] = 0;
495 496
 			    }
497
+			}
498
+			offmatrix = mdata->offmatrix[pt->sigid - 1];
496 499
 
497
-			} else if(mdata->partcnt[pt->sigid - 1] + 2 == pt->partno) {
500
+			if(pt->partno != 1) {
498 501
 			    found = 0;
499
-			    for(j = mdata->offcnt[pt->sigid - 1] - 1; j >= 0; j--) {
502
+			    for(j = 1; j <= AC_DEFAULT_TRACKLEN && offmatrix[pt->partno - 2][j] != -1; j++) {
500 503
 				found = 1;
501 504
 				if(pt->maxdist)
502
-				    if(curroff - mdata->partoff[pt->sigid - 1][j] > pt->maxdist)
505
+				    if(realoff - offmatrix[pt->partno - 2][j] > pt->maxdist)
503 506
 					found = 0;
504 507
 
505 508
 				if(found && pt->mindist)
506
-				    if(curroff - mdata->partoff[pt->sigid - 1][j] < pt->mindist)
509
+				    if(realoff - offmatrix[pt->partno - 2][j] < pt->mindist)
507 510
 					found = 0;
508 511
 
509 512
 				if(found)
510 513
 				    break;
511 514
 			    }
515
+			}
512 516
 
513
-			    if(found) {
514
-				if(pt->maxdist)
515
-				    mdata->maxshift[pt->sigid - 1] = mdata->partoff[pt->sigid - 1][j] + pt->maxdist - curroff;
516
-				else
517
-				    mdata->maxshift[pt->sigid - 1] = -1;
518
-
519
-				mdata->partoff[pt->sigid - 1][0] = curroff + pt->length;
520
-				mdata->offcnt[pt->sigid - 1] = 1;
521
-
522
-				if(++mdata->partcnt[pt->sigid - 1] + 1 == pt->parts) {
523
-				    if(pt->type) {
524
-					if(otfrec) {
525
-					    if(pt->type > type || pt->type >= CL_TYPE_SFX || pt->type == CL_TYPE_MSEXE) {
526
-						cli_dbgmsg("Matched signature for file type %s at %u\n", pt->virname, (unsigned int) mdata->inioff[pt->sigid - 1]);
527
-						mdata->offcnt[pt->sigid - 1] = 0;
528
-						mdata->offidx[pt->sigid - 1] = 0;
529
-						mdata->partcnt[pt->sigid - 1] = 0;
530
-						mdata->maxshift[pt->sigid - 1] = -1;
531
-
532
-						type = pt->type;
533
-						if(ftoffset && (!*ftoffset || (*ftoffset)->cnt < MAX_EMBEDDED_OBJ) && ((ftype == CL_TYPE_MSEXE && type >= CL_TYPE_SFX) || ((ftype == CL_TYPE_MSEXE || ftype == CL_TYPE_ZIP) && type == CL_TYPE_MSEXE)))  {
534
-						    if(!(tnode = cli_calloc(1, sizeof(struct cli_matched_type)))) {
535
-							cli_errmsg("cli_ac_scanbuff(): Can't allocate memory for new type node\n");
536
-							if(info.exeinfo.section)
537
-							    free(info.exeinfo.section);
538
-							return CL_EMEM;
539
-						    }
540
-
541
-						    tnode->type = type;
542
-						    tnode->offset = mdata->inioff[pt->sigid - 1];
543
-
544
-						    if(*ftoffset && !tnode_last) {
545
-							tnode_last = *ftoffset;
546
-							while(tnode_last->next)
547
-							    tnode_last = tnode_last->next;
548
-						    }
549
-
550
-						    if(tnode_last) {
551
-							tnode_last->next = tnode;
552
-							tnode_last = tnode;
553
-						    } else {
554
-							*ftoffset = tnode;
555
-							tnode_last = tnode;
556
-						    }
557
-
558
-						    (*ftoffset)->cnt++;
517
+			if(pt->partno == 1 || (found && (pt->partno != pt->parts))) {
518
+			    offmatrix[pt->partno - 1][0] %= (AC_DEFAULT_TRACKLEN + 1);
519
+			    if(!offmatrix[pt->partno - 1][0])
520
+				offmatrix[pt->partno - 1][0]++;
521
+
522
+			    offmatrix[pt->partno - 1][offmatrix[pt->partno - 1][0]] = realoff + pt->length;
523
+			    if(pt->partno) /* save realoff for the first part */
524
+				offmatrix[pt->parts - 1][offmatrix[pt->partno - 1][0]] = realoff;
525
+			} else if(found && pt->partno == pt->parts) {
526
+			    if(pt->type) {
527
+				if(otfrec) {
528
+				    if(pt->type > type || pt->type >= CL_TYPE_SFX || pt->type == CL_TYPE_MSEXE) {
529
+					cli_dbgmsg("Matched signature for file type %s\n", pt->virname);
530
+					type = pt->type;
531
+					if(ftoffset && (!*ftoffset || (*ftoffset)->cnt < MAX_EMBEDDED_OBJ) && ((ftype == CL_TYPE_MSEXE && type >= CL_TYPE_SFX) || ((ftype == CL_TYPE_MSEXE || ftype == CL_TYPE_ZIP) && type == CL_TYPE_MSEXE)))  {
532
+					    /* FIXME: we don't know which offset of the first part is the correct one */
533
+					    for(j = 1; j <= AC_DEFAULT_TRACKLEN && offmatrix[0][j] != -1; j++) {
534
+						if(ac_addtype(ftoffset, type, offmatrix[pt->parts - 1][j])) {
535
+						    if(info.exeinfo.section)
536
+							free(info.exeinfo.section);
537
+						    return CL_EMEM;
559 538
 						}
560 539
 					    }
561 540
 					}
562
-				    } else {
563
-					if(virname)
564
-					    *virname = pt->virname;
565 541
 
566
-					if(info.exeinfo.section)
567
-					    free(info.exeinfo.section);
568
-					return CL_VIRUS;
542
+					memset(offmatrix[0], -1, pt->parts * (AC_DEFAULT_TRACKLEN + 1) * sizeof(int32_t));
543
+					for(j = 0; j < pt->parts; j++)
544
+					    offmatrix[j][0] = 0;
569 545
 				    }
570 546
 				}
547
+
548
+			    } else { /* !pt->type */
549
+				if(virname)
550
+				    *virname = pt->virname;
551
+
552
+				if(info.exeinfo.section)
553
+				    free(info.exeinfo.section);
554
+
555
+				return CL_VIRUS;
571 556
 			    }
572 557
 			}
573 558
 
... ...
@@ -575,33 +557,15 @@ int cli_ac_scanbuff(const unsigned char *buffer, uint32_t length, const char **v
575 575
 			if(pt->type) {
576 576
 			    if(otfrec) {
577 577
 				if(pt->type > type || pt->type >= CL_TYPE_SFX || pt->type == CL_TYPE_MSEXE) {
578
-				    cli_dbgmsg("Matched signature for file type %s at %u\n", pt->virname, curroff);
578
+				    cli_dbgmsg("Matched signature for file type %s at %u\n", pt->virname, realoff);
579 579
 				    type = pt->type;
580 580
 				    if(ftoffset && (!*ftoffset || (*ftoffset)->cnt < MAX_EMBEDDED_OBJ) && ((ftype == CL_TYPE_MSEXE && type >= CL_TYPE_SFX) || ((ftype == CL_TYPE_MSEXE || ftype == CL_TYPE_ZIP) && type == CL_TYPE_MSEXE)))  {
581
-					if(!(tnode = cli_calloc(1, sizeof(struct cli_matched_type)))) {
582
-					    cli_errmsg("cli_ac_scanbuff(): Can't allocate memory for new type node\n");
581
+
582
+					if(ac_addtype(ftoffset, type, realoff)) {
583 583
 					    if(info.exeinfo.section)
584 584
 						free(info.exeinfo.section);
585 585
 					    return CL_EMEM;
586 586
 					}
587
-					tnode->type = type;
588
-					tnode->offset = curroff;
589
-
590
-					if(*ftoffset && !tnode_last) {
591
-					    tnode_last = *ftoffset;
592
-					    while(tnode_last->next)
593
-						tnode_last = tnode_last->next;
594
-					}
595
-
596
-					if(tnode_last) {
597
-					    tnode_last->next = tnode;
598
-					    tnode_last = tnode;
599
-					} else {
600
-					    *ftoffset = tnode;
601
-					    tnode_last = tnode;
602
-					}
603
-
604
-					(*ftoffset)->cnt++;
605 587
 				    }
606 588
 				}
607 589
 			    }
... ...
@@ -618,8 +582,6 @@ int cli_ac_scanbuff(const unsigned char *buffer, uint32_t length, const char **v
618 618
 
619 619
 		pt = pt->next;
620 620
 	    }
621
-
622
-	    current = current->fail;
623 621
 	}
624 622
     }
625 623
 
... ...
@@ -628,3 +590,244 @@ int cli_ac_scanbuff(const unsigned char *buffer, uint32_t length, const char **v
628 628
 
629 629
     return otfrec ? type : CL_CLEAN;
630 630
 }
631
+
632
+/* FIXME: clean up the code */
633
+int cli_ac_addsig(struct cli_matcher *root, const char *virname, const char *hexsig, uint32_t sigid, uint16_t parts, uint16_t partno, uint16_t type, uint32_t mindist, uint32_t maxdist, const char *offset, uint8_t target)
634
+{
635
+	struct cli_ac_patt *new;
636
+	char *pt, *hex = NULL;
637
+	uint16_t i, j, ppos = 0, pend;
638
+	uint8_t wprefix = 0, error = 0, namelen, plen = 0;
639
+	int ret;
640
+
641
+#define FREE_ALT			\
642
+    if(new->alt) {			\
643
+	free(new->altn);		\
644
+	for(i = 0; i < new->alt; i++)	\
645
+	    free(new->altc[i]);		\
646
+	free(new->altc);		\
647
+	free(hex);			\
648
+    }
649
+
650
+    if(strlen(hexsig) / 2 < root->ac_mindepth)
651
+	return CL_EPATSHORT;
652
+
653
+    if((new = (struct cli_ac_patt *) cli_calloc(1, sizeof(struct cli_ac_patt))) == NULL)
654
+	return CL_EMEM;
655
+
656
+    new->type = type;
657
+    new->sigid = sigid;
658
+    new->parts = parts;
659
+    new->partno = partno;
660
+    new->mindist = mindist;
661
+    new->maxdist = maxdist;
662
+    new->target = target;
663
+    if(offset) {
664
+	new->offset = cli_strdup(offset);
665
+	if(!new->offset) {
666
+	    free(new);
667
+	    return CL_EMEM;
668
+	}
669
+    }
670
+
671
+    if(strchr(hexsig, '(')) {
672
+	    char *hexcpy, *hexnew, *start, *h, *c;
673
+
674
+	if(!(hexcpy = cli_strdup(hexsig))) {
675
+	    if(new->offset)
676
+		free(new->offset);
677
+	    free(new);
678
+	    return CL_EMEM;
679
+	}
680
+
681
+	if(!(hexnew = (char *) cli_calloc(strlen(hexsig) + 1, 1))) {
682
+	    free(hexcpy);
683
+	    if(new->offset)
684
+		free(new->offset);
685
+	    free(new);
686
+	    return CL_EMEM;
687
+	}
688
+
689
+	start = pt = hexcpy;
690
+	while((pt = strchr(start, '('))) {
691
+	    *pt++ = 0;
692
+
693
+	    if(!start) {
694
+		error = 1;
695
+		break;
696
+	    }
697
+
698
+	    strcat(hexnew, start);
699
+	    strcat(hexnew, "()");
700
+
701
+	    if(!(start = strchr(pt, ')'))) {
702
+		error = 1;
703
+		break;
704
+	    }
705
+	    *start++ = 0;
706
+
707
+	    new->alt++;
708
+	    new->altn = (uint16_t *) cli_realloc(new->altn, new->alt * sizeof(uint16_t));
709
+	    new->altn[new->alt - 1] = 0;
710
+	    new->altc = (unsigned char **) cli_realloc(new->altc, new->alt * sizeof(char *));
711
+	    new->altc[new->alt - 1] = NULL;
712
+
713
+	    for(i = 0; i < strlen(pt); i++)
714
+		if(pt[i] == '|')
715
+		    new->altn[new->alt - 1]++;
716
+
717
+	    if(!new->altn[new->alt - 1]) {
718
+		error = 1;
719
+		break;
720
+	    } else
721
+		new->altn[new->alt - 1]++;
722
+
723
+	    if(!(new->altc[new->alt - 1] = (unsigned char *) cli_calloc(new->altn[new->alt - 1], 1))) {
724
+		error = 1;
725
+		break;
726
+	    }
727
+
728
+	    for(i = 0; i < new->altn[new->alt - 1]; i++) {
729
+		if((h = cli_strtok(pt, i, "|")) == NULL) {
730
+		    error = 1;
731
+		    break;
732
+		}
733
+
734
+		if((c = cli_hex2str(h)) == NULL) {
735
+		    free(h);
736
+		    error = 1;
737
+		    break;
738
+		}
739
+
740
+		new->altc[new->alt - 1][i] = *c;
741
+		free(c);
742
+		free(h);
743
+	    }
744
+
745
+	    if(error)
746
+		break;
747
+	}
748
+
749
+	if(start)
750
+	    strcat(hexnew, start);
751
+
752
+	hex = hexnew;
753
+	free(hexcpy);
754
+
755
+	if(error) {
756
+	    FREE_ALT;
757
+	    if(new->offset)
758
+		free(new->offset);
759
+	    free(new);
760
+	    return CL_EMALFDB;
761
+	}
762
+    }
763
+
764
+    if((new->pattern = cli_hex2ui(new->alt ? hex : hexsig)) == NULL) {
765
+	FREE_ALT;
766
+	if(new->offset)
767
+	    free(new->offset);
768
+	free(new);
769
+	return CL_EMALFDB;
770
+    }
771
+
772
+    new->length = strlen(new->alt ? hex : hexsig) / 2;
773
+
774
+    for(i = 0; i < root->ac_maxdepth && i < new->length; i++) {
775
+	if(new->pattern[i] & CLI_MATCH_WILDCARD) {
776
+	    wprefix = 1;
777
+	    break;
778
+	}
779
+    }
780
+
781
+    if(wprefix) {
782
+	pend = new->length - root->ac_mindepth + 1;
783
+	for(i = 0; i < pend; i++) {
784
+	    for(j = i; j < i + root->ac_maxdepth && j < new->length; j++) {
785
+		if(new->pattern[j] & CLI_MATCH_WILDCARD) {
786
+		    break;
787
+		} else {
788
+		    if(j - i + 1 > plen) {
789
+			plen = j - i + 1;
790
+			ppos = i;
791
+		    }
792
+		}
793
+		if(plen >= root->ac_maxdepth)
794
+		    break;
795
+	    }
796
+	    if(plen >= root->ac_maxdepth)
797
+		break;
798
+	}
799
+
800
+	if(plen < root->ac_mindepth) {
801
+	    cli_errmsg("cli_ac_addsig: Can't find a static subpattern of length %u\n", root->ac_mindepth);
802
+	    FREE_ALT;
803
+	    if(new->offset)
804
+		free(new->offset);
805
+	    free(new->pattern);
806
+	    free(new);
807
+	    return CL_EMALFDB;
808
+	}
809
+
810
+	new->prefix = new->pattern;
811
+	new->prefix_length = ppos;
812
+	new->pattern = &new->prefix[ppos];
813
+	new->length -= ppos;
814
+
815
+	for(i = 0; i < new->prefix_length; i++)
816
+	    if((new->prefix[i] & CLI_MATCH_WILDCARD) == CLI_MATCH_ALTERNATIVE)
817
+		new->alt_pattern++;
818
+    }
819
+
820
+    if(new->length > root->maxpatlen)
821
+	root->maxpatlen = new->length;
822
+
823
+    if((pt = strstr(virname, " (Clam)")))
824
+	namelen = strlen(virname) - strlen(pt);
825
+    else
826
+	namelen = strlen(virname);
827
+
828
+    if(!namelen) {
829
+	cli_errmsg("cli_ac_addsig: No virus name\n");
830
+	if(new->prefix)
831
+	    free(new->prefix);
832
+	else
833
+	    free(new->pattern);
834
+	FREE_ALT;
835
+	if(new->offset)
836
+	    free(new->offset);
837
+	free(new);
838
+	return CL_EMALFDB;
839
+    }
840
+
841
+    if((new->virname = cli_calloc(namelen + 1, sizeof(char))) == NULL) {
842
+	if(new->prefix)
843
+	    free(new->prefix);
844
+	else
845
+	    free(new->pattern);
846
+	FREE_ALT;
847
+	if(new->offset)
848
+	    free(new->offset);
849
+	free(new);
850
+	return CL_EMEM;
851
+    }
852
+    strncpy(new->virname, virname, namelen);
853
+
854
+    if((ret = cli_ac_addpatt(root, new))) {
855
+	if(new->prefix)
856
+	    free(new->prefix);
857
+	else
858
+	    free(new->pattern);
859
+	free(new->virname);
860
+	FREE_ALT;
861
+	if(new->offset)
862
+	    free(new->offset);
863
+	free(new);
864
+	return ret;
865
+    }
866
+
867
+    if(new->alt)
868
+	free(hex);
869
+
870
+    return CL_SUCCESS;
871
+}
... ...
@@ -24,24 +24,20 @@
24 24
 #include "filetypes.h"
25 25
 #include "cltypes.h"
26 26
 
27
-#define AC_DEFAULT_DEPTH 2
27
+#define AC_DEFAULT_MIN_DEPTH 2
28
+#define AC_DEFAULT_MAX_DEPTH 3
28 29
 #define AC_DEFAULT_TRACKLEN 8
29 30
 
30 31
 struct cli_ac_data {
31 32
     uint32_t partsigs;
32
-    off_t *inioff;
33
-    uint16_t *partcnt;
34
-    uint32_t **partoff;
35
-    uint8_t *offcnt;
36
-    uint8_t *offidx;
37
-    int32_t *maxshift;
33
+    int32_t ***offmatrix;
38 34
 };
39 35
 
40 36
 struct cli_ac_patt {
41 37
     uint16_t *pattern, *prefix, length, prefix_length;
38
+    uint8_t depth;
42 39
     uint32_t mindist, maxdist;
43 40
     char *virname, *offset;
44
-    const char *viralias;
45 41
     uint32_t sigid;
46 42
     uint16_t parts, partno, alt, *altn, alt_pattern;
47 43
     uint8_t target;
... ...
@@ -51,9 +47,9 @@ struct cli_ac_patt {
51 51
 };
52 52
 
53 53
 struct cli_ac_node {
54
-    uint8_t islast;
54
+    uint8_t leaf, final;
55 55
     struct cli_ac_patt *list;
56
-    struct cli_ac_node *trans[256], *fail;
56
+    struct cli_ac_node **trans, *fail;
57 57
 };
58 58
 
59 59
 #include "matcher.h"
... ...
@@ -63,6 +59,9 @@ int cli_ac_initdata(struct cli_ac_data *data, uint32_t partsigs, uint8_t trackle
63 63
 void cli_ac_freedata(struct cli_ac_data *data);
64 64
 int cli_ac_scanbuff(const unsigned char *buffer, uint32_t length, const char **virname, const struct cli_matcher *root, struct cli_ac_data *mdata, uint8_t otfrec, uint32_t offset, cli_file_t ftype, int fd, struct cli_matched_type **ftoffset);
65 65
 int cli_ac_buildtrie(struct cli_matcher *root);
66
+int cli_ac_init(struct cli_matcher *root, uint8_t mindepth, uint8_t maxdepth);
66 67
 void cli_ac_free(struct cli_matcher *root);
68
+int cli_ac_addsig(struct cli_matcher *root, const char *virname, const char *hexsig, uint32_t sigid, uint16_t parts, uint16_t partno, uint16_t type, uint32_t mindist, uint32_t maxdist, const char *offset, uint8_t target);
69
+
67 70
 
68 71
 #endif
... ...
@@ -46,9 +46,10 @@ struct cli_matcher {
46 46
     struct cli_bm_patt **bm_suffix;
47 47
 
48 48
     /* Extended Aho-Corasick */
49
-    uint8_t ac_depth;
49
+    uint8_t ac_mindepth, ac_maxdepth;
50 50
     struct cli_ac_node *ac_root, **ac_nodetable;
51
-    uint32_t ac_partsigs, ac_nodes;
51
+    struct cli_ac_patt **ac_pattable;
52
+    uint32_t ac_partsigs, ac_nodes, ac_patterns;
52 53
 };
53 54
 
54 55
 #define CL_TARGET_TABLE_SIZE 7
... ...
@@ -82,237 +82,6 @@ static pthread_mutex_t cli_ref_mutex = PTHREAD_MUTEX_INITIALIZER;
82 82
 int cl_loaddb(const char *filename, struct cl_engine **engine, unsigned int *signo);
83 83
 int cl_loaddbdir(const char *dirname, struct cl_engine **engine, unsigned int *signo);
84 84
 
85
-/* TODO: clean up the code */
86
-
87
-static int cli_ac_addsig(struct cli_matcher *root, const char *virname, const char *hexsig, int sigid, int parts, int partno, unsigned short type, unsigned int mindist, unsigned int maxdist, const char *offset, unsigned short target)
88
-{
89
-	struct cli_ac_patt *new;
90
-	char *pt, *hex = NULL;
91
-	int virlen, ret, error = 0;
92
-	unsigned int i, j, wprefix = 0;
93
-
94
-#define FREE_ALT			\
95
-    if(new->alt) {			\
96
-	free(new->altn);		\
97
-	for(i = 0; i < new->alt; i++)	\
98
-	    free(new->altc[i]);		\
99
-	free(new->altc);		\
100
-	free(hex);			\
101
-    }
102
-
103
-    if(strlen(hexsig) / 2 < AC_DEFAULT_DEPTH)
104
-	return CL_EPATSHORT;
105
-
106
-    if((new = (struct cli_ac_patt *) cli_calloc(1, sizeof(struct cli_ac_patt))) == NULL)
107
-	return CL_EMEM;
108
-
109
-    new->type = type;
110
-    new->sigid = sigid;
111
-    new->parts = parts;
112
-    new->partno = partno;
113
-    new->mindist = mindist;
114
-    new->maxdist = maxdist;
115
-    new->target = target;
116
-    if(offset) {
117
-	new->offset = cli_strdup(offset);
118
-	if(!new->offset)
119
-	    return CL_EMEM;
120
-    }
121
-
122
-    if(strchr(hexsig, '(')) {
123
-	    char *hexcpy, *hexnew, *start, *h, *c;
124
-
125
-	if(!(hexcpy = cli_strdup(hexsig))) {
126
-	    if(new->offset)
127
-		free(new->offset);
128
-	    free(new);
129
-	    return CL_EMEM;
130
-	}
131
-
132
-	if(!(hexnew = (char *) cli_calloc(strlen(hexsig) + 1, 1))) {
133
-	    free(hexcpy);
134
-	    if(new->offset)
135
-		free(new->offset);
136
-	    free(new);
137
-	    return CL_EMEM;
138
-	}
139
-
140
-	start = pt = hexcpy;
141
-	while((pt = strchr(start, '('))) {
142
-	    *pt++ = 0;
143
-
144
-	    if(!start) {
145
-		error = 1;
146
-		break;
147
-	    }
148
-
149
-	    strcat(hexnew, start);
150
-	    strcat(hexnew, "@@");
151
-
152
-	    if(!(start = strchr(pt, ')'))) {
153
-		error = 1;
154
-		break;
155
-	    }
156
-	    *start++ = 0;
157
-
158
-	    new->alt++;
159
-	    new->altn = (unsigned short int *) cli_realloc(new->altn, new->alt * sizeof(unsigned short int));
160
-	    new->altn[new->alt - 1] = 0;
161
-	    new->altc = (unsigned char **) cli_realloc(new->altc, new->alt * sizeof(char *));
162
-	    new->altc[new->alt - 1] = NULL;
163
-
164
-	    for(i = 0; i < strlen(pt); i++)
165
-		if(pt[i] == '|')
166
-		    new->altn[new->alt - 1]++;
167
-
168
-	    if(!new->altn[new->alt - 1]) {
169
-		error = 1;
170
-		break;
171
-	    } else
172
-		new->altn[new->alt - 1]++;
173
-
174
-	    if(!(new->altc[new->alt - 1] = (unsigned char *) cli_calloc(new->altn[new->alt - 1], 1))) {
175
-		error = 1;
176
-		break;
177
-	    }
178
-
179
-	    for(i = 0; i < new->altn[new->alt - 1]; i++) {
180
-		if((h = cli_strtok(pt, i, "|")) == NULL) {
181
-		    error = 1;
182
-		    break;
183
-		}
184
-
185
-		if((c = cli_hex2str(h)) == NULL) {
186
-		    free(h);
187
-		    error = 1;
188
-		    break;
189
-		}
190
-
191
-		new->altc[new->alt - 1][i] = *c;
192
-		free(c);
193
-		free(h);
194
-	    }
195
-
196
-	    if(error)
197
-		break;
198
-	}
199
-
200
-	if(start)
201
-	    strcat(hexnew, start);
202
-
203
-	hex = hexnew;
204
-	free(hexcpy);
205
-
206
-	if(error) {
207
-	    FREE_ALT;
208
-	    if(new->offset)
209
-		free(new->offset);
210
-	    free(new);
211
-	    return CL_EMALFDB;
212
-	}
213
-    }
214
-
215
-    if((new->pattern = cli_hex2ui(new->alt ? hex : hexsig)) == NULL) {
216
-	FREE_ALT;
217
-	if(new->offset)
218
-	    free(new->offset);
219
-	free(new);
220
-	return CL_EMALFDB;
221
-    }
222
-
223
-    new->length = strlen(new->alt ? hex : hexsig) / 2;
224
-
225
-    for(i = 0; i < AC_DEFAULT_DEPTH; i++) {
226
-	if(new->pattern[i] & CLI_MATCH_WILDCARD) {
227
-	    wprefix = 1;
228
-	    break;
229
-	}
230
-    }
231
-
232
-    if(wprefix) {
233
-	for(; i < (uint16_t) (new->length - AC_DEFAULT_DEPTH + 1); i++) {
234
-	    wprefix = 0;
235
-	    for(j = i; j < i + AC_DEFAULT_DEPTH; j++) {
236
-		if(new->pattern[j] & CLI_MATCH_WILDCARD) {
237
-		    wprefix = 1;
238
-		    break;
239
-		}
240
-	    }
241
-	    if(!wprefix)
242
-		break;
243
-	}
244
-
245
-	if(wprefix) {
246
-	    FREE_ALT;
247
-	    if(new->offset)
248
-		free(new->offset);
249
-	    free(new->pattern);
250
-	    free(new);
251
-	    return CL_EMALFDB;
252
-	}
253
-
254
-	new->prefix = new->pattern;
255
-	new->prefix_length = i;
256
-	new->pattern = &new->prefix[i];
257
-	new->length -= i;
258
-
259
-	for(i = 0; i < new->prefix_length; i++)
260
-	    if((new->prefix[i] & CLI_MATCH_WILDCARD) == CLI_MATCH_ALTERNATIVE)
261
-		new->alt_pattern++;
262
-    }
263
-
264
-    if(new->length > root->maxpatlen)
265
-	root->maxpatlen = new->length;
266
-
267
-    if((pt = strstr(virname, "(Clam)")))
268
-	virlen = strlen(virname) - strlen(pt) - 1;
269
-    else
270
-	virlen = strlen(virname);
271
-
272
-    if(virlen <= 0) {
273
-	if(new->prefix)
274
-	    free(new->prefix);
275
-	else
276
-	    free(new->pattern);
277
-	FREE_ALT;
278
-	if(new->offset)
279
-	    free(new->offset);
280
-	free(new);
281
-	return CL_EMALFDB;
282
-    }
283
-
284
-    if((new->virname = cli_calloc(virlen + 1, sizeof(char))) == NULL) {
285
-	if(new->prefix)
286
-	    free(new->prefix);
287
-	else
288
-	    free(new->pattern);
289
-	FREE_ALT;
290
-	if(new->offset)
291
-	    free(new->offset);
292
-	free(new);
293
-	return CL_EMEM;
294
-    }
295
-
296
-    strncpy(new->virname, virname, virlen);
297
-
298
-    if((ret = cli_ac_addpatt(root, new))) {
299
-	if(new->prefix)
300
-	    free(new->prefix);
301
-	else
302
-	    free(new->pattern);
303
-	free(new->virname);
304
-	FREE_ALT;
305
-	if(new->offset)
306
-	    free(new->offset);
307
-	free(new);
308
-	return ret;
309
-    }
310
-
311
-    if(new->alt)
312
-	free(hex);
313
-
314
-    return CL_SUCCESS;
315
-}
316 85
 
317 86
 int cli_parse_add(struct cli_matcher *root, const char *virname, const char *hexsig, unsigned short type, const char *offset, unsigned short target)
318 87
 {
... ...
@@ -570,11 +339,10 @@ static int cli_initroots(struct cl_engine *engine, unsigned int options)
570 570
 	    }
571 571
 
572 572
 	    cli_dbgmsg("Initialising AC pattern matcher of root[%d]\n", i);
573
-	    root->ac_root =  (struct cli_ac_node *) cli_calloc(1, sizeof(struct cli_ac_node));
574
-	    if(!root->ac_root) {
573
+	    if((ret = cli_ac_init(root, AC_DEFAULT_MIN_DEPTH, AC_DEFAULT_MAX_DEPTH))) {
575 574
 		/* no need to free previously allocated memory here */
576 575
 		cli_errmsg("Can't initialise AC pattern matcher\n");
577
-		return CL_EMEM;
576
+		return ret;
578 577
 	    }
579 578
 
580 579
 	    if(!root->ac_only) {
... ...
@@ -97,7 +97,7 @@ uint16_t *cli_hex2ui(const char *hex)
97 97
 	    }
98 98
 	    val |= CLI_MATCH_NIBBLE_LOW;
99 99
 
100
-	} else if(hex[i] == '@') {
100
+	} else if(hex[i] == '(') {
101 101
 	    val |= CLI_MATCH_ALTERNATIVE;
102 102
 
103 103
 	} else {