Browse code

pattern matcher accuracy improvements

git-svn: trunk@2505

Tomasz Kojm authored on 2006/11/16 00:26:54
Showing 11 changed files
... ...
@@ -1,3 +1,10 @@
1
+Wed Nov 15 16:18:09 CET 2006 (tk)
2
+---------------------------------
3
+  * libclamav: + the AC matcher now keeps a track of partial matches to improve
4
+		 the accuracy of signatures with range wildcards
5
+	       + add cli_ac_initdata() and cli_ac_freedata()
6
+	       + fix some signedness warnings
7
+
1 8
 Tue Nov 14 13:49:58 GMT 2006 (trog)
2 9
 -----------------------------------
3 10
   * libclamav/ole2_extract.c: Fix Solaris endian issue. (bb#89)
... ...
@@ -104,7 +104,8 @@ extern "C"
104 104
 
105 105
 /* internal structures */
106 106
 struct cli_bm_patt {
107
-    char *pattern, *virname, *offset;
107
+    unsigned char *pattern;
108
+    char *virname, *offset;
108 109
     const char *viralias;
109 110
     unsigned int length;
110 111
     unsigned short target;
... ...
@@ -26,6 +26,9 @@
26 26
 #include <string.h>
27 27
 #include <stdlib.h>
28 28
 #include <ctype.h>
29
+#ifdef	HAVE_UNISTD_H
30
+#include <unistd.h>
31
+#endif
29 32
 
30 33
 #include "clamav.h"
31 34
 #include "filetypes.h"
... ...
@@ -233,13 +236,11 @@ int is_tar(unsigned char *buf, unsigned int nbytes);
233 233
 
234 234
 cli_file_t cli_filetype2(int desc, const struct cl_engine *engine)
235 235
 {
236
-	char smallbuff[MAGIC_BUFFER_SIZE + 1], *decoded;
237
-	unsigned char *bigbuff;
236
+	unsigned char smallbuff[MAGIC_BUFFER_SIZE + 1], *decoded, *bigbuff;
238 237
 	int bread, sret;
239 238
 	cli_file_t ret = CL_TYPE_UNKNOWN_DATA;
240 239
 	struct cli_matcher *root;
241
-	int *partcnt;
242
-	unsigned long int *partoff;
240
+	struct cli_ac_data mdata;
243 241
 
244 242
 
245 243
     memset(smallbuff, 0, sizeof(smallbuff));
... ...
@@ -251,33 +252,28 @@ cli_file_t cli_filetype2(int desc, const struct cl_engine *engine)
251 251
 	if(!root)
252 252
 	    return ret;
253 253
 
254
-	if((partcnt = (int *) cli_calloc(root->ac_partsigs + 1, sizeof(int))) == NULL) {
255
-	    cli_warnmsg("cli_filetype2(): unable to cli_calloc(%d, %d)\n", root->ac_partsigs + 1, sizeof(int));
254
+	if(cli_ac_initdata(&mdata, root->ac_partsigs, AC_DEFAULT_TRACKLEN))
256 255
 	    return ret;
257
-	}
258 256
 
259
-	if((partoff = (unsigned long int *) cli_calloc(root->ac_partsigs + 1, sizeof(unsigned long int))) == NULL) {
260
-	    cli_dbgmsg("cli_filetype2(): unable to cli_calloc(%d, %d)\n", root->ac_partsigs + 1, sizeof(unsigned long int));
261
-	    free(partcnt);
262
-	    return ret;
263
-	}
257
+	sret = cli_ac_scanbuff(smallbuff, bread, NULL, engine->root[0], &mdata, 1, 0, 0, -1, NULL);
258
+
259
+	cli_ac_freedata(&mdata);
264 260
 
265
-	sret = cli_ac_scanbuff(smallbuff, bread, NULL, engine->root[0], partcnt, 1, 0, partoff, 0, -1, NULL);
266 261
 	if(sret >= CL_TYPENO) {
267 262
 	    ret = sret;
268 263
 	} else {
269
-	    memset(partcnt, 0, (root->ac_partsigs + 1) * sizeof(int));
270
-	    memset(partoff, 0, (root->ac_partsigs + 1) * sizeof(unsigned long int));
264
+	    if(cli_ac_initdata(&mdata, root->ac_partsigs, AC_DEFAULT_TRACKLEN))
265
+		return ret;
266
+
271 267
 	    decoded = cli_utf16toascii(smallbuff, bread);
272 268
 	    if(decoded) {
273
-		sret = cli_ac_scanbuff(decoded, strlen(decoded), NULL, engine->root[0], partcnt, 1, 0, partoff, 0, -1, NULL);
269
+		sret = cli_ac_scanbuff(decoded, strlen(decoded), NULL, engine->root[0], &mdata, 1, 0, 0, -1, NULL);
274 270
 		free(decoded);
275 271
 		if(sret == CL_TYPE_HTML)
276 272
 		    ret = CL_TYPE_HTML_UTF16;
277 273
 	    }
274
+	    cli_ac_freedata(&mdata);
278 275
 	}
279
-	free(partcnt);
280
-	free(partoff);
281 276
     }
282 277
 
283 278
     if(ret == CL_TYPE_UNKNOWN_DATA || ret == CL_TYPE_UNKNOWN_TEXT) {
... ...
@@ -63,14 +63,14 @@ int cli_ac_addpatt(struct cli_matcher *root, struct cli_ac_patt *pattern)
63 63
 	if(!next) {
64 64
 	    next = (struct cli_ac_node *) cli_calloc(1, sizeof(struct cli_ac_node));
65 65
 	    if(!next) {
66
-		cli_dbgmsg("Unable to allocate AC node (%d)\n", sizeof(struct cli_ac_node));
66
+		cli_errmsg("cli_ac_addpatt(): Unable to allocate AC node (%u bytes)\n", sizeof(struct cli_ac_node));
67 67
 		return CL_EMEM;
68 68
 	    }
69 69
 
70 70
 	    root->ac_nodes++;
71 71
 	    root->ac_nodetable = (struct cli_ac_node **) cli_realloc(root->ac_nodetable, (root->ac_nodes) * sizeof(struct cli_ac_node *));
72 72
 	    if(root->ac_nodetable == NULL) {
73
-		cli_dbgmsg("Unable to realloc nodetable (%d)\n", (root->ac_nodes) * sizeof(struct cli_matcher *));
73
+		cli_errmsg("cli_ac_addpatt(): Unable to realloc nodetable (%u bytes)\n", (root->ac_nodes) * sizeof(struct cli_matcher *));
74 74
 		return CL_EMEM;
75 75
 	    }
76 76
 	    root->ac_nodetable[root->ac_nodes - 1] = next;
... ...
@@ -95,7 +95,7 @@ static int cli_enqueue(struct nodelist **bfs, struct cli_ac_node *n)
95 95
 
96 96
     new = (struct nodelist *) cli_calloc(1, sizeof(struct nodelist));
97 97
     if (new == NULL) {
98
-	cli_dbgmsg("Unable to allocate node list (%d)\n", sizeof(struct nodelist));
98
+	cli_errmsg("cli_enqueue(): Unable to allocate node list (%u bytes)\n", sizeof(struct nodelist));
99 99
 	return CL_EMEM;
100 100
     }
101 101
 
... ...
@@ -176,7 +176,7 @@ int cli_ac_buildtrie(struct cli_matcher *root)
176 176
 	return CL_EMALFDB;
177 177
 
178 178
     if(!root->ac_root) {
179
-	cli_dbgmsg("AC pattern matcher not initialised\n");
179
+	cli_dbgmsg("cli_ac_buildtrie(): AC pattern matcher is not initialised\n");
180 180
 	return CL_SUCCESS;
181 181
     }
182 182
 
... ...
@@ -228,7 +228,7 @@ void cli_ac_free(struct cli_matcher *root)
228 228
 	free(root->ac_root);
229 229
 }
230 230
 
231
-inline static int cli_findpos(const char *buffer, unsigned int depth, unsigned int offset, unsigned int length, const struct cli_ac_patt *pattern)
231
+inline static int cli_findpos(const unsigned char *buffer, unsigned int depth, unsigned int offset, unsigned int length, const struct cli_ac_patt *pattern)
232 232
 {
233 233
 	unsigned int bufferpos = offset + depth;
234 234
 	unsigned int postfixend = offset + length;
... ...
@@ -260,7 +260,7 @@ inline static int cli_findpos(const char *buffer, unsigned int depth, unsigned i
260 260
 		return 0;
261 261
 	    alt++;
262 262
 
263
-	} else if(pattern->pattern[i] != CLI_IGN && (char) pattern->pattern[i] != buffer[bufferpos])
263
+	} else if(pattern->pattern[i] != CLI_IGN && (unsigned char) pattern->pattern[i] != buffer[bufferpos])
264 264
 	    return 0;
265 265
 
266 266
 	bufferpos++;
... ...
@@ -288,7 +288,7 @@ inline static int cli_findpos(const char *buffer, unsigned int depth, unsigned i
288 288
 		    return 0;
289 289
 		alt++;
290 290
 
291
-	    } else if(pattern->prefix[i] != CLI_IGN && (char) pattern->prefix[i] != buffer[bufferpos])
291
+	    } else if(pattern->prefix[i] != CLI_IGN && (unsigned char) pattern->prefix[i] != buffer[bufferpos])
292 292
 		return 0;
293 293
 
294 294
 	    bufferpos++;
... ...
@@ -298,27 +298,119 @@ inline static int cli_findpos(const char *buffer, unsigned int depth, unsigned i
298 298
     return 1;
299 299
 }
300 300
 
301
-int cli_ac_scanbuff(const unsigned char *buffer, unsigned int length, const char **virname, const struct cli_matcher *root, int *partcnt, unsigned short otfrec, unsigned long int offset, unsigned long int *partoff, unsigned short ftype, int fd, struct cli_matched_type **ftoffset)
301
+int cli_ac_initdata(struct cli_ac_data *data, unsigned int partsigs, unsigned int tracklen)
302
+{
303
+	unsigned int i, j;
304
+
305
+
306
+    if(!data) {
307
+	cli_errmsg("cli_ac_init(): data == NULL\n");
308
+	return CL_ENULLARG;
309
+    }
310
+
311
+    data->partsigs = partsigs;
312
+
313
+    if(!partsigs)
314
+	return CL_SUCCESS;
315
+
316
+    data->partcnt = (unsigned int *) cli_calloc(partsigs, sizeof(unsigned int));
317
+
318
+    if(!data->partcnt) {
319
+	cli_errmsg("cli_ac_init(): unable to cli_calloc(%u, %u)\n", partsigs, sizeof(unsigned int));
320
+	return CL_EMEM;
321
+    }
322
+
323
+    data->offcnt = (unsigned int *) cli_calloc(partsigs, sizeof(unsigned int));
324
+
325
+    if(!data->offcnt) {
326
+	cli_errmsg("cli_ac_init(): unable to cli_calloc(%u, %u)\n", partsigs, sizeof(unsigned int));
327
+	free(data->partcnt);
328
+	return CL_EMEM;
329
+    }
330
+
331
+    data->maxshift = (int *) cli_malloc(partsigs * sizeof(int));
332
+
333
+    if(!data->maxshift) {
334
+	cli_errmsg("cli_ac_init(): unable to cli_malloc(%u)\n", partsigs * sizeof(int));
335
+	free(data->partcnt);
336
+	free(data->offcnt);
337
+	return CL_EMEM;
338
+    }
339
+
340
+    memset(data->maxshift, -1, partsigs * sizeof(int));
341
+
342
+    data->partoff = (unsigned int **) cli_calloc(partsigs, sizeof(unsigned int *));
343
+
344
+    if(!data->partoff) {
345
+	cli_errmsg("cli_ac_init(): unable to cli_calloc(%u, %u)\n", partsigs, sizeof(unsigned int));
346
+	free(data->partcnt);
347
+	free(data->offcnt);
348
+	free(data->maxshift);
349
+	return CL_EMEM;
350
+    }
351
+
352
+    /* The number of multipart signatures is rather small so we already
353
+     * allocate the memory for all parts here instead of using a runtime
354
+     * allocation in cli_ac_scanbuff()
355
+     */
356
+
357
+    for(i = 0; i < partsigs; i++) {
358
+	data->partoff[i] = (unsigned int *) cli_calloc(tracklen, sizeof(unsigned int));
359
+
360
+	if(!data->partoff[i]) {
361
+	    for(j = 0; j < i; j++)
362
+		free(data->partoff[j]);
363
+
364
+	    free(data->partoff);
365
+	    free(data->partcnt);
366
+	    free(data->offcnt);
367
+	    free(data->maxshift);
368
+	    cli_errmsg("cli_ac_init(): unable to cli_calloc(%u, %u)\n", tracklen, sizeof(unsigned int));
369
+	    return CL_EMEM;
370
+	}
371
+    }
372
+
373
+    return CL_SUCCESS;
374
+}
375
+
376
+void cli_ac_freedata(struct cli_ac_data *data)
377
+{
378
+	unsigned int i;
379
+
380
+
381
+    if(data && data->partsigs) {
382
+	free(data->partcnt);
383
+	free(data->offcnt);
384
+	free(data->maxshift);
385
+
386
+	for(i = 0; i < data->partsigs; i++)
387
+	    free(data->partoff[i]);
388
+
389
+	free(data->partoff);
390
+    }
391
+}
392
+
393
+int cli_ac_scanbuff(const unsigned char *buffer, unsigned int length, const char **virname, const struct cli_matcher *root, struct cli_ac_data *mdata, unsigned short otfrec, unsigned long int offset, unsigned short ftype, int fd, struct cli_matched_type **ftoffset)
302 394
 {
303 395
 	struct cli_ac_node *current;
304 396
 	struct cli_ac_patt *pt;
305
-	int type = CL_CLEAN, dist, t;
306
-        unsigned int i, position;
397
+	int type = CL_CLEAN, t, j;
398
+        unsigned int i, position, idx, found, curroff;
307 399
 	struct cli_matched_type *tnode;
308 400
 
309 401
 
310 402
     if(!root->ac_root)
311 403
 	return CL_CLEAN;
312 404
 
313
-    if(!partcnt || !partoff) {
314
-	cli_dbgmsg("cli_ac_scanbuff(): partcnt == NULL || partoff == NULL\n");
405
+    if(!mdata) {
406
+	cli_errmsg("cli_ac_scanbuff(): mdata == NULL\n");
315 407
 	return CL_ENULLARG;
316 408
     }
317 409
 
318 410
     current = root->ac_root;
319 411
 
320 412
     for(i = 0; i < length; i++)  {
321
-	current = current->trans[(unsigned char) buffer[i] & 0xff];
413
+	current = current->trans[buffer[i] & 0xff];
322 414
 
323 415
 	if(current->islast) {
324 416
 	    position = i - ac_depth + 1;
... ...
@@ -326,46 +418,67 @@ int cli_ac_scanbuff(const unsigned char *buffer, unsigned int length, const char
326 326
 	    pt = current->list;
327 327
 	    while(pt) {
328 328
 		if(cli_findpos(buffer, ac_depth, position, length, pt)) {
329
+		    curroff = offset + position - pt->prefix_length;
330
+
329 331
 		    if((pt->offset || pt->target) && (!pt->sigid || pt->partno == 1)) {
330 332
 			if(ftype == CL_TYPE_UNKNOWN_TEXT)
331 333
 			    t = type;
332 334
 			else
333 335
 			    t = ftype;
334 336
 
335
-			if((fd == -1 && !t) || !cli_validatesig(t, pt->offset, offset + position - pt->prefix_length, fd, pt->virname)) {
337
+			if((fd == -1 && !t) || !cli_validatesig(t, pt->offset, curroff, fd, pt->virname)) {
336 338
 			    pt = pt->next;
337 339
 			    continue;
338 340
 			}
339 341
 		    }
340 342
 
341 343
 		    if(pt->sigid) { /* it's a partial signature */
342
-			if(partcnt[pt->sigid] + 1 == pt->partno) {
343
-			    dist = 1;
344
-			    if(pt->maxdist)
345
-				if((offset + i - pt->prefix_length) - partoff[pt->sigid] > pt->maxdist)
346
-				    dist = 0;
347 344
 
348
-			    if(dist && pt->mindist)
349
-				if((offset + i - pt->prefix_length) - partoff[pt->sigid] < pt->mindist)
350
-				    dist = 0;
345
+			if(mdata->partcnt[pt->sigid - 1] + 1 == pt->partno) {
346
+			    idx = mdata->offcnt[pt->sigid - 1];
347
+			    if(idx < AC_DEFAULT_TRACKLEN) {
348
+				mdata->partoff[pt->sigid - 1][idx] = curroff + pt->length;
349
+
350
+				if(mdata->maxshift[pt->sigid - 1] == -1 || ((int) (mdata->partoff[pt->sigid - 1][idx] - mdata->partoff[pt->sigid - 1][0]) <= mdata->maxshift[pt->sigid - 1]))
351
+				    mdata->offcnt[pt->sigid - 1]++;
352
+			    }
351 353
 
352
-			    if(dist) {
353
-				partoff[pt->sigid] = offset + i + pt->length;
354
+			} else if(mdata->partcnt[pt->sigid - 1] + 2 == pt->partno) {
355
+			    found = 0;
356
+			    for(j = mdata->offcnt[pt->sigid - 1] - 1; j >= 0; j--) {
357
+				found = 1;
358
+				if(pt->maxdist)
359
+				    if(curroff - mdata->partoff[pt->sigid - 1][j] > pt->maxdist)
360
+					found = 0;
354 361
 
355
-				if(++partcnt[pt->sigid] == pt->parts) { /* the last one */
362
+				if(found && pt->mindist)
363
+				    if(curroff - mdata->partoff[pt->sigid - 1][j] < pt->mindist)
364
+					found = 0;
365
+
366
+				if(found)
367
+				    break;
368
+			    }
369
+
370
+			    if(found) {
371
+				mdata->maxshift[pt->sigid - 1] = mdata->partoff[pt->sigid - 1][j] + pt->maxdist - curroff;
372
+
373
+				mdata->partoff[pt->sigid - 1][0] = curroff + pt->length;
374
+				mdata->offcnt[pt->sigid - 1] = 1;
375
+
376
+				if(++mdata->partcnt[pt->sigid - 1] + 1 == pt->parts) {
356 377
 				    if(pt->type) {
357 378
 					if(otfrec) {
358 379
 					    if(pt->type > type || pt->type >= CL_TYPE_SFX) {
359
-						cli_dbgmsg("Matched signature for file type %s at %d\n", pt->virname, offset + position - pt->prefix_length);
380
+						cli_dbgmsg("Matched signature for file type %s\n", pt->virname);
360 381
 						type = pt->type;
361 382
 						if(ftoffset && (!*ftoffset || (*ftoffset)->cnt < SFX_MAX_TESTS) && ftype == CL_TYPE_MSEXE && type >= CL_TYPE_SFX) {
362 383
 						    if(!(tnode = cli_calloc(1, sizeof(struct cli_matched_type)))) {
363
-							cli_errmsg("Can't alloc memory for new type node\n");
384
+							cli_errmsg("cli_ac_scanbuff(): Can't allocate memory for new type node\n");
364 385
 							return CL_EMEM;
365 386
 						    }
366 387
 
367 388
 						    tnode->type = type;
368
-						    tnode->offset = offset + position - pt->prefix_length;
389
+						    tnode->offset = -1; /* we don't remember the offset of the first part */
369 390
 
370 391
 						    if(*ftoffset)
371 392
 							tnode->cnt = (*ftoffset)->cnt + 1;
... ...
@@ -374,7 +487,6 @@ int cli_ac_scanbuff(const unsigned char *buffer, unsigned int length, const char
374 374
 
375 375
 						    tnode->next = *ftoffset;
376 376
 						    *ftoffset = tnode;
377
-
378 377
 						}
379 378
 					    }
380 379
 					}
... ...
@@ -392,15 +504,15 @@ int cli_ac_scanbuff(const unsigned char *buffer, unsigned int length, const char
392 392
 			if(pt->type) {
393 393
 			    if(otfrec) {
394 394
 				if(pt->type > type || pt->type >= CL_TYPE_SFX) {
395
-				    cli_dbgmsg("Matched signature for file type %s at %d\n", pt->virname, offset + position - pt->prefix_length);
395
+				    cli_dbgmsg("Matched signature for file type %s at %u\n", pt->virname, curroff);
396 396
 				    type = pt->type;
397 397
 				    if(ftoffset && (!*ftoffset ||(*ftoffset)->cnt < SFX_MAX_TESTS) && ftype == CL_TYPE_MSEXE && type >= CL_TYPE_SFX) {
398 398
 					if(!(tnode = cli_calloc(1, sizeof(struct cli_matched_type)))) {
399
-					    cli_errmsg("Can't alloc memory for new type node\n");
399
+					    cli_errmsg("cli_ac_scanbuff(): Can't allocate memory for new type node\n");
400 400
 					    return CL_EMEM;
401 401
 					}
402 402
 					tnode->type = type;
403
-					tnode->offset = offset + position - pt->prefix_length;
403
+					tnode->offset = curroff;
404 404
 
405 405
 					if(*ftoffset)
406 406
 					    tnode->cnt = (*ftoffset)->cnt + 1;
... ...
@@ -25,9 +25,20 @@
25 25
 #include "filetypes.h"
26 26
 
27 27
 #define AC_DEFAULT_DEPTH 2
28
+#define AC_DEFAULT_TRACKLEN 8
29
+
30
+struct cli_ac_data {
31
+    unsigned int partsigs;
32
+    unsigned int *partcnt;
33
+    unsigned int **partoff;
34
+    unsigned int *offcnt;
35
+    int *maxshift;
36
+};
28 37
 
29 38
 int cli_ac_addpatt(struct cli_matcher *root, struct cli_ac_patt *pattern);
30
-int cli_ac_scanbuff(const unsigned char *buffer, unsigned int length, const char **virname, const struct cli_matcher *root, int *partcnt, unsigned short otfrec, unsigned long int offset, unsigned long int *partoff, unsigned short ftype, int fd, struct cli_matched_type **ftoffset);
39
+int cli_ac_initdata(struct cli_ac_data *data, unsigned int partsigs, unsigned int histlen);
40
+void cli_ac_freedata(struct cli_ac_data *data);
41
+int cli_ac_scanbuff(const unsigned char *buffer, unsigned int length, const char **virname, const struct cli_matcher *root, struct cli_ac_data *mdata, unsigned short otfrec, unsigned long int offset, unsigned short ftype, int fd, struct cli_matched_type **ftoffset);
31 42
 int cli_ac_buildtrie(struct cli_matcher *root);
32 43
 void cli_ac_free(struct cli_matcher *root);
33 44
 void cli_ac_setdepth(unsigned int depth);
... ...
@@ -32,15 +32,14 @@
32 32
 /* #define BM_TEST_OFFSET	5 */
33 33
 #define BM_BLOCK_SIZE	3
34 34
 
35
-#define HASH(a,b,c) 211 * (unsigned char) a + 37 * (unsigned char) b + (unsigned char) c
36
-#define DHASH(a,b,c) 211 * a + 37 * b + c
35
+#define HASH(a,b,c) (211 * a + 37 * b + c)
37 36
 
38 37
 
39 38
 int cli_bm_addpatt(struct cli_matcher *root, struct cli_bm_patt *pattern)
40 39
 {
41 40
 	int i;
42 41
 	uint16_t idx;
43
-	const char *pt = pattern->pattern;
42
+	const unsigned char *pt = pattern->pattern;
44 43
 	struct cli_bm_patt *prev, *next = NULL;
45 44
 
46 45
 
... ...
@@ -80,7 +79,7 @@ int cli_bm_addpatt(struct cli_matcher *root, struct cli_bm_patt *pattern)
80 80
 int cli_bm_init(struct cli_matcher *root)
81 81
 {
82 82
 	unsigned int i;
83
-	unsigned int size = DHASH(256, 256, 256);
83
+	unsigned int size = HASH(256, 256, 256);
84 84
 
85 85
 
86 86
     cli_dbgmsg("in cli_bm_init()\n");
... ...
@@ -104,7 +103,7 @@ void cli_bm_free(struct cli_matcher *root)
104 104
 {
105 105
 	struct cli_bm_patt *b1, *b2;
106 106
 	unsigned int i;
107
-	unsigned int size = DHASH(256, 256, 256);
107
+	unsigned int size = HASH(256, 256, 256);
108 108
 
109 109
 
110 110
     if(root->bm_shift)
... ...
@@ -129,14 +128,14 @@ void cli_bm_free(struct cli_matcher *root)
129 129
     }
130 130
 }
131 131
 
132
-int cli_bm_scanbuff(const char *buffer, unsigned int length, const char **virname, const struct cli_matcher *root, unsigned long int offset, unsigned short ftype, int fd)
132
+int cli_bm_scanbuff(const unsigned char *buffer, unsigned int length, const char **virname, const struct cli_matcher *root, unsigned long int offset, unsigned short ftype, int fd)
133 133
 {
134 134
 	unsigned int i, j, shift, off, found = 0;
135 135
 	int idxtest;
136 136
 	uint16_t idx;
137 137
 	struct cli_bm_patt *p;
138
-	const char *bp;
139
-	char prefix;
138
+	const unsigned char *bp;
139
+	unsigned char prefix;
140 140
 
141 141
 
142 142
     if(!root->bm_shift)
... ...
@@ -26,7 +26,7 @@
26 26
 
27 27
 int cli_bm_addpatt(struct cli_matcher *root, struct cli_bm_patt *pattern);
28 28
 int cli_bm_init(struct cli_matcher *root);
29
-int cli_bm_scanbuff(const char *buffer, unsigned int length, const char **virname, const struct cli_matcher *root, unsigned long int offset, unsigned short ftype, int fd);
29
+int cli_bm_scanbuff(const unsigned char *buffer, unsigned int length, const char **virname, const struct cli_matcher *root, unsigned long int offset, unsigned short ftype, int fd);
30 30
 void cli_bm_free(struct cli_matcher *root);
31 31
 
32 32
 #endif
... ...
@@ -52,10 +52,10 @@ extern short cli_debug_flag;
52 52
 #endif
53 53
 
54 54
 
55
-int cli_scanbuff(const char *buffer, unsigned int length, const char **virname, const struct cl_engine *engine, unsigned short ftype)
55
+int cli_scanbuff(const unsigned char *buffer, unsigned int length, const char **virname, const struct cl_engine *engine, unsigned short ftype)
56 56
 {
57
-	int ret = CL_CLEAN, i, tid = 0, *partcnt;
58
-	unsigned long int *partoff;
57
+	int ret = CL_CLEAN, i, tid = 0;
58
+	struct cli_ac_data mdata;
59 59
 	struct cli_matcher *groot, *troot = NULL;
60 60
 #ifdef HAVE_NCORE
61 61
 	void *streamhandle;
... ...
@@ -209,43 +209,26 @@ int cli_scanbuff(const char *buffer, unsigned int length, const char **virname,
209 209
 
210 210
     if(troot) {
211 211
 
212
-	if((partcnt = (int *) cli_calloc(troot->ac_partsigs + 1, sizeof(int))) == NULL) {
213
-	    cli_dbgmsg("cli_scanbuff(): unable to cli_calloc(%d, %d)\n", troot->ac_partsigs + 1, sizeof(int));
214
-	    return CL_EMEM;
215
-	}
216
-
217
-	if((partoff = (unsigned long int *) cli_calloc(troot->ac_partsigs + 1, sizeof(unsigned long int))) == NULL) {
218
-	    cli_dbgmsg("cli_scanbuff(): unable to cli_calloc(%d, %d)\n", troot->ac_partsigs + 1, sizeof(unsigned long int));
219
-	    free(partcnt);
220
-	    return CL_EMEM;
221
-	}
212
+	if((ret = cli_ac_initdata(&mdata, troot->ac_partsigs, AC_DEFAULT_TRACKLEN)))
213
+	    return ret;
222 214
 
223 215
 	if(troot->ac_only || (ret = cli_bm_scanbuff(buffer, length, virname, troot, 0, ftype, -1)) != CL_VIRUS)
224
-	    ret = cli_ac_scanbuff(buffer, length, virname, troot, partcnt, 0, 0, partoff, ftype, -1, NULL);
216
+	    ret = cli_ac_scanbuff(buffer, length, virname, troot, &mdata, 0, 0, ftype, -1, NULL);
225 217
 
226
-	free(partcnt);
227
-	free(partoff);
218
+	cli_ac_freedata(&mdata);
228 219
 
229 220
 	if(ret == CL_VIRUS)
230 221
 	    return ret;
231 222
     }
232 223
 
233
-    if((partcnt = (int *) cli_calloc(groot->ac_partsigs + 1, sizeof(int))) == NULL) {
234
-	cli_dbgmsg("cli_scanbuff(): unable to cli_calloc(%d, %d)\n", groot->ac_partsigs + 1, sizeof(int));
235
-	return CL_EMEM;
236
-    }
237
-
238
-    if((partoff = (unsigned long int *) cli_calloc(groot->ac_partsigs + 1, sizeof(unsigned long int))) == NULL) {
239
-	cli_dbgmsg("cli_scanbuff(): unable to cli_calloc(%d, %d)\n", groot->ac_partsigs + 1, sizeof(unsigned long int));
240
-	free(partcnt);
241
-	return CL_EMEM;
242
-    }
224
+    if((ret = cli_ac_initdata(&mdata, groot->ac_partsigs, AC_DEFAULT_TRACKLEN)))
225
+	return ret;
243 226
 
244 227
     if(groot->ac_only || (ret = cli_bm_scanbuff(buffer, length, virname, groot, 0, ftype, -1)) != CL_VIRUS)
245
-	ret = cli_ac_scanbuff(buffer, length, virname, groot, partcnt, 0, 0, partoff, ftype, -1, NULL);
228
+	ret = cli_ac_scanbuff(buffer, length, virname, groot, &mdata, 0, 0, ftype, -1, NULL);
229
+
230
+    cli_ac_freedata(&mdata);
246 231
 
247
-    free(partcnt);
248
-    free(partoff);
249 232
     return ret;
250 233
 }
251 234
 
... ...
@@ -408,10 +391,11 @@ int cli_validatesig(unsigned short ftype, const char *offstr, unsigned long int
408 408
 
409 409
 int cli_scandesc(int desc, cli_ctx *ctx, unsigned short otfrec, unsigned short ftype, struct cli_matched_type **ftoffset)
410 410
 {
411
- 	char *buffer, *buff, *endbl, *pt;
412
-	int ret = CL_CLEAN, *gpartcnt = NULL, *tpartcnt = NULL, type = CL_CLEAN, i, tid = 0, bytes;
411
+ 	unsigned char *buffer, *buff, *endbl, *upt;
412
+	int ret = CL_CLEAN, type = CL_CLEAN, i, tid = 0, bytes;
413 413
 	unsigned int buffersize, length, maxpatlen, shift = 0;
414
-	unsigned long int *gpartoff = NULL, *tpartoff = NULL, offset = 0;
414
+	unsigned long int offset = 0;
415
+	struct cli_ac_data gdata, tdata;
415 416
 	MD5_CTX md5ctx;
416 417
 	unsigned char digest[16];
417 418
 	struct cli_md5_node *md5_node;
... ...
@@ -423,6 +407,7 @@ int cli_scandesc(int desc, cli_ctx *ctx, unsigned short otfrec, unsigned short f
423 423
 	uint32_t datamask[2] = { 0xffffffff, 0xffffffff };
424 424
 	int count, hret;
425 425
 	off_t origoff;
426
+	char *pt;
426 427
 #endif
427 428
 
428 429
 
... ...
@@ -644,42 +629,17 @@ int cli_scandesc(int desc, cli_ctx *ctx, unsigned short otfrec, unsigned short f
644 644
 
645 645
     /* prepare the buffer */
646 646
     buffersize = maxpatlen + SCANBUFF;
647
-    if(!(buffer = (char *) cli_calloc(buffersize, sizeof(char)))) {
647
+    if(!(buffer = (unsigned char *) cli_calloc(buffersize, sizeof(unsigned char)))) {
648 648
 	cli_dbgmsg("cli_scandesc(): unable to cli_calloc(%d)\n", buffersize);
649 649
 	return CL_EMEM;
650 650
     }
651 651
 
652
-    if((gpartcnt = (int *) cli_calloc(groot->ac_partsigs + 1, sizeof(int))) == NULL) {
653
-	cli_dbgmsg("cli_scandesc(): unable to cli_calloc(%d, %d)\n", groot->ac_partsigs + 1, sizeof(int));
654
-	free(buffer);
655
-	return CL_EMEM;
656
-    }
657
-
658
-    if((gpartoff = (unsigned long int *) cli_calloc(groot->ac_partsigs + 1, sizeof(unsigned long int))) == NULL) {
659
-	cli_dbgmsg("cli_scandesc(): unable to cli_calloc(%d, %d)\n", groot->ac_partsigs + 1, sizeof(unsigned long int));
660
-	free(buffer);
661
-	free(gpartcnt);
662
-	return CL_EMEM;
663
-    }
652
+    if((ret = cli_ac_initdata(&gdata, groot->ac_partsigs, AC_DEFAULT_TRACKLEN)))
653
+	return ret;
664 654
 
665 655
     if(troot) {
666
-
667
-	if((tpartcnt = (int *) cli_calloc(troot->ac_partsigs + 1, sizeof(int))) == NULL) {
668
-	    cli_dbgmsg("cli_scandesc(): unable to cli_calloc(%d, %d)\n", troot->ac_partsigs + 1, sizeof(int));
669
-	    free(buffer);
670
-	    free(gpartcnt);
671
-	    free(gpartoff);
672
-	    return CL_EMEM;
673
-	}
674
-
675
-	if((tpartoff = (unsigned long int *) cli_calloc(troot->ac_partsigs + 1, sizeof(unsigned long int))) == NULL) {
676
-	    cli_dbgmsg("cli_scandesc(): unable to cli_calloc(%d, %d)\n", troot->ac_partsigs + 1, sizeof(unsigned long int));
677
-	    free(buffer);
678
-	    free(gpartcnt);
679
-	    free(gpartoff);
680
-	    free(tpartcnt);
681
-	    return CL_EMEM;
682
-	}
656
+	if((ret = cli_ac_initdata(&tdata, troot->ac_partsigs, AC_DEFAULT_TRACKLEN)))
657
+	    return ret;
683 658
     }
684 659
 
685 660
     if(ctx->engine->md5_hlist)
... ...
@@ -689,29 +649,27 @@ int cli_scandesc(int desc, cli_ctx *ctx, unsigned short otfrec, unsigned short f
689 689
     buff = buffer;
690 690
     buff += maxpatlen; /* pointer to read data block */
691 691
     endbl = buff + SCANBUFF - maxpatlen; /* pointer to the last block
692
-						* length of maxpatlen
693
-						*/
692
+					  * length of maxpatlen
693
+					  */
694 694
 
695
-    pt = buff;
695
+    upt = buff;
696 696
     while((bytes = cli_readn(desc, buff + shift, SCANBUFF - shift)) > 0) {
697 697
 
698 698
 	if(ctx->scanned)
699 699
 	    *ctx->scanned += bytes / CL_COUNT_PRECISION;
700 700
 
701 701
 	length = shift + bytes;
702
-	if(pt == buffer)
702
+	if(upt == buffer)
703 703
 	    length += maxpatlen;
704 704
 
705 705
 	if(troot) {
706
-	    if(troot->ac_only || (ret = cli_bm_scanbuff(pt, length, ctx->virname, troot, offset, ftype, desc)) != CL_VIRUS)
707
-		ret = cli_ac_scanbuff(pt, length, ctx->virname, troot, tpartcnt, otfrec, offset, tpartoff, ftype, desc, ftoffset);
706
+	    if(troot->ac_only || (ret = cli_bm_scanbuff(upt, length, ctx->virname, troot, offset, ftype, desc)) != CL_VIRUS)
707
+		ret = cli_ac_scanbuff(upt, length, ctx->virname, troot, &tdata, otfrec, offset, ftype, desc, ftoffset);
708 708
 
709 709
 	    if(ret == CL_VIRUS) {
710 710
 		free(buffer);
711
-		free(gpartcnt);
712
-		free(gpartoff);
713
-		free(tpartcnt);
714
-		free(tpartoff);
711
+		cli_ac_freedata(&gdata);
712
+		cli_ac_freedata(&tdata);
715 713
 
716 714
 		lseek(desc, 0, SEEK_SET);
717 715
 		if(cli_checkfp(desc, ctx->engine))
... ...
@@ -721,17 +679,14 @@ int cli_scandesc(int desc, cli_ctx *ctx, unsigned short otfrec, unsigned short f
721 721
 	    }
722 722
 	}
723 723
 
724
-	if(groot->ac_only || (ret = cli_bm_scanbuff(pt, length, ctx->virname, groot, offset, ftype, desc)) != CL_VIRUS)
725
-	    ret = cli_ac_scanbuff(pt, length, ctx->virname, groot, gpartcnt, otfrec, offset, gpartoff, ftype, desc, ftoffset);
724
+	if(groot->ac_only || (ret = cli_bm_scanbuff(upt, length, ctx->virname, groot, offset, ftype, desc)) != CL_VIRUS)
725
+	    ret = cli_ac_scanbuff(upt, length, ctx->virname, groot, &gdata, otfrec, offset, ftype, desc, ftoffset);
726 726
 
727 727
 	if(ret == CL_VIRUS) {
728 728
 	    free(buffer);
729
-	    free(gpartcnt);
730
-	    free(gpartoff);
731
-	    if(troot) {
732
-		free(tpartcnt);
733
-		free(tpartoff);
734
-	    }
729
+	    cli_ac_freedata(&gdata);
730
+	    if(troot)
731
+		cli_ac_freedata(&tdata);
735 732
 	    lseek(desc, 0, SEEK_SET);
736 733
 	    if(cli_checkfp(desc, ctx->engine))
737 734
 		return CL_CLEAN;
... ...
@@ -750,8 +705,8 @@ int cli_scandesc(int desc, cli_ctx *ctx, unsigned short otfrec, unsigned short f
750 750
 	    memmove(buffer, endbl, maxpatlen);
751 751
 	    offset += SCANBUFF;
752 752
 
753
-	    if(pt == buff) {
754
-		pt = buffer;
753
+	    if(upt == buff) {
754
+		upt = buffer;
755 755
 		offset -= maxpatlen;
756 756
 	    }
757 757
 
... ...
@@ -764,12 +719,9 @@ int cli_scandesc(int desc, cli_ctx *ctx, unsigned short otfrec, unsigned short f
764 764
     }
765 765
 
766 766
     free(buffer);
767
-    free(gpartcnt);
768
-    free(gpartoff);
769
-    if(troot) {
770
-	free(tpartcnt);
771
-	free(tpartoff);
772
-    }
767
+    cli_ac_freedata(&gdata);
768
+    if(troot)
769
+	cli_ac_freedata(&tdata);
773 770
 
774 771
     if(ctx->engine->md5_hlist) {
775 772
 	MD5_Final(digest, &md5ctx);
... ...
@@ -28,7 +28,7 @@
28 28
 
29 29
 int cli_scandesc(int desc, cli_ctx *ctx, unsigned short otfrec, unsigned short ftype, struct cli_matched_type **ftoffset);
30 30
 
31
-int cli_scanbuff(const char *buffer, unsigned int length, const char **virname, const struct cl_engine *engine, unsigned short ftype);
31
+int cli_scanbuff(const unsigned char *buffer, unsigned int length, const char **virname, const struct cl_engine *engine, unsigned short ftype);
32 32
 
33 33
 int cli_validatesig(unsigned short ftype, const char *offstr, unsigned long int fileoff, int desc, const char *virname);
34 34
 
... ...
@@ -19,6 +19,9 @@
19 19
  *  MA 02110-1301, USA.
20 20
  *
21 21
  *  $Log: regex_list.c,v $
22
+ *  Revision 1.15  2006/11/15 15:26:54  tkojm
23
+ *  pattern matcher accuracy improvements
24
+ *
22 25
  *  Revision 1.14  2006/11/05 18:16:56  acab
23 26
  *  Patch for bug 52 from Edvin
24 27
  *
... ...
@@ -350,8 +353,8 @@ int regex_list_match(struct regex_matcher* matcher,const char* real_url,const ch
350 350
 		size_t buffer_len  = (hostOnly && !is_whitelist) ? real_len : real_len + display_len + 1;
351 351
 		char*  buffer = cli_malloc(buffer_len+1);
352 352
 		size_t i;
353
-		int partcnt,rc = 0;
354
-		unsigned long int partoff;
353
+		int rc = 0;
354
+		struct cli_ac_data mdata;
355 355
 
356 356
 		if(!buffer)
357 357
 			return CL_EMEM;
... ...
@@ -364,13 +367,18 @@ int regex_list_match(struct regex_matcher* matcher,const char* real_url,const ch
364 364
 		}
365 365
 		cli_dbgmsg("Looking up in regex_list: %s\n", buffer);
366 366
 
367
-		if(hostOnly)
367
+		if(hostOnly) {
368
+			if((rc = cli_ac_initdata(&mdata, 0, AC_DEFAULT_TRACKLEN)))
369
+				return rc;
370
+			rc = 0;
371
+
368 372
 			for(i = 0; i < matcher->root_hosts_cnt; i++) {
369
-				if(( rc = cli_ac_scanbuff((unsigned char*)buffer,buffer_len,info, &matcher->root_hosts[i] ,&partcnt,0,0,&partoff,0,-1,NULL) ))
373
+				if(( rc = cli_ac_scanbuff((unsigned char*)buffer,buffer_len,info, &matcher->root_hosts[i] ,&mdata,0,0,0,-1,NULL) ))
370 374
 					break;
371 375
 			}
372
-		else
376
+		} else
373 377
 			rc = 0;
378
+    
374 379
 		if(!rc && !hostOnly) 
375 380
 			rc = match_node(matcher->root_regex,(unsigned char*)buffer,buffer_len,info) == MATCH_SUCCESS ? CL_VIRUS : CL_SUCCESS;
376 381
 		free(buffer);
... ...
@@ -956,7 +956,7 @@ static int cli_vba_scandir(const char *dirname, cli_ctx *ctx)
956 956
 		if(ctx->scanned)
957 957
 		    *ctx->scanned += data_len / CL_COUNT_PRECISION;
958 958
 
959
-		if(cli_scanbuff((char *) data, data_len, ctx->virname, ctx->engine, CL_TYPE_MSOLE2) == CL_VIRUS) {
959
+		if(cli_scanbuff(data, data_len, ctx->virname, ctx->engine, CL_TYPE_MSOLE2) == CL_VIRUS) {
960 960
 		    free(data);
961 961
 		    ret = CL_VIRUS;
962 962
 		    break;
... ...
@@ -1003,7 +1003,7 @@ static int cli_vba_scandir(const char *dirname, cli_ctx *ctx)
1003 1003
 		} else {
1004 1004
 			if(ctx->scanned)
1005 1005
 			    *ctx->scanned += vba_project->length[i] / CL_COUNT_PRECISION;
1006
-			if(cli_scanbuff((char *) data, vba_project->length[i], ctx->virname, ctx->engine, CL_TYPE_MSOLE2) == CL_VIRUS) {
1006
+			if(cli_scanbuff(data, vba_project->length[i], ctx->virname, ctx->engine, CL_TYPE_MSOLE2) == CL_VIRUS) {
1007 1007
 				free(data);
1008 1008
 				ret = CL_VIRUS;
1009 1009
 				break;