Browse code

make use of hostkey prefix entries

git-svn: trunk@4929

Török Edvin authored on 2009/03/12 05:06:35
Showing 6 changed files
... ...
@@ -1,3 +1,9 @@
1
+Wed Mar 11 22:06:30 EET 2009 (edwin)
2
+------------------------------------
3
+ * libclamav/phishcheck.c, libclamav/regex_list.c,
4
+ libclamav/regex_list.h, unit_tests/check_regex.c,
5
+ unit_tests/input/daily.gdb: make use of hostkey prefix entries
6
+
1 7
 Wed Mar 11 21:27:32 EET 2009 (edwin)
2 8
 ------------------------------------
3 9
  * clamd/others.c, sigtool/Makefile.in: fix previous commit
... ...
@@ -1172,7 +1172,7 @@ static int whitelist_check(const struct cl_engine* engine,struct url_check* urls
1172 1172
 	return whitelist_match(engine,urls->realLink.data,urls->displayLink.data,hostOnly);
1173 1173
 }
1174 1174
 
1175
-static int hash_match(const struct regex_matcher *rlist, const char *host, size_t hlen, const char *path, size_t plen)
1175
+static int hash_match(const struct regex_matcher *rlist, const char *host, size_t hlen, const char *path, size_t plen, int *prefix_matched)
1176 1176
 {
1177 1177
 	const char *virname;
1178 1178
 #if 0
... ...
@@ -1198,9 +1198,15 @@ static int hash_match(const struct regex_matcher *rlist, const char *host, size_
1198 1198
 		h[2*i+1] = hexchars[sha256_dig[i]&0xf];
1199 1199
 	    }
1200 1200
 	    h[64]='\0';
1201
-	    cli_dbgmsg("Looking up hash %s for %s%s\n", h, host, path);
1202
-	    if(SO_search(&rlist->sha256_filter, sha256_dig, 32) != -1 &&
1203
-	       cli_bm_scanbuff(sha256_dig, 32, &virname, &rlist->sha256_hashes,0,0,-1) == CL_VIRUS) {
1201
+	    cli_dbgmsg("Looking up hash %s for %s(%u)%s(%u)\n", h, host, hlen, path, plen);
1202
+	    if (prefix_matched) {
1203
+		if (cli_bm_scanbuff(sha256_dig, 4, &virname, &rlist->hostkey_prefix,0,0,-1) == CL_VIRUS) {
1204
+		    cli_dbgmsg("prefix matched\n", virname);
1205
+		    *prefix_matched = 1;
1206
+		} else
1207
+		    return CL_SUCCESS;
1208
+	    }
1209
+	    if (cli_bm_scanbuff(sha256_dig, 32, &virname, &rlist->sha256_hashes,0,0,-1) == CL_VIRUS) {
1204 1210
 		switch(*virname) {
1205 1211
 		    case '1':
1206 1212
 			return CL_PHISH_HASH1;
... ...
@@ -1316,10 +1322,11 @@ static int url_hash_match(const struct regex_matcher *rlist, const char *inurl,
1316 1316
 	size_t path_len;
1317 1317
 	size_t host_len;
1318 1318
 	char *p;
1319
-	int rc;
1319
+	int rc, prefix_matched=0;
1320 1320
 	const char *lp[COMPONENTS+1];
1321 1321
 	size_t pp[COMPONENTS+2];
1322 1322
 	char urlbuff[URL_MAX_LEN+3];/* htmlnorm truncates at 1024 bytes + terminating null + slash + host end null */
1323
+	unsigned count;
1323 1324
 
1324 1325
 	if(!rlist || !rlist->sha256_hashes.bm_patterns) {
1325 1326
 		return CL_SUCCESS;
... ...
@@ -1358,15 +1365,27 @@ static int url_hash_match(const struct regex_matcher *rlist, const char *inurl,
1358 1358
 		}
1359 1359
 	} else
1360 1360
 		k = 1;
1361
-
1362
-	for(ji=j;ji < COMPONENTS+1; ji++) {
1363
-		for(ki=0;ki < k; ki++) {
1364
-			assert(pp[ki] <= path_len);
1365
-			rc = hash_match(rlist, lp[ji], host_begin + host_len - lp[ji] + 1, path_begin, pp[ki]);
1366
-			if(rc) {
1367
-				return rc;
1368
-			}
1361
+	count = 0;
1362
+	for(ki=k;ki > 0;) {
1363
+	    --ki;
1364
+	    for(ji=COMPONENTS+1;ji > j;) {
1365
+		/* lookup last 2 and 3 components of host, as hostkey prefix,
1366
+		 * if not matched, shortcircuit lookups */
1367
+		int need_prefixmatch = (count<2 && !prefix_matched) &&
1368
+				       rlist->hostkey_prefix.bm_patterns;
1369
+		--ji;
1370
+		assert(pp[ki] <= path_len);
1371
+		rc = hash_match(rlist, lp[ji], host_begin + host_len - lp[ji] + 1, path_begin, pp[ki], 
1372
+				need_prefixmatch ? &prefix_matched : NULL);
1373
+		if(rc) {
1374
+		    return rc;
1375
+		}
1376
+		count++;
1377
+		if (count == 2 && !prefix_matched && rlist->hostkey_prefix.bm_patterns) {
1378
+		    cli_dbgmsg("hostkey prefix not matched, short-circuiting lookups\n");
1379
+		    return CL_SUCCESS;
1369 1380
 		}
1381
+	    }
1370 1382
 	}
1371 1383
 	return CL_SUCCESS;
1372 1384
 }
... ...
@@ -1394,8 +1413,11 @@ static enum phish_status phishingCheck(const struct cl_engine* engine,struct url
1394 1394
 	}
1395 1395
 
1396 1396
 	if(( rc = url_hash_match(engine->domainlist_matcher, urls->realLink.data, strlen(urls->realLink.data)) )) {
1397
+	    if (rc == CL_PHISH_CLEAN)
1398
+		cli_dbgmsg("not analyzing, not a real url: %s\n", urls->realLink.data);
1399
+	    else
1397 1400
 		cli_dbgmsg("Hash matched for: %s\n", urls->realLink.data);
1398
-		return rc;
1401
+	    return rc;
1399 1402
 	}
1400 1403
 
1401 1404
 	if((rc = cleanupURLs(urls))) {
... ...
@@ -372,12 +372,15 @@ int init_regex_list(struct regex_matcher* matcher)
372 372
 	}
373 373
 #ifdef USE_MPOOL
374 374
 	matcher->sha256_hashes.mempool = mp;
375
+	matcher->hostkey_prefix.mempool = mp;
375 376
 #endif
376 377
 	if((rc = cli_bm_init(&matcher->sha256_hashes))) {
377 378
 		return rc;
378 379
 	}
380
+	if((rc = cli_bm_init(&matcher->hostkey_prefix))) {
381
+		return rc;
382
+	}
379 383
 	SO_init(&matcher->filter);
380
-	SO_init(&matcher->sha256_filter);
381 384
 	return CL_SUCCESS;
382 385
 }
383 386
 
... ...
@@ -424,10 +427,11 @@ static int functionality_level_check(char* line)
424 424
 	}
425 425
 }
426 426
 
427
-static int add_hash(struct regex_matcher *matcher, char* pattern, const char fl)
427
+static int add_hash(struct regex_matcher *matcher, char* pattern, const char fl, int is_prefix)
428 428
 {
429 429
 	int rc;
430 430
 	struct cli_bm_patt *pat = mpool_calloc(matcher->mempool, 1, sizeof(*pat));
431
+	struct cli_matcher *bm;
431 432
 	if(!pat)
432 433
 		return CL_EMEM;
433 434
 	pat->pattern = (unsigned char*)cli_mpool_hex2str(matcher->mempool, pattern);
... ...
@@ -440,8 +444,14 @@ static int add_hash(struct regex_matcher *matcher, char* pattern, const char fl)
440 440
 		return CL_EMEM;
441 441
 	}
442 442
 	*pat->virname = fl;
443
-	SO_preprocess_add(&matcher->sha256_filter, pat->pattern, pat->length);
444
-	if((rc = cli_bm_addpatt(&matcher->sha256_hashes, pat))) {
443
+	if (is_prefix) {
444
+	    pat->length=4;
445
+	    bm = &matcher->hostkey_prefix;
446
+	} else {
447
+	    bm = &matcher->sha256_hashes;
448
+	}
449
+
450
+	if((rc = cli_bm_addpatt(bm, pat))) {
445 451
 		cli_errmsg("add_hash: failed to add BM pattern\n");
446 452
 		free(pat->pattern);
447 453
 		free(pat->virname);
... ...
@@ -542,15 +552,12 @@ int load_regex_matcher(struct regex_matcher* matcher,FILE* fd,unsigned int *sign
542 542
 				return rc==CL_EMEM ? CL_EMEM : CL_EMALFDB;
543 543
 		} else if (buffer[0] == 'S' && !is_whitelist) {
544 544
 			pattern[pattern_len] = '\0';
545
-			if(*pattern=='F' && pattern[1]==':') {
545
+			if((pattern[0]=='F' || pattern[0]=='P') && pattern[1]==':') {
546 546
 			    pattern += 2;
547
-			    if (( rc = add_hash(matcher, pattern, flags[0]) )) {
547
+			    if (( rc = add_hash(matcher, pattern, flags[0], pattern[-2] == 'P') )) {
548 548
 				cli_errmsg("Error loading at line: %d\n", line);
549 549
 				return rc==CL_EMEM ? CL_EMEM : CL_EMALFDB;
550 550
 			    }
551
-			} else if (*pattern=='P' && pattern[1]==':') {
552
-			    pattern += 2;
553
-			    /* TODO: hostkey prefix */
554 551
 			} else {
555 552
 			    cli_errmsg("Error loading line: %d, %c\n", line, *pattern);
556 553
 			    return CL_EMALFDB;
... ...
@@ -617,6 +624,7 @@ void regex_list_done(struct regex_matcher* matcher)
617 617
 		}
618 618
 		hashtab_free(&matcher->suffix_hash);
619 619
 		cli_bm_free(&matcher->sha256_hashes);
620
+		cli_bm_free(&matcher->hostkey_prefix);
620 621
 	}
621 622
 }
622 623
 
... ...
@@ -52,7 +52,7 @@ struct regex_matcher {
52 52
 	regex_t **all_pregs;
53 53
 	struct cli_matcher suffixes;
54 54
 	struct cli_matcher sha256_hashes;
55
-	struct filter sha256_filter;
55
+	struct cli_matcher hostkey_prefix;
56 56
 	struct filter filter;
57 57
 #ifdef USE_MPOOL
58 58
 	mpool_t *mempool;
... ...
@@ -313,7 +313,7 @@ static void psetup_impl(int load2)
313 313
 		fail_unless(rc == 0, "load_regex_matcher");
314 314
 		fclose(f);
315 315
 
316
-		fail_unless_fmt(signo == 2, "Incorrect number of signatures: %u, expected %u", signo, 2);
316
+		fail_unless_fmt(signo == 4, "Incorrect number of signatures: %u, expected %u", signo, 4);
317 317
 	}
318 318
 	loaded_2 = load2;
319 319
 
... ...
@@ -1,2 +1,4 @@
1
+S:P:d1b8a025
1 2
 S:F:d1b8a0251d7555d016b6468ae623e4b1e830c7efccc54966d09447a3d0a85c60
3
+S2:P:7f6fd541
2 4
 S2:F:7f6fd541e625e7bc5d5a64f166e47ecfe13735464a74d160b48265c162a71089