Browse code

Improve Trojan.Swizzor.Gen detection: do per file statistics in addition to per string. It is amazing how a much simpler rule can do the same job better.

git-svn: trunk@4539

Török Edvin authored on 2008/12/06 23:49:00
Showing 4 changed files
... ...
@@ -1,3 +1,10 @@
1
+Sat Dec  6 16:54:43 EET 2008 (edwin)
2
+------------------------------------
3
+ * libclamav/pe.c, libclamav/special.c, libclamav/special.h: Improve
4
+ Trojan.Swizzor.Gen detection: do per file statistics in addition to
5
+ per string. It is amazing how a much simpler rule can do the same
6
+ job better.
7
+
1 8
 Thu Dec  4 17:43:01 CET 2008 (acab)
2 9
 -----------------------------------
3 10
  * clamav-milter: r4519:r4536 merge new clamav milter
... ...
@@ -1215,15 +1215,21 @@ int cli_scanpe(int desc, cli_ctx *ctx)
1215 1215
 
1216 1216
     /* Trojan.Swizzor.Gen */
1217 1217
     if (SCAN_ALGO && (DCONF & PE_CONF_SWIZZOR) && nsections > 1 && fsize > 64*1024 && fsize < 4*1024*1024) {
1218
-	    int ret = CL_CLEAN;
1219 1218
 	    if(dirs[2].Size) {
1220
-		    struct swizz_stats stats;
1221
-		    unsigned int m = 10000;
1222
-		    memset(&stats, 0, sizeof(stats));
1223
-		    cli_parseres_special(EC32(dirs[2].VirtualAddress), EC32(dirs[2].VirtualAddress), desc, exe_sections, nsections, fsize, hdr_size, 0, 0, &m, &stats);
1224
-		    if (cli_detect_swizz(&stats) == CL_VIRUS) {
1225
-			    *ctx->virname = "Trojan.Swizzor.Gen";
1226
-			    ret = CL_VIRUS;
1219
+		    struct swizz_stats *stats = cli_calloc(1, sizeof(*stats));
1220
+		    unsigned int m = 1000;
1221
+		    int ret = CL_CLEAN;
1222
+
1223
+		    if (!stats)
1224
+			    ret = CL_EMEM;
1225
+		    else {
1226
+			    cli_parseres_special(EC32(dirs[2].VirtualAddress), EC32(dirs[2].VirtualAddress), desc, exe_sections, nsections, fsize, hdr_size, 0, 0, &m, stats);
1227
+			    if ((ret = cli_detect_swizz(stats)) == CL_VIRUS) {
1228
+				    *ctx->virname = "Trojan.Swizzor.Gen";
1229
+			    }
1230
+			    free(stats);
1231
+		    }
1232
+		    if (ret != CL_CLEAN) {
1227 1233
 			    free(exe_sections);
1228 1234
 			    return ret;
1229 1235
 		    }
... ...
@@ -360,46 +360,12 @@ static inline int swizz_j48(const uint16_t n[])
360 360
 {
361 361
 	cli_dbgmsg("swizz_j48: %u, %u, %u\n",n[0],n[1],n[2]);
362 362
 	/* rules based on J48 tree */
363
-	if (n[0] <= 924)
364
-		return CL_CLEAN;
365
-	if (n[0] <= 940) {
366
-		return (n[2] > 1 && n[2] <= 8) ? CL_VIRUS : CL_CLEAN;
367
-	}
368
-	if (n[2] <= 14) {
369
-		if (n[2] <= 0) {
370
-			if (n[0] <= 999)
371
-				return CL_CLEAN;
372
-			if (n[0] <= 1012) {
373
-				if (n[1] <= 23) {
374
-					if (n[0] <= 1003)
375
-						return CL_CLEAN;
376
-					return (n[1] <= 19 && n[0] > 1007 && n[1] > 15) || (n[1] > 19) ? CL_VIRUS : CL_CLEAN;
377
-				}
378
-				return CL_VIRUS;
379
-			}
380
-			return n[1] == 0 ? CL_CLEAN : CL_VIRUS;
381
-		}
382
-		if (n[2] <= 8)
383
-			return CL_VIRUS;
384
-		if (n[0] <= 954)
385
-			return CL_CLEAN;
386
-		if (n[2] <= 10)
387
-			return CL_VIRUS;
388
-		if (n[2] <= 12) {
389
-			if (n[0] <= 1011) {
390
-				if (n[1] <=32)
391
-					return CL_VIRUS;
392
-				return (n[2] <= 11 || n[1] > 51) ? CL_VIRUS : CL_CLEAN;
393
-			}
394
-			return CL_CLEAN;
395
-		}
396
-		if (n[1] <= 52) {
397
-			return (n[1] <= 43 && n[1] > 6 &&
398
-					(n[2] <= 13 || n[1] <= 30 || n[1] > 40))
399
-				? CL_CLEAN : CL_VIRUS;
400
-		}
401
-	}
402
-	return CL_CLEAN;
363
+	if (n[0] <= 945 || !n[1])
364
+		return 0;
365
+	if (n[0] <= 1006)
366
+		return (n[2] > 0 && n[2] <= 6);
367
+	else
368
+		return n[1] <= 10;
403 369
 }
404 370
 
405 371
 void cli_detect_swizz_str(const unsigned char *str, uint32_t len, struct swizz_stats *stats, int blob)
... ...
@@ -440,8 +406,10 @@ void cli_detect_swizz_str(const unsigned char *str, uint32_t len, struct swizz_s
440 440
 	for(i=0;i<j-2;i++) {
441 441
 		if (stri[i] != ' ' && stri[i+1] != ' ' && stri[i+2] != ' ') {
442 442
 			uint16_t idx = (stri[i] - 'a')*676 + (stri[i+1] - 'a')*26 + (stri[i+2] - 'a');
443
-			if (idx < sizeof(ngrams))
443
+			if (idx < sizeof(ngrams)) {
444 444
 				ngrams[idx]++;
445
+				stats->gngrams[idx]++;
446
+			}
445 447
 		} else if (stri[i] == ' ')
446 448
 			words++;
447 449
 	}
... ...
@@ -461,21 +429,72 @@ void cli_detect_swizz_str(const unsigned char *str, uint32_t len, struct swizz_s
461 461
 		uint32_t v = ngram_cnts[i];
462 462
 		ngram_cnts[i] = (v<<10)/all;
463 463
 	}
464
-	ret = swizz_j48(ngram_cnts);
464
+	ret = swizz_j48(ngram_cnts) ? CL_VIRUS : CL_CLEAN;
465 465
 	cli_dbgmsg("cli_detect_swizz_str: %s, %u words\n", ret == CL_VIRUS ? "suspicious" : "ok", words);
466 466
 	if (ret == CL_VIRUS)
467 467
 		stats->suspicious += j;
468 468
 	stats->total += j;
469 469
 }
470 470
 
471
+static inline swizz_j48_global(const uint32_t gn[])
472
+{
473
+	if (gn[0] <= 24185) {
474
+		return gn[0] > 22980 && gn[8] > 0 && gn[8] <= 97;
475
+	}
476
+	if (!gn[8]) {
477
+		if (gn[4] <= 311) {
478
+			if (!gn[4]) {
479
+				return gn[1] > 0 &&
480
+					((gn[0] <= 26579 && gn[3] > 0) ||
481
+					 (gn[0] > 28672 && gn[0] <= 30506));
482
+			}
483
+			if (gn[5] <= 616) {
484
+				if (gn[6] <= 104) {
485
+					return gn[9] <= 167;
486
+				}
487
+				return gn[6] <= 286;
488
+			}
489
+		}
490
+		return 0;
491
+	}
492
+	return 1;
493
+}
494
+
471 495
 int cli_detect_swizz(struct swizz_stats *stats)
472 496
 {
497
+	uint32_t gn[10];
498
+	uint32_t all = 0;
499
+	unsigned i;
500
+	int global_swizz = CL_CLEAN;
501
+
473 502
 	cli_dbgmsg("cli_detect_swizz: %lu/%lu, version:%d, manifest: %d \n",
474 503
 			(unsigned long)stats->suspicious, (unsigned long)stats->total,
475 504
 			stats->has_version, stats->has_manifest);
476
-	/* not all have version/manifest */
477
-	if (stats->total > 128 && stats->suspicious > 2*stats->total/10) {
478
-		return CL_VIRUS;
505
+	memset(gn, 0, sizeof(gn));
506
+	for(i=0;i<17576;i++) {
507
+		uint8_t v = stats->gngrams[i];
508
+		if (v > 10) v = 10;
509
+		if (v) {
510
+			gn[v-1]++;
511
+			all++;
512
+		}
513
+	}
514
+	if (all) {
515
+		/* normalize */
516
+		cli_dbgmsg("cli_detect_swizz: gn: ");
517
+		for(i=0;i<sizeof(gn)/sizeof(gn[0]);i++) {
518
+			uint32_t v = gn[i];
519
+			gn[i] = (v<<15)/all;
520
+			if (cli_debug_flag)
521
+			cli_dbgmsg("%lu, ", (unsigned long)gn[i]);
522
+		}
523
+		global_swizz = swizz_j48_global(gn) ? CL_VIRUS : CL_CLEAN;
524
+		cli_dbgmsg("\ncli_detect_swizz: global: %s\n", global_swizz ? "suspicious" : "clean");
479 525
 	}
480
-	return CL_CLEAN;
526
+
527
+	if (stats->total <= 337)
528
+		return CL_CLEAN;
529
+	if (stats->suspicious<<10 > 20*stats->total)
530
+		return CL_VIRUS;
531
+	return global_swizz;
481 532
 }
... ...
@@ -23,6 +23,7 @@
23 23
 
24 24
 #include "others.h"
25 25
 struct swizz_stats {
26
+	uint16_t gngrams[17576];
26 27
 	uint32_t total;
27 28
 	uint32_t suspicious;
28 29
 	int has_version;