Browse code

Fuzzy logic match for unknown encoding types

git-svn-id: file:///var/lib/svn/clamav-devel/trunk/clamav-devel@1035 77e5149b-7576-45b1-b177-96237e5ba77b

Nigel Horne authored on 2004/10/23 00:58:16
Showing 2 changed files
... ...
@@ -1,6 +1,12 @@
1
+Fri Oct 22 16:57:13 BST 2004 (njh)
2
+----------------------------------
3
+  * libclamav/message.c:	If an unknown encoding type is found take
4
+  			a best guess for the decoder. If none can be guessed
5
+			invoke all decoders
6
+
1 7
 Thu Oct 21 11:14:35 BST 2004 (njh)
2 8
 ----------------------------------
3
-  libclamav/mbox.c:	PARTIAL MODE: Extend yesterday's Solaris fix to BeOS
9
+  * libclamav/mbox.c:	PARTIAL MODE: Extend yesterday's Solaris fix to BeOS
4 10
 
5 11
 Wed Oct 20 23:51:46 CEST 2004 (tk)
6 12
 ----------------------------------
... ...
@@ -17,6 +17,9 @@
17 17
  *
18 18
  * Change History:
19 19
  * $Log: message.c,v $
20
+ * Revision 1.105  2004/10/22 15:53:45  nigelhorne
21
+ * Fuzzy logic match for unknown encoding types
22
+ *
20 23
  * Revision 1.104  2004/10/19 13:53:55  nigelhorne
21 24
  * Don't add trailing NUL bytes
22 25
  *
... ...
@@ -309,7 +312,7 @@
309 309
  * uuencodebegin() no longer static
310 310
  *
311 311
  */
312
-static	char	const	rcsid[] = "$Id: message.c,v 1.104 2004/10/19 13:53:55 nigelhorne Exp $";
312
+static	char	const	rcsid[] = "$Id: message.c,v 1.105 2004/10/22 15:53:45 nigelhorne Exp $";
313 313
 
314 314
 #if HAVE_CONFIG_H
315 315
 #include "clamav-config.h"
... ...
@@ -371,6 +374,7 @@ static	const	char	*messageGetArgument(const message *m, int arg);
371 371
 static	void	*messageExport(message *m, const char *dir, void *(*create)(void), void (*destroy)(void *), void (*setFilename)(void *, const char *, const char *), void (*addData)(void *, const unsigned char *, size_t), void *(*exportText)(const text *, void *));
372 372
 static	int	usefulArg(const char *arg);
373 373
 static	void	messageDedup(message *m);
374
+static	int	simil(const char *str1, const char *str2);
374 375
 
375 376
 /*
376 377
  * These maps are ordered in decreasing likelyhood of their appearance
... ...
@@ -600,7 +604,7 @@ messageSetMimeSubtype(message *m, const char *subtype)
600 600
 const char *
601 601
 messageGetMimeSubtype(const message *m)
602 602
 {
603
-	return((m->mimeSubtype) ? m->mimeSubtype : "");
603
+	return (m->mimeSubtype) ? m->mimeSubtype : "";
604 604
 }
605 605
 
606 606
 void
... ...
@@ -633,7 +637,7 @@ messageSetDispositionType(message *m, const char *disptype)
633 633
 const char *
634 634
 messageGetDispositionType(const message *m)
635 635
 {
636
-	return((m->mimeDispositionType) ? m->mimeDispositionType : "");
636
+	return (m->mimeDispositionType) ? m->mimeDispositionType : "";
637 637
 }
638 638
 
639 639
 /*
... ...
@@ -867,7 +871,7 @@ messageGetArgument(const message *m, int arg)
867 867
 	assert(arg >= 0);
868 868
 	assert(arg < m->numberOfArguments);
869 869
 
870
-	return((m->mimeArguments[arg]) ? m->mimeArguments[arg] : "");
870
+	return (m->mimeArguments[arg]) ? m->mimeArguments[arg] : "";
871 871
 }
872 872
 
873 873
 /*
... ...
@@ -958,6 +962,9 @@ messageSetEncoding(message *m, const char *enctype)
958 958
 	 */
959 959
 	i = 0;
960 960
 	while((type = cli_strtok(enctype, i++, " \t")) != NULL) {
961
+		int highestSimil = 0;
962
+		const char *closest = NULL;
963
+
961 964
 		for(e = encoding_map; e->string; e++)
962 965
 			if(strcasecmp(type, e->string) == 0) {
963 966
 				int j;
... ...
@@ -969,31 +976,44 @@ messageSetEncoding(message *m, const char *enctype)
969 969
 						break;
970 970
 					}
971 971
 				}
972
-				if(j < m->numberOfEncTypes)
973
-					break;
972
+
974 973
 				et = (encoding_type *)cli_realloc(m->encodingTypes, (m->numberOfEncTypes + 1) * sizeof(encoding_type));
975
-				if(et == NULL) {
976
-					free(type);
977
-					return;
978
-				}
974
+				if(et == NULL)
975
+					break;
979 976
 
980 977
 				m->encodingTypes = et;
981 978
 				m->encodingTypes[m->numberOfEncTypes++] = e->type;
982 979
 
983 980
 				cli_dbgmsg("Encoding type %d is \"%s\"\n", m->numberOfEncTypes, type);
984 981
 				break;
982
+
983
+			} else {
984
+				const int sim = simil(type, e->string);
985
+
986
+				if(sim > highestSimil) {
987
+					closest = e->string;
988
+					highestSimil = sim;
989
+				}
985 990
 			}
986 991
 
987 992
 		if(e->string == NULL) {
988
-			cli_warnmsg("Unknown encoding type \"%s\" - report to bugs@clamav.net\n", type);
989
-			free(type);
990 993
 			/*
991
-			 * Err on the side of safety, enable all decoding
992
-			 * modules
994
+			 * 50% is arbitary. For example 7bi will match as
995
+			 * 66% certain to be 7bit
993 996
 			 */
994
-			messageSetEncoding(m, "base64");
995
-			messageSetEncoding(m, "quoted-printable");
996
-			break;
997
+			if(closest && (highestSimil >= 50)) {
998
+				cli_warnmsg("Unknown encoding type \"%s\" - guessing as %s (%u%% certainty)\n",
999
+					type, closest, highestSimil);
1000
+				messageSetEncoding(m, closest);
1001
+			} else {
1002
+				cli_warnmsg("Unknown encoding type \"%s\" - report to bugs@clamav.net\n", type);
1003
+				/*
1004
+				 * Err on the side of safety, enable all
1005
+				 * decoding modules
1006
+				 */
1007
+				messageSetEncoding(m, "base64");
1008
+				messageSetEncoding(m, "quoted-printable");
1009
+			}
997 1010
 		}
998 1011
 
999 1012
 		free(type);
... ...
@@ -2404,7 +2424,7 @@ base64(char c)
2404 2404
 static unsigned char
2405 2405
 uudecode(char c)
2406 2406
 {
2407
-	return(c - ' ');
2407
+	return c - ' ';
2408 2408
 }
2409 2409
 
2410 2410
 /*
... ...
@@ -2499,3 +2519,183 @@ messageDedup(message *m)
2499 2499
 	}
2500 2500
 	m->dedupedThisFar = t1;
2501 2501
 }
2502
+
2503
+/*
2504
+ * common/simil:
2505
+ *	From Computing Magazine 20/8/92
2506
+ * Returns %ge number from 0 to 100 - how similar are 2 strings?
2507
+ * 100 for exact match, < for error
2508
+ */
2509
+
2510
+struct	pstr_list {	/* internal stack */
2511
+	char	*d1;
2512
+	struct	pstr_list	*next;
2513
+};
2514
+
2515
+#define	OUT_OF_MEMORY	(-2)
2516
+#define	FAILURE	(-3)
2517
+#define	SUCCESS	(-4)
2518
+#define	ARRAY_OVERFLOW	(-5)
2519
+typedef	struct	pstr_list	ELEMENT1;
2520
+typedef	ELEMENT1		*LINK1;
2521
+
2522
+static	int	push(LINK1 *top, const char *string);
2523
+static	int	pop(LINK1 *top, char *buffer);
2524
+static	unsigned	int	compare(char *ls1, char **rs1, char *ls2, char **rs2);
2525
+
2526
+#define	MAX_PATTERN_SIZ	40	/* maximum string lengths */
2527
+
2528
+static int
2529
+simil(const char *str1, const char *str2)
2530
+{
2531
+	LINK1 top = NULL;
2532
+	unsigned int score = 0;
2533
+	unsigned int common, total, len1;
2534
+	unsigned int len2;
2535
+	char ls1[MAX_PATTERN_SIZ], ls2[MAX_PATTERN_SIZ];
2536
+	char *rs1 = NULL, *rs2 = NULL;
2537
+	char *s1, *s2;
2538
+
2539
+	if(strcasecmp(str1, str2) == 0)
2540
+		return 100;
2541
+
2542
+	if((s1 = strdup(str1)) == NULL)
2543
+		return OUT_OF_MEMORY;
2544
+	if((s2 = strdup(str2)) == NULL) {
2545
+		free(s1);
2546
+		return OUT_OF_MEMORY;
2547
+	}
2548
+
2549
+	if(((total = strstrip(s1)) > MAX_PATTERN_SIZ - 1) || ((len2 = strstrip(s2)) > MAX_PATTERN_SIZ - 1)) {
2550
+		free(s1);
2551
+		free(s2);
2552
+		return ARRAY_OVERFLOW;
2553
+	}
2554
+
2555
+	total += len2;
2556
+
2557
+	if(push(&top, s1) == OUT_OF_MEMORY)
2558
+		return OUT_OF_MEMORY;
2559
+	if(push(&top, s2) == OUT_OF_MEMORY)
2560
+		return OUT_OF_MEMORY;
2561
+
2562
+	while(pop(&top, ls2) == SUCCESS) {
2563
+		pop(&top, ls1);
2564
+		common = compare(ls1, &rs1, ls2, &rs2);
2565
+		if(common > 0) {
2566
+			score += common;
2567
+			len1 = strlen(ls1);
2568
+			len2 = strlen(ls2);
2569
+
2570
+			if((len1 > 1 && len2 >= 1) || (len2 > 1 && len1 >= 1))
2571
+				if((push(&top, ls1) == OUT_OF_MEMORY) || (push(&top, ls2) == OUT_OF_MEMORY)) {
2572
+					free(s1);
2573
+					free(s2);
2574
+					return OUT_OF_MEMORY;
2575
+				}
2576
+			len1 = strlen(rs1);
2577
+			len2 = strlen(rs2);
2578
+
2579
+			if((len1 > 1 && len2 >= 1) || (len2 > 1 && len1 >= 1))
2580
+				if((push(&top, rs1) == OUT_OF_MEMORY) || (push(&top, rs2) == OUT_OF_MEMORY)) {
2581
+					free(s1);
2582
+					free(s2);
2583
+					return OUT_OF_MEMORY;
2584
+				}
2585
+		}
2586
+	}
2587
+	free(s1);
2588
+	free(s2);
2589
+	return (total > 0) ? ((score * 200) / total) : 0;
2590
+}
2591
+
2592
+static unsigned int
2593
+compare(char *ls1, char **rs1, char *ls2, char **rs2)
2594
+{
2595
+	unsigned int common, diff, maxchars = 0;
2596
+	bool some_similarity = FALSE;
2597
+	char *s1, *s2;
2598
+	char *maxs1 = NULL, *maxs2 = NULL, *maxe1 = NULL, *maxe2 = NULL;
2599
+	char *cs1, *cs2, *start1, *end1, *end2;
2600
+
2601
+	end1 = ls1 + strlen(ls1);
2602
+	end2 = ls2 + strlen(ls2);
2603
+	start1 = ls1;
2604
+
2605
+	for(;;) {
2606
+		s1 = start1;
2607
+		s2 = ls2;
2608
+
2609
+		if(s1 < end1) {
2610
+			while(s1 < end1 && s2 < end2) {
2611
+				if(tolower(*s1) == tolower(*s2)) {
2612
+					some_similarity = TRUE;
2613
+					cs1 = s1;
2614
+					cs2 = s2;
2615
+					common = 0;
2616
+					do
2617
+						if(s1 == end1 || s2 == end2)
2618
+							break;
2619
+						else {
2620
+							s1++;
2621
+							s2++;
2622
+							common++;
2623
+						}
2624
+					while(tolower(*s1) == tolower(*s2));
2625
+
2626
+					if(common > maxchars) {
2627
+						diff = common - maxchars;
2628
+						maxchars = common;
2629
+						maxs1 = cs1;
2630
+						maxs2 = cs2;
2631
+						maxe1 = s1;
2632
+						maxe2 = s2;
2633
+						end1 -= diff;
2634
+						end2 -= diff;
2635
+					} else
2636
+						s1 -= common;
2637
+				} else
2638
+					s2++;
2639
+			}
2640
+			start1++;
2641
+		} else
2642
+			break;
2643
+	}
2644
+	if(some_similarity) {
2645
+		*maxs1 = '\0';
2646
+		*maxs2 = '\0';
2647
+		*rs1 = maxe1;
2648
+		*rs2 = maxe2;
2649
+	}
2650
+	return maxchars;
2651
+}
2652
+
2653
+static int
2654
+push(LINK1 *top, const char *string)
2655
+{
2656
+	LINK1 element;
2657
+
2658
+	if((element = (LINK1)cli_malloc(sizeof(ELEMENT1))) == NULL)
2659
+		return OUT_OF_MEMORY;
2660
+	if((element->d1 = strdup(string)) == NULL)
2661
+		return OUT_OF_MEMORY;
2662
+	element->next = *top;
2663
+	*top = element;
2664
+
2665
+	return SUCCESS;
2666
+}
2667
+
2668
+static int
2669
+pop(LINK1 *top, char *buffer)
2670
+{
2671
+	LINK1 t1;
2672
+
2673
+	if((t1 = *top) != NULL) {
2674
+		(void)strcpy(buffer, t1->d1);
2675
+		*top = t1->next;
2676
+		free(t1->d1);
2677
+		free((char *)t1);
2678
+		return SUCCESS;
2679
+	}
2680
+	return FAILURE;
2681
+}