git-svn-id: file:///var/lib/svn/clamav-devel/trunk/clamav-devel@1035 77e5149b-7576-45b1-b177-96237e5ba77b
Nigel Horne authored on 2004/10/23 00:58:16... | ... |
@@ -1,6 +1,12 @@ |
1 |
+Fri Oct 22 16:57:13 BST 2004 (njh) |
|
2 |
+---------------------------------- |
|
3 |
+ * libclamav/message.c: If an unknown encoding type is found take |
|
4 |
+ a best guess for the decoder. If none can be guessed |
|
5 |
+ invoke all decoders |
|
6 |
+ |
|
1 | 7 |
Thu Oct 21 11:14:35 BST 2004 (njh) |
2 | 8 |
---------------------------------- |
3 |
- libclamav/mbox.c: PARTIAL MODE: Extend yesterday's Solaris fix to BeOS |
|
9 |
+ * libclamav/mbox.c: PARTIAL MODE: Extend yesterday's Solaris fix to BeOS |
|
4 | 10 |
|
5 | 11 |
Wed Oct 20 23:51:46 CEST 2004 (tk) |
6 | 12 |
---------------------------------- |
... | ... |
@@ -17,6 +17,9 @@ |
17 | 17 |
* |
18 | 18 |
* Change History: |
19 | 19 |
* $Log: message.c,v $ |
20 |
+ * Revision 1.105 2004/10/22 15:53:45 nigelhorne |
|
21 |
+ * Fuzzy logic match for unknown encoding types |
|
22 |
+ * |
|
20 | 23 |
* Revision 1.104 2004/10/19 13:53:55 nigelhorne |
21 | 24 |
* Don't add trailing NUL bytes |
22 | 25 |
* |
... | ... |
@@ -309,7 +312,7 @@ |
309 | 309 |
* uuencodebegin() no longer static |
310 | 310 |
* |
311 | 311 |
*/ |
312 |
-static char const rcsid[] = "$Id: message.c,v 1.104 2004/10/19 13:53:55 nigelhorne Exp $"; |
|
312 |
+static char const rcsid[] = "$Id: message.c,v 1.105 2004/10/22 15:53:45 nigelhorne Exp $"; |
|
313 | 313 |
|
314 | 314 |
#if HAVE_CONFIG_H |
315 | 315 |
#include "clamav-config.h" |
... | ... |
@@ -371,6 +374,7 @@ static const char *messageGetArgument(const message *m, int arg); |
371 | 371 |
static void *messageExport(message *m, const char *dir, void *(*create)(void), void (*destroy)(void *), void (*setFilename)(void *, const char *, const char *), void (*addData)(void *, const unsigned char *, size_t), void *(*exportText)(const text *, void *)); |
372 | 372 |
static int usefulArg(const char *arg); |
373 | 373 |
static void messageDedup(message *m); |
374 |
+static int simil(const char *str1, const char *str2); |
|
374 | 375 |
|
375 | 376 |
/* |
376 | 377 |
* These maps are ordered in decreasing likelyhood of their appearance |
... | ... |
@@ -600,7 +604,7 @@ messageSetMimeSubtype(message *m, const char *subtype) |
600 | 600 |
const char * |
601 | 601 |
messageGetMimeSubtype(const message *m) |
602 | 602 |
{ |
603 |
- return((m->mimeSubtype) ? m->mimeSubtype : ""); |
|
603 |
+ return (m->mimeSubtype) ? m->mimeSubtype : ""; |
|
604 | 604 |
} |
605 | 605 |
|
606 | 606 |
void |
... | ... |
@@ -633,7 +637,7 @@ messageSetDispositionType(message *m, const char *disptype) |
633 | 633 |
const char * |
634 | 634 |
messageGetDispositionType(const message *m) |
635 | 635 |
{ |
636 |
- return((m->mimeDispositionType) ? m->mimeDispositionType : ""); |
|
636 |
+ return (m->mimeDispositionType) ? m->mimeDispositionType : ""; |
|
637 | 637 |
} |
638 | 638 |
|
639 | 639 |
/* |
... | ... |
@@ -867,7 +871,7 @@ messageGetArgument(const message *m, int arg) |
867 | 867 |
assert(arg >= 0); |
868 | 868 |
assert(arg < m->numberOfArguments); |
869 | 869 |
|
870 |
- return((m->mimeArguments[arg]) ? m->mimeArguments[arg] : ""); |
|
870 |
+ return (m->mimeArguments[arg]) ? m->mimeArguments[arg] : ""; |
|
871 | 871 |
} |
872 | 872 |
|
873 | 873 |
/* |
... | ... |
@@ -958,6 +962,9 @@ messageSetEncoding(message *m, const char *enctype) |
958 | 958 |
*/ |
959 | 959 |
i = 0; |
960 | 960 |
while((type = cli_strtok(enctype, i++, " \t")) != NULL) { |
961 |
+ int highestSimil = 0; |
|
962 |
+ const char *closest = NULL; |
|
963 |
+ |
|
961 | 964 |
for(e = encoding_map; e->string; e++) |
962 | 965 |
if(strcasecmp(type, e->string) == 0) { |
963 | 966 |
int j; |
... | ... |
@@ -969,31 +976,44 @@ messageSetEncoding(message *m, const char *enctype) |
969 | 969 |
break; |
970 | 970 |
} |
971 | 971 |
} |
972 |
- if(j < m->numberOfEncTypes) |
|
973 |
- break; |
|
972 |
+ |
|
974 | 973 |
et = (encoding_type *)cli_realloc(m->encodingTypes, (m->numberOfEncTypes + 1) * sizeof(encoding_type)); |
975 |
- if(et == NULL) { |
|
976 |
- free(type); |
|
977 |
- return; |
|
978 |
- } |
|
974 |
+ if(et == NULL) |
|
975 |
+ break; |
|
979 | 976 |
|
980 | 977 |
m->encodingTypes = et; |
981 | 978 |
m->encodingTypes[m->numberOfEncTypes++] = e->type; |
982 | 979 |
|
983 | 980 |
cli_dbgmsg("Encoding type %d is \"%s\"\n", m->numberOfEncTypes, type); |
984 | 981 |
break; |
982 |
+ |
|
983 |
+ } else { |
|
984 |
+ const int sim = simil(type, e->string); |
|
985 |
+ |
|
986 |
+ if(sim > highestSimil) { |
|
987 |
+ closest = e->string; |
|
988 |
+ highestSimil = sim; |
|
989 |
+ } |
|
985 | 990 |
} |
986 | 991 |
|
987 | 992 |
if(e->string == NULL) { |
988 |
- cli_warnmsg("Unknown encoding type \"%s\" - report to bugs@clamav.net\n", type); |
|
989 |
- free(type); |
|
990 | 993 |
/* |
991 |
- * Err on the side of safety, enable all decoding |
|
992 |
- * modules |
|
994 |
+ * 50% is arbitary. For example 7bi will match as |
|
995 |
+ * 66% certain to be 7bit |
|
993 | 996 |
*/ |
994 |
- messageSetEncoding(m, "base64"); |
|
995 |
- messageSetEncoding(m, "quoted-printable"); |
|
996 |
- break; |
|
997 |
+ if(closest && (highestSimil >= 50)) { |
|
998 |
+ cli_warnmsg("Unknown encoding type \"%s\" - guessing as %s (%u%% certainty)\n", |
|
999 |
+ type, closest, highestSimil); |
|
1000 |
+ messageSetEncoding(m, closest); |
|
1001 |
+ } else { |
|
1002 |
+ cli_warnmsg("Unknown encoding type \"%s\" - report to bugs@clamav.net\n", type); |
|
1003 |
+ /* |
|
1004 |
+ * Err on the side of safety, enable all |
|
1005 |
+ * decoding modules |
|
1006 |
+ */ |
|
1007 |
+ messageSetEncoding(m, "base64"); |
|
1008 |
+ messageSetEncoding(m, "quoted-printable"); |
|
1009 |
+ } |
|
997 | 1010 |
} |
998 | 1011 |
|
999 | 1012 |
free(type); |
... | ... |
@@ -2404,7 +2424,7 @@ base64(char c) |
2404 | 2404 |
static unsigned char |
2405 | 2405 |
uudecode(char c) |
2406 | 2406 |
{ |
2407 |
- return(c - ' '); |
|
2407 |
+ return c - ' '; |
|
2408 | 2408 |
} |
2409 | 2409 |
|
2410 | 2410 |
/* |
... | ... |
@@ -2499,3 +2519,183 @@ messageDedup(message *m) |
2499 | 2499 |
} |
2500 | 2500 |
m->dedupedThisFar = t1; |
2501 | 2501 |
} |
2502 |
+ |
|
2503 |
+/* |
|
2504 |
+ * common/simil: |
|
2505 |
+ * From Computing Magazine 20/8/92 |
|
2506 |
+ * Returns %ge number from 0 to 100 - how similar are 2 strings? |
|
2507 |
+ * 100 for exact match, < for error |
|
2508 |
+ */ |
|
2509 |
+ |
|
2510 |
+struct pstr_list { /* internal stack */ |
|
2511 |
+ char *d1; |
|
2512 |
+ struct pstr_list *next; |
|
2513 |
+}; |
|
2514 |
+ |
|
2515 |
+#define OUT_OF_MEMORY (-2) |
|
2516 |
+#define FAILURE (-3) |
|
2517 |
+#define SUCCESS (-4) |
|
2518 |
+#define ARRAY_OVERFLOW (-5) |
|
2519 |
+typedef struct pstr_list ELEMENT1; |
|
2520 |
+typedef ELEMENT1 *LINK1; |
|
2521 |
+ |
|
2522 |
+static int push(LINK1 *top, const char *string); |
|
2523 |
+static int pop(LINK1 *top, char *buffer); |
|
2524 |
+static unsigned int compare(char *ls1, char **rs1, char *ls2, char **rs2); |
|
2525 |
+ |
|
2526 |
+#define MAX_PATTERN_SIZ 40 /* maximum string lengths */ |
|
2527 |
+ |
|
2528 |
+static int |
|
2529 |
+simil(const char *str1, const char *str2) |
|
2530 |
+{ |
|
2531 |
+ LINK1 top = NULL; |
|
2532 |
+ unsigned int score = 0; |
|
2533 |
+ unsigned int common, total, len1; |
|
2534 |
+ unsigned int len2; |
|
2535 |
+ char ls1[MAX_PATTERN_SIZ], ls2[MAX_PATTERN_SIZ]; |
|
2536 |
+ char *rs1 = NULL, *rs2 = NULL; |
|
2537 |
+ char *s1, *s2; |
|
2538 |
+ |
|
2539 |
+ if(strcasecmp(str1, str2) == 0) |
|
2540 |
+ return 100; |
|
2541 |
+ |
|
2542 |
+ if((s1 = strdup(str1)) == NULL) |
|
2543 |
+ return OUT_OF_MEMORY; |
|
2544 |
+ if((s2 = strdup(str2)) == NULL) { |
|
2545 |
+ free(s1); |
|
2546 |
+ return OUT_OF_MEMORY; |
|
2547 |
+ } |
|
2548 |
+ |
|
2549 |
+ if(((total = strstrip(s1)) > MAX_PATTERN_SIZ - 1) || ((len2 = strstrip(s2)) > MAX_PATTERN_SIZ - 1)) { |
|
2550 |
+ free(s1); |
|
2551 |
+ free(s2); |
|
2552 |
+ return ARRAY_OVERFLOW; |
|
2553 |
+ } |
|
2554 |
+ |
|
2555 |
+ total += len2; |
|
2556 |
+ |
|
2557 |
+ if(push(&top, s1) == OUT_OF_MEMORY) |
|
2558 |
+ return OUT_OF_MEMORY; |
|
2559 |
+ if(push(&top, s2) == OUT_OF_MEMORY) |
|
2560 |
+ return OUT_OF_MEMORY; |
|
2561 |
+ |
|
2562 |
+ while(pop(&top, ls2) == SUCCESS) { |
|
2563 |
+ pop(&top, ls1); |
|
2564 |
+ common = compare(ls1, &rs1, ls2, &rs2); |
|
2565 |
+ if(common > 0) { |
|
2566 |
+ score += common; |
|
2567 |
+ len1 = strlen(ls1); |
|
2568 |
+ len2 = strlen(ls2); |
|
2569 |
+ |
|
2570 |
+ if((len1 > 1 && len2 >= 1) || (len2 > 1 && len1 >= 1)) |
|
2571 |
+ if((push(&top, ls1) == OUT_OF_MEMORY) || (push(&top, ls2) == OUT_OF_MEMORY)) { |
|
2572 |
+ free(s1); |
|
2573 |
+ free(s2); |
|
2574 |
+ return OUT_OF_MEMORY; |
|
2575 |
+ } |
|
2576 |
+ len1 = strlen(rs1); |
|
2577 |
+ len2 = strlen(rs2); |
|
2578 |
+ |
|
2579 |
+ if((len1 > 1 && len2 >= 1) || (len2 > 1 && len1 >= 1)) |
|
2580 |
+ if((push(&top, rs1) == OUT_OF_MEMORY) || (push(&top, rs2) == OUT_OF_MEMORY)) { |
|
2581 |
+ free(s1); |
|
2582 |
+ free(s2); |
|
2583 |
+ return OUT_OF_MEMORY; |
|
2584 |
+ } |
|
2585 |
+ } |
|
2586 |
+ } |
|
2587 |
+ free(s1); |
|
2588 |
+ free(s2); |
|
2589 |
+ return (total > 0) ? ((score * 200) / total) : 0; |
|
2590 |
+} |
|
2591 |
+ |
|
2592 |
+static unsigned int |
|
2593 |
+compare(char *ls1, char **rs1, char *ls2, char **rs2) |
|
2594 |
+{ |
|
2595 |
+ unsigned int common, diff, maxchars = 0; |
|
2596 |
+ bool some_similarity = FALSE; |
|
2597 |
+ char *s1, *s2; |
|
2598 |
+ char *maxs1 = NULL, *maxs2 = NULL, *maxe1 = NULL, *maxe2 = NULL; |
|
2599 |
+ char *cs1, *cs2, *start1, *end1, *end2; |
|
2600 |
+ |
|
2601 |
+ end1 = ls1 + strlen(ls1); |
|
2602 |
+ end2 = ls2 + strlen(ls2); |
|
2603 |
+ start1 = ls1; |
|
2604 |
+ |
|
2605 |
+ for(;;) { |
|
2606 |
+ s1 = start1; |
|
2607 |
+ s2 = ls2; |
|
2608 |
+ |
|
2609 |
+ if(s1 < end1) { |
|
2610 |
+ while(s1 < end1 && s2 < end2) { |
|
2611 |
+ if(tolower(*s1) == tolower(*s2)) { |
|
2612 |
+ some_similarity = TRUE; |
|
2613 |
+ cs1 = s1; |
|
2614 |
+ cs2 = s2; |
|
2615 |
+ common = 0; |
|
2616 |
+ do |
|
2617 |
+ if(s1 == end1 || s2 == end2) |
|
2618 |
+ break; |
|
2619 |
+ else { |
|
2620 |
+ s1++; |
|
2621 |
+ s2++; |
|
2622 |
+ common++; |
|
2623 |
+ } |
|
2624 |
+ while(tolower(*s1) == tolower(*s2)); |
|
2625 |
+ |
|
2626 |
+ if(common > maxchars) { |
|
2627 |
+ diff = common - maxchars; |
|
2628 |
+ maxchars = common; |
|
2629 |
+ maxs1 = cs1; |
|
2630 |
+ maxs2 = cs2; |
|
2631 |
+ maxe1 = s1; |
|
2632 |
+ maxe2 = s2; |
|
2633 |
+ end1 -= diff; |
|
2634 |
+ end2 -= diff; |
|
2635 |
+ } else |
|
2636 |
+ s1 -= common; |
|
2637 |
+ } else |
|
2638 |
+ s2++; |
|
2639 |
+ } |
|
2640 |
+ start1++; |
|
2641 |
+ } else |
|
2642 |
+ break; |
|
2643 |
+ } |
|
2644 |
+ if(some_similarity) { |
|
2645 |
+ *maxs1 = '\0'; |
|
2646 |
+ *maxs2 = '\0'; |
|
2647 |
+ *rs1 = maxe1; |
|
2648 |
+ *rs2 = maxe2; |
|
2649 |
+ } |
|
2650 |
+ return maxchars; |
|
2651 |
+} |
|
2652 |
+ |
|
2653 |
+static int |
|
2654 |
+push(LINK1 *top, const char *string) |
|
2655 |
+{ |
|
2656 |
+ LINK1 element; |
|
2657 |
+ |
|
2658 |
+ if((element = (LINK1)cli_malloc(sizeof(ELEMENT1))) == NULL) |
|
2659 |
+ return OUT_OF_MEMORY; |
|
2660 |
+ if((element->d1 = strdup(string)) == NULL) |
|
2661 |
+ return OUT_OF_MEMORY; |
|
2662 |
+ element->next = *top; |
|
2663 |
+ *top = element; |
|
2664 |
+ |
|
2665 |
+ return SUCCESS; |
|
2666 |
+} |
|
2667 |
+ |
|
2668 |
+static int |
|
2669 |
+pop(LINK1 *top, char *buffer) |
|
2670 |
+{ |
|
2671 |
+ LINK1 t1; |
|
2672 |
+ |
|
2673 |
+ if((t1 = *top) != NULL) { |
|
2674 |
+ (void)strcpy(buffer, t1->d1); |
|
2675 |
+ *top = t1->next; |
|
2676 |
+ free(t1->d1); |
|
2677 |
+ free((char *)t1); |
|
2678 |
+ return SUCCESS; |
|
2679 |
+ } |
|
2680 |
+ return FAILURE; |
|
2681 |
+} |