git-svn: trunk@2253
Nigel Horne authored on 2006/09/14 06:40:03... | ... |
@@ -1,3 +1,8 @@ |
1 |
+Wed Sep 13 22:38:22 BST 2006 (njh) |
|
2 |
+---------------------------------- |
|
3 |
+ * libclamav/mbox.c: Committed ACAB's merge of Edvin's Phish code, |
|
4 |
+ configure --enable-experimental to use it. |
|
5 |
+ |
|
1 | 6 |
Wed Sep 13 19:41:20 CEST 2006 (acab) |
2 | 7 |
------------------------------------ |
3 | 8 |
* libclamav/mbox.c: checkURLs reprototyped to prepare the merge of the |
... | ... |
@@ -16,7 +16,7 @@ |
16 | 16 |
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, |
17 | 17 |
* MA 02110-1301, USA. |
18 | 18 |
*/ |
19 |
-static char const rcsid[] = "$Id: mbox.c,v 1.329 2006/09/13 17:43:57 acab Exp $"; |
|
19 |
+static char const rcsid[] = "$Id: mbox.c,v 1.330 2006/09/13 21:37:49 njh Exp $"; |
|
20 | 20 |
|
21 | 21 |
#if HAVE_CONFIG_H |
22 | 22 |
#include "clamav-config.h" |
... | ... |
@@ -129,9 +129,16 @@ typedef enum { FALSE = 0, TRUE = 1 } bool; |
129 | 129 |
*/ |
130 | 130 |
#endif |
131 | 131 |
|
132 |
+#if defined(FOLLOWURLS) || defined(CL_EXPERIMENTAL) |
|
133 |
+#include "htmlnorm.h" |
|
134 |
+#endif |
|
135 |
+ |
|
136 |
+#ifdef CL_EXPERIMENTAL |
|
137 |
+#include "phishcheck.h" |
|
138 |
+#endif |
|
139 |
+ |
|
132 | 140 |
#ifdef FOLLOWURLS |
133 | 141 |
|
134 |
-#include "htmlnorm.h" |
|
135 | 142 |
|
136 | 143 |
#ifdef WITH_CURL /* Set in configure */ |
137 | 144 |
/* |
... | ... |
@@ -228,7 +235,14 @@ static message *do_multipart(message *mainMessage, message **messages, int i, in |
228 | 228 |
static int count_quotes(const char *buf); |
229 | 229 |
static bool next_is_folded_header(const text *t); |
230 | 230 |
|
231 |
-static void checkURLs(message *m, mbox_ctx* mctx,int *rc,int is_html); |
|
231 |
+static void checkURLs(message *m, mbox_ctx *mctx,int *rc,int is_html); |
|
232 |
+ |
|
233 |
+#ifdef CL_EXPERIMENTAL |
|
234 |
+static void do_checkURLs(message *m, const char *dir,tag_arguments_t* hrefs); |
|
235 |
+static blob* getHrefs(message* m,tag_arguments_t* hrefs); |
|
236 |
+static void hrefs_done(blob *b,tag_arguments_t* hrefs); |
|
237 |
+#endif |
|
238 |
+ |
|
232 | 239 |
#ifdef WITH_CURL |
233 | 240 |
struct arg { |
234 | 241 |
const char *url; |
... | ... |
@@ -379,6 +393,8 @@ static void add_to_map(const char *offset, const char *word); |
379 | 379 |
static const char *find_in_map(const char *offset, const char *word); |
380 | 380 |
static void free_map(void); |
381 | 381 |
|
382 |
+ |
|
383 |
+ |
|
382 | 384 |
/* |
383 | 385 |
* This could be the future. Instead of parsing and decoding it just decodes. |
384 | 386 |
* |
... | ... |
@@ -1595,7 +1611,13 @@ parseEmailFile(FILE *fin, const table_t *rfc821, const char *firstLine, const ch |
1595 | 1595 |
break; |
1596 | 1596 |
} else { |
1597 | 1597 |
if(line == NULL) { |
1598 |
- if(lastBodyLineWasBlank) { |
|
1598 |
+ /* |
|
1599 |
+ * Although this would save time and RAM, some |
|
1600 |
+ * phish signatures have been built which need |
|
1601 |
+ * the blank lines |
|
1602 |
+ */ |
|
1603 |
+ if(lastBodyLineWasBlank && |
|
1604 |
+ (messageGetMimeType(ret) != TEXT)) { |
|
1599 | 1605 |
cli_dbgmsg("Ignoring consecutive blank lines in the body\n"); |
1600 | 1606 |
continue; |
1601 | 1607 |
} |
... | ... |
@@ -1916,7 +1938,9 @@ parseEmailBody(message *messageIn, text *textIn, mbox_ctx *mctx) |
1916 | 1916 |
message *mainMessage = messageIn; |
1917 | 1917 |
fileblob *fb; |
1918 | 1918 |
bool infected = FALSE; |
1919 |
- |
|
1919 |
+#ifdef CL_EXPERIMENTAL |
|
1920 |
+ const int doPhishingScan = !(mctx->ctx->options&CL_SCAN_NOPHISHING); /* || (mctx->ctx->options&CL_SCAN_PHISHING_GA_TRAIN) || (mctx->ctx->options&CL_SCAN_PHISHING_GA); kept here for the GA MERGE */ |
|
1921 |
+#endif |
|
1920 | 1922 |
cli_dbgmsg("in parseEmailBody\n"); |
1921 | 1923 |
|
1922 | 1924 |
/* Anything left to be parsed? */ |
... | ... |
@@ -1964,16 +1988,29 @@ parseEmailBody(message *messageIn, text *textIn, mbox_ctx *mctx) |
1964 | 1964 |
case NOMIME: |
1965 | 1965 |
cli_dbgmsg("Not a mime encoded message\n"); |
1966 | 1966 |
aText = textAddMessage(aText, mainMessage); |
1967 |
+#ifdef CL_EXPERIMENTAL |
|
1968 |
+ if(!doPhishingScan) /*else: fall-through: some phishing mails claim they are text/plain, when they are indeed html*/ |
|
1969 |
+#endif |
|
1967 | 1970 |
break; |
1968 | 1971 |
case TEXT: |
1969 | 1972 |
/* text/plain has been preprocessed as no encoding */ |
1973 |
+#ifdef CL_EXPERIMENTAL |
|
1974 |
+ if(subtype==HTML || doPhishingScan) { |
|
1975 |
+#else |
|
1970 | 1976 |
if((mctx->ctx->options&CL_SCAN_MAILURL) && (subtype == HTML)) |
1977 |
+#endif |
|
1971 | 1978 |
/* |
1972 | 1979 |
* It would be better to save and scan the |
1973 | 1980 |
* file and only checkURLs if it's found to be |
1974 | 1981 |
* clean |
1975 | 1982 |
*/ |
1976 |
- checkURLs(mainMessage, mctx, &rc, 1); |
|
1983 |
+ checkURLs(mainMessage, mctx, &rc,subtype==HTML);/* there might be html sent without subtype html too, |
|
1984 |
+ so scan them for phishing too*/ |
|
1985 |
+#ifdef CL_EXPERIMENTAL |
|
1986 |
+ if(rc==3) |
|
1987 |
+ infected=TRUE; |
|
1988 |
+ } |
|
1989 |
+#endif |
|
1977 | 1990 |
break; |
1978 | 1991 |
case MULTIPART: |
1979 | 1992 |
cli_dbgmsg("Content-type 'multipart' handler\n"); |
... | ... |
@@ -3701,9 +3738,234 @@ rfc1341(message *m, const char *dir) |
3701 | 3701 |
} |
3702 | 3702 |
#endif |
3703 | 3703 |
|
3704 |
+#ifdef CL_EXPERIMENTAL |
|
3705 |
+static void |
|
3706 |
+hrefs_done(blob *b, tag_arguments_t *hrefs) |
|
3707 |
+{ |
|
3708 |
+ if(b) |
|
3709 |
+ blobDestroy(b); |
|
3710 |
+ html_tag_arg_free(hrefs); |
|
3711 |
+} |
|
3712 |
+ |
|
3713 |
+/* |
|
3714 |
+ * This used to be part of checkURLs, split out, because phishingScan needs it |
|
3715 |
+ * too, and phishingScan might be used in situations where checkURLs is |
|
3716 |
+ * disabled (see ifdef) |
|
3717 |
+ */ |
|
3718 |
+static blob * |
|
3719 |
+getHrefs(message *m, tag_arguments_t *hrefs) |
|
3720 |
+{ |
|
3721 |
+ blob *b = messageToBlob(m,0); |
|
3722 |
+ size_t len; |
|
3723 |
+ |
|
3724 |
+ if(b == NULL) |
|
3725 |
+ return NULL; |
|
3726 |
+ |
|
3727 |
+ len = blobGetDataSize(b); |
|
3728 |
+ |
|
3729 |
+ if(len == 0) { |
|
3730 |
+ blobDestroy(b); |
|
3731 |
+ return NULL; |
|
3732 |
+ } |
|
3733 |
+ |
|
3734 |
+ /* TODO: make this size customisable */ |
|
3735 |
+ if(len > 100*1024) { |
|
3736 |
+ cli_warnmsg("Viruses pointed to by URL not scanned in large message\n"); |
|
3737 |
+ blobDestroy(b); |
|
3738 |
+ return NULL; |
|
3739 |
+ } |
|
3740 |
+ |
|
3741 |
+ blobClose(b); |
|
3742 |
+ |
|
3743 |
+ hrefs->count = 0; |
|
3744 |
+ hrefs->tag = hrefs->value = NULL; |
|
3745 |
+ hrefs->contents = NULL; |
|
3746 |
+ |
|
3747 |
+ cli_dbgmsg("checkURLs: calling html_normalise_mem\n"); |
|
3748 |
+ if(!html_normalise_mem(blobGetData(b), len, NULL, hrefs)) { |
|
3749 |
+ blobDestroy(b); |
|
3750 |
+ return NULL; |
|
3751 |
+ } |
|
3752 |
+ cli_dbgmsg("checkURLs: html_normalise_mem returned\n"); |
|
3753 |
+ |
|
3754 |
+ /* TODO: Do we need to call remove_html_comments? */ |
|
3755 |
+ return b; |
|
3756 |
+} |
|
3757 |
+ |
|
3758 |
+static void |
|
3759 |
+checkURLs(message *mainMessage, mbox_ctx *mctx, int *rc, int is_html) |
|
3760 |
+{ |
|
3761 |
+ tag_arguments_t hrefs; |
|
3762 |
+ blob *b; |
|
3763 |
+ |
|
3764 |
+ hrefs.scanContents = (!(mctx->ctx->options&CL_SCAN_NOPHISHING)); /* aCaB: stripped GA related stuff */ |
|
3765 |
+ |
|
3766 |
+#if (!defined(FOLLOWURLS)) || (FOLLOWURLS <= 0) |
|
3767 |
+ if(!hrefs.scanContents) |
|
3768 |
+ /* |
|
3769 |
+ * Don't waste time extracting hrefs (parsing html), nobody |
|
3770 |
+ * will need it |
|
3771 |
+ */ |
|
3772 |
+ return; |
|
3773 |
+#endif |
|
3774 |
+ |
|
3775 |
+ hrefs.count = 0; |
|
3776 |
+ hrefs.tag = hrefs.value = NULL; |
|
3777 |
+ hrefs.contents = NULL; |
|
3778 |
+ |
|
3779 |
+ b = getHrefs(mainMessage, &hrefs); |
|
3780 |
+ if(b) { |
|
3781 |
+ if(!(mctx->ctx->options&CL_SCAN_NOPHISHING)) { |
|
3782 |
+ if(phishingScan(mainMessage,mctx->dir,mctx->ctx,&hrefs) == CL_VIRUS) { |
|
3783 |
+ mainMessage->isInfected = TRUE; |
|
3784 |
+ *rc = 3; |
|
3785 |
+ cli_dbgmsg("PH:Phishing found\n"); |
|
3786 |
+ } |
|
3787 |
+ } |
|
3788 |
+ if(is_html && mctx->ctx->options&CL_SCAN_MAILURL) |
|
3789 |
+ do_checkURLs(mainMessage, mctx->dir,&hrefs); |
|
3790 |
+ } |
|
3791 |
+ hrefs_done(b,&hrefs); |
|
3792 |
+} |
|
3793 |
+ |
|
3794 |
+#if defined(FOLLOWURLS) && (FOLLOWURLS > 0) |
|
3795 |
+static void |
|
3796 |
+do_checkURLs(message *m, const char *dir, tag_arguments_t *hrefs) |
|
3797 |
+{ |
|
3798 |
+ table_t *t; |
|
3799 |
+ int i, n; |
|
3800 |
+#if defined(WITH_CURL) && defined(CL_THREAD_SAFE) |
|
3801 |
+ pthread_t tid[FOLLOWURLS]; |
|
3802 |
+ struct arg args[FOLLOWURLS]; |
|
3803 |
+#endif |
|
3804 |
+ |
|
3805 |
+ t = tableCreate(); |
|
3806 |
+ if(t == NULL) |
|
3807 |
+ return; |
|
3808 |
+ |
|
3809 |
+ n = 0; |
|
3810 |
+ |
|
3811 |
+ for(i = 0; i < hrefs->count; i++) { |
|
3812 |
+ const char *url = (const char *)hrefs->value[i]; |
|
3813 |
+ |
|
3814 |
+ /* |
|
3815 |
+ * TODO: If it's an image source, it'd be nice to note beacons |
|
3816 |
+ * where width="0" height="0", which needs support from |
|
3817 |
+ * the HTML normalise code |
|
3818 |
+ */ |
|
3819 |
+ if(strncasecmp("http://", url, 7) == 0) { |
|
3820 |
+ char *ptr; |
|
3821 |
+#ifdef WITH_CURL |
|
3822 |
+#ifndef CL_THREAD_SAFE |
|
3823 |
+ struct arg arg; |
|
3824 |
+#endif |
|
3825 |
+ |
|
3826 |
+#else /*!WITH_CURL*/ |
|
3827 |
+#ifdef CL_THREAD_SAFE |
|
3828 |
+ static pthread_mutex_t system_mutex = PTHREAD_MUTEX_INITIALIZER; |
|
3829 |
+#endif |
|
3830 |
+ struct stat statb; |
|
3831 |
+ char cmd[512]; |
|
3832 |
+#endif /*WITH_CURL*/ |
|
3833 |
+ char name[NAME_MAX + 1]; |
|
3834 |
+ |
|
3835 |
+ if(tableFind(t, url) == 1) { |
|
3836 |
+ cli_dbgmsg("URL %s already downloaded\n", url); |
|
3837 |
+ continue; |
|
3838 |
+ } |
|
3839 |
+ /* |
|
3840 |
+ * What about foreign character spoofing? |
|
3841 |
+ * It would be useful be able to check if url |
|
3842 |
+ * is the same as the text displayed, e.g. |
|
3843 |
+ * <a href="http://dodgy.biz">www.paypal.com</a> |
|
3844 |
+ * but that needs support from HTML normalise |
|
3845 |
+ */ |
|
3846 |
+ if(strchr(url, '%') && strchr(url, '@')) |
|
3847 |
+ cli_warnmsg("Possible URL spoofing attempt noticed, but not yet handled (%s)\n", url); |
|
3848 |
+ |
|
3849 |
+ if(n == FOLLOWURLS) { |
|
3850 |
+ cli_warnmsg("URL %s will not be scanned\n", url); |
|
3851 |
+ break; |
|
3852 |
+ } |
|
3853 |
+ |
|
3854 |
+ (void)tableInsert(t, url, 1); |
|
3855 |
+ cli_dbgmsg("Downloading URL %s to be scanned\n", url); |
|
3856 |
+ strncpy(name, url, sizeof(name) - 1); |
|
3857 |
+ name[sizeof(name) - 1] = '\0'; |
|
3858 |
+ for(ptr = name; *ptr; ptr++) |
|
3859 |
+ if(*ptr == '/') |
|
3860 |
+ *ptr = '_'; |
|
3861 |
+ |
|
3862 |
+#ifdef WITH_CURL |
|
3863 |
+#ifdef CL_THREAD_SAFE |
|
3864 |
+ args[n].dir = dir; |
|
3865 |
+ args[n].url = url; |
|
3866 |
+ args[n].filename = strdup(name); |
|
3867 |
+ pthread_create(&tid[n], NULL, getURL, &args[n]); |
|
3868 |
+#else |
|
3869 |
+ arg.url = url; |
|
3870 |
+ arg.dir = dir; |
|
3871 |
+ arg.filename = name; |
|
3872 |
+ getURL(&arg); |
|
3873 |
+#endif |
|
3874 |
+ |
|
3875 |
+#else /*!WITH_CURL*/ |
|
3876 |
+ cli_warnmsg("The use of mail-follow-urls without CURL being installed is deprecated\n"); |
|
3877 |
+ /* |
|
3878 |
+ * TODO: maximum size and timeouts |
|
3879 |
+ */ |
|
3880 |
+ len = sizeof(cmd) - 26 - strlen(dir) - strlen(name); |
|
3881 |
+#ifdef CL_DEBUG |
|
3882 |
+ snprintf(cmd, sizeof(cmd) - 1, "GET -t10 \"%.*s\" >%s/%s", len, url, dir, name); |
|
3883 |
+#else |
|
3884 |
+ snprintf(cmd, sizeof(cmd) - 1, "GET -t10 \"%.*s\" >%s/%s 2>/dev/null", len, url, dir, name); |
|
3885 |
+#endif |
|
3886 |
+ cmd[sizeof(cmd) - 1] = '\0'; |
|
3887 |
+ |
|
3888 |
+ cli_dbgmsg("%s\n", cmd); |
|
3889 |
+#ifdef CL_THREAD_SAFE |
|
3890 |
+ pthread_mutex_lock(&system_mutex); |
|
3891 |
+#endif |
|
3892 |
+ system(cmd); |
|
3893 |
+#ifdef CL_THREAD_SAFE |
|
3894 |
+ pthread_mutex_unlock(&system_mutex); |
|
3895 |
+#endif |
|
3896 |
+ snprintf(cmd, sizeof(cmd), "%s/%s", dir, name); |
|
3897 |
+ if(stat(cmd, &statb) >= 0) |
|
3898 |
+ if(statb.st_size == 0) { |
|
3899 |
+ cli_warnmsg("URL %s failed to download\n", url); |
|
3900 |
+ /* |
|
3901 |
+ * Don't bother scanning an empty file |
|
3902 |
+ */ |
|
3903 |
+ (void)unlink(cmd); |
|
3904 |
+ } |
|
3905 |
+#endif |
|
3906 |
+ ++n; |
|
3907 |
+ } |
|
3908 |
+ } |
|
3909 |
+ tableDestroy(t); |
|
3910 |
+ |
|
3911 |
+#if defined(WITH_CURL) && defined(CL_THREAD_SAFE) |
|
3912 |
+ assert(n <= FOLLOWURLS); |
|
3913 |
+ cli_dbgmsg("checkURLs: waiting for %d thread(s) to finish\n", n); |
|
3914 |
+ while(--n >= 0) { |
|
3915 |
+ pthread_join(tid[n], NULL); |
|
3916 |
+ free(args[n].filename); |
|
3917 |
+ } |
|
3918 |
+#endif |
|
3919 |
+} |
|
3920 |
+#else |
|
3921 |
+static void |
|
3922 |
+do_checkURLs(message *m, const char *dir, tag_arguments_t *hrefs) |
|
3923 |
+{ |
|
3924 |
+} |
|
3925 |
+#endif |
|
3926 |
+ |
|
3927 |
+#else /*!CL_EXPERIMENTAL*/ |
|
3928 |
+ |
|
3704 | 3929 |
#if defined(FOLLOWURLS) && (FOLLOWURLS > 0) |
3705 | 3930 |
static void |
3706 |
-checkURLs(message *m, mbox_ctx* mctx,int *rc,int is_html) |
|
3931 |
+checkURLs(message *m, mbox_ctx *mctx, int* rc, int is_html) |
|
3707 | 3932 |
{ |
3708 | 3933 |
blob *b = messageToBlob(m, 0); |
3709 | 3934 |
size_t len; |
... | ... |
@@ -3866,6 +4128,16 @@ checkURLs(message *m, mbox_ctx* mctx,int *rc,int is_html) |
3866 | 3866 |
html_tag_arg_free(&hrefs); |
3867 | 3867 |
} |
3868 | 3868 |
|
3869 |
+#else |
|
3870 |
+ |
|
3871 |
+static void |
|
3872 |
+checkURLs(message *m, mbox_ctx *mctx, int* rc, int is_html) |
|
3873 |
+{ |
|
3874 |
+} |
|
3875 |
+#endif |
|
3876 |
+#endif /* ! CL_EXPERIMENTAL */ |
|
3877 |
+ |
|
3878 |
+#if defined(FOLLOWURLS) && (FOLLOWURLS>0) |
|
3869 | 3879 |
/* |
3870 | 3880 |
* Includes some Win32 patches by Gianluigi Tiesi <sherpya@netfarm.it> |
3871 | 3881 |
* |
... | ... |
@@ -4020,14 +4292,7 @@ getURL(struct arg *arg) |
4020 | 4020 |
return NULL; |
4021 | 4021 |
} |
4022 | 4022 |
#endif |
4023 |
- |
|
4024 |
-#else |
|
4025 |
-static void |
|
4026 |
-checkURLs(message *m, mbox_ctx* mctx,int *rc,int is_html) |
|
4027 |
-{ |
|
4028 |
-} |
|
4029 | 4023 |
#endif |
4030 |
- |
|
4031 | 4024 |
#ifdef HAVE_BACKTRACE |
4032 | 4025 |
static void |
4033 | 4026 |
sigsegv(int sig) |
... | ... |
@@ -4377,7 +4642,7 @@ do_multipart(message *mainMessage, message **messages, int i, int *rc, mbox_ctx |
4377 | 4377 |
} else { |
4378 | 4378 |
if(mctx->ctx->options&CL_SCAN_MAILURL) |
4379 | 4379 |
if(tableFind(mctx->subtypeTable, cptr) == HTML) |
4380 |
- checkURLs(aMessage, mctx, &rc, 1); |
|
4380 |
+ checkURLs(aMessage, mctx, rc, 1); |
|
4381 | 4381 |
messageAddArgument(aMessage, |
4382 | 4382 |
"filename=mixedtextportion"); |
4383 | 4383 |
} |