Browse code

Added Edvin's SOC code

git-svn: trunk@2253

Nigel Horne authored on 2006/09/14 06:40:03
Showing 2 changed files
... ...
@@ -1,3 +1,8 @@
1
+Wed Sep 13 22:38:22 BST 2006 (njh)
2
+----------------------------------
3
+  * libclamav/mbox.c:	Committed ACAB's merge of Edvin's Phish code,
4
+  			configure --enable-experimental to use it.
5
+
1 6
 Wed Sep 13 19:41:20 CEST 2006 (acab)
2 7
 ------------------------------------
3 8
   * libclamav/mbox.c: checkURLs reprototyped to prepare the merge of the
... ...
@@ -16,7 +16,7 @@
16 16
  *  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
17 17
  *  MA 02110-1301, USA.
18 18
  */
19
-static	char	const	rcsid[] = "$Id: mbox.c,v 1.329 2006/09/13 17:43:57 acab Exp $";
19
+static	char	const	rcsid[] = "$Id: mbox.c,v 1.330 2006/09/13 21:37:49 njh Exp $";
20 20
 
21 21
 #if HAVE_CONFIG_H
22 22
 #include "clamav-config.h"
... ...
@@ -129,9 +129,16 @@ typedef enum	{ FALSE = 0, TRUE = 1 } bool;
129 129
 				 */
130 130
 #endif
131 131
 
132
+#if defined(FOLLOWURLS) || defined(CL_EXPERIMENTAL)
133
+#include "htmlnorm.h"
134
+#endif
135
+
136
+#ifdef CL_EXPERIMENTAL
137
+#include "phishcheck.h"
138
+#endif
139
+
132 140
 #ifdef	FOLLOWURLS
133 141
 
134
-#include "htmlnorm.h"
135 142
 
136 143
 #ifdef	WITH_CURL	/* Set in configure */
137 144
 /*
... ...
@@ -228,7 +235,14 @@ static	message	*do_multipart(message *mainMessage, message **messages, int i, in
228 228
 static	int	count_quotes(const char *buf);
229 229
 static	bool	next_is_folded_header(const text *t);
230 230
 
231
-static	void	checkURLs(message *m, mbox_ctx* mctx,int *rc,int is_html);
231
+static	void	checkURLs(message *m, mbox_ctx *mctx,int *rc,int is_html);
232
+
233
+#ifdef CL_EXPERIMENTAL
234
+static	void	do_checkURLs(message *m, const char *dir,tag_arguments_t* hrefs);
235
+static	blob*	getHrefs(message* m,tag_arguments_t* hrefs);
236
+static	void	hrefs_done(blob *b,tag_arguments_t* hrefs);
237
+#endif
238
+
232 239
 #ifdef	WITH_CURL
233 240
 struct arg {
234 241
 	const char *url;
... ...
@@ -379,6 +393,8 @@ static	void	add_to_map(const char *offset, const char *word);
379 379
 static	const	char	*find_in_map(const char *offset, const char *word);
380 380
 static	void	free_map(void);
381 381
 
382
+
383
+
382 384
 /*
383 385
  * This could be the future. Instead of parsing and decoding it just decodes.
384 386
  *
... ...
@@ -1595,7 +1611,13 @@ parseEmailFile(FILE *fin, const table_t *rfc821, const char *firstLine, const ch
1595 1595
 					break;
1596 1596
 		} else {
1597 1597
 			if(line == NULL) {
1598
-				if(lastBodyLineWasBlank) {
1598
+				/*
1599
+				 * Although this would save time and RAM, some
1600
+				 * phish signatures have been built which need
1601
+				 * the blank lines
1602
+				 */
1603
+				if(lastBodyLineWasBlank &&
1604
+				  (messageGetMimeType(ret) != TEXT)) {
1599 1605
 					cli_dbgmsg("Ignoring consecutive blank lines in the body\n");
1600 1606
 					continue;
1601 1607
 				}
... ...
@@ -1916,7 +1938,9 @@ parseEmailBody(message *messageIn, text *textIn, mbox_ctx *mctx)
1916 1916
 	message *mainMessage = messageIn;
1917 1917
 	fileblob *fb;
1918 1918
 	bool infected = FALSE;
1919
-
1919
+#ifdef CL_EXPERIMENTAL
1920
+	const int doPhishingScan = !(mctx->ctx->options&CL_SCAN_NOPHISHING); /* || (mctx->ctx->options&CL_SCAN_PHISHING_GA_TRAIN) || (mctx->ctx->options&CL_SCAN_PHISHING_GA);  kept here for the GA MERGE */
1921
+#endif
1920 1922
 	cli_dbgmsg("in parseEmailBody\n");
1921 1923
 
1922 1924
 	/* Anything left to be parsed? */
... ...
@@ -1964,16 +1988,29 @@ parseEmailBody(message *messageIn, text *textIn, mbox_ctx *mctx)
1964 1964
 		case NOMIME:
1965 1965
 			cli_dbgmsg("Not a mime encoded message\n");
1966 1966
 			aText = textAddMessage(aText, mainMessage);
1967
+#ifdef CL_EXPERIMENTAL
1968
+			if(!doPhishingScan) /*else: fall-through: some phishing mails claim they are text/plain, when they are indeed html*/
1969
+#endif
1967 1970
 			break;
1968 1971
 		case TEXT:
1969 1972
 			/* text/plain has been preprocessed as no encoding */
1973
+#ifdef CL_EXPERIMENTAL
1974
+			if(subtype==HTML || doPhishingScan) {
1975
+#else
1970 1976
 			if((mctx->ctx->options&CL_SCAN_MAILURL) && (subtype == HTML))
1977
+#endif
1971 1978
 				/*
1972 1979
 				 * It would be better to save and scan the
1973 1980
 				 * file and only checkURLs if it's found to be
1974 1981
 				 * clean
1975 1982
 				 */
1976
-				checkURLs(mainMessage, mctx, &rc, 1);
1983
+				checkURLs(mainMessage, mctx, &rc,subtype==HTML);/* there might be html sent without subtype html too,
1984
+													so scan them for phishing too*/
1985
+#ifdef CL_EXPERIMENTAL
1986
+				if(rc==3)
1987
+				infected=TRUE;
1988
+			}
1989
+#endif
1977 1990
 			break;
1978 1991
 		case MULTIPART:
1979 1992
 			cli_dbgmsg("Content-type 'multipart' handler\n");
... ...
@@ -3701,9 +3738,234 @@ rfc1341(message *m, const char *dir)
3701 3701
 }
3702 3702
 #endif
3703 3703
 
3704
+#ifdef CL_EXPERIMENTAL
3705
+static void
3706
+hrefs_done(blob *b, tag_arguments_t *hrefs)
3707
+{
3708
+	if(b)
3709
+		blobDestroy(b);
3710
+	html_tag_arg_free(hrefs);
3711
+}
3712
+
3713
+/*
3714
+ * This used to be part of checkURLs, split out, because phishingScan needs it
3715
+ * too, and phishingScan might be used in situations where checkURLs is
3716
+ * disabled (see ifdef)
3717
+ */
3718
+static blob *
3719
+getHrefs(message *m, tag_arguments_t *hrefs)
3720
+{
3721
+	blob *b = messageToBlob(m,0);
3722
+	size_t len;
3723
+
3724
+	if(b == NULL)
3725
+		return NULL;
3726
+
3727
+	len = blobGetDataSize(b);
3728
+
3729
+	if(len == 0) {
3730
+		blobDestroy(b);
3731
+		return NULL;
3732
+	}
3733
+
3734
+	/* TODO: make this size customisable */
3735
+	if(len > 100*1024) {
3736
+		cli_warnmsg("Viruses pointed to by URL not scanned in large message\n");
3737
+		blobDestroy(b);
3738
+		return NULL;
3739
+	}
3740
+
3741
+	blobClose(b);
3742
+
3743
+	hrefs->count = 0;
3744
+	hrefs->tag = hrefs->value = NULL;
3745
+	hrefs->contents = NULL;
3746
+
3747
+	cli_dbgmsg("checkURLs: calling html_normalise_mem\n");
3748
+	if(!html_normalise_mem(blobGetData(b), len, NULL, hrefs)) {
3749
+		blobDestroy(b);
3750
+		return NULL;
3751
+	}
3752
+	cli_dbgmsg("checkURLs: html_normalise_mem returned\n");
3753
+
3754
+	/* TODO: Do we need to call remove_html_comments? */
3755
+	return b;
3756
+}
3757
+
3758
+static void
3759
+checkURLs(message *mainMessage, mbox_ctx *mctx, int *rc, int is_html)
3760
+{
3761
+       tag_arguments_t hrefs;
3762
+       blob *b;
3763
+
3764
+       hrefs.scanContents = (!(mctx->ctx->options&CL_SCAN_NOPHISHING)); /* aCaB: stripped GA related stuff */
3765
+
3766
+#if    (!defined(FOLLOWURLS)) || (FOLLOWURLS <= 0)
3767
+       if(!hrefs.scanContents)
3768
+	       /*
3769
+		* Don't waste time extracting hrefs (parsing html), nobody
3770
+		* will need it
3771
+		*/
3772
+		return;
3773
+#endif
3774
+
3775
+       hrefs.count = 0;
3776
+       hrefs.tag = hrefs.value = NULL;
3777
+       hrefs.contents = NULL;
3778
+
3779
+       b = getHrefs(mainMessage, &hrefs);
3780
+       if(b) {
3781
+	       if(!(mctx->ctx->options&CL_SCAN_NOPHISHING)) {
3782
+		       if(phishingScan(mainMessage,mctx->dir,mctx->ctx,&hrefs) == CL_VIRUS) {
3783
+			       mainMessage->isInfected = TRUE;
3784
+			       *rc = 3;
3785
+			       cli_dbgmsg("PH:Phishing found\n");
3786
+		       }
3787
+	       }
3788
+	       if(is_html && mctx->ctx->options&CL_SCAN_MAILURL)
3789
+		       do_checkURLs(mainMessage, mctx->dir,&hrefs);
3790
+       }
3791
+       hrefs_done(b,&hrefs);
3792
+}
3793
+
3794
+#if	defined(FOLLOWURLS) && (FOLLOWURLS > 0)
3795
+static void
3796
+do_checkURLs(message *m, const char *dir, tag_arguments_t *hrefs)
3797
+{
3798
+	table_t *t;
3799
+	int i, n;
3800
+#if	defined(WITH_CURL) && defined(CL_THREAD_SAFE)
3801
+	pthread_t tid[FOLLOWURLS];
3802
+	struct arg args[FOLLOWURLS];
3803
+#endif
3804
+
3805
+	t = tableCreate();
3806
+	if(t == NULL)
3807
+		return;
3808
+
3809
+	n = 0;
3810
+
3811
+	for(i = 0; i < hrefs->count; i++) {
3812
+		const char *url = (const char *)hrefs->value[i];
3813
+
3814
+		/*
3815
+		 * TODO: If it's an image source, it'd be nice to note beacons
3816
+		 *	where width="0" height="0", which needs support from
3817
+		 *	the HTML normalise code
3818
+		 */
3819
+		if(strncasecmp("http://", url, 7) == 0) {
3820
+			char *ptr;
3821
+#ifdef	WITH_CURL
3822
+#ifndef	CL_THREAD_SAFE
3823
+			struct arg arg;
3824
+#endif
3825
+
3826
+#else	/*!WITH_CURL*/
3827
+#ifdef	CL_THREAD_SAFE
3828
+			static pthread_mutex_t system_mutex = PTHREAD_MUTEX_INITIALIZER;
3829
+#endif
3830
+			struct stat statb;
3831
+			char cmd[512];
3832
+#endif	/*WITH_CURL*/
3833
+			char name[NAME_MAX + 1];
3834
+
3835
+			if(tableFind(t, url) == 1) {
3836
+				cli_dbgmsg("URL %s already downloaded\n", url);
3837
+				continue;
3838
+			}
3839
+			/*
3840
+			 * What about foreign character spoofing?
3841
+			 * It would be useful be able to check if url
3842
+			 *	is the same as the text displayed, e.g.
3843
+			 *	<a href="http://dodgy.biz">www.paypal.com</a>
3844
+			 *	but that needs support from HTML normalise
3845
+			 */
3846
+			if(strchr(url, '%') && strchr(url, '@'))
3847
+				cli_warnmsg("Possible URL spoofing attempt noticed, but not yet handled (%s)\n", url);
3848
+
3849
+			if(n == FOLLOWURLS) {
3850
+				cli_warnmsg("URL %s will not be scanned\n", url);
3851
+				break;
3852
+			}
3853
+
3854
+			(void)tableInsert(t, url, 1);
3855
+			cli_dbgmsg("Downloading URL %s to be scanned\n", url);
3856
+			strncpy(name, url, sizeof(name) - 1);
3857
+			name[sizeof(name) - 1] = '\0';
3858
+			for(ptr = name; *ptr; ptr++)
3859
+				if(*ptr == '/')
3860
+					*ptr = '_';
3861
+
3862
+#ifdef	WITH_CURL
3863
+#ifdef	CL_THREAD_SAFE
3864
+			args[n].dir = dir;
3865
+			args[n].url = url;
3866
+			args[n].filename = strdup(name);
3867
+			pthread_create(&tid[n], NULL, getURL, &args[n]);
3868
+#else
3869
+			arg.url = url;
3870
+			arg.dir = dir;
3871
+			arg.filename = name;
3872
+			getURL(&arg);
3873
+#endif
3874
+
3875
+#else	/*!WITH_CURL*/
3876
+			cli_warnmsg("The use of mail-follow-urls without CURL being installed is deprecated\n");
3877
+			/*
3878
+			 * TODO: maximum size and timeouts
3879
+			 */
3880
+			len = sizeof(cmd) - 26 - strlen(dir) - strlen(name);
3881
+#ifdef	CL_DEBUG
3882
+			snprintf(cmd, sizeof(cmd) - 1, "GET -t10 \"%.*s\" >%s/%s", len, url, dir, name);
3883
+#else
3884
+			snprintf(cmd, sizeof(cmd) - 1, "GET -t10 \"%.*s\" >%s/%s 2>/dev/null", len, url, dir, name);
3885
+#endif
3886
+			cmd[sizeof(cmd) - 1] = '\0';
3887
+
3888
+			cli_dbgmsg("%s\n", cmd);
3889
+#ifdef	CL_THREAD_SAFE
3890
+			pthread_mutex_lock(&system_mutex);
3891
+#endif
3892
+			system(cmd);
3893
+#ifdef	CL_THREAD_SAFE
3894
+			pthread_mutex_unlock(&system_mutex);
3895
+#endif
3896
+			snprintf(cmd, sizeof(cmd), "%s/%s", dir, name);
3897
+			if(stat(cmd, &statb) >= 0)
3898
+				if(statb.st_size == 0) {
3899
+					cli_warnmsg("URL %s failed to download\n", url);
3900
+					/*
3901
+					 * Don't bother scanning an empty file
3902
+					 */
3903
+					(void)unlink(cmd);
3904
+				}
3905
+#endif
3906
+			++n;
3907
+		}
3908
+	}
3909
+	tableDestroy(t);
3910
+
3911
+#if	defined(WITH_CURL) && defined(CL_THREAD_SAFE)
3912
+	assert(n <= FOLLOWURLS);
3913
+	cli_dbgmsg("checkURLs: waiting for %d thread(s) to finish\n", n);
3914
+	while(--n >= 0) {
3915
+		pthread_join(tid[n], NULL);
3916
+		free(args[n].filename);
3917
+	}
3918
+#endif
3919
+}
3920
+#else
3921
+static void
3922
+do_checkURLs(message *m, const char *dir, tag_arguments_t *hrefs)
3923
+{
3924
+}
3925
+#endif
3926
+
3927
+#else	/*!CL_EXPERIMENTAL*/
3928
+
3704 3929
 #if	defined(FOLLOWURLS) && (FOLLOWURLS > 0)
3705 3930
 static void
3706
-checkURLs(message *m, mbox_ctx* mctx,int *rc,int is_html)
3931
+checkURLs(message *m, mbox_ctx *mctx, int* rc, int is_html)
3707 3932
 {
3708 3933
 	blob *b = messageToBlob(m, 0);
3709 3934
 	size_t len;
... ...
@@ -3866,6 +4128,16 @@ checkURLs(message *m, mbox_ctx* mctx,int *rc,int is_html)
3866 3866
 	html_tag_arg_free(&hrefs);
3867 3867
 }
3868 3868
 
3869
+#else
3870
+
3871
+static void
3872
+checkURLs(message *m, mbox_ctx *mctx, int* rc, int is_html)
3873
+{
3874
+}
3875
+#endif
3876
+#endif /* ! CL_EXPERIMENTAL */
3877
+
3878
+#if defined(FOLLOWURLS) && (FOLLOWURLS>0)
3869 3879
 /*
3870 3880
  * Includes some Win32 patches by Gianluigi Tiesi <sherpya@netfarm.it>
3871 3881
  *
... ...
@@ -4020,14 +4292,7 @@ getURL(struct arg *arg)
4020 4020
 	return NULL;
4021 4021
 }
4022 4022
 #endif
4023
-
4024
-#else
4025
-static void
4026
-checkURLs(message *m, mbox_ctx* mctx,int *rc,int is_html)
4027
-{
4028
-}
4029 4023
 #endif
4030
-
4031 4024
 #ifdef HAVE_BACKTRACE
4032 4025
 static void
4033 4026
 sigsegv(int sig)
... ...
@@ -4377,7 +4642,7 @@ do_multipart(message *mainMessage, message **messages, int i, int *rc, mbox_ctx
4377 4377
 				} else {
4378 4378
 					if(mctx->ctx->options&CL_SCAN_MAILURL)
4379 4379
 						if(tableFind(mctx->subtypeTable, cptr) == HTML)
4380
-							checkURLs(aMessage, mctx, &rc, 1);
4380
+							checkURLs(aMessage, mctx, rc, 1);
4381 4381
 					messageAddArgument(aMessage,
4382 4382
 						"filename=mixedtextportion");
4383 4383
 				}