Browse code

extract URLs from mail body (bb #1482).

git-svn: trunk@5014

Török Edvin authored on 2009/04/03 05:36:22
Showing 5 changed files
... ...
@@ -1,3 +1,8 @@
1
+Thu Apr  2 22:59:30 EEST 2009 (edwin)
2
+-------------------------------------
3
+ * libclamav/htmlnorm.c, libclamav/htmlnorm.h, libclamav/mbox.c,
4
+ libclamav/phishcheck.c: extract URLs from mail body (bb #1482).
5
+
1 6
 Thu Apr  2 19:30:19 CEST 2009 (tk)
2 7
 ----------------------------------
3 8
  * libclamav/cab.c: fix compiler warnings (bb#1494)
... ...
@@ -346,7 +346,7 @@ static void html_tag_arg_set(tag_arguments_t *tags, const char *tag, const char
346 346
 	}
347 347
 	return;
348 348
 }
349
-static void html_tag_arg_add(tag_arguments_t *tags,
349
+void html_tag_arg_add(tag_arguments_t *tags,
350 350
 		const char *tag, char *value)
351 351
 {
352 352
 	int len, i;
... ...
@@ -40,6 +40,7 @@ int html_normalise_mem(unsigned char *in_buff, off_t in_size, const char *dirnam
40 40
 int html_normalise_fd(int fd, const char *dirname, tag_arguments_t *hrefs, const struct cli_dconf* dconf);
41 41
 void html_tag_arg_free(tag_arguments_t *tags);
42 42
 int html_screnc_decode(int fd, const char *dirname);
43
+void html_tag_arg_add(tag_arguments_t *tags, const char *tag, char *value);
43 44
  
44 45
 #endif
45 46
 
... ...
@@ -3962,6 +3962,36 @@ hrefs_done(blob *b, tag_arguments_t *hrefs)
3962 3962
 	html_tag_arg_free(hrefs);
3963 3963
 }
3964 3964
 
3965
+/* extract URLs from static text */
3966
+static void extract_text_urls(const unsigned char *mem, size_t len, tag_arguments_t *hrefs)
3967
+{
3968
+    char url[1024];
3969
+    size_t off;
3970
+    for (off=0;off + 10 < len;off++) {
3971
+	/* check whether this is the start of a URL */
3972
+	int32_t proto = cli_readint32(mem + off);
3973
+	/* convert to lowercase */
3974
+	proto |= 0x20202020;
3975
+	/* 'http:', 'https:', or 'ftp:' in little-endian */
3976
+	if ((proto == 0x70747468 &&
3977
+	     (mem[off+4] == ':' || (mem[off+5] == 's' && mem[off+6] == ':')))
3978
+	    || proto == 0x3a707466) {
3979
+	    size_t url_len;
3980
+	    for (url_len=4; off + url_len < len && url_len < (sizeof(url)-1); url_len++) {
3981
+		unsigned char c = mem[off + url_len];
3982
+		/* smart compilers will compile this if into
3983
+		 * a single bt + jb instruction */
3984
+		if (c == ' ' || c == '\n' || c == '\t')
3985
+		    break;
3986
+	    }
3987
+	    memcpy(url, mem + off, url_len);
3988
+	    url[url_len] = '\0';
3989
+	    html_tag_arg_add(hrefs, "href", url);
3990
+	    off += url_len;
3991
+	}
3992
+    }
3993
+}
3994
+
3965 3995
 /*
3966 3996
  * This used to be part of checkURLs, split out, because phishingScan needs it
3967 3997
  * too, and phishingScan might be used in situations where checkURLs is
... ...
@@ -3970,6 +4000,7 @@ hrefs_done(blob *b, tag_arguments_t *hrefs)
3970 3970
 static blob *
3971 3971
 getHrefs(message *m, tag_arguments_t *hrefs)
3972 3972
 {
3973
+	unsigned char *mem;
3973 3974
 	blob *b = messageToBlob(m, 0);
3974 3975
 	size_t len;
3975 3976
 
... ...
@@ -3995,11 +4026,15 @@ getHrefs(message *m, tag_arguments_t *hrefs)
3995 3995
 	hrefs->contents = NULL;
3996 3996
 
3997 3997
 	cli_dbgmsg("getHrefs: calling html_normalise_mem\n");
3998
-	if(!html_normalise_mem(blobGetData(b), (off_t)len, NULL, hrefs,m->ctx->dconf)) {
3998
+	mem = blobGetData(b);
3999
+	if(!html_normalise_mem(mem, (off_t)len, NULL, hrefs,m->ctx->dconf)) {
3999 4000
 		blobDestroy(b);
4000 4001
 		return NULL;
4001 4002
 	}
4002 4003
 	cli_dbgmsg("getHrefs: html_normalise_mem returned\n");
4004
+	if (!hrefs->count && hrefs->scanContents) {
4005
+	    extract_text_urls(mem, len, hrefs);
4006
+	}
4003 4007
 
4004 4008
 	/* TODO: Do we need to call remove_html_comments? */
4005 4009
 	return b;
... ...
@@ -146,9 +146,9 @@ static const char src_text[] = "src";
146 146
 static const char href_text[] = "href";
147 147
 static const char mailto[] = "mailto:";
148 148
 static const char mailto_proto[] = "mailto://";
149
-static const char https[]="https://";
150
-static const char http[]="http://";
151
-static const char ftp[] = "ftp://";
149
+static const char https[]="https:";
150
+static const char http[]="http:";
151
+static const char ftp[] = "ftp:";
152 152
 
153 153
 static const size_t href_text_len = sizeof(href_text);
154 154
 static const size_t src_text_len = sizeof(src_text);
... ...
@@ -774,8 +774,7 @@ int phishingScan(cli_ctx* ctx,tag_arguments_t* hrefs)
774 774
 	fclose(f);
775 775
 	return 0;
776 776
 #endif
777
-	for(i=0;i<hrefs->count;i++)
778
-		if(hrefs->contents[i]) {
777
+	for(i=0;i<hrefs->count;i++) {
779 778
 			struct url_check urls;
780 779
 			enum phish_status rc;
781 780
 			urls.flags	 = strncmp((char*)hrefs->tag[i],href_text,href_text_len)? (CL_PHISH_ALL_CHECKS&~CHECK_SSL): CL_PHISH_ALL_CHECKS;
... ...
@@ -841,10 +840,7 @@ int phishingScan(cli_ctx* ctx,tag_arguments_t* hrefs)
841 841
 					break;
842 842
 			}
843 843
 			return cli_found_possibly_unwanted(ctx);
844
-		}
845
-		else
846
-			if(strcmp((char*)hrefs->tag[i],"href"))
847
-					cli_dbgmsg("Phishcheck: href with no contents?\n");
844
+	}
848 845
 	return CL_CLEAN;
849 846
 }
850 847
 
... ...
@@ -1015,33 +1011,34 @@ static int isURL(char* URL, int accept_anyproto)
1015 1015
 	switch (URL[0]) {
1016 1016
 		case 'h':
1017 1017
 			if (strncmp(URL, https, https_len) == 0)
1018
-				start = URL + https_len;
1018
+				start = URL + https_len - 1;
1019 1019
 			else if (strncmp(URL, http, http_len) == 0)
1020
-				start = URL + http_len;
1020
+				start = URL + http_len - 1;
1021 1021
 			break;
1022 1022
 		case 'f':
1023 1023
 		       if (strncmp(URL, ftp, ftp_len) == 0)
1024
-			       start = URL + ftp_len;
1024
+			       start = URL + ftp_len - 1;
1025 1025
 		       break;
1026 1026
 		case 'm':
1027 1027
 		       if (strncmp(URL, mailto_proto, mailto_proto_len) == 0)
1028
-			       start = URL + mailto_proto_len;
1028
+			       start = URL + mailto_proto_len - 1;
1029 1029
 		       break;
1030 1030
 	}
1031
-	if(start) {
1032
-		if(start[0] == '\0')
1033
-			return 0;/* empty URL */
1031
+	if(start && start[1] == '/' && start[2] == '/') {
1034 1032
 		/* has a valid protocol, it is a URL */
1035 1033
 		return 1;
1036 1034
 	}
1037
-	start = accept_anyproto ?  strchr(URL, ':') : NULL;
1035
+	start = accept_anyproto ?  strchr(URL, ':') : start;
1038 1036
 	if(start) {
1039 1037
 		/* validate URI scheme */
1040 1038
 		if(validate_uri_ialpha(URL, start)) {
1041
-			if(start[1] == '/' && start[2] == '/')
1042
-				start += 3; /* skip :// */
1043
-			else
1039
+			/* skip :// */
1040
+			if (start[1] == '/') {
1041
+			    start += 2;
1042
+			    if (*start == '/')
1044 1043
 				start++;
1044
+			} else
1045
+			    start++;
1045 1046
 		}
1046 1047
 		else
1047 1048
 			start = URL; /* scheme invalid */
... ...
@@ -1298,7 +1295,7 @@ int cli_url_canon(const char *inurl, size_t len, char *urlbuff, size_t dest_len,
1298 1298
 	/* determine end of hostname */
1299 1299
 	host_len = strcspn(host_begin, ":/?");
1300 1300
 	path_begin = host_begin + host_len;
1301
-	if(host_len < len) {
1301
+	if(host_len <= len) {
1302 1302
 		/* url without path, use a single / */
1303 1303
 		memmove(path_begin + 2, path_begin + 1, len - host_len);
1304 1304
 		*path_begin++ = '/';
... ...
@@ -1419,7 +1416,7 @@ static enum phish_status phishingCheck(const struct cl_engine* engine,struct url
1419 1419
 	int phishy=0, blacklisted=0;
1420 1420
 	const struct phishcheck* pchk = (const struct phishcheck*) engine->phishcheck;
1421 1421
 
1422
-	if(!urls->realLink.data || urls->displayLink.data[0]=='\0')
1422
+	if(!urls->realLink.data)
1423 1423
 		return CL_PHISH_CLEAN;
1424 1424
 
1425 1425
 	cli_dbgmsg("Phishcheck:Checking url %s->%s\n", urls->realLink.data,
... ...
@@ -1466,6 +1463,10 @@ static enum phish_status phishingCheck(const struct cl_engine* engine,struct url
1466 1466
 	if (blacklisted)
1467 1467
 	    return blacklisted;
1468 1468
 
1469
+	if (urls->displayLink.data[0] == '\0') {
1470
+	    return CL_PHISH_CLEAN;
1471
+	}
1472
+
1469 1473
 	url_check_init(&host_url);
1470 1474
 
1471 1475
 	if((rc = url_get_host(urls, &host_url, DOMAIN_DISPLAY, &phishy))) {