git-svn: trunk@5014
Török Edvin authored on 2009/04/03 05:36:22... | ... |
@@ -1,3 +1,8 @@ |
1 |
+Thu Apr 2 22:59:30 EEST 2009 (edwin) |
|
2 |
+------------------------------------- |
|
3 |
+ * libclamav/htmlnorm.c, libclamav/htmlnorm.h, libclamav/mbox.c, |
|
4 |
+ libclamav/phishcheck.c: extract URLs from mail body (bb #1482). |
|
5 |
+ |
|
1 | 6 |
Thu Apr 2 19:30:19 CEST 2009 (tk) |
2 | 7 |
---------------------------------- |
3 | 8 |
* libclamav/cab.c: fix compiler warnings (bb#1494) |
... | ... |
@@ -346,7 +346,7 @@ static void html_tag_arg_set(tag_arguments_t *tags, const char *tag, const char |
346 | 346 |
} |
347 | 347 |
return; |
348 | 348 |
} |
349 |
-static void html_tag_arg_add(tag_arguments_t *tags, |
|
349 |
+void html_tag_arg_add(tag_arguments_t *tags, |
|
350 | 350 |
const char *tag, char *value) |
351 | 351 |
{ |
352 | 352 |
int len, i; |
... | ... |
@@ -40,6 +40,7 @@ int html_normalise_mem(unsigned char *in_buff, off_t in_size, const char *dirnam |
40 | 40 |
int html_normalise_fd(int fd, const char *dirname, tag_arguments_t *hrefs, const struct cli_dconf* dconf); |
41 | 41 |
void html_tag_arg_free(tag_arguments_t *tags); |
42 | 42 |
int html_screnc_decode(int fd, const char *dirname); |
43 |
+void html_tag_arg_add(tag_arguments_t *tags, const char *tag, char *value); |
|
43 | 44 |
|
44 | 45 |
#endif |
45 | 46 |
|
... | ... |
@@ -3962,6 +3962,36 @@ hrefs_done(blob *b, tag_arguments_t *hrefs) |
3962 | 3962 |
html_tag_arg_free(hrefs); |
3963 | 3963 |
} |
3964 | 3964 |
|
3965 |
+/* extract URLs from static text */ |
|
3966 |
+static void extract_text_urls(const unsigned char *mem, size_t len, tag_arguments_t *hrefs) |
|
3967 |
+{ |
|
3968 |
+ char url[1024]; |
|
3969 |
+ size_t off; |
|
3970 |
+ for (off=0;off + 10 < len;off++) { |
|
3971 |
+ /* check whether this is the start of a URL */ |
|
3972 |
+ int32_t proto = cli_readint32(mem + off); |
|
3973 |
+ /* convert to lowercase */ |
|
3974 |
+ proto |= 0x20202020; |
|
3975 |
+ /* 'http:', 'https:', or 'ftp:' in little-endian */ |
|
3976 |
+ if ((proto == 0x70747468 && |
|
3977 |
+ (mem[off+4] == ':' || (mem[off+5] == 's' && mem[off+6] == ':'))) |
|
3978 |
+ || proto == 0x3a707466) { |
|
3979 |
+ size_t url_len; |
|
3980 |
+ for (url_len=4; off + url_len < len && url_len < (sizeof(url)-1); url_len++) { |
|
3981 |
+ unsigned char c = mem[off + url_len]; |
|
3982 |
+ /* smart compilers will compile this if into |
|
3983 |
+ * a single bt + jb instruction */ |
|
3984 |
+ if (c == ' ' || c == '\n' || c == '\t') |
|
3985 |
+ break; |
|
3986 |
+ } |
|
3987 |
+ memcpy(url, mem + off, url_len); |
|
3988 |
+ url[url_len] = '\0'; |
|
3989 |
+ html_tag_arg_add(hrefs, "href", url); |
|
3990 |
+ off += url_len; |
|
3991 |
+ } |
|
3992 |
+ } |
|
3993 |
+} |
|
3994 |
+ |
|
3965 | 3995 |
/* |
3966 | 3996 |
* This used to be part of checkURLs, split out, because phishingScan needs it |
3967 | 3997 |
* too, and phishingScan might be used in situations where checkURLs is |
... | ... |
@@ -3970,6 +4000,7 @@ hrefs_done(blob *b, tag_arguments_t *hrefs) |
3970 | 3970 |
static blob * |
3971 | 3971 |
getHrefs(message *m, tag_arguments_t *hrefs) |
3972 | 3972 |
{ |
3973 |
+ unsigned char *mem; |
|
3973 | 3974 |
blob *b = messageToBlob(m, 0); |
3974 | 3975 |
size_t len; |
3975 | 3976 |
|
... | ... |
@@ -3995,11 +4026,15 @@ getHrefs(message *m, tag_arguments_t *hrefs) |
3995 | 3995 |
hrefs->contents = NULL; |
3996 | 3996 |
|
3997 | 3997 |
cli_dbgmsg("getHrefs: calling html_normalise_mem\n"); |
3998 |
- if(!html_normalise_mem(blobGetData(b), (off_t)len, NULL, hrefs,m->ctx->dconf)) { |
|
3998 |
+ mem = blobGetData(b); |
|
3999 |
+ if(!html_normalise_mem(mem, (off_t)len, NULL, hrefs,m->ctx->dconf)) { |
|
3999 | 4000 |
blobDestroy(b); |
4000 | 4001 |
return NULL; |
4001 | 4002 |
} |
4002 | 4003 |
cli_dbgmsg("getHrefs: html_normalise_mem returned\n"); |
4004 |
+ if (!hrefs->count && hrefs->scanContents) { |
|
4005 |
+ extract_text_urls(mem, len, hrefs); |
|
4006 |
+ } |
|
4003 | 4007 |
|
4004 | 4008 |
/* TODO: Do we need to call remove_html_comments? */ |
4005 | 4009 |
return b; |
... | ... |
@@ -146,9 +146,9 @@ static const char src_text[] = "src"; |
146 | 146 |
static const char href_text[] = "href"; |
147 | 147 |
static const char mailto[] = "mailto:"; |
148 | 148 |
static const char mailto_proto[] = "mailto://"; |
149 |
-static const char https[]="https://"; |
|
150 |
-static const char http[]="http://"; |
|
151 |
-static const char ftp[] = "ftp://"; |
|
149 |
+static const char https[]="https:"; |
|
150 |
+static const char http[]="http:"; |
|
151 |
+static const char ftp[] = "ftp:"; |
|
152 | 152 |
|
153 | 153 |
static const size_t href_text_len = sizeof(href_text); |
154 | 154 |
static const size_t src_text_len = sizeof(src_text); |
... | ... |
@@ -774,8 +774,7 @@ int phishingScan(cli_ctx* ctx,tag_arguments_t* hrefs) |
774 | 774 |
fclose(f); |
775 | 775 |
return 0; |
776 | 776 |
#endif |
777 |
- for(i=0;i<hrefs->count;i++) |
|
778 |
- if(hrefs->contents[i]) { |
|
777 |
+ for(i=0;i<hrefs->count;i++) { |
|
779 | 778 |
struct url_check urls; |
780 | 779 |
enum phish_status rc; |
781 | 780 |
urls.flags = strncmp((char*)hrefs->tag[i],href_text,href_text_len)? (CL_PHISH_ALL_CHECKS&~CHECK_SSL): CL_PHISH_ALL_CHECKS; |
... | ... |
@@ -841,10 +840,7 @@ int phishingScan(cli_ctx* ctx,tag_arguments_t* hrefs) |
841 | 841 |
break; |
842 | 842 |
} |
843 | 843 |
return cli_found_possibly_unwanted(ctx); |
844 |
- } |
|
845 |
- else |
|
846 |
- if(strcmp((char*)hrefs->tag[i],"href")) |
|
847 |
- cli_dbgmsg("Phishcheck: href with no contents?\n"); |
|
844 |
+ } |
|
848 | 845 |
return CL_CLEAN; |
849 | 846 |
} |
850 | 847 |
|
... | ... |
@@ -1015,33 +1011,34 @@ static int isURL(char* URL, int accept_anyproto) |
1015 | 1015 |
switch (URL[0]) { |
1016 | 1016 |
case 'h': |
1017 | 1017 |
if (strncmp(URL, https, https_len) == 0) |
1018 |
- start = URL + https_len; |
|
1018 |
+ start = URL + https_len - 1; |
|
1019 | 1019 |
else if (strncmp(URL, http, http_len) == 0) |
1020 |
- start = URL + http_len; |
|
1020 |
+ start = URL + http_len - 1; |
|
1021 | 1021 |
break; |
1022 | 1022 |
case 'f': |
1023 | 1023 |
if (strncmp(URL, ftp, ftp_len) == 0) |
1024 |
- start = URL + ftp_len; |
|
1024 |
+ start = URL + ftp_len - 1; |
|
1025 | 1025 |
break; |
1026 | 1026 |
case 'm': |
1027 | 1027 |
if (strncmp(URL, mailto_proto, mailto_proto_len) == 0) |
1028 |
- start = URL + mailto_proto_len; |
|
1028 |
+ start = URL + mailto_proto_len - 1; |
|
1029 | 1029 |
break; |
1030 | 1030 |
} |
1031 |
- if(start) { |
|
1032 |
- if(start[0] == '\0') |
|
1033 |
- return 0;/* empty URL */ |
|
1031 |
+ if(start && start[1] == '/' && start[2] == '/') { |
|
1034 | 1032 |
/* has a valid protocol, it is a URL */ |
1035 | 1033 |
return 1; |
1036 | 1034 |
} |
1037 |
- start = accept_anyproto ? strchr(URL, ':') : NULL; |
|
1035 |
+ start = accept_anyproto ? strchr(URL, ':') : start; |
|
1038 | 1036 |
if(start) { |
1039 | 1037 |
/* validate URI scheme */ |
1040 | 1038 |
if(validate_uri_ialpha(URL, start)) { |
1041 |
- if(start[1] == '/' && start[2] == '/') |
|
1042 |
- start += 3; /* skip :// */ |
|
1043 |
- else |
|
1039 |
+ /* skip :// */ |
|
1040 |
+ if (start[1] == '/') { |
|
1041 |
+ start += 2; |
|
1042 |
+ if (*start == '/') |
|
1044 | 1043 |
start++; |
1044 |
+ } else |
|
1045 |
+ start++; |
|
1045 | 1046 |
} |
1046 | 1047 |
else |
1047 | 1048 |
start = URL; /* scheme invalid */ |
... | ... |
@@ -1298,7 +1295,7 @@ int cli_url_canon(const char *inurl, size_t len, char *urlbuff, size_t dest_len, |
1298 | 1298 |
/* determine end of hostname */ |
1299 | 1299 |
host_len = strcspn(host_begin, ":/?"); |
1300 | 1300 |
path_begin = host_begin + host_len; |
1301 |
- if(host_len < len) { |
|
1301 |
+ if(host_len <= len) { |
|
1302 | 1302 |
/* url without path, use a single / */ |
1303 | 1303 |
memmove(path_begin + 2, path_begin + 1, len - host_len); |
1304 | 1304 |
*path_begin++ = '/'; |
... | ... |
@@ -1419,7 +1416,7 @@ static enum phish_status phishingCheck(const struct cl_engine* engine,struct url |
1419 | 1419 |
int phishy=0, blacklisted=0; |
1420 | 1420 |
const struct phishcheck* pchk = (const struct phishcheck*) engine->phishcheck; |
1421 | 1421 |
|
1422 |
- if(!urls->realLink.data || urls->displayLink.data[0]=='\0') |
|
1422 |
+ if(!urls->realLink.data) |
|
1423 | 1423 |
return CL_PHISH_CLEAN; |
1424 | 1424 |
|
1425 | 1425 |
cli_dbgmsg("Phishcheck:Checking url %s->%s\n", urls->realLink.data, |
... | ... |
@@ -1466,6 +1463,10 @@ static enum phish_status phishingCheck(const struct cl_engine* engine,struct url |
1466 | 1466 |
if (blacklisted) |
1467 | 1467 |
return blacklisted; |
1468 | 1468 |
|
1469 |
+ if (urls->displayLink.data[0] == '\0') { |
|
1470 |
+ return CL_PHISH_CLEAN; |
|
1471 |
+ } |
|
1472 |
+ |
|
1469 | 1473 |
url_check_init(&host_url); |
1470 | 1474 |
|
1471 | 1475 |
if((rc = url_get_host(urls, &host_url, DOMAIN_DISPLAY, &phishy))) { |