git-svn: trunk@2246
aCaB authored on 2006/09/13 07:06:03... | ... |
@@ -1,3 +1,9 @@ |
1 |
+Wed Sep 13 00:03:30 CEST 2006 (acab) |
|
2 |
+------------------------------------ |
|
3 |
+ * libclamav: Merge of the html normaliser part of the phishing module from |
|
4 |
+ Torok Edvin <edwintorok*gmail.com> |
|
5 |
+ Part of the Google Summer of Code program |
|
6 |
+ |
|
1 | 7 |
Tue Sep 12 22:52:14 CEST 2006 (tk) |
2 | 8 |
---------------------------------- |
3 | 9 |
* clamd: apply w32 patches from NJH |
... | ... |
@@ -47,7 +47,12 @@ |
47 | 47 |
#include "others.h" |
48 | 48 |
#include "htmlnorm.h" |
49 | 49 |
|
50 |
+#ifdef CL_EXPERIMENTAL |
|
51 |
+#include "mbox.h" |
|
52 |
+#endif |
|
53 |
+ |
|
50 | 54 |
#define HTML_STR_LENGTH 1024 |
55 |
+#define MAX_TAG_CONTENTS_LENGTH HTML_STR_LENGTH |
|
51 | 56 |
|
52 | 57 |
typedef enum { |
53 | 58 |
HTML_BAD_STATE, |
... | ... |
@@ -307,6 +312,16 @@ static void html_tag_arg_add(tag_arguments_t *tags, |
307 | 307 |
if (!tags->value) { |
308 | 308 |
goto abort; |
309 | 309 |
} |
310 |
+#ifdef CL_EXPERIMENTAL |
|
311 |
+ if(tags->scanContents) { |
|
312 |
+ tags->contents= (blob **) cli_realloc(tags->contents, |
|
313 |
+ tags->count*sizeof(*tags->contents)); |
|
314 |
+ if(!tags->contents) { |
|
315 |
+ goto abort; |
|
316 |
+ } |
|
317 |
+ tags->contents[tags->count-1]=NULL; |
|
318 |
+ } |
|
319 |
+#endif |
|
310 | 320 |
tags->tag[tags->count-1] = strdup(tag); |
311 | 321 |
if (value) { |
312 | 322 |
if (*value == '"') { |
... | ... |
@@ -333,6 +348,12 @@ abort: |
333 | 333 |
if (tags->value) { |
334 | 334 |
free(tags->value[i]); |
335 | 335 |
} |
336 |
+#ifdef CL_EXPERIMENTAL |
|
337 |
+ if(tags->contents) { |
|
338 |
+ if(tags->contents[i]) |
|
339 |
+ blobDestroy(tags->contents[i]); |
|
340 |
+ } |
|
341 |
+#endif |
|
336 | 342 |
} |
337 | 343 |
if (tags->tag) { |
338 | 344 |
free(tags->tag); |
... | ... |
@@ -340,6 +361,11 @@ abort: |
340 | 340 |
if (tags->value) { |
341 | 341 |
free(tags->value); |
342 | 342 |
} |
343 |
+#ifdef CL_EXPERIMENTAL |
|
344 |
+ if (tags->contents) |
|
345 |
+ free(tags->contents); |
|
346 |
+ tags->contents=NULL; |
|
347 |
+#endif |
|
343 | 348 |
tags->tag = tags->value = NULL; |
344 | 349 |
tags->count = 0; |
345 | 350 |
return; |
... | ... |
@@ -375,6 +401,11 @@ void html_tag_arg_free(tag_arguments_t *tags) |
375 | 375 |
if (tags->value[i]) { |
376 | 376 |
free(tags->value[i]); |
377 | 377 |
} |
378 |
+#ifdef CL_EXPERIMENTAL |
|
379 |
+ if(tags->contents) |
|
380 |
+ if (tags->contents[i]) |
|
381 |
+ blobDestroy(tags->contents[i]); |
|
382 |
+#endif |
|
378 | 383 |
} |
379 | 384 |
if (tags->tag) { |
380 | 385 |
free(tags->tag); |
... | ... |
@@ -382,14 +413,59 @@ void html_tag_arg_free(tag_arguments_t *tags) |
382 | 382 |
if (tags->value) { |
383 | 383 |
free(tags->value); |
384 | 384 |
} |
385 |
+#ifdef CL_EXPERIMENTAL |
|
386 |
+ if(tags->contents) |
|
387 |
+ free(tags->contents); |
|
388 |
+ tags->contents = NULL; |
|
389 |
+#endif |
|
385 | 390 |
tags->tag = tags->value = NULL; |
386 | 391 |
tags->count = 0; |
387 | 392 |
} |
388 | 393 |
|
394 |
+#ifdef CL_EXPERIMENTAL |
|
395 |
+/** |
|
396 |
+ * this is used for img, and iframe tags. If they are inside an <a href> tag, then set the contents of the image|iframe to the real URL. |
|
397 |
+ */ |
|
398 |
+static inline void html_tag_set_inahref(tag_arguments_t *tags,int idx,int in_ahref) |
|
399 |
+{ |
|
400 |
+ char x[]=""; |
|
401 |
+ tags->contents[idx-1]=blobCreate(); |
|
402 |
+ blobAddData(tags->contents[idx-1],tags->value[in_ahref-1],strlen(tags->value[in_ahref-1])); |
|
403 |
+ blobAddData(tags->contents[idx-1],x,1); |
|
404 |
+ blobClose(tags->contents[idx-1]); |
|
405 |
+} |
|
406 |
+ |
|
407 |
+/** |
|
408 |
+ * the displayed text for an <a href> tag |
|
409 |
+ */ |
|
410 |
+static inline void html_tag_contents_append(tag_arguments_t *tags,int idx,const unsigned char* begin,const unsigned char *end) |
|
411 |
+{ |
|
412 |
+ if(begin<end) { |
|
413 |
+ blobAddData(tags->contents[idx-1],begin,end-begin); |
|
414 |
+ } |
|
415 |
+} |
|
416 |
+ |
|
417 |
+ |
|
418 |
+static inline void html_tag_contents_done(tag_arguments_t *tags,int idx) |
|
419 |
+{ |
|
420 |
+ char x[]=""; |
|
421 |
+ blobAddData(tags->contents[idx-1],x,1);/*append NULL character*/ |
|
422 |
+ blobClose(tags->contents[idx-1]); |
|
423 |
+} |
|
424 |
+ |
|
425 |
+static inline void html_tag_contents_length_check(tag_arguments_t *tags,int* idx) |
|
426 |
+{ |
|
427 |
+ if (blobGetDataSize(tags->contents[*idx-1])>MAX_TAG_CONTENTS_LENGTH) { |
|
428 |
+ html_tag_contents_done(tags,*idx); |
|
429 |
+ *idx=0;/*in_ahref=0;*/ |
|
430 |
+ } |
|
431 |
+} |
|
432 |
+#endif |
|
433 |
+ |
|
389 | 434 |
static int cli_html_normalise(int fd, m_area_t *m_area, const char *dirname, tag_arguments_t *hrefs) |
390 | 435 |
{ |
391 | 436 |
int fd_tmp, tag_length, tag_arg_length, binary; |
392 |
- int retval=FALSE, escape, value, hex, tag_val_length, table_pos, in_script=FALSE; |
|
437 |
+ int retval=FALSE, escape, value, hex, tag_val_length=0, table_pos, in_script=FALSE; |
|
393 | 438 |
FILE *stream_in; |
394 | 439 |
html_state state=HTML_NORM, next_state=HTML_BAD_STATE; |
395 | 440 |
char filename[1024], tag[HTML_STR_LENGTH+1], tag_arg[HTML_STR_LENGTH+1]; |
... | ... |
@@ -400,6 +476,13 @@ static int cli_html_normalise(int fd, m_area_t *m_area, const char *dirname, tag |
400 | 400 |
unsigned long length; |
401 | 401 |
file_buff_t *file_buff_o1, *file_buff_o2, *file_buff_script; |
402 | 402 |
file_buff_t *file_tmp_o1; |
403 |
+#ifdef CL_EXPERIMENTAL |
|
404 |
+ int in_ahref=0;/* index of <a> tag, whose contents we are parsing. Indexing starts from 1, 0 means outside of <a>*/ |
|
405 |
+ unsigned char* href_contents_begin=NULL;/*beginning of the next portion of <a> contents*/ |
|
406 |
+ unsigned char* ptrend=NULL;/*end of <a> contents*/ |
|
407 |
+ unsigned char* in_form_action = NULL;/* the action URL of the current <form> tag, if any*/ |
|
408 |
+ tag_args.scanContents=0;/* do we need to store the contents of <a></a>?*/ |
|
409 |
+#endif |
|
403 | 410 |
|
404 | 411 |
if (!m_area) { |
405 | 412 |
if (fd < 0) { |
... | ... |
@@ -421,6 +504,9 @@ static int cli_html_normalise(int fd, m_area_t *m_area, const char *dirname, tag |
421 | 421 |
tag_args.count = 0; |
422 | 422 |
tag_args.tag = NULL; |
423 | 423 |
tag_args.value = NULL; |
424 |
+#ifdef CL_EXPERIMENTAL |
|
425 |
+ tag_args.contents = NULL; |
|
426 |
+#endif |
|
424 | 427 |
|
425 | 428 |
if (dirname) { |
426 | 429 |
snprintf(filename, 1024, "%s/rfc2397", dirname); |
... | ... |
@@ -498,6 +584,10 @@ static int cli_html_normalise(int fd, m_area_t *m_area, const char *dirname, tag |
498 | 498 |
|
499 | 499 |
ptr = line = cli_readline(stream_in, m_area, 8192); |
500 | 500 |
while (line) { |
501 |
+#ifdef CL_EXPERIMENTAL |
|
502 |
+ if(href_contents_begin) |
|
503 |
+ href_contents_begin=ptr;/*start of a new line, last line already appended to contents see below*/ |
|
504 |
+#endif |
|
501 | 505 |
while (*ptr && isspace(*ptr)) { |
502 | 506 |
ptr++; |
503 | 507 |
} |
... | ... |
@@ -542,10 +632,21 @@ static int cli_html_normalise(int fd, m_area_t *m_area, const char *dirname, tag |
542 | 542 |
break; |
543 | 543 |
case HTML_NORM: |
544 | 544 |
if (*ptr == '<') { |
545 |
+#ifdef CL_EXPERIMENTAL |
|
546 |
+ ptrend=ptr-1; /* for use by scanContents */ |
|
547 |
+#endif |
|
545 | 548 |
html_output_c(file_buff_o1, file_buff_o2, '<'); |
546 | 549 |
if (in_script) { |
547 | 550 |
html_output_c(file_buff_script, NULL, '<'); |
548 | 551 |
} |
552 |
+#ifdef CL_EXPERIMENTAL |
|
553 |
+ if(hrefs && hrefs->scanContents && in_ahref && href_contents_begin) { |
|
554 |
+ /*append this text portion to the contents of <a>*/ |
|
555 |
+ html_tag_contents_append(hrefs,in_ahref,href_contents_begin,ptr); |
|
556 |
+ html_tag_contents_length_check(hrefs,&in_ahref); |
|
557 |
+ href_contents_begin=NULL;/*We just encountered another tag inside <a>, so skip it*/ |
|
558 |
+ } |
|
559 |
+#endif |
|
549 | 560 |
ptr++; |
550 | 561 |
state = HTML_SKIP_WS; |
551 | 562 |
tag_length=0; |
... | ... |
@@ -815,6 +916,21 @@ static int cli_html_normalise(int fd, m_area_t *m_area, const char *dirname, tag |
815 | 815 |
in_script=FALSE; |
816 | 816 |
html_output_c(file_buff_script, NULL, '\n'); |
817 | 817 |
} |
818 |
+#ifdef CL_EXPERIMENTAL |
|
819 |
+ if (hrefs && hrefs->scanContents && in_ahref) { |
|
820 |
+ if(strcmp(tag,"/a") == 0) { |
|
821 |
+ html_tag_contents_done(hrefs,in_ahref); |
|
822 |
+ in_ahref=0;/* we are no longer inside an <a href> |
|
823 |
+ nesting <a> tags not supported, and shouldn't be supported*/ |
|
824 |
+ } |
|
825 |
+ href_contents_begin=ptr; |
|
826 |
+ } |
|
827 |
+ if (strcmp(tag, "/form") == 0) { |
|
828 |
+ if (in_form_action) |
|
829 |
+ free(in_form_action); |
|
830 |
+ in_form_action = NULL; |
|
831 |
+ } |
|
832 |
+#endif |
|
818 | 833 |
} else if (strcmp(tag, "script") == 0) { |
819 | 834 |
arg_value = html_tag_arg_value(&tag_args, "language"); |
820 | 835 |
if (arg_value && (strcasecmp(arg_value, "jscript.encode") == 0)) { |
... | ... |
@@ -830,26 +946,130 @@ static int cli_html_normalise(int fd, m_area_t *m_area, const char *dirname, tag |
830 | 830 |
} |
831 | 831 |
html_output_tag(file_buff_script, tag, &tag_args); |
832 | 832 |
} else if (hrefs) { |
833 |
+#ifdef CL_EXPERIMENTAL |
|
834 |
+ if(in_ahref && !href_contents_begin) |
|
835 |
+ href_contents_begin=ptr; |
|
836 |
+#endif |
|
833 | 837 |
if (strcmp(tag, "a") == 0) { |
834 | 838 |
arg_value = html_tag_arg_value(&tag_args, "href"); |
835 | 839 |
if (arg_value && strlen(arg_value) > 0) { |
840 |
+#ifdef CL_EXPERIMENTAL |
|
841 |
+ if (hrefs->scanContents) { |
|
842 |
+ const unsigned char* arg_value_title = html_tag_arg_value(&tag_args,"title"); |
|
843 |
+ /*beginning of an <a> tag*/ |
|
844 |
+ if (in_ahref) |
|
845 |
+ /*we encountered nested <a> tags, pretend previous closed*/ |
|
846 |
+ if (href_contents_begin) { |
|
847 |
+ html_tag_contents_append(hrefs,in_ahref, |
|
848 |
+ href_contents_begin,ptrend); |
|
849 |
+ /*add pending contents between tags*/ |
|
850 |
+ html_tag_contents_done(hrefs,in_ahref); |
|
851 |
+ in_ahref=0; |
|
852 |
+ } |
|
853 |
+ if (arg_value_title) { |
|
854 |
+ /* title is a 'displayed link'*/ |
|
855 |
+ html_tag_arg_add(hrefs,"href_title",arg_value_title); |
|
856 |
+ hrefs->contents[hrefs->count-1]=blobCreate(); |
|
857 |
+ html_tag_contents_append(hrefs,hrefs->count,arg_value, |
|
858 |
+ arg_value+strlen(arg_value)); |
|
859 |
+ html_tag_contents_done(hrefs,hrefs->count); |
|
860 |
+ } |
|
861 |
+ if (in_form_action) { |
|
862 |
+ /* form action is the real URL, and href is the 'displayed' */ |
|
863 |
+ html_tag_arg_add(hrefs,"form",arg_value); |
|
864 |
+ hrefs->contents[hrefs->count-1] = blobCreate(); |
|
865 |
+ html_tag_contents_append(hrefs, hrefs->count, in_form_action, |
|
866 |
+ in_form_action + strlen(in_form_action)); |
|
867 |
+ html_tag_contents_done(hrefs,hrefs->count); |
|
868 |
+ } |
|
869 |
+ } |
|
870 |
+#endif |
|
836 | 871 |
html_tag_arg_add(hrefs, "href", arg_value); |
872 |
+#ifdef CL_EXPERIMENTAL |
|
873 |
+ if (hrefs->scanContents) { |
|
874 |
+ in_ahref=hrefs->count; /* index of this tag (counted from 1) */ |
|
875 |
+ href_contents_begin=ptr;/* contents begin after <a ..> ends */ |
|
876 |
+ hrefs->contents[hrefs->count-1]=blobCreate(); |
|
877 |
+ } |
|
878 |
+#endif |
|
837 | 879 |
} |
880 |
+#ifdef CL_EXPERIMENTAL |
|
881 |
+ } else if (strcmp(tag,"form") == 0 && hrefs->scanContents) { |
|
882 |
+ const unsigned char* arg_action_value = html_tag_arg_value(&tag_args,"action"); |
|
883 |
+ if (arg_action_value) |
|
884 |
+ in_form_action = strdup(arg_action_value); |
|
885 |
+#endif |
|
838 | 886 |
} else if (strcmp(tag, "img") == 0) { |
839 | 887 |
arg_value = html_tag_arg_value(&tag_args, "src"); |
840 | 888 |
if (arg_value && strlen(arg_value) > 0) { |
841 | 889 |
html_tag_arg_add(hrefs, "src", arg_value); |
890 |
+#ifdef CL_EXPERIMENTAL |
|
891 |
+ if(hrefs->scanContents && in_ahref) |
|
892 |
+ /* "contents" of an img tag, is the URL of its parent <a> tag */ |
|
893 |
+ html_tag_set_inahref(hrefs,hrefs->count,in_ahref); |
|
894 |
+ if (in_form_action) { |
|
895 |
+ /* form action is the real URL, and href is the 'displayed' */ |
|
896 |
+ html_tag_arg_add(hrefs,"form",arg_value); |
|
897 |
+ hrefs->contents[hrefs->count-1] = blobCreate(); |
|
898 |
+ html_tag_contents_append(hrefs, hrefs->count, in_form_action, |
|
899 |
+ in_form_action + strlen(in_form_action)); |
|
900 |
+ html_tag_contents_done(hrefs,hrefs->count); |
|
901 |
+ } |
|
902 |
+#endif |
|
842 | 903 |
} |
843 | 904 |
arg_value = html_tag_arg_value(&tag_args, "dynsrc"); |
844 | 905 |
if (arg_value && strlen(arg_value) > 0) { |
845 | 906 |
html_tag_arg_add(hrefs, "dynsrc", arg_value); |
907 |
+#ifdef CL_EXPERIMENTAL |
|
908 |
+ if(hrefs->scanContents && in_ahref) |
|
909 |
+ /* see above */ |
|
910 |
+ html_tag_set_inahref(hrefs,hrefs->count,in_ahref); |
|
911 |
+ if (in_form_action) { |
|
912 |
+ /* form action is the real URL, and href is the 'displayed' */ |
|
913 |
+ html_tag_arg_add(hrefs,"form",arg_value); |
|
914 |
+ hrefs->contents[hrefs->count-1] = blobCreate(); |
|
915 |
+ html_tag_contents_append(hrefs, hrefs->count, in_form_action, |
|
916 |
+ in_form_action + strlen(in_form_action)); |
|
917 |
+ html_tag_contents_done(hrefs,hrefs->count); |
|
918 |
+ } |
|
919 |
+#endif |
|
846 | 920 |
} |
847 | 921 |
} else if (strcmp(tag, "iframe") == 0) { |
848 | 922 |
arg_value = html_tag_arg_value(&tag_args, "src"); |
849 | 923 |
if (arg_value && strlen(arg_value) > 0) { |
850 | 924 |
html_tag_arg_add(hrefs, "iframe", arg_value); |
925 |
+#ifdef CL_EXPERIMENTAL |
|
926 |
+ if(hrefs->scanContents && in_ahref) |
|
927 |
+ /* see above */ |
|
928 |
+ html_tag_set_inahref(hrefs,hrefs->count,in_ahref); |
|
929 |
+ if (in_form_action) { |
|
930 |
+ /* form action is the real URL, and href is the 'displayed' */ |
|
931 |
+ html_tag_arg_add(hrefs,"form",arg_value); |
|
932 |
+ hrefs->contents[hrefs->count-1] = blobCreate(); |
|
933 |
+ html_tag_contents_append(hrefs, hrefs->count, in_form_action, |
|
934 |
+ in_form_action + strlen(in_form_action)); |
|
935 |
+ html_tag_contents_done(hrefs,hrefs->count); |
|
936 |
+ } |
|
851 | 937 |
} |
852 |
- } |
|
938 |
+ } else if (strcmp(tag,"area") == 0) { |
|
939 |
+ arg_value = html_tag_arg_value(&tag_args,"href"); |
|
940 |
+ if (arg_value && strlen(arg_value) > 0) { |
|
941 |
+ html_tag_arg_add(hrefs, "area", arg_value); |
|
942 |
+ if(hrefs->scanContents && in_ahref) |
|
943 |
+ /* see above */ |
|
944 |
+ html_tag_set_inahref(hrefs,hrefs->count,in_ahref); |
|
945 |
+ if (in_form_action) { |
|
946 |
+ /* form action is the real URL, and href is the 'displayed' */ |
|
947 |
+ html_tag_arg_add(hrefs,"form",arg_value); |
|
948 |
+ hrefs->contents[hrefs->count-1] = blobCreate(); |
|
949 |
+ html_tag_contents_append(hrefs, hrefs->count, in_form_action, |
|
950 |
+ in_form_action + strlen(in_form_action)); |
|
951 |
+ html_tag_contents_done(hrefs,hrefs->count); |
|
952 |
+ } |
|
953 |
+#endif |
|
954 |
+ } |
|
955 |
+ } |
|
956 |
+ /* TODO:imagemaps can have urls too */ |
|
853 | 957 |
} |
854 | 958 |
html_tag_arg_free(&tag_args); |
855 | 959 |
break; |
... | ... |
@@ -871,6 +1091,11 @@ static int cli_html_normalise(int fd, m_area_t *m_area, const char *dirname, tag |
871 | 871 |
ptr++; |
872 | 872 |
} else if (*ptr == ';') { |
873 | 873 |
html_output_c(file_buff_o1, file_buff_o2, value); |
874 |
+#ifdef CL_EXPERIMENTAL |
|
875 |
+ if (tag_val_length < HTML_STR_LENGTH) { |
|
876 |
+ tag_val[tag_val_length++] = value; /* store encoded values too */ |
|
877 |
+ } |
|
878 |
+#endif |
|
874 | 879 |
state = next_state; |
875 | 880 |
next_state = HTML_BAD_STATE; |
876 | 881 |
ptr++; |
... | ... |
@@ -1155,12 +1380,24 @@ static int cli_html_normalise(int fd, m_area_t *m_area, const char *dirname, tag |
1155 | 1155 |
break; |
1156 | 1156 |
} |
1157 | 1157 |
} |
1158 |
+#ifdef CL_EXPERIMENTAL |
|
1159 |
+ if(hrefs && hrefs->scanContents && in_ahref && href_contents_begin) |
|
1160 |
+ /* end of line, append contents now, resume on next line */ |
|
1161 |
+ html_tag_contents_append(hrefs,in_ahref,href_contents_begin,ptr-1); |
|
1162 |
+ ptrend = NULL; |
|
1163 |
+#endif |
|
1158 | 1164 |
free(line); |
1159 | 1165 |
ptr = line = cli_readline(stream_in, m_area, 8192); |
1160 | 1166 |
} |
1161 | 1167 |
|
1162 | 1168 |
retval = TRUE; |
1163 | 1169 |
abort: |
1170 |
+#ifdef CL_EXPERIMENTAL |
|
1171 |
+ if (in_form_action) |
|
1172 |
+ free(in_form_action); |
|
1173 |
+ if (in_ahref) /* tag not closed, force closing */ |
|
1174 |
+ html_tag_contents_done(hrefs,in_ahref); |
|
1175 |
+#endif |
|
1164 | 1176 |
html_tag_arg_free(&tag_args); |
1165 | 1177 |
if (!m_area) { |
1166 | 1178 |
fclose(stream_in); |