Browse code

Phishing merge - htmlnorm mbox.c pending...

git-svn: trunk@2246

aCaB authored on 2006/09/13 07:06:03
Showing 2 changed files
... ...
@@ -1,3 +1,9 @@
1
+Wed Sep 13 00:03:30 CEST 2006 (acab)
2
+------------------------------------
3
+  * libclamav: Merge of the html normaliser part of the phishing module from
4
+               Torok Edvin <edwintorok*gmail.com>
5
+               Part of the Google Summer of Code program
6
+
1 7
 Tue Sep 12 22:52:14 CEST 2006 (tk)
2 8
 ----------------------------------
3 9
   * clamd: apply w32 patches from NJH
... ...
@@ -47,7 +47,12 @@
47 47
 #include "others.h"
48 48
 #include "htmlnorm.h"
49 49
 
50
+#ifdef CL_EXPERIMENTAL
51
+#include "mbox.h"
52
+#endif
53
+
50 54
 #define HTML_STR_LENGTH 1024
55
+#define MAX_TAG_CONTENTS_LENGTH HTML_STR_LENGTH
51 56
 
52 57
 typedef enum {
53 58
     HTML_BAD_STATE,
... ...
@@ -307,6 +312,16 @@ static void html_tag_arg_add(tag_arguments_t *tags,
307 307
 	if (!tags->value) {
308 308
 		goto abort;
309 309
 	}
310
+#ifdef CL_EXPERIMENTAL
311
+	if(tags->scanContents) {
312
+		tags->contents= (blob **) cli_realloc(tags->contents,
313
+				tags->count*sizeof(*tags->contents));
314
+		if(!tags->contents) {
315
+			goto abort;
316
+		}
317
+		tags->contents[tags->count-1]=NULL;
318
+	}
319
+#endif
310 320
 	tags->tag[tags->count-1] = strdup(tag);
311 321
 	if (value) {
312 322
 		if (*value == '"') {
... ...
@@ -333,6 +348,12 @@ abort:
333 333
 		if (tags->value) {
334 334
 			free(tags->value[i]);
335 335
 		}
336
+#ifdef CL_EXPERIMENTAL
337
+		if(tags->contents) {
338
+			if(tags->contents[i])
339
+				blobDestroy(tags->contents[i]);
340
+		}
341
+#endif
336 342
 	}
337 343
 	if (tags->tag) {
338 344
 		free(tags->tag);
... ...
@@ -340,6 +361,11 @@ abort:
340 340
 	if (tags->value) {
341 341
 		free(tags->value);
342 342
 	}
343
+#ifdef CL_EXPERIMENTAL
344
+	if (tags->contents)
345
+		free(tags->contents);
346
+	tags->contents=NULL;
347
+#endif
343 348
 	tags->tag = tags->value = NULL;
344 349
 	tags->count = 0;	
345 350
 	return;
... ...
@@ -375,6 +401,11 @@ void html_tag_arg_free(tag_arguments_t *tags)
375 375
 		if (tags->value[i]) {
376 376
 			free(tags->value[i]);
377 377
 		}
378
+#ifdef CL_EXPERIMENTAL
379
+		if(tags->contents)
380
+			if (tags->contents[i])
381
+				blobDestroy(tags->contents[i]);
382
+#endif
378 383
 	}
379 384
 	if (tags->tag) {
380 385
 		free(tags->tag);
... ...
@@ -382,14 +413,59 @@ void html_tag_arg_free(tag_arguments_t *tags)
382 382
 	if (tags->value) {
383 383
 		free(tags->value);
384 384
 	}
385
+#ifdef CL_EXPERIMENTAL
386
+	if(tags->contents)
387
+		free(tags->contents);
388
+	tags->contents = NULL;
389
+#endif
385 390
 	tags->tag = tags->value = NULL;
386 391
 	tags->count = 0;
387 392
 }
388 393
 
394
+#ifdef CL_EXPERIMENTAL
395
+/**
396
+ * this is used for img, and iframe tags. If they are inside an <a href> tag, then set the contents of the image|iframe to the real URL.
397
+ */
398
+static inline void html_tag_set_inahref(tag_arguments_t *tags,int idx,int in_ahref)
399
+{
400
+	char x[]="";
401
+	tags->contents[idx-1]=blobCreate();
402
+	blobAddData(tags->contents[idx-1],tags->value[in_ahref-1],strlen(tags->value[in_ahref-1]));
403
+	blobAddData(tags->contents[idx-1],x,1);
404
+	blobClose(tags->contents[idx-1]);
405
+}
406
+
407
+/**
408
+ * the displayed text for an <a href> tag
409
+ */
410
+static inline void html_tag_contents_append(tag_arguments_t *tags,int idx,const unsigned char* begin,const unsigned char *end)
411
+{
412
+	if(begin<end) {
413
+		blobAddData(tags->contents[idx-1],begin,end-begin);
414
+	}
415
+}
416
+
417
+
418
+static inline void html_tag_contents_done(tag_arguments_t *tags,int idx)
419
+{
420
+	char x[]="";
421
+	blobAddData(tags->contents[idx-1],x,1);/*append NULL character*/
422
+	blobClose(tags->contents[idx-1]);
423
+}
424
+
425
+static inline void html_tag_contents_length_check(tag_arguments_t *tags,int* idx)
426
+{
427
+	if (blobGetDataSize(tags->contents[*idx-1])>MAX_TAG_CONTENTS_LENGTH) {
428
+		html_tag_contents_done(tags,*idx);
429
+		*idx=0;/*in_ahref=0;*/
430
+	}
431
+}
432
+#endif
433
+
389 434
 static int cli_html_normalise(int fd, m_area_t *m_area, const char *dirname, tag_arguments_t *hrefs)
390 435
 {
391 436
 	int fd_tmp, tag_length, tag_arg_length, binary;
392
-	int retval=FALSE, escape, value, hex, tag_val_length, table_pos, in_script=FALSE;
437
+	int retval=FALSE, escape, value, hex, tag_val_length=0, table_pos, in_script=FALSE;
393 438
 	FILE *stream_in;
394 439
 	html_state state=HTML_NORM, next_state=HTML_BAD_STATE;
395 440
 	char filename[1024], tag[HTML_STR_LENGTH+1], tag_arg[HTML_STR_LENGTH+1];
... ...
@@ -400,6 +476,13 @@ static int cli_html_normalise(int fd, m_area_t *m_area, const char *dirname, tag
400 400
 	unsigned long length;
401 401
 	file_buff_t *file_buff_o1, *file_buff_o2, *file_buff_script;
402 402
 	file_buff_t *file_tmp_o1;
403
+#ifdef CL_EXPERIMENTAL
404
+	int in_ahref=0;/* index of <a> tag, whose contents we are parsing. Indexing starts from 1, 0 means outside of <a>*/
405
+	unsigned char* href_contents_begin=NULL;/*beginning of the next portion of <a> contents*/
406
+	unsigned char* ptrend=NULL;/*end of <a> contents*/
407
+	unsigned char* in_form_action = NULL;/* the action URL of the current <form> tag, if any*/
408
+	tag_args.scanContents=0;/* do we need to store the contents of <a></a>?*/
409
+#endif
403 410
 
404 411
 	if (!m_area) {
405 412
 		if (fd < 0) {
... ...
@@ -421,6 +504,9 @@ static int cli_html_normalise(int fd, m_area_t *m_area, const char *dirname, tag
421 421
 	tag_args.count = 0;
422 422
 	tag_args.tag = NULL;
423 423
 	tag_args.value = NULL;
424
+#ifdef CL_EXPERIMENTAL
425
+	tag_args.contents = NULL;
426
+#endif
424 427
 	
425 428
 	if (dirname) {
426 429
 		snprintf(filename, 1024, "%s/rfc2397", dirname);
... ...
@@ -498,6 +584,10 @@ static int cli_html_normalise(int fd, m_area_t *m_area, const char *dirname, tag
498 498
 		
499 499
 	ptr = line = cli_readline(stream_in, m_area, 8192);
500 500
 	while (line) {
501
+#ifdef CL_EXPERIMENTAL
502
+		if(href_contents_begin)
503
+			href_contents_begin=ptr;/*start of a new line, last line already appended to contents see below*/
504
+#endif
501 505
 		while (*ptr && isspace(*ptr)) {
502 506
 			ptr++;
503 507
 		}
... ...
@@ -542,10 +632,21 @@ static int cli_html_normalise(int fd, m_area_t *m_area, const char *dirname, tag
542 542
 				break;
543 543
 			case HTML_NORM:
544 544
 				if (*ptr == '<') {
545
+#ifdef CL_EXPERIMENTAL
546
+					ptrend=ptr-1; /* for use by scanContents */
547
+#endif
545 548
 					html_output_c(file_buff_o1, file_buff_o2, '<');
546 549
 					if (in_script) {
547 550
 						html_output_c(file_buff_script, NULL, '<');
548 551
 					}
552
+#ifdef CL_EXPERIMENTAL
553
+					if(hrefs && hrefs->scanContents && in_ahref && href_contents_begin) {
554
+						/*append this text portion to the contents of <a>*/
555
+						html_tag_contents_append(hrefs,in_ahref,href_contents_begin,ptr);
556
+						html_tag_contents_length_check(hrefs,&in_ahref);
557
+						href_contents_begin=NULL;/*We just encountered another tag inside <a>, so skip it*/
558
+					}
559
+#endif
549 560
 					ptr++;
550 561
 					state = HTML_SKIP_WS;
551 562
 					tag_length=0;
... ...
@@ -815,6 +916,21 @@ static int cli_html_normalise(int fd, m_area_t *m_area, const char *dirname, tag
815 815
 						in_script=FALSE;
816 816
 						html_output_c(file_buff_script, NULL, '\n');
817 817
 					}
818
+#ifdef CL_EXPERIMENTAL
819
+					if (hrefs && hrefs->scanContents && in_ahref) {
820
+						if(strcmp(tag,"/a") == 0) {
821
+							html_tag_contents_done(hrefs,in_ahref);
822
+							in_ahref=0;/* we are no longer inside an <a href>
823
+							nesting <a> tags not supported, and shouldn't be supported*/
824
+						}
825
+						href_contents_begin=ptr;
826
+					}
827
+					if (strcmp(tag, "/form") == 0)  {
828
+					if (in_form_action)
829
+						free(in_form_action);
830
+						in_form_action = NULL;
831
+					}
832
+#endif
818 833
 				} else if (strcmp(tag, "script") == 0) {
819 834
 					arg_value = html_tag_arg_value(&tag_args, "language");
820 835
 					if (arg_value && (strcasecmp(arg_value, "jscript.encode") == 0)) {
... ...
@@ -830,26 +946,130 @@ static int cli_html_normalise(int fd, m_area_t *m_area, const char *dirname, tag
830 830
 					}
831 831
 					html_output_tag(file_buff_script, tag, &tag_args);
832 832
 				} else if (hrefs) {
833
+#ifdef CL_EXPERIMENTAL
834
+					if(in_ahref && !href_contents_begin)
835
+						href_contents_begin=ptr;
836
+#endif
833 837
 					if (strcmp(tag, "a") == 0) {
834 838
 						arg_value = html_tag_arg_value(&tag_args, "href");
835 839
 						if (arg_value && strlen(arg_value) > 0) {
840
+#ifdef CL_EXPERIMENTAL
841
+							if (hrefs->scanContents) {
842
+								const unsigned char* arg_value_title = html_tag_arg_value(&tag_args,"title");
843
+								/*beginning of an <a> tag*/
844
+								if (in_ahref)
845
+									/*we encountered nested <a> tags, pretend previous closed*/
846
+									if (href_contents_begin) {
847
+										html_tag_contents_append(hrefs,in_ahref,
848
+											href_contents_begin,ptrend);
849
+										/*add pending contents between tags*/
850
+										html_tag_contents_done(hrefs,in_ahref);
851
+										in_ahref=0;
852
+										}
853
+								if (arg_value_title) {
854
+									/* title is a 'displayed link'*/
855
+									html_tag_arg_add(hrefs,"href_title",arg_value_title);
856
+									hrefs->contents[hrefs->count-1]=blobCreate();
857
+									html_tag_contents_append(hrefs,hrefs->count,arg_value,
858
+										arg_value+strlen(arg_value));
859
+									html_tag_contents_done(hrefs,hrefs->count);
860
+								}
861
+								if (in_form_action) {
862
+									/* form action is the real URL, and href is the 'displayed' */
863
+									html_tag_arg_add(hrefs,"form",arg_value);
864
+									hrefs->contents[hrefs->count-1] =  blobCreate();
865
+									html_tag_contents_append(hrefs, hrefs->count, in_form_action,
866
+											in_form_action + strlen(in_form_action));
867
+									html_tag_contents_done(hrefs,hrefs->count);
868
+								}
869
+							}
870
+#endif
836 871
 							html_tag_arg_add(hrefs, "href", arg_value);
872
+#ifdef CL_EXPERIMENTAL
873
+							if (hrefs->scanContents) {
874
+								in_ahref=hrefs->count; /* index of this tag (counted from 1) */
875
+								href_contents_begin=ptr;/* contents begin after <a ..> ends */
876
+								hrefs->contents[hrefs->count-1]=blobCreate();
877
+							}
878
+#endif
837 879
 						}
880
+#ifdef CL_EXPERIMENTAL
881
+					} else if (strcmp(tag,"form") == 0 && hrefs->scanContents) {
882
+						const unsigned char* arg_action_value = html_tag_arg_value(&tag_args,"action");
883
+						if (arg_action_value)
884
+							in_form_action = strdup(arg_action_value);
885
+#endif
838 886
 					} else if (strcmp(tag, "img") == 0) {
839 887
 						arg_value = html_tag_arg_value(&tag_args, "src");
840 888
 						if (arg_value && strlen(arg_value) > 0) {
841 889
 							html_tag_arg_add(hrefs, "src", arg_value);
890
+#ifdef CL_EXPERIMENTAL
891
+							if(hrefs->scanContents && in_ahref)
892
+								/* "contents" of an img tag, is the URL of its parent <a> tag */
893
+								html_tag_set_inahref(hrefs,hrefs->count,in_ahref);
894
+							if (in_form_action) {
895
+								/* form action is the real URL, and href is the 'displayed' */
896
+								html_tag_arg_add(hrefs,"form",arg_value);
897
+								hrefs->contents[hrefs->count-1] =  blobCreate();
898
+								html_tag_contents_append(hrefs, hrefs->count, in_form_action,
899
+										in_form_action + strlen(in_form_action));
900
+								html_tag_contents_done(hrefs,hrefs->count);
901
+							}
902
+#endif
842 903
 						}
843 904
 						arg_value = html_tag_arg_value(&tag_args, "dynsrc");
844 905
 						if (arg_value && strlen(arg_value) > 0) {
845 906
 							html_tag_arg_add(hrefs, "dynsrc", arg_value);
907
+#ifdef CL_EXPERIMENTAL
908
+							if(hrefs->scanContents && in_ahref)
909
+								/* see above */
910
+								html_tag_set_inahref(hrefs,hrefs->count,in_ahref);
911
+							if (in_form_action) {
912
+								/* form action is the real URL, and href is the 'displayed' */
913
+								html_tag_arg_add(hrefs,"form",arg_value);
914
+								hrefs->contents[hrefs->count-1] =  blobCreate();
915
+								html_tag_contents_append(hrefs, hrefs->count, in_form_action,
916
+										in_form_action + strlen(in_form_action));
917
+								html_tag_contents_done(hrefs,hrefs->count);
918
+							}
919
+#endif
846 920
 						}
847 921
 					} else if (strcmp(tag, "iframe") == 0) {
848 922
 						arg_value = html_tag_arg_value(&tag_args, "src");
849 923
 						if (arg_value && strlen(arg_value) > 0) {
850 924
 							html_tag_arg_add(hrefs, "iframe", arg_value);
925
+#ifdef CL_EXPERIMENTAL
926
+							if(hrefs->scanContents && in_ahref)
927
+								/* see above */
928
+								html_tag_set_inahref(hrefs,hrefs->count,in_ahref);
929
+							if (in_form_action) {
930
+								/* form action is the real URL, and href is the 'displayed' */
931
+								html_tag_arg_add(hrefs,"form",arg_value);
932
+								hrefs->contents[hrefs->count-1] =  blobCreate();
933
+								html_tag_contents_append(hrefs, hrefs->count, in_form_action,
934
+										in_form_action + strlen(in_form_action));
935
+								html_tag_contents_done(hrefs,hrefs->count);
936
+							}
851 937
 						}
852
-					}						
938
+					} else if (strcmp(tag,"area") == 0) {
939
+						arg_value = html_tag_arg_value(&tag_args,"href");
940
+						if (arg_value && strlen(arg_value) > 0) {
941
+							html_tag_arg_add(hrefs, "area", arg_value);
942
+							if(hrefs->scanContents && in_ahref)
943
+								/* see above */
944
+								html_tag_set_inahref(hrefs,hrefs->count,in_ahref);
945
+							if (in_form_action) {
946
+								/* form action is the real URL, and href is the 'displayed' */
947
+								html_tag_arg_add(hrefs,"form",arg_value);
948
+								hrefs->contents[hrefs->count-1] =  blobCreate();
949
+								html_tag_contents_append(hrefs, hrefs->count, in_form_action,
950
+									in_form_action + strlen(in_form_action));
951
+								html_tag_contents_done(hrefs,hrefs->count);
952
+							}
953
+#endif
954
+						}						
955
+					}
956
+					/* TODO:imagemaps can have urls too */
853 957
 				}
854 958
 				html_tag_arg_free(&tag_args);
855 959
 				break;
... ...
@@ -871,6 +1091,11 @@ static int cli_html_normalise(int fd, m_area_t *m_area, const char *dirname, tag
871 871
 					ptr++;
872 872
 				} else if (*ptr == ';') {
873 873
 					html_output_c(file_buff_o1, file_buff_o2, value);
874
+#ifdef CL_EXPERIMENTAL
875
+					if (tag_val_length < HTML_STR_LENGTH) {
876
+					tag_val[tag_val_length++] = value; /* store encoded values too */
877
+					}
878
+#endif
874 879
 					state = next_state;
875 880
 					next_state = HTML_BAD_STATE;
876 881
 					ptr++;
... ...
@@ -1155,12 +1380,24 @@ static int cli_html_normalise(int fd, m_area_t *m_area, const char *dirname, tag
1155 1155
 				break;	
1156 1156
 			}
1157 1157
 		}
1158
+#ifdef CL_EXPERIMENTAL
1159
+		if(hrefs && hrefs->scanContents && in_ahref && href_contents_begin)
1160
+			/* end of line, append contents now, resume on next line */
1161
+			html_tag_contents_append(hrefs,in_ahref,href_contents_begin,ptr-1);
1162
+		ptrend = NULL;
1163
+#endif
1158 1164
 		free(line);
1159 1165
 		ptr = line = cli_readline(stream_in, m_area, 8192);
1160 1166
 	}
1161 1167
 	
1162 1168
 	retval = TRUE;
1163 1169
 abort:
1170
+#ifdef CL_EXPERIMENTAL
1171
+	if (in_form_action)
1172
+		free(in_form_action);
1173
+	if (in_ahref) /* tag not closed, force closing */
1174
+		html_tag_contents_done(hrefs,in_ahref);
1175
+#endif
1164 1176
 	html_tag_arg_free(&tag_args);
1165 1177
 	if (!m_area) {
1166 1178
 		fclose(stream_in);