Browse code

tagless version of HTML file (bb #162) fix compiler warning

git-svn: trunk@3619

Török Edvin authored on 2008/02/12 06:41:58
Showing 5 changed files
... ...
@@ -1,3 +1,8 @@
1
+Mon Feb 11 23:27:47 EET 2008 (edwin)
2
+------------------------------------
3
+  * libclamav/scanners.c, htmlnorm.c: tagless version of HTML file (bb #162)
4
+  * libclamav/scanners.c, textnorm.[ch]: fix compiler warning
5
+
1 6
 Mon Feb 11 22:16:10 CET 2008 (acab)
2 7
 -----------------------------------
3 8
   * libclamav: account for scanned data in cli_scanbuff too
... ...
@@ -500,7 +500,7 @@ static inline void html_tag_contents_done(tag_arguments_t *tags,int idx)
500 500
 static int cli_html_normalise(int fd, m_area_t *m_area, const char *dirname, tag_arguments_t *hrefs,const struct cli_dconf* dconf)
501 501
 {
502 502
 	int fd_tmp, tag_length, tag_arg_length, binary;
503
-	int retval=FALSE, escape, value = 0, hex, tag_val_length=0, table_pos, in_script=FALSE;
503
+	int retval=FALSE, escape, value = 0, hex, tag_val_length=0, table_pos, in_script=FALSE, text_space_written=FALSE;
504 504
 	FILE *stream_in = NULL;
505 505
 	html_state state=HTML_NORM, next_state=HTML_BAD_STATE;
506 506
 	char filename[1024], tag[HTML_STR_LENGTH+1], tag_arg[HTML_STR_LENGTH+1];
... ...
@@ -509,7 +509,7 @@ static int cli_html_normalise(int fd, m_area_t *m_area, const char *dirname, tag
509 509
 	tag_arguments_t tag_args;
510 510
 	quoted_state quoted;
511 511
 	unsigned long length;
512
-	file_buff_t *file_buff_o1, *file_buff_o2, *file_buff_script;
512
+	file_buff_t *file_buff_o1, *file_buff_o2, *file_buff_script, *file_buff_text;
513 513
 	file_buff_t *file_tmp_o1;
514 514
 	int in_ahref=0;/* index of <a> tag, whose contents we are parsing. Indexing starts from 1, 0 means outside of <a>*/
515 515
 	unsigned char* href_contents_begin=NULL;/*beginning of the next portion of <a> contents*/
... ...
@@ -550,19 +550,19 @@ static int cli_html_normalise(int fd, m_area_t *m_area, const char *dirname, tag
550 550
 	if (dirname) {
551 551
 		snprintf(filename, 1024, "%s/rfc2397", dirname);
552 552
 		if (mkdir(filename, 0700) && errno != EEXIST) {
553
-			file_buff_o1 = file_buff_o2 = file_buff_script = NULL;
553
+			file_buff_o1 = file_buff_o2 = file_buff_script = file_buff_text = NULL;
554 554
 			goto abort;
555 555
 		}
556 556
 		file_buff_o1 = (file_buff_t *) cli_malloc(sizeof(file_buff_t));
557 557
 		if (!file_buff_o1) {
558
-			file_buff_o1 = file_buff_o2 = file_buff_script = NULL;
558
+			file_buff_o1 = file_buff_o2 = file_buff_script = file_buff_text = NULL;
559 559
 			goto abort;
560 560
 		}
561 561
 
562 562
 		file_buff_o2 = (file_buff_t *) cli_malloc(sizeof(file_buff_t));
563 563
 		if (!file_buff_o2) {
564 564
 			free(file_buff_o1);
565
-			file_buff_o1 = file_buff_o2 = file_buff_script = NULL;
565
+			file_buff_o1 = file_buff_o2 = file_buff_script = file_buff_text = NULL;
566 566
 			goto abort;
567 567
 		}
568 568
 
... ...
@@ -570,7 +570,7 @@ static int cli_html_normalise(int fd, m_area_t *m_area, const char *dirname, tag
570 570
 		if (!file_buff_script) {
571 571
 			free(file_buff_o1);
572 572
 			free(file_buff_o2);
573
-			file_buff_o1 = file_buff_o2 = file_buff_script = NULL;
573
+			file_buff_o1 = file_buff_o2 = file_buff_script = file_buff_text = NULL;
574 574
 			goto abort;
575 575
 		}
576 576
 
... ...
@@ -581,7 +581,7 @@ static int cli_html_normalise(int fd, m_area_t *m_area, const char *dirname, tag
581 581
 			free(file_buff_o1);
582 582
 			free(file_buff_o2);
583 583
 			free(file_buff_script);
584
-			file_buff_o1 = file_buff_o2 = file_buff_script = NULL;
584
+			file_buff_o1 = file_buff_o2 = file_buff_script = file_buff_text = NULL;
585 585
 			goto abort;
586 586
 		}
587 587
 
... ...
@@ -593,7 +593,7 @@ static int cli_html_normalise(int fd, m_area_t *m_area, const char *dirname, tag
593 593
 			free(file_buff_o1);
594 594
 			free(file_buff_o2);
595 595
 			free(file_buff_script);
596
-			file_buff_o1 = file_buff_o2 = file_buff_script = NULL;
596
+			file_buff_o1 = file_buff_o2 = file_buff_script = file_buff_text = NULL;
597 597
 			goto abort;
598 598
 		}
599 599
 
... ...
@@ -606,17 +606,39 @@ static int cli_html_normalise(int fd, m_area_t *m_area, const char *dirname, tag
606 606
 			free(file_buff_o1);
607 607
 			free(file_buff_o2);
608 608
 			free(file_buff_script);
609
-			file_buff_o1 = file_buff_o2 = file_buff_script = NULL;
609
+			file_buff_o1 = file_buff_o2 = file_buff_script = file_buff_text = NULL;
610 610
 			goto abort;
611 611
 		}
612 612
 
613
+		file_buff_text = (file_buff_t *) cli_malloc(sizeof(file_buff_t));
614
+		if(!file_buff_text) {
615
+			free(file_buff_o1);
616
+			free(file_buff_o2);
617
+			free(file_buff_script);
618
+			file_buff_o1 = file_buff_o2 = file_buff_script = file_buff_text = NULL;
619
+			goto abort;
620
+		}
621
+		snprintf(filename, 1024, "%s/notags.html", dirname);
622
+		file_buff_text->fd = open(filename, O_WRONLY|O_CREAT|O_TRUNC, S_IWUSR|S_IRUSR);
623
+		if(!file_buff_text->fd) {
624
+			cli_dbgmsg("open failed: %s\n", filename);
625
+			close(file_buff_o1->fd);
626
+			close(file_buff_o2->fd);
627
+			free(file_buff_o1);
628
+			free(file_buff_o2);
629
+			free(file_buff_script);
630
+			free(file_buff_text);
631
+			file_buff_o1 = file_buff_o2 = file_buff_script = file_buff_text = NULL;
632
+		}
613 633
 		file_buff_o1->length = 0;
614 634
 		file_buff_o2->length = 0;
615 635
 		file_buff_script->length = 0;
636
+		file_buff_text->length = 0;
616 637
 	} else {
617 638
 		file_buff_o1 = NULL;
618 639
 		file_buff_o2 = NULL;
619 640
 		file_buff_script = NULL;
641
+		file_buff_text = NULL;
620 642
 	}
621 643
 
622 644
 	binary = FALSE;
... ...
@@ -677,6 +699,9 @@ static int cli_html_normalise(int fd, m_area_t *m_area, const char *dirname, tag
677 677
 					html_output_c(file_buff_o1, file_buff_o2, '<');
678 678
 					if (in_script) {
679 679
 						html_output_c(file_buff_script, NULL, '<');
680
+					} else if(!text_space_written) {
681
+						html_output_c(file_buff_text, NULL, ' ');
682
+						text_space_written = TRUE;
680 683
 					}
681 684
 					if(hrefs && hrefs->scanContents && in_ahref && href_contents_begin) {
682 685
 						/*append this text portion to the contents of <a>*/
... ...
@@ -688,9 +713,17 @@ static int cli_html_normalise(int fd, m_area_t *m_area, const char *dirname, tag
688 688
 					tag_length=0;
689 689
 					next_state = HTML_TAG;
690 690
 				} else if (isspace(*ptr)) {
691
+					if(!text_space_written && !in_script) {
692
+						html_output_c(file_buff_text, NULL, ' ');
693
+						text_space_written = TRUE;
694
+					}
691 695
 					state = HTML_TRIM_WS;
692 696
 					next_state = HTML_NORM;
693 697
 				} else if (*ptr == '&') {
698
+					if(!text_space_written && !in_script) {
699
+						html_output_c(file_buff_text, NULL, ' ');
700
+						text_space_written = TRUE;
701
+					}
694 702
 					state = HTML_CHAR_REF;
695 703
 					next_state = HTML_NORM;
696 704
 					ptr++;
... ...
@@ -698,6 +731,9 @@ static int cli_html_normalise(int fd, m_area_t *m_area, const char *dirname, tag
698 698
 					html_output_c(file_buff_o1, file_buff_o2, tolower(*ptr));
699 699
 					if (in_script) {
700 700
 						html_output_c(file_buff_script, NULL, tolower(*ptr));
701
+					} else {
702
+						html_output_c(file_buff_text, NULL, tolower(*ptr));
703
+						text_space_written = FALSE;
701 704
 					}
702 705
 					ptr++;
703 706
 				}
... ...
@@ -979,6 +1015,18 @@ static int cli_html_normalise(int fd, m_area_t *m_area, const char *dirname, tag
979 979
 						in_script = TRUE;
980 980
 					}
981 981
 					html_output_tag(file_buff_script, tag, &tag_args);
982
+				} else if (strcmp(tag, "a") == 0) {
983
+					arg_value = html_tag_arg_value(&tag_args, "href");
984
+					if(arg_value && arg_value[0]) {
985
+						html_output_str(file_buff_text, arg_value, strlen(arg_value));
986
+						html_output_c(file_buff_text, NULL, ' ');
987
+					}
988
+				} else if (strcmp(tag, "img") == 0) {
989
+					arg_value = html_tag_arg_value(&tag_args, "src");
990
+					if(arg_value && arg_value[0]) {
991
+						html_output_str(file_buff_text, arg_value, strlen(arg_value));
992
+						html_output_c(file_buff_text, NULL, ' ');
993
+					}
982 994
 				} else if (hrefs) {
983 995
 					if(in_ahref && !href_contents_begin)
984 996
 						href_contents_begin=ptr;
... ...
@@ -1540,6 +1588,11 @@ abort:
1540 1540
 		close(file_buff_script->fd);
1541 1541
 		free(file_buff_script);
1542 1542
 	}
1543
+	if(file_buff_text) {
1544
+		html_output_flush(file_buff_text);
1545
+		close(file_buff_text->fd);
1546
+		free(file_buff_text);
1547
+	}
1543 1548
 	return retval;
1544 1549
 }
1545 1550
 
... ...
@@ -885,6 +885,7 @@ static int cli_scanhtml(int desc, cli_ctx *ctx)
885 885
 	char *tempname, fullname[1024];
886 886
 	int ret=CL_CLEAN, fd;
887 887
 	struct stat sb;
888
+	struct stat first_stat;
888 889
 
889 890
 
890 891
     cli_dbgmsg("in cli_scanhtml()\n");
... ...
@@ -910,36 +911,40 @@ static int cli_scanhtml(int desc, cli_ctx *ctx)
910 910
     }
911 911
 
912 912
     html_normalise_fd(desc, tempname, NULL, ctx->dconf);
913
-    snprintf(fullname, 1024, "%s/comment.html", tempname);
913
+    snprintf(fullname, 1024, "%s/nocomment.html", tempname);
914 914
     fd = open(fullname, O_RDONLY|O_BINARY);
915 915
     if (fd >= 0) {
916
-        ret = cli_scandesc(fd, ctx, 0, CL_TYPE_HTML, 0, NULL);
917
-	close(fd);
918
-    }
919
-
920
-    if(ret < 0 || ret == CL_VIRUS) {
921
-	if(!cli_leavetemps_flag)
922
-	    cli_rmdirs(tempname);
923
-	free(tempname);
924
-	return ret;
916
+	if(fstat(fd, &first_stat) == -1) {
917
+		cli_errmsg("cli_scanhtml: fstat() failed for %s: %d\n", fullname, fd);
918
+		close(fd);
919
+		ret = CL_EIO;
920
+	} else {
921
+		ret = cli_scandesc(fd, ctx, 0, CL_TYPE_HTML, 0, NULL);
922
+		close(fd);
923
+	}
925 924
     }
926 925
 
927 926
     if (ret == CL_CLEAN) {
928
-	snprintf(fullname, 1024, "%s/nocomment.html", tempname);
927
+	snprintf(fullname, 1024, "%s/comment.html", tempname);
929 928
 	fd = open(fullname, O_RDONLY|O_BINARY);
930 929
 	if (fd >= 0) {
931
-	    ret = cli_scandesc(fd, ctx, 0, CL_TYPE_HTML, 0, NULL);
932
-	    close(fd);
930
+	    if(fstat(fd, &sb) == -1) {
931
+		cli_errmsg("cli_scanhtml: fstat() failed for %s: %d\n", fullname, fd);
932
+		close(fd);
933
+		ret = CL_EIO;
934
+	    } else {
935
+		    if(sb.st_size != first_stat.st_size) {
936
+			    /* scan only if HTML contained comments, otherwise we already scanned it
937
+			     * above */
938
+			    ret = cli_scandesc(fd, ctx, 0, CL_TYPE_HTML, 0, NULL);
939
+		    } else {
940
+			    cli_dbgmsg("Skipping comment.html because it is identical to nocomment.html\n");
941
+		    }
942
+		    close(fd);
943
+	    }
933 944
 	}
934 945
     }
935 946
 
936
-    if(ret < 0 || ret == CL_VIRUS) {
937
-	if(!cli_leavetemps_flag)
938
-	    cli_rmdirs(tempname);
939
-	free(tempname);
940
-	return ret;
941
-    }
942
-
943 947
     if (ret == CL_CLEAN) {
944 948
 	snprintf(fullname, 1024, "%s/script.html", tempname);
945 949
 	fd = open(fullname, O_RDONLY|O_BINARY);
... ...
@@ -949,16 +954,18 @@ static int cli_scanhtml(int desc, cli_ctx *ctx)
949 949
 	}
950 950
     }
951 951
 
952
-    if(ret < 0 || ret == CL_VIRUS) {
953
-	if(!cli_leavetemps_flag)
954
-	    cli_rmdirs(tempname);
955
-	free(tempname);
956
-	return ret;
952
+    if(ret == CL_CLEAN) {
953
+	    snprintf(fullname, 1024, "%s/notags.html", tempname);
954
+	    fd = open(fullname, O_RDONLY|O_BINARY);
955
+	    if(fd >= 0) {
956
+		    ret = cli_scandesc(fd, ctx, 0, CL_TYPE_HTML, 0, NULL);
957
+		    close(fd);
958
+	    }
957 959
     }
958 960
 
959 961
     if (ret == CL_CLEAN) {
960
-    	snprintf(fullname, 1024, "%s/rfc2397", tempname);
961
-    	ret = cli_scandir(fullname, ctx, 0);
962
+	snprintf(fullname, 1024, "%s/rfc2397", tempname);
963
+	ret = cli_scandir(fullname, ctx, 0);
962 964
     }
963 965
 
964 966
     if(!cli_leavetemps_flag)
... ...
@@ -1025,7 +1032,7 @@ static int cli_scanscript(int desc, cli_ctx *ctx)
1025 1025
 			}
1026 1026
 			text_normalize_reset(&state);
1027 1027
 		}
1028
-		if(nread > 0 && (text_normalize_buffer(&state, buff, nread)) != nread) {
1028
+		if(nread > 0 && (text_normalize_buffer(&state, buff, nread) != nread)) {
1029 1029
 			cli_dbgmsg("cli_scanscript: short read during normalizing\n");
1030 1030
 		}
1031 1031
 		/* used a do {}while() here, since we need to flush our buffers at the end,
... ...
@@ -84,7 +84,7 @@ static const enum normalize_action char_action[256] = {
84 84
 /* Normalizes the text at @buf of length @buf_len, @buf can include \0 characters.
85 85
  * Stores the normalized text in @state's buffer. 
86 86
  * Returns how many bytes it consumed of the input. */
87
-size_t text_normalize_buffer(struct text_norm_state *state, const unsigned char *buf, const size_t buf_len)
87
+ssize_t text_normalize_buffer(struct text_norm_state *state, const unsigned char *buf, const size_t buf_len)
88 88
 {
89 89
 	size_t i;
90 90
 	const unsigned char *out_end = state->out + state->out_len;
... ...
@@ -27,4 +27,4 @@ struct text_norm_state {
27 27
 
28 28
 int text_normalize_init(struct text_norm_state *state, unsigned char *out, size_t out_len);
29 29
 void text_normalize_reset(struct text_norm_state* state);
30
-size_t text_normalize_buffer(struct text_norm_state *state, const unsigned char *buf, const size_t buf_len);
30
+ssize_t text_normalize_buffer(struct text_norm_state *state, const unsigned char *buf, const size_t buf_len);