git-svn: trunk@3619
Török Edvin authored on 2008/02/12 06:41:58... | ... |
@@ -1,3 +1,8 @@ |
1 |
+Mon Feb 11 23:27:47 EET 2008 (edwin) |
|
2 |
+------------------------------------ |
|
3 |
+ * libclamav/scanners.c, htmlnorm.c: tagless version of HTML file (bb #162) |
|
4 |
+ * libclamav/scanners.c, textnorm.[ch]: fix compiler warning |
|
5 |
+ |
|
1 | 6 |
Mon Feb 11 22:16:10 CET 2008 (acab) |
2 | 7 |
----------------------------------- |
3 | 8 |
* libclamav: account for scanned data in cli_scanbuff too |
... | ... |
@@ -500,7 +500,7 @@ static inline void html_tag_contents_done(tag_arguments_t *tags,int idx) |
500 | 500 |
static int cli_html_normalise(int fd, m_area_t *m_area, const char *dirname, tag_arguments_t *hrefs,const struct cli_dconf* dconf) |
501 | 501 |
{ |
502 | 502 |
int fd_tmp, tag_length, tag_arg_length, binary; |
503 |
- int retval=FALSE, escape, value = 0, hex, tag_val_length=0, table_pos, in_script=FALSE; |
|
503 |
+ int retval=FALSE, escape, value = 0, hex, tag_val_length=0, table_pos, in_script=FALSE, text_space_written=FALSE; |
|
504 | 504 |
FILE *stream_in = NULL; |
505 | 505 |
html_state state=HTML_NORM, next_state=HTML_BAD_STATE; |
506 | 506 |
char filename[1024], tag[HTML_STR_LENGTH+1], tag_arg[HTML_STR_LENGTH+1]; |
... | ... |
@@ -509,7 +509,7 @@ static int cli_html_normalise(int fd, m_area_t *m_area, const char *dirname, tag |
509 | 509 |
tag_arguments_t tag_args; |
510 | 510 |
quoted_state quoted; |
511 | 511 |
unsigned long length; |
512 |
- file_buff_t *file_buff_o1, *file_buff_o2, *file_buff_script; |
|
512 |
+ file_buff_t *file_buff_o1, *file_buff_o2, *file_buff_script, *file_buff_text; |
|
513 | 513 |
file_buff_t *file_tmp_o1; |
514 | 514 |
int in_ahref=0;/* index of <a> tag, whose contents we are parsing. Indexing starts from 1, 0 means outside of <a>*/ |
515 | 515 |
unsigned char* href_contents_begin=NULL;/*beginning of the next portion of <a> contents*/ |
... | ... |
@@ -550,19 +550,19 @@ static int cli_html_normalise(int fd, m_area_t *m_area, const char *dirname, tag |
550 | 550 |
if (dirname) { |
551 | 551 |
snprintf(filename, 1024, "%s/rfc2397", dirname); |
552 | 552 |
if (mkdir(filename, 0700) && errno != EEXIST) { |
553 |
- file_buff_o1 = file_buff_o2 = file_buff_script = NULL; |
|
553 |
+ file_buff_o1 = file_buff_o2 = file_buff_script = file_buff_text = NULL; |
|
554 | 554 |
goto abort; |
555 | 555 |
} |
556 | 556 |
file_buff_o1 = (file_buff_t *) cli_malloc(sizeof(file_buff_t)); |
557 | 557 |
if (!file_buff_o1) { |
558 |
- file_buff_o1 = file_buff_o2 = file_buff_script = NULL; |
|
558 |
+ file_buff_o1 = file_buff_o2 = file_buff_script = file_buff_text = NULL; |
|
559 | 559 |
goto abort; |
560 | 560 |
} |
561 | 561 |
|
562 | 562 |
file_buff_o2 = (file_buff_t *) cli_malloc(sizeof(file_buff_t)); |
563 | 563 |
if (!file_buff_o2) { |
564 | 564 |
free(file_buff_o1); |
565 |
- file_buff_o1 = file_buff_o2 = file_buff_script = NULL; |
|
565 |
+ file_buff_o1 = file_buff_o2 = file_buff_script = file_buff_text = NULL; |
|
566 | 566 |
goto abort; |
567 | 567 |
} |
568 | 568 |
|
... | ... |
@@ -570,7 +570,7 @@ static int cli_html_normalise(int fd, m_area_t *m_area, const char *dirname, tag |
570 | 570 |
if (!file_buff_script) { |
571 | 571 |
free(file_buff_o1); |
572 | 572 |
free(file_buff_o2); |
573 |
- file_buff_o1 = file_buff_o2 = file_buff_script = NULL; |
|
573 |
+ file_buff_o1 = file_buff_o2 = file_buff_script = file_buff_text = NULL; |
|
574 | 574 |
goto abort; |
575 | 575 |
} |
576 | 576 |
|
... | ... |
@@ -581,7 +581,7 @@ static int cli_html_normalise(int fd, m_area_t *m_area, const char *dirname, tag |
581 | 581 |
free(file_buff_o1); |
582 | 582 |
free(file_buff_o2); |
583 | 583 |
free(file_buff_script); |
584 |
- file_buff_o1 = file_buff_o2 = file_buff_script = NULL; |
|
584 |
+ file_buff_o1 = file_buff_o2 = file_buff_script = file_buff_text = NULL; |
|
585 | 585 |
goto abort; |
586 | 586 |
} |
587 | 587 |
|
... | ... |
@@ -593,7 +593,7 @@ static int cli_html_normalise(int fd, m_area_t *m_area, const char *dirname, tag |
593 | 593 |
free(file_buff_o1); |
594 | 594 |
free(file_buff_o2); |
595 | 595 |
free(file_buff_script); |
596 |
- file_buff_o1 = file_buff_o2 = file_buff_script = NULL; |
|
596 |
+ file_buff_o1 = file_buff_o2 = file_buff_script = file_buff_text = NULL; |
|
597 | 597 |
goto abort; |
598 | 598 |
} |
599 | 599 |
|
... | ... |
@@ -606,17 +606,39 @@ static int cli_html_normalise(int fd, m_area_t *m_area, const char *dirname, tag |
606 | 606 |
free(file_buff_o1); |
607 | 607 |
free(file_buff_o2); |
608 | 608 |
free(file_buff_script); |
609 |
- file_buff_o1 = file_buff_o2 = file_buff_script = NULL; |
|
609 |
+ file_buff_o1 = file_buff_o2 = file_buff_script = file_buff_text = NULL; |
|
610 | 610 |
goto abort; |
611 | 611 |
} |
612 | 612 |
|
613 |
+ file_buff_text = (file_buff_t *) cli_malloc(sizeof(file_buff_t)); |
|
614 |
+ if(!file_buff_text) { |
|
615 |
+ free(file_buff_o1); |
|
616 |
+ free(file_buff_o2); |
|
617 |
+ free(file_buff_script); |
|
618 |
+ file_buff_o1 = file_buff_o2 = file_buff_script = file_buff_text = NULL; |
|
619 |
+ goto abort; |
|
620 |
+ } |
|
621 |
+ snprintf(filename, 1024, "%s/notags.html", dirname); |
|
622 |
+ file_buff_text->fd = open(filename, O_WRONLY|O_CREAT|O_TRUNC, S_IWUSR|S_IRUSR); |
|
623 |
+ if(!file_buff_text->fd) { |
|
624 |
+ cli_dbgmsg("open failed: %s\n", filename); |
|
625 |
+ close(file_buff_o1->fd); |
|
626 |
+ close(file_buff_o2->fd); |
|
627 |
+ free(file_buff_o1); |
|
628 |
+ free(file_buff_o2); |
|
629 |
+ free(file_buff_script); |
|
630 |
+ free(file_buff_text); |
|
631 |
+ file_buff_o1 = file_buff_o2 = file_buff_script = file_buff_text = NULL; |
|
632 |
+ } |
|
613 | 633 |
file_buff_o1->length = 0; |
614 | 634 |
file_buff_o2->length = 0; |
615 | 635 |
file_buff_script->length = 0; |
636 |
+ file_buff_text->length = 0; |
|
616 | 637 |
} else { |
617 | 638 |
file_buff_o1 = NULL; |
618 | 639 |
file_buff_o2 = NULL; |
619 | 640 |
file_buff_script = NULL; |
641 |
+ file_buff_text = NULL; |
|
620 | 642 |
} |
621 | 643 |
|
622 | 644 |
binary = FALSE; |
... | ... |
@@ -677,6 +699,9 @@ static int cli_html_normalise(int fd, m_area_t *m_area, const char *dirname, tag |
677 | 677 |
html_output_c(file_buff_o1, file_buff_o2, '<'); |
678 | 678 |
if (in_script) { |
679 | 679 |
html_output_c(file_buff_script, NULL, '<'); |
680 |
+ } else if(!text_space_written) { |
|
681 |
+ html_output_c(file_buff_text, NULL, ' '); |
|
682 |
+ text_space_written = TRUE; |
|
680 | 683 |
} |
681 | 684 |
if(hrefs && hrefs->scanContents && in_ahref && href_contents_begin) { |
682 | 685 |
/*append this text portion to the contents of <a>*/ |
... | ... |
@@ -688,9 +713,17 @@ static int cli_html_normalise(int fd, m_area_t *m_area, const char *dirname, tag |
688 | 688 |
tag_length=0; |
689 | 689 |
next_state = HTML_TAG; |
690 | 690 |
} else if (isspace(*ptr)) { |
691 |
+ if(!text_space_written && !in_script) { |
|
692 |
+ html_output_c(file_buff_text, NULL, ' '); |
|
693 |
+ text_space_written = TRUE; |
|
694 |
+ } |
|
691 | 695 |
state = HTML_TRIM_WS; |
692 | 696 |
next_state = HTML_NORM; |
693 | 697 |
} else if (*ptr == '&') { |
698 |
+ if(!text_space_written && !in_script) { |
|
699 |
+ html_output_c(file_buff_text, NULL, ' '); |
|
700 |
+ text_space_written = TRUE; |
|
701 |
+ } |
|
694 | 702 |
state = HTML_CHAR_REF; |
695 | 703 |
next_state = HTML_NORM; |
696 | 704 |
ptr++; |
... | ... |
@@ -698,6 +731,9 @@ static int cli_html_normalise(int fd, m_area_t *m_area, const char *dirname, tag |
698 | 698 |
html_output_c(file_buff_o1, file_buff_o2, tolower(*ptr)); |
699 | 699 |
if (in_script) { |
700 | 700 |
html_output_c(file_buff_script, NULL, tolower(*ptr)); |
701 |
+ } else { |
|
702 |
+ html_output_c(file_buff_text, NULL, tolower(*ptr)); |
|
703 |
+ text_space_written = FALSE; |
|
701 | 704 |
} |
702 | 705 |
ptr++; |
703 | 706 |
} |
... | ... |
@@ -979,6 +1015,18 @@ static int cli_html_normalise(int fd, m_area_t *m_area, const char *dirname, tag |
979 | 979 |
in_script = TRUE; |
980 | 980 |
} |
981 | 981 |
html_output_tag(file_buff_script, tag, &tag_args); |
982 |
+ } else if (strcmp(tag, "a") == 0) { |
|
983 |
+ arg_value = html_tag_arg_value(&tag_args, "href"); |
|
984 |
+ if(arg_value && arg_value[0]) { |
|
985 |
+ html_output_str(file_buff_text, arg_value, strlen(arg_value)); |
|
986 |
+ html_output_c(file_buff_text, NULL, ' '); |
|
987 |
+ } |
|
988 |
+ } else if (strcmp(tag, "img") == 0) { |
|
989 |
+ arg_value = html_tag_arg_value(&tag_args, "src"); |
|
990 |
+ if(arg_value && arg_value[0]) { |
|
991 |
+ html_output_str(file_buff_text, arg_value, strlen(arg_value)); |
|
992 |
+ html_output_c(file_buff_text, NULL, ' '); |
|
993 |
+ } |
|
982 | 994 |
} else if (hrefs) { |
983 | 995 |
if(in_ahref && !href_contents_begin) |
984 | 996 |
href_contents_begin=ptr; |
... | ... |
@@ -1540,6 +1588,11 @@ abort: |
1540 | 1540 |
close(file_buff_script->fd); |
1541 | 1541 |
free(file_buff_script); |
1542 | 1542 |
} |
1543 |
+ if(file_buff_text) { |
|
1544 |
+ html_output_flush(file_buff_text); |
|
1545 |
+ close(file_buff_text->fd); |
|
1546 |
+ free(file_buff_text); |
|
1547 |
+ } |
|
1543 | 1548 |
return retval; |
1544 | 1549 |
} |
1545 | 1550 |
|
... | ... |
@@ -885,6 +885,7 @@ static int cli_scanhtml(int desc, cli_ctx *ctx) |
885 | 885 |
char *tempname, fullname[1024]; |
886 | 886 |
int ret=CL_CLEAN, fd; |
887 | 887 |
struct stat sb; |
888 |
+ struct stat first_stat; |
|
888 | 889 |
|
889 | 890 |
|
890 | 891 |
cli_dbgmsg("in cli_scanhtml()\n"); |
... | ... |
@@ -910,36 +911,40 @@ static int cli_scanhtml(int desc, cli_ctx *ctx) |
910 | 910 |
} |
911 | 911 |
|
912 | 912 |
html_normalise_fd(desc, tempname, NULL, ctx->dconf); |
913 |
- snprintf(fullname, 1024, "%s/comment.html", tempname); |
|
913 |
+ snprintf(fullname, 1024, "%s/nocomment.html", tempname); |
|
914 | 914 |
fd = open(fullname, O_RDONLY|O_BINARY); |
915 | 915 |
if (fd >= 0) { |
916 |
- ret = cli_scandesc(fd, ctx, 0, CL_TYPE_HTML, 0, NULL); |
|
917 |
- close(fd); |
|
918 |
- } |
|
919 |
- |
|
920 |
- if(ret < 0 || ret == CL_VIRUS) { |
|
921 |
- if(!cli_leavetemps_flag) |
|
922 |
- cli_rmdirs(tempname); |
|
923 |
- free(tempname); |
|
924 |
- return ret; |
|
916 |
+ if(fstat(fd, &first_stat) == -1) { |
|
917 |
+ cli_errmsg("cli_scanhtml: fstat() failed for %s: %d\n", fullname, fd); |
|
918 |
+ close(fd); |
|
919 |
+ ret = CL_EIO; |
|
920 |
+ } else { |
|
921 |
+ ret = cli_scandesc(fd, ctx, 0, CL_TYPE_HTML, 0, NULL); |
|
922 |
+ close(fd); |
|
923 |
+ } |
|
925 | 924 |
} |
926 | 925 |
|
927 | 926 |
if (ret == CL_CLEAN) { |
928 |
- snprintf(fullname, 1024, "%s/nocomment.html", tempname); |
|
927 |
+ snprintf(fullname, 1024, "%s/comment.html", tempname); |
|
929 | 928 |
fd = open(fullname, O_RDONLY|O_BINARY); |
930 | 929 |
if (fd >= 0) { |
931 |
- ret = cli_scandesc(fd, ctx, 0, CL_TYPE_HTML, 0, NULL); |
|
932 |
- close(fd); |
|
930 |
+ if(fstat(fd, &sb) == -1) { |
|
931 |
+ cli_errmsg("cli_scanhtml: fstat() failed for %s: %d\n", fullname, fd); |
|
932 |
+ close(fd); |
|
933 |
+ ret = CL_EIO; |
|
934 |
+ } else { |
|
935 |
+ if(sb.st_size != first_stat.st_size) { |
|
936 |
+ /* scan only if HTML contained comments, otherwise we already scanned it |
|
937 |
+ * above */ |
|
938 |
+ ret = cli_scandesc(fd, ctx, 0, CL_TYPE_HTML, 0, NULL); |
|
939 |
+ } else { |
|
940 |
+ cli_dbgmsg("Skipping comment.html because it is identical to nocomment.html\n"); |
|
941 |
+ } |
|
942 |
+ close(fd); |
|
943 |
+ } |
|
933 | 944 |
} |
934 | 945 |
} |
935 | 946 |
|
936 |
- if(ret < 0 || ret == CL_VIRUS) { |
|
937 |
- if(!cli_leavetemps_flag) |
|
938 |
- cli_rmdirs(tempname); |
|
939 |
- free(tempname); |
|
940 |
- return ret; |
|
941 |
- } |
|
942 |
- |
|
943 | 947 |
if (ret == CL_CLEAN) { |
944 | 948 |
snprintf(fullname, 1024, "%s/script.html", tempname); |
945 | 949 |
fd = open(fullname, O_RDONLY|O_BINARY); |
... | ... |
@@ -949,16 +954,18 @@ static int cli_scanhtml(int desc, cli_ctx *ctx) |
949 | 949 |
} |
950 | 950 |
} |
951 | 951 |
|
952 |
- if(ret < 0 || ret == CL_VIRUS) { |
|
953 |
- if(!cli_leavetemps_flag) |
|
954 |
- cli_rmdirs(tempname); |
|
955 |
- free(tempname); |
|
956 |
- return ret; |
|
952 |
+ if(ret == CL_CLEAN) { |
|
953 |
+ snprintf(fullname, 1024, "%s/notags.html", tempname); |
|
954 |
+ fd = open(fullname, O_RDONLY|O_BINARY); |
|
955 |
+ if(fd >= 0) { |
|
956 |
+ ret = cli_scandesc(fd, ctx, 0, CL_TYPE_HTML, 0, NULL); |
|
957 |
+ close(fd); |
|
958 |
+ } |
|
957 | 959 |
} |
958 | 960 |
|
959 | 961 |
if (ret == CL_CLEAN) { |
960 |
- snprintf(fullname, 1024, "%s/rfc2397", tempname); |
|
961 |
- ret = cli_scandir(fullname, ctx, 0); |
|
962 |
+ snprintf(fullname, 1024, "%s/rfc2397", tempname); |
|
963 |
+ ret = cli_scandir(fullname, ctx, 0); |
|
962 | 964 |
} |
963 | 965 |
|
964 | 966 |
if(!cli_leavetemps_flag) |
... | ... |
@@ -1025,7 +1032,7 @@ static int cli_scanscript(int desc, cli_ctx *ctx) |
1025 | 1025 |
} |
1026 | 1026 |
text_normalize_reset(&state); |
1027 | 1027 |
} |
1028 |
- if(nread > 0 && (text_normalize_buffer(&state, buff, nread)) != nread) { |
|
1028 |
+ if(nread > 0 && (text_normalize_buffer(&state, buff, nread) != nread)) { |
|
1029 | 1029 |
cli_dbgmsg("cli_scanscript: short read during normalizing\n"); |
1030 | 1030 |
} |
1031 | 1031 |
/* used a do {}while() here, since we need to flush our buffers at the end, |
... | ... |
@@ -84,7 +84,7 @@ static const enum normalize_action char_action[256] = { |
84 | 84 |
/* Normalizes the text at @buf of length @buf_len, @buf can include \0 characters. |
85 | 85 |
* Stores the normalized text in @state's buffer. |
86 | 86 |
* Returns how many bytes it consumed of the input. */ |
87 |
-size_t text_normalize_buffer(struct text_norm_state *state, const unsigned char *buf, const size_t buf_len) |
|
87 |
+ssize_t text_normalize_buffer(struct text_norm_state *state, const unsigned char *buf, const size_t buf_len) |
|
88 | 88 |
{ |
89 | 89 |
size_t i; |
90 | 90 |
const unsigned char *out_end = state->out + state->out_len; |
... | ... |
@@ -27,4 +27,4 @@ struct text_norm_state { |
27 | 27 |
|
28 | 28 |
int text_normalize_init(struct text_norm_state *state, unsigned char *out, size_t out_len); |
29 | 29 |
void text_normalize_reset(struct text_norm_state* state); |
30 |
-size_t text_normalize_buffer(struct text_norm_state *state, const unsigned char *buf, const size_t buf_len); |
|
30 |
+ssize_t text_normalize_buffer(struct text_norm_state *state, const unsigned char *buf, const size_t buf_len); |