Browse code

* libclamav/htmlnorm.c: RFC2397 ("data" URL scheme) support. * libclamav/scanner.c: scan RFC2397 data.

git-svn-id: file:///var/lib/svn/clamav-devel/trunk/clamav-devel@1244 77e5149b-7576-45b1-b177-96237e5ba77b

Trog authored on 2005/01/14 23:56:09
Showing 3 changed files
... ...
@@ -1,3 +1,8 @@
1
+Fri Jan 14 14:53:59 GMT 2005 (trog)
2
+-----------------------------------
3
+  * libclamav/htmlnorm.c: RFC2397 ("data" URL scheme) support.
4
+  * libclamav/scanner.c: scan RFC2397 data.
5
+
1 6
 Wed Jan 12 08:58:29 GMT 2005 (njh)
2 7
 ----------------------------------
3 8
   * clamav-milter:	Fixed DNS resolution error messages which could print
... ...
@@ -68,6 +68,12 @@ typedef enum {
68 68
     HTML_JSDECODE_LENGTH,
69 69
     HTML_JSDECODE_DECRYPT,
70 70
     HTML_SPECIAL_CHAR,
71
+    HTML_RFC2397_TYPE,
72
+    HTML_RFC2397_INIT,
73
+    HTML_RFC2397_DATA,
74
+    HTML_RFC2397_FINISH,
75
+    HTML_RFC2397_ESC,
76
+    HTML_ESCAPE_CHAR,
71 77
 } html_state;
72 78
 
73 79
 typedef enum {
... ...
@@ -383,18 +389,19 @@ void html_tag_arg_free(tag_arguments_t *tags)
383 383
 
384 384
 static int cli_html_normalise(int fd, m_area_t *m_area, const char *dirname, tag_arguments_t *hrefs)
385 385
 {
386
-	int fd_tmp, tag_length, tag_arg_length;
386
+	int fd_tmp, tag_length, tag_arg_length, binary;
387 387
 	int retval=FALSE, escape, value, hex, tag_val_length, table_pos, in_script=FALSE;
388 388
 	FILE *stream_in;
389 389
 	html_state state=HTML_NORM, next_state=HTML_BAD_STATE;
390 390
 	char filename[1024], tag[HTML_STR_LENGTH+1], tag_arg[HTML_STR_LENGTH+1];
391
-	char tag_val[HTML_STR_LENGTH+1];
391
+	char tag_val[HTML_STR_LENGTH+1], *tmp_file;
392 392
 	unsigned char *line, *ptr, *arg_value;
393 393
 	tag_arguments_t tag_args;
394 394
 	quoted_state quoted;
395 395
 	unsigned long length;
396 396
 	file_buff_t *file_buff_o1, *file_buff_o2, *file_buff_script;
397
-	
397
+	file_buff_t *file_tmp_o1;
398
+
398 399
 	if (!m_area) {
399 400
 		if (fd < 0) {
400 401
 			cli_dbgmsg("Invalid HTML fd\n");
... ...
@@ -417,6 +424,11 @@ static int cli_html_normalise(int fd, m_area_t *m_area, const char *dirname, tag
417 417
 	tag_args.value = NULL;
418 418
 	
419 419
 	if (dirname) {
420
+		snprintf(filename, 1024, "%s/rfc2397", dirname);
421
+		if (mkdir(filename, 0700)) {
422
+			file_buff_o1 = file_buff_o2 = file_buff_script = NULL;
423
+			goto abort;
424
+		}
420 425
 		file_buff_o1 = (file_buff_t *) cli_malloc(sizeof(file_buff_t));
421 426
 		if (!file_buff_o1) {
422 427
 			file_buff_o1 = file_buff_o2 = file_buff_script = NULL;
... ...
@@ -482,19 +494,21 @@ static int cli_html_normalise(int fd, m_area_t *m_area, const char *dirname, tag
482 482
 		file_buff_o2 = NULL;
483 483
 		file_buff_script = NULL;
484 484
 	}
485
-			
485
+	
486
+	binary = FALSE;
487
+		
486 488
 	ptr = line = cli_readline(stream_in, m_area, 8192);
487 489
 	while (line) {
488 490
 		while (*ptr && isspace(*ptr)) {
489 491
 			ptr++;
490 492
 		}
491 493
 		while (*ptr) {
492
-			if (*ptr == '\n') {
494
+			if (!binary && *ptr == '\n') {
493 495
 				/* Convert it to a space and re-process */
494 496
 				*ptr = ' ';
495 497
 				continue;
496 498
 			}
497
-			if (*ptr == '\r') {
499
+			if (!binary && *ptr == '\r') {
498 500
 				ptr++;
499 501
 				continue;
500 502
 			}
... ...
@@ -647,7 +661,42 @@ static int cli_html_normalise(int fd, m_area_t *m_area, const char *dirname, tag
647 647
 				}
648 648
 				break;
649 649
 			case HTML_TAG_ARG_VAL:
650
-				if (*ptr == '&') {
650
+				if ((tag_val_length == 5) && (strncmp(tag_val, "data:", 5) == 0)) {
651
+					/* RFC2397 inline data */
652
+
653
+					/* Rewind one byte so we don't recursuive */
654
+					if (file_buff_o1 && (file_buff_o1->length > 0)) {
655
+						file_buff_o1->length--;
656
+					}
657
+					if (file_buff_o2 && (file_buff_o2->length > 0)) {
658
+						file_buff_o2->length--;
659
+					}
660
+					
661
+					if (quoted != NOT_QUOTED) {
662
+						html_output_c(file_buff_o1, file_buff_o2, '"');
663
+					}
664
+					tag_val_length = 0;
665
+					state = HTML_RFC2397_TYPE;
666
+					next_state = HTML_TAG_ARG;
667
+				} else if ((tag_val_length == 6) && (strncmp(tag_val, "\"data:", 6) == 0)) {
668
+					/* RFC2397 inline data */
669
+
670
+					/* Rewind one byte so we don't recursuive */
671
+					if (file_buff_o1 && (file_buff_o1->length > 0)) {
672
+						file_buff_o1->length--;
673
+					}
674
+					if (file_buff_o2 && (file_buff_o2->length > 0)) {
675
+						file_buff_o2->length--;
676
+					}
677
+					
678
+					if (quoted != NOT_QUOTED) {
679
+						html_output_c(file_buff_o1, file_buff_o2, '"');
680
+					}
681
+
682
+					tag_val_length = 0;
683
+					state = HTML_RFC2397_TYPE;
684
+					next_state = HTML_TAG_ARG;
685
+				} else if (*ptr == '&') {
651 686
 					state = HTML_CHAR_REF;
652 687
 					next_state = HTML_TAG_ARG_VAL;
653 688
 					ptr++;
... ...
@@ -923,6 +972,188 @@ static int cli_html_normalise(int fd, m_area_t *m_area, const char *dirname, tag
923 923
 				ptr++;
924 924
 				length--;
925 925
 				break;
926
+				
927
+			case HTML_RFC2397_TYPE:
928
+				if (*ptr == '\'') {
929
+					if (!escape && (quoted==SINGLE_QUOTED)) {
930
+						/* Early end of data detected. Error */
931
+						ptr++;
932
+						state = HTML_SKIP_WS;
933
+						tag_arg_length=0;
934
+						next_state = HTML_TAG_ARG;
935
+					} else {
936
+						if (tag_val_length < HTML_STR_LENGTH) {
937
+							tag_val[tag_val_length++] = '"';
938
+						}
939
+						ptr++;
940
+					}
941
+				} else if (*ptr == '"') {
942
+					if (!escape && (quoted==DOUBLE_QUOTED)) {
943
+						/* Early end of data detected. Error */
944
+						ptr++;
945
+						state = HTML_SKIP_WS;
946
+						tag_arg_length=0;
947
+						next_state = HTML_TAG_ARG;
948
+					} else {
949
+						if (tag_val_length < HTML_STR_LENGTH) {
950
+							tag_val[tag_val_length++] = '"';
951
+						}
952
+						ptr++;
953
+					}
954
+				} else if (isspace(*ptr) || (*ptr == '>')) {
955
+					if (quoted == NOT_QUOTED) {
956
+						/* Early end of data detected. Error */
957
+						state = HTML_SKIP_WS;
958
+						tag_arg_length=0;
959
+						next_state = HTML_TAG_ARG;
960
+					} else {
961
+						if (tag_val_length < HTML_STR_LENGTH) {
962
+							if (isspace(*ptr)) {
963
+								tag_val[tag_val_length++] = ' ';
964
+							} else {
965
+								tag_val[tag_val_length++] = '>';
966
+							}
967
+						}
968
+						state = HTML_SKIP_WS;
969
+						escape = FALSE;
970
+						quoted = NOT_QUOTED;
971
+						next_state = HTML_RFC2397_TYPE;
972
+						ptr++;
973
+					}
974
+				} else if (*ptr == ',') {
975
+					/* Beginning of data */
976
+					tag_val[tag_val_length] = '\0';
977
+					state = HTML_RFC2397_INIT;
978
+					escape = FALSE;
979
+					next_state = HTML_BAD_STATE;
980
+					ptr++;
981
+				
982
+				} else {
983
+					if (tag_val_length < HTML_STR_LENGTH) {
984
+						tag_val[tag_val_length++] = tolower(*ptr);
985
+					}
986
+					ptr++;
987
+				}
988
+				if (*ptr == '\\') {
989
+					escape = TRUE;
990
+				} else {
991
+					escape = FALSE;
992
+				}
993
+				break;
994
+			case HTML_RFC2397_INIT:
995
+				file_tmp_o1 = (file_buff_t *) cli_malloc(sizeof(file_buff_t));
996
+				if (!file_tmp_o1) {
997
+					goto abort;
998
+				}
999
+				snprintf(filename, 1024, "%s/rfc2397", dirname);
1000
+				tmp_file = cli_gentemp(filename);
1001
+				cli_dbgmsg("RFC2397 data file: %s\n", tmp_file);
1002
+				file_tmp_o1->fd = open(tmp_file, O_WRONLY|O_CREAT|O_TRUNC, S_IWUSR|S_IRUSR);
1003
+				free(tmp_file);
1004
+				if (!file_tmp_o1->fd) {
1005
+					cli_dbgmsg("open failed: %s\n", filename);
1006
+					free(file_tmp_o1);
1007
+					goto abort;
1008
+				}
1009
+				file_tmp_o1->length = 0;
1010
+				
1011
+				html_output_str(file_tmp_o1, "From html-normalise\n", 20);
1012
+				html_output_str(file_tmp_o1, "Content-type: ", 14);
1013
+				if ((tag_val_length == 0) && (*tag_val == ';')) {
1014
+						html_output_str(file_tmp_o1, "text/plain\n", 11);
1015
+				}
1016
+				html_output_str(file_tmp_o1, tag_val, tag_val_length);
1017
+				html_output_c(file_tmp_o1, NULL, '\n');
1018
+				if (strstr(tag_val, ";base64") != NULL) {
1019
+					html_output_str(file_tmp_o1, "Content-transfer-encoding: base64\n", 34);
1020
+				}
1021
+				html_output_c(file_tmp_o1, NULL, '\n');
1022
+				state = HTML_RFC2397_DATA;
1023
+				binary = TRUE;
1024
+				break;
1025
+			case HTML_RFC2397_DATA:
1026
+				if (*ptr == '&') {
1027
+					state = HTML_CHAR_REF;
1028
+					next_state = HTML_RFC2397_DATA;
1029
+					ptr++;
1030
+				} else if (*ptr == '%') {
1031
+					length = 0;
1032
+					value = 0;
1033
+					state = HTML_ESCAPE_CHAR;
1034
+					next_state = HTML_RFC2397_ESC;
1035
+					ptr++;
1036
+				} else if (*ptr == '\'') {
1037
+					if (!escape && (quoted==SINGLE_QUOTED)) {
1038
+						state = HTML_RFC2397_FINISH;
1039
+						ptr++;
1040
+					} else {
1041
+						html_output_c(file_tmp_o1, NULL, *ptr);
1042
+						ptr++;
1043
+					}
1044
+				} else if (*ptr == '\"') {
1045
+					if (!escape && (quoted=DOUBLE_QUOTED)) {
1046
+						state = HTML_RFC2397_FINISH;
1047
+						ptr++;
1048
+					} else {
1049
+						html_output_c(file_tmp_o1, NULL, *ptr);
1050
+						ptr++;
1051
+					}
1052
+				} else if (isspace(*ptr) || (*ptr == '>')) {
1053
+					if (quoted == NOT_QUOTED) {
1054
+						state = HTML_RFC2397_FINISH;
1055
+						ptr++;
1056
+					} else {
1057
+						html_output_c(file_tmp_o1, NULL, *ptr);
1058
+						ptr++;
1059
+					}
1060
+				} else {
1061
+					html_output_c(file_tmp_o1, NULL, *ptr);
1062
+					ptr++;
1063
+				}
1064
+				if (*ptr == '\\') {
1065
+					escape = TRUE;
1066
+				} else {
1067
+					escape = FALSE;
1068
+				}
1069
+				break;
1070
+			case HTML_RFC2397_FINISH:
1071
+				html_output_flush(file_tmp_o1);
1072
+				close(file_tmp_o1->fd);
1073
+				free(file_tmp_o1);
1074
+				state = HTML_SKIP_WS;
1075
+				escape = FALSE;
1076
+				quoted = NOT_QUOTED;
1077
+				next_state = HTML_TAG_ARG;
1078
+				binary = FALSE;
1079
+				break;
1080
+			case HTML_RFC2397_ESC:
1081
+				if (length == 2) {
1082
+					html_output_c(file_tmp_o1, NULL, value);
1083
+				} else if (length == 1) {
1084
+					html_output_c(file_tmp_o1, NULL, '%');
1085
+					html_output_c(file_tmp_o1, NULL, value+'0');
1086
+				} else {
1087
+					html_output_c(file_tmp_o1, NULL, '%');
1088
+				}
1089
+				state = HTML_RFC2397_DATA;
1090
+				break;		
1091
+			case HTML_ESCAPE_CHAR:
1092
+				value *= 16;
1093
+				length++;
1094
+				if (isxdigit(*ptr)) {
1095
+					if (isdigit(*ptr)) {
1096
+						value += (*ptr - '0');
1097
+					} else {
1098
+						value += (tolower(*ptr) - 'a' + 10);
1099
+					}
1100
+				} else {
1101
+					state = next_state;
1102
+				}
1103
+				if (length == 2) {
1104
+					state = next_state;
1105
+				}
1106
+				ptr++;
1107
+				break;	
926 1108
 			}
927 1109
 		}
928 1110
 		free(line);
... ...
@@ -719,67 +719,6 @@ static int cli_scanmscab(int desc, const char **virname, long int *scanned, cons
719 719
     return ret;
720 720
 }
721 721
 
722
-static int cli_scanhtml(int desc, const char **virname, long int *scanned, const struct cl_node *root, const struct cl_limits *limits, unsigned int options, int *arec, int *mrec)
723
-{
724
-	char *tempname, fullname[1024];
725
-	int ret=CL_CLEAN, fd;
726
-
727
-
728
-    cli_dbgmsg("in cli_scanhtml()\n");
729
-
730
-    tempname = cli_gentemp(NULL);
731
-    if(mkdir(tempname, 0700)) {
732
-        cli_dbgmsg("ScanHTML -> Can't create temporary directory %s\n", tempname);
733
-        return CL_ETMPDIR;
734
-    }
735
-
736
-    html_normalise_fd(desc, tempname, NULL);
737
-    snprintf(fullname, 1024, "%s/comment.html", tempname);
738
-    fd = open(fullname, O_RDONLY);
739
-    if (fd >= 0) {
740
-        ret = cli_scandesc(fd, virname, scanned, root, 0, CL_TYPE_HTML);
741
-	close(fd);
742
-    }
743
-
744
-    if(ret < 0 || ret == CL_VIRUS) {
745
-	if(!cli_leavetemps_flag)
746
-	    cli_rmdirs(tempname);
747
-	free(tempname);
748
-	return ret;
749
-    }
750
-
751
-    if (ret == CL_CLEAN) {
752
-	snprintf(fullname, 1024, "%s/nocomment.html", tempname);
753
-	fd = open(fullname, O_RDONLY);
754
-	if (fd >= 0) {
755
-	    ret = cli_scandesc(fd, virname, scanned, root, 0, CL_TYPE_HTML);
756
-	    close(fd);
757
-	}
758
-    }
759
-
760
-    if(ret < 0 || ret == CL_VIRUS) {
761
-	if(!cli_leavetemps_flag)
762
-	    cli_rmdirs(tempname);
763
-	free(tempname);
764
-	return ret;
765
-    }
766
-
767
-    if (ret == CL_CLEAN) {
768
-	snprintf(fullname, 1024, "%s/script.html", tempname);
769
-	fd = open(fullname, O_RDONLY);
770
-	if (fd >= 0) {
771
-	    ret = cli_scandesc(fd, virname, scanned, root, 0, CL_TYPE_HTML);
772
-	    close(fd);
773
-	}
774
-    }
775
-
776
-    if(!cli_leavetemps_flag)
777
-        cli_rmdirs(tempname);
778
-
779
-    free(tempname);
780
-    return ret;
781
-}
782
-
783 722
 static int cli_scandir(const char *dirname, const char **virname, long int *scanned, const struct cl_node *root, const struct cl_limits *limits, unsigned int options, int *arec, int *mrec)
784 723
 {
785 724
 	DIR *dd;
... ...
@@ -986,6 +925,79 @@ static int cli_vba_scandir(const char *dirname, const char **virname, long int *
986 986
     return ret;
987 987
 }
988 988
 
989
+static int cli_scanhtml(int desc, const char **virname, long int *scanned, const struct cl_node *root, const struct cl_limits *limits, unsigned int options, int *arec, int *mrec)
990
+{
991
+	char *tempname, fullname[1024];
992
+	int ret=CL_CLEAN, fd;
993
+
994
+
995
+    cli_dbgmsg("in cli_scanhtml()\n");
996
+
997
+    tempname = cli_gentemp(NULL);
998
+    if(mkdir(tempname, 0700)) {
999
+        cli_dbgmsg("ScanHTML -> Can't create temporary directory %s\n", tempname);
1000
+        return CL_ETMPDIR;
1001
+    }
1002
+
1003
+    html_normalise_fd(desc, tempname, NULL);
1004
+    snprintf(fullname, 1024, "%s/comment.html", tempname);
1005
+    fd = open(fullname, O_RDONLY);
1006
+    if (fd >= 0) {
1007
+        ret = cli_scandesc(fd, virname, scanned, root, 0, CL_TYPE_HTML);
1008
+	close(fd);
1009
+    }
1010
+
1011
+    if(ret < 0 || ret == CL_VIRUS) {
1012
+	if(!cli_leavetemps_flag)
1013
+	    cli_rmdirs(tempname);
1014
+	free(tempname);
1015
+	return ret;
1016
+    }
1017
+
1018
+    if (ret == CL_CLEAN) {
1019
+	snprintf(fullname, 1024, "%s/nocomment.html", tempname);
1020
+	fd = open(fullname, O_RDONLY);
1021
+	if (fd >= 0) {
1022
+	    ret = cli_scandesc(fd, virname, scanned, root, 0, CL_TYPE_HTML);
1023
+	    close(fd);
1024
+	}
1025
+    }
1026
+
1027
+    if(ret < 0 || ret == CL_VIRUS) {
1028
+	if(!cli_leavetemps_flag)
1029
+	    cli_rmdirs(tempname);
1030
+	free(tempname);
1031
+	return ret;
1032
+    }
1033
+
1034
+    if (ret == CL_CLEAN) {
1035
+	snprintf(fullname, 1024, "%s/script.html", tempname);
1036
+	fd = open(fullname, O_RDONLY);
1037
+	if (fd >= 0) {
1038
+	    ret = cli_scandesc(fd, virname, scanned, root, 0, CL_TYPE_HTML);
1039
+	    close(fd);
1040
+	}
1041
+    }
1042
+
1043
+    if(ret < 0 || ret == CL_VIRUS) {
1044
+	if(!cli_leavetemps_flag)
1045
+	    cli_rmdirs(tempname);
1046
+	free(tempname);
1047
+	return ret;
1048
+    }
1049
+
1050
+    if (ret == CL_CLEAN) {
1051
+    	snprintf(fullname, 1024, "%s/rfc2397", tempname);
1052
+    	ret = cli_scandir(fullname, virname, scanned, root, limits, options, arec, mrec);
1053
+    }
1054
+
1055
+    if(!cli_leavetemps_flag)
1056
+        cli_rmdirs(tempname);
1057
+
1058
+    free(tempname);
1059
+    return ret;
1060
+}
1061
+
989 1062
 static int cli_scanole2(int desc, const char **virname, long int *scanned, const struct cl_node *root, const struct cl_limits *limits, unsigned int options, int *arec, int *mrec)
990 1063
 {
991 1064
 	char *dir;