Browse code

strip spaces from scripts, normalize screnc

git-svn: trunk@3675

Török Edvin authored on 2008/02/26 20:05:11
Showing 2 changed files
... ...
@@ -1,3 +1,7 @@
1
+Tue Feb 26 12:06:48 EET 2008 (edwin)
2
+------------------------------------
3
+  * libclamav/htmlnorm.c: strip spaces from scripts, normalize screnc
4
+
1 5
 Mon Feb 25 13:01:46 EET 2008 (edwin)
2 6
 ------------------------------------
3 7
   * revert the Makefile changes in r3671.
... ...
@@ -633,7 +633,8 @@ static int cli_html_normalise(int fd, m_area_t *m_area, const char *dirname, tag
633 633
 				if (isspace(*ptr)) {
634 634
 					ptr++;
635 635
 				} else {
636
-					html_output_c(file_buff_o2, ' ');
636
+					if(!in_script)
637
+						html_output_c(file_buff_o2, ' ');
637 638
 					state = next_state;
638 639
 					next_state = HTML_BAD_STATE;
639 640
 				}
... ...
@@ -673,8 +674,15 @@ static int cli_html_normalise(int fd, m_area_t *m_area, const char *dirname, tag
673 673
 				} else {
674 674
 					html_output_c(file_buff_o2, tolower(*ptr));
675 675
 					if (!in_script) {
676
-						html_output_c(file_buff_text, tolower(*ptr));
677
-						text_space_written = FALSE;
676
+						if(*ptr < 0x20) {
677
+							if(!text_space_written) {
678
+								html_output_c(file_buff_text, ' ');
679
+								text_space_written = TRUE;
680
+							}
681
+						} else {
682
+							html_output_c(file_buff_text, tolower(*ptr));
683
+							text_space_written = FALSE;
684
+						}
678 685
 					}
679 686
 					ptr++;
680 687
 				}
... ...
@@ -710,7 +718,9 @@ static int cli_html_normalise(int fd, m_area_t *m_area, const char *dirname, tag
710 710
 					tag[tag_length] = '\0';
711 711
 					state = HTML_SKIP_WS;
712 712
 					tag_arg_length = 0;
713
-					next_state = HTML_TAG_ARG;
713
+					/* if we're inside a script we only care for </script>.
714
+					 * if we'd go to HTML_TAG_ARG whitespace would be inconsistently normalized*/
715
+					next_state = !in_script ? HTML_TAG_ARG : HTML_NORM;
714 716
 				}
715 717
 				break;
716 718
 			case HTML_TAG_ARG:
... ...
@@ -895,7 +905,7 @@ static int cli_html_normalise(int fd, m_area_t *m_area, const char *dirname, tag
895 895
 				}
896 896
 				break;
897 897
 			case HTML_COMMENT:
898
-				if (in_script) {
898
+				if (in_script && !isspace(*ptr)) {
899 899
 					/* dump script to nocomment.html, since we no longer have
900 900
 					 * comment.html/script.html */
901 901
 					html_output_c(file_buff_o2, tolower(*ptr));
... ...
@@ -1070,12 +1080,14 @@ static int cli_html_normalise(int fd, m_area_t *m_area, const char *dirname, tag
1070 1070
 					if(arg_value && arg_value[0]) {
1071 1071
 						html_output_str(file_buff_text, arg_value, strlen(arg_value));
1072 1072
 						html_output_c(file_buff_text, ' ');
1073
+						text_space_written = TRUE;
1073 1074
 					}
1074 1075
 				} else if (strcmp(tag, "img") == 0) {
1075 1076
 					arg_value = html_tag_arg_value(&tag_args, "src");
1076 1077
 					if(arg_value && arg_value[0]) {
1077 1078
 						html_output_str(file_buff_text, arg_value, strlen(arg_value));
1078 1079
 						html_output_c(file_buff_text, ' ');
1080
+						text_space_written = TRUE;
1079 1081
 					}
1080 1082
 				}
1081 1083
 				html_tag_arg_free(&tag_args);
... ...
@@ -1253,21 +1265,27 @@ static int cli_html_normalise(int fd, m_area_t *m_area, const char *dirname, tag
1253 1253
 						case 0x21:
1254 1254
 							html_output_c(file_buff_o2, 0x3c);
1255 1255
 							break;
1256
+							/*
1256 1257
 						case 0x23:
1257 1258
 							html_output_c(file_buff_o2, 0x0d);
1258 1259
 							break;
1260
+							we strip whitespace
1261
+							*/
1259 1262
 						case 0x24:
1260 1263
 							html_output_c(file_buff_o2, 0x40);
1261 1264
 							break;
1265
+							/*
1262 1266
 						case 0x26:
1263 1267
 							html_output_c(file_buff_o2, 0x0a);
1264 1268
 							break;
1269
+							we strip whitespace 
1270
+							*/
1265 1271
 						case 0x2a:
1266 1272
 							html_output_c(file_buff_o2, 0x3e);
1267 1273
 							break;
1268 1274
 						}
1269
-					} else {
1270
-						html_output_c(file_buff_o2, value);
1275
+					} else if(!isspace(value&0xff)) {
1276
+						html_output_c(file_buff_o2, tolower(value&0xff));
1271 1277
 					}
1272 1278
 				}
1273 1279
 				table_pos = (table_pos + 1) % 64;
... ...
@@ -1642,15 +1660,19 @@ int html_screnc_decode(int fd, const char *dirname)
1642 1642
 				case 0x21:
1643 1643
 					html_output_c(&file_buff, 0x3c);
1644 1644
 					break;
1645
-				case 0x23:
1645
+				/*case 0x23:
1646 1646
 					html_output_c(&file_buff, 0x0d);
1647 1647
 					break;
1648
+					we strip whitespace
1649
+					*/
1648 1650
 				case 0x24:
1649 1651
 					html_output_c(&file_buff, 0x40);
1650
-					break;				
1651
-				case 0x26:
1652
+					break;
1653
+				/*case 0x26:
1652 1654
 					html_output_c(&file_buff, 0x0a);
1653 1655
 					break;
1656
+					we strip whitespace
1657
+					*/
1654 1658
 				case 0x2a:
1655 1659
 					html_output_c(&file_buff, 0x3e);
1656 1660
 					break;
... ...
@@ -1664,8 +1686,8 @@ int html_screnc_decode(int fd, const char *dirname)
1664 1664
 					result = decrypt_tables[table_order[table_pos]][*ptr];
1665 1665
 					if (result == 0xFF) { /* special character */
1666 1666
 						state = HTML_SPECIAL_CHAR;
1667
-					} else {
1668
-						html_output_c(&file_buff, (char)result);
1667
+					} else if(!isspace(result&0xff)) {
1668
+						html_output_c(&file_buff, tolower(result&0xff));
1669 1669
 					}
1670 1670
 				}
1671 1671
 				ptr++;