Browse code

improve normalization of screnc encoded files(bb #1022) extract duplicated code into its own function

git-svn: trunk@3945

Török Edvin authored on 2008/07/14 23:50:52
Showing 2 changed files
... ...
@@ -1,3 +1,7 @@
1
+Mon Jul 14 17:26:03 EEST 2008 (edwin)
2
+------------------------------------
3
+  * libclamav/htmlnorm.c: improve normalization of screnc encoded files(bb #1022)
4
+
1 5
 Mon Jul 14 13:57:15 CEST 2008 (tk)
2 6
 ----------------------------------
3 7
   * freshclam/manager.c: fix rename() problem on w32 (bb#1084)
... ...
@@ -73,7 +73,7 @@ typedef enum {
73 73
     HTML_TAG_ARG_EQUAL,
74 74
     HTML_PROCESS_TAG,
75 75
     HTML_CHAR_REF_DECODE,
76
-    HTML_SKIP_LENGTH,
76
+    HTML_LOOKFOR_SCRENC,
77 77
     HTML_JSDECODE,
78 78
     HTML_JSDECODE_LENGTH,
79 79
     HTML_JSDECODE_DECRYPT,
... ...
@@ -276,7 +276,7 @@ static unsigned char *cli_readchunk(FILE *stream, m_area_t *m_area, unsigned int
276 276
 			if(count < chunk_len) {
277 277
 				chunk[count] = '\0';
278 278
 				/* seek-back to space */
279
-				fseek(stream, (long)(count - chunk_len), SEEK_CUR);
279
+				fseek(stream, -(long)(chunk_len - count), SEEK_CUR);
280 280
 			}
281 281
 		}
282 282
 	}
... ...
@@ -491,18 +491,108 @@ static inline void html_tag_contents_done(tag_arguments_t *tags,int idx)
491 491
 	blobClose(tags->contents[idx-1]);
492 492
 }
493 493
 
494
+struct screnc_state {
495
+	uint32_t length;
496
+	uint32_t sum;
497
+	uint8_t  table_pos;
498
+};
499
+
500
+/* inplace decoding, so that we can normalize it later */
501
+static void *screnc_decode(unsigned char *ptr, struct screnc_state *s)
502
+{
503
+	uint8_t  value;
504
+	unsigned char *dst = ptr;
505
+
506
+	if(!ptr || !s)
507
+		return;
508
+	while(s->length > 0 && *ptr) {
509
+		if ((*ptr == '\n') || (*ptr == '\r')) {
510
+			ptr++;
511
+			continue;
512
+		}
513
+		if (*ptr < 0x80) {
514
+			value = decrypt_tables[table_order[s->table_pos]][*ptr];
515
+			if (value == 0xFF) { /* special character */
516
+				ptr++;
517
+				s->length--;
518
+				switch (*ptr) {
519
+					case '\0':
520
+						/* Fixup for end of line */
521
+						ptr--;
522
+						break;
523
+					case 0x21:
524
+						value = 0x3c;
525
+						break;
526
+					case 0x23:
527
+						value = 0x0d;
528
+						break;
529
+					case 0x24:
530
+						value = 0x40;
531
+						break;
532
+					case 0x26:
533
+						value = 0x0a;
534
+						break;
535
+					case 0x2a:
536
+						value = 0x3e;
537
+						break;
538
+				}
539
+			}
540
+			s->sum += value;
541
+			*dst++ = value;
542
+			s->table_pos = (s->table_pos + 1) % 64;
543
+		} else {
544
+			*dst++ = *ptr++;
545
+			*dst++ = *ptr;
546
+		}
547
+		ptr++;
548
+		s->length--;
549
+	}
550
+	if(!s->length) {
551
+		size_t remaining;
552
+		if(strlen(ptr) >= 12) {
553
+			uint32_t expected;
554
+			expected = base64_chars[ptr[0]] << 2;
555
+			expected += base64_chars[ptr[1]] >> 4;
556
+			expected += (base64_chars[ptr[1]] & 0x0f) << 12;
557
+			expected += (base64_chars[ptr[2]] >> 2) << 8;
558
+			expected += (base64_chars[ptr[2]] & 0x03) << 22;
559
+			expected += base64_chars[ptr[3]] << 16;
560
+			expected += (base64_chars[ptr[4]] << 2) << 24;
561
+			expected += (base64_chars[ptr[5]] >> 4) << 24;
562
+			ptr += 8;
563
+			if(s->sum != expected) {
564
+				cli_dbgmsg("screnc_decode: checksum mismatch: %lu != %lu\n", s->sum, expected);
565
+			} else {
566
+				if(strncmp(ptr, "^#~@", 4) != 0) {
567
+					cli_dbgmsg("screnc_decode: terminator not found\n");
568
+				} else {
569
+					cli_dbgmsg("screnc_decode: OK\n");
570
+				}
571
+			}
572
+			ptr += 4;
573
+		}
574
+		/* copy remaining */
575
+		remaining = strlen(ptr) + 1;
576
+		memmove(dst, ptr, remaining);
577
+	} else {
578
+		*dst = '\0';
579
+	}
580
+}
581
+
494 582
 static int cli_html_normalise(int fd, m_area_t *m_area, const char *dirname, tag_arguments_t *hrefs,const struct cli_dconf* dconf)
495 583
 {
496 584
 	int fd_tmp, tag_length, tag_arg_length, binary;
497
-	int retval=FALSE, escape, value = 0, hex, tag_val_length=0, table_pos, in_script=FALSE, text_space_written=FALSE;
585
+	int retval=FALSE, escape, value = 0, hex, tag_val_length=0;
586
+	int look_for_screnc=FALSE, in_screnc=FALSE,in_script=FALSE, text_space_written=FALSE, spacew=FALSE;
498 587
 	FILE *stream_in = NULL;
499
-	html_state state=HTML_NORM, next_state=HTML_BAD_STATE;
588
+	html_state state=HTML_NORM, next_state=HTML_BAD_STATE, saved_next_state=HTML_BAD_STATE;
500 589
 	char filename[1024], tag[HTML_STR_LENGTH+1], tag_arg[HTML_STR_LENGTH+1];
501 590
 	char tag_val[HTML_STR_LENGTH+1], *tmp_file;
502
-	unsigned char *line, *ptr, *arg_value;
591
+	unsigned char *line, *ptr, *arg_value, *ptr_screnc;
503 592
 	tag_arguments_t tag_args;
504 593
 	quoted_state quoted;
505 594
 	unsigned long length;
595
+	struct screnc_state screnc_state;
506 596
 	file_buff_t *file_buff_o2, *file_buff_text;
507 597
 	file_buff_t *file_tmp_o1;
508 598
 	int in_ahref=0;/* index of <a> tag, whose contents we are parsing. Indexing starts from 1, 0 means outside of <a>*/
... ...
@@ -618,13 +708,6 @@ static int cli_html_normalise(int fd, m_area_t *m_area, const char *dirname, tag
618 618
 				/* An engine error has occurred */
619 619
 				cli_dbgmsg("HTML Engine Error\n");
620 620
 				goto abort;
621
-			case HTML_SKIP_LENGTH:
622
-				length--;
623
-				ptr++;
624
-				if (!length) {
625
-					state = next_state;
626
-				}
627
-				break;
628 621
 			case HTML_SKIP_WS:
629 622
 				if (isspace(*ptr)) {
630 623
 					ptr++;
... ...
@@ -971,7 +1054,7 @@ static int cli_html_normalise(int fd, m_area_t *m_area, const char *dirname, tag
971 971
 						next_state = HTML_JSDECODE;
972 972
 						/* we already output the old tag, output the new tag now */
973 973
 						html_output_tag(file_buff_o2, tag, &tag_args);
974
-					} else {
974
+					} else if(strcmp(tag, "script") == 0) {
975 975
 						in_script = TRUE;
976 976
 						if(dconf_js && !js_state) {
977 977
 							js_state = cli_js_init();
... ...
@@ -981,6 +1064,15 @@ static int cli_html_normalise(int fd, m_area_t *m_area, const char *dirname, tag
981 981
 							js_begin = ptr;
982 982
 						}
983 983
 					}
984
+				} else if(strcmp(tag, "%@") == 0) {
985
+					arg_value = html_tag_arg_value(&tag_args, "language");
986
+					if(arg_value && strcasecmp(arg_value,"jscript.encode") == 0||
987
+							strcasecmp(arg_value, "vbscript.encode") == 0) {
988
+
989
+						saved_next_state = next_state;
990
+						next_state = state;
991
+						state = HTML_LOOKFOR_SCRENC;
992
+					}
984 993
 				} else if (hrefs) {
985 994
 					if(in_ahref && !href_contents_begin)
986 995
 						href_contents_begin=ptr;
... ...
@@ -1235,6 +1327,16 @@ static int cli_html_normalise(int fd, m_area_t *m_area, const char *dirname, tag
1235 1235
 					next_state = HTML_BAD_STATE;
1236 1236
 				}
1237 1237
 				break;
1238
+			case HTML_LOOKFOR_SCRENC:
1239
+				look_for_screnc = TRUE;
1240
+				ptr_screnc = strstr(ptr, "#@~^");
1241
+				if(ptr_screnc) {
1242
+					*ptr_screnc = '\0';
1243
+					ptr_screnc += 4;
1244
+				}
1245
+				state = next_state;
1246
+				next_state = saved_next_state;
1247
+				break;
1238 1248
 			case HTML_JSDECODE:
1239 1249
 				/* Check for start marker */
1240 1250
 				if (strncmp(ptr, "#@~^", 4) == 0) {
... ...
@@ -1252,68 +1354,33 @@ static int cli_html_normalise(int fd, m_area_t *m_area, const char *dirname, tag
1252 1252
 					next_state = HTML_BAD_STATE;
1253 1253
 					break;
1254 1254
 				}
1255
-				length = base64_chars[ptr[0]] << 2;
1256
-				length += base64_chars[ptr[1]] >> 4;
1257
-				length += (base64_chars[ptr[1]] & 0x0f) << 12;
1258
-				length += (base64_chars[ptr[2]] >> 2) << 8;
1259
-				length += (base64_chars[ptr[2]] & 0x03) << 22;
1260
-				length += base64_chars[ptr[3]] << 16;
1261
-				length += (base64_chars[ptr[4]] << 2) << 24;
1262
-				length += (base64_chars[ptr[5]] >> 4) << 24;
1263
-				table_pos = 0;
1255
+				memset(&screnc_state, 0, sizeof(screnc_state));
1256
+				screnc_state.length = base64_chars[ptr[0]] << 2;
1257
+				screnc_state.length += base64_chars[ptr[1]] >> 4;
1258
+				screnc_state.length += (base64_chars[ptr[1]] & 0x0f) << 12;
1259
+				screnc_state.length += (base64_chars[ptr[2]] >> 2) << 8;
1260
+				screnc_state.length += (base64_chars[ptr[2]] & 0x03) << 22;
1261
+				screnc_state.length += base64_chars[ptr[3]] << 16;
1262
+				screnc_state.length += (base64_chars[ptr[4]] << 2) << 24;
1263
+				screnc_state.length += (base64_chars[ptr[5]] >> 4) << 24;
1264 1264
 				state = HTML_JSDECODE_DECRYPT;
1265
+				in_screnc = TRUE;
1265 1266
 				next_state = HTML_BAD_STATE;
1266 1267
 				ptr += 8;
1267 1268
 				break;
1268 1269
 			case HTML_JSDECODE_DECRYPT:
1269
-				if (length == 0) {
1270
+				screnc_decode(ptr, &screnc_state);
1271
+				if(!screnc_state.length) {
1270 1272
 					html_output_str(file_buff_o2, "</script>\n", 10);
1271
-					length = 12;
1272
-					state = HTML_SKIP_LENGTH;
1273
-					next_state = HTML_NORM;
1273
+					state = HTML_NORM;
1274
+					next_state = HTML_BAD_STATE;
1275
+					in_screnc = FALSE;
1274 1276
 					break;
1277
+				} else {
1278
+					state = HTML_NORM;
1279
+					next_state = HTML_BAD_STATE;
1275 1280
 				}
1276
-				if (*ptr < 0x80) {
1277
-					value = decrypt_tables[table_order[table_pos]][*ptr];
1278
-					if (value == 0xFF) { /* special character */
1279
-						ptr++;
1280
-						length--;
1281
-						switch (*ptr) {
1282
-						case '\0':
1283
-							/* Fixup for end of line */
1284
-							ptr--;
1285
-							break;
1286
-						case 0x21:
1287
-							html_output_c(file_buff_o2, 0x3c);
1288
-							break;
1289
-							/*
1290
-						case 0x23:
1291
-							html_output_c(file_buff_o2, 0x0d);
1292
-							break;
1293
-							we strip whitespace
1294
-							*/
1295
-						case 0x24:
1296
-							html_output_c(file_buff_o2, 0x40);
1297
-							break;
1298
-							/*
1299
-						case 0x26:
1300
-							html_output_c(file_buff_o2, 0x0a);
1301
-							break;
1302
-							we strip whitespace 
1303
-							*/
1304
-						case 0x2a:
1305
-							html_output_c(file_buff_o2, 0x3e);
1306
-							break;
1307
-						}
1308
-					} else if(!isspace(value&0xff)) {
1309
-						html_output_c(file_buff_o2, tolower(value&0xff));
1310
-					}
1311
-				}
1312
-				table_pos = (table_pos + 1) % 64;
1313
-				ptr++;
1314
-				length--;
1315 1281
 				break;
1316
-
1317 1282
 			case HTML_RFC2397_TYPE:
1318 1283
 				if (*ptr == '\'') {
1319 1284
 					if (!escape && (quoted==SINGLE_QUOTED)) {
... ...
@@ -1528,8 +1595,24 @@ static int cli_html_normalise(int fd, m_area_t *m_area, const char *dirname, tag
1528 1528
 				js_state = NULL;
1529 1529
 			}
1530 1530
 		}
1531
+		if(look_for_screnc && ptr_screnc) {
1532
+			/* start found, and stuff before it already processed */
1533
+			ptr = ptr_screnc;
1534
+			ptr_screnc = NULL;
1535
+			state = HTML_JSDECODE_LENGTH;
1536
+			next_state = HTML_BAD_STATE;
1537
+			continue;
1538
+		}
1531 1539
 		free(line);
1532 1540
 		ptr = line = cli_readchunk(stream_in, m_area, 8192);
1541
+		if (in_screnc) {
1542
+			state = HTML_JSDECODE_DECRYPT;
1543
+			next_state = HTML_BAD_STATE;
1544
+		} else if(look_for_screnc && !ptr_screnc) {
1545
+			saved_next_state = next_state;
1546
+			next_state = state;
1547
+			state = HTML_LOOKFOR_SCRENC;
1548
+		}
1533 1549
 	}
1534 1550
 
1535 1551
 	if(dconf_entconv) {
... ...
@@ -1620,14 +1703,14 @@ int html_normalise_fd(int fd, const char *dirname, tag_arguments_t *hrefs,const
1620 1620
 
1621 1621
 int html_screnc_decode(int fd, const char *dirname)
1622 1622
 {
1623
-	int fd_tmp, table_pos=0, result, count, state, retval=FALSE;
1623
+	int fd_tmp, result, count, retval=FALSE;
1624 1624
 	unsigned char *line, tmpstr[6];
1625
-	unsigned long length;
1626 1625
 	unsigned char *ptr, filename[1024];
1627 1626
 	FILE *stream_in;
1628
-	file_buff_t file_buff;
1629
-	
1630
-	lseek(fd, 0, SEEK_SET);	
1627
+	int ofd;
1628
+	struct screnc_state screnc_state;
1629
+
1630
+	lseek(fd, 0, SEEK_SET);
1631 1631
 	fd_tmp = dup(fd);
1632 1632
 	if (fd_tmp < 0) {
1633 1633
 		return FALSE;
... ...
@@ -1637,17 +1720,16 @@ int html_screnc_decode(int fd, const char *dirname)
1637 1637
 		close(fd_tmp);
1638 1638
 		return FALSE;
1639 1639
 	}
1640
-	
1640
+
1641 1641
 	snprintf(filename, 1024, "%s/screnc.html", dirname);
1642
-	file_buff.fd = open(filename, O_WRONLY|O_CREAT|O_TRUNC, S_IWUSR|S_IRUSR);
1643
-	file_buff.length = 0;
1644
-	
1645
-	if (!file_buff.fd) {
1642
+	ofd = open(filename, O_WRONLY|O_CREAT|O_TRUNC, S_IWUSR|S_IRUSR);
1643
+
1644
+	if (!ofd) {
1646 1645
 		cli_dbgmsg("open failed: %s\n", filename);
1647 1646
 		fclose(stream_in);
1648 1647
 		return FALSE;
1649 1648
 	}
1650
-	
1649
+
1651 1650
 	while ((line = cli_readchunk(stream_in, NULL, 8192)) != NULL) {
1652 1651
 		ptr = strstr(line, "#@~^");
1653 1652
 		if (ptr) {
... ...
@@ -1658,7 +1740,7 @@ int html_screnc_decode(int fd, const char *dirname)
1658 1658
 	if (!line) {
1659 1659
 		goto abort;
1660 1660
 	}
1661
-	
1661
+
1662 1662
 	/* Calculate the length of the encoded string */
1663 1663
 	ptr += 4;
1664 1664
 	count = 0;
... ...
@@ -1670,88 +1752,36 @@ int html_screnc_decode(int fd, const char *dirname)
1670 1670
 				goto abort;
1671 1671
 			}
1672 1672
 		}
1673
-		tmpstr[count++] = *ptr;
1673
+		if(count < 6)
1674
+			tmpstr[count] = *ptr;
1675
+		count++;
1674 1676
 		ptr++;
1675
-	} while (count < 6);
1676
-	
1677
-	length = base64_chars[tmpstr[0]] << 2;
1678
-	length += base64_chars[tmpstr[1]] >> 4;
1679
-	length += (base64_chars[tmpstr[1]] & 0x0f) << 12;
1680
-	length += (base64_chars[tmpstr[2]] >> 2) << 8;
1681
-	length += (base64_chars[tmpstr[2]] & 0x03) << 22;
1682
-	length += base64_chars[tmpstr[3]] << 16;
1683
-	length += (base64_chars[tmpstr[4]] << 2) << 24;
1684
-	length += (base64_chars[tmpstr[5]] >> 4) << 24;
1685
-
1686
-	/* Move forward 2 bytes */
1687
-	count = 2;
1688
-	state = HTML_SKIP_LENGTH;
1689
-
1690
-	while (length && line) {
1691
-		while (length && *ptr) {
1692
-			if ((*ptr == '\n') || (*ptr == '\r')) {
1693
-				ptr++;
1694
-				continue;
1695
-			}
1696
-			switch (state) {
1697
-			case HTML_SKIP_LENGTH:
1698
-				ptr++;
1699
-				count--;
1700
-				if (count == 0) {
1701
-					state = HTML_NORM;
1702
-				}
1703
-				break;
1704
-			case HTML_SPECIAL_CHAR:
1705
-				switch (*ptr) {
1706
-				case 0x21:
1707
-					html_output_c(&file_buff, 0x3c);
1708
-					break;
1709
-				/*case 0x23:
1710
-					html_output_c(&file_buff, 0x0d);
1711
-					break;
1712
-					we strip whitespace
1713
-					*/
1714
-				case 0x24:
1715
-					html_output_c(&file_buff, 0x40);
1716
-					break;
1717
-				/*case 0x26:
1718
-					html_output_c(&file_buff, 0x0a);
1719
-					break;
1720
-					we strip whitespace
1721
-					*/
1722
-				case 0x2a:
1723
-					html_output_c(&file_buff, 0x3e);
1724
-					break;
1725
-				}
1726
-				ptr++;
1727
-				length--;
1728
-				state = HTML_NORM;
1729
-				break;
1730
-			case HTML_NORM:	
1731
-				if (*ptr < 0x80) {
1732
-					result = decrypt_tables[table_order[table_pos]][*ptr];
1733
-					if (result == 0xFF) { /* special character */
1734
-						state = HTML_SPECIAL_CHAR;
1735
-					} else if(!isspace(result&0xff)) {
1736
-						html_output_c(&file_buff, tolower(result&0xff));
1737
-					}
1738
-				}
1739
-				ptr++;
1740
-				length--;
1741
-				table_pos = (table_pos + 1) % 64;
1742
-				break;
1743
-			}
1744
-		}
1677
+	} while (count < 8);
1678
+
1679
+	memset(&screnc_state, 0, sizeof(screnc_state));
1680
+	screnc_state.length = base64_chars[tmpstr[0]] << 2;
1681
+	screnc_state.length += base64_chars[tmpstr[1]] >> 4;
1682
+	screnc_state.length += (base64_chars[tmpstr[1]] & 0x0f) << 12;
1683
+	screnc_state.length += (base64_chars[tmpstr[2]] >> 2) << 8;
1684
+	screnc_state.length += (base64_chars[tmpstr[2]] & 0x03) << 22;
1685
+	screnc_state.length += base64_chars[tmpstr[3]] << 16;
1686
+	screnc_state.length += (base64_chars[tmpstr[4]] << 2) << 24;
1687
+	screnc_state.length += (base64_chars[tmpstr[5]] >> 4) << 24;
1688
+
1689
+	while (screnc_state.length && line) {
1690
+		screnc_decode(ptr, &screnc_state);
1691
+		write(ofd, ptr, strlen(ptr));
1745 1692
 		free(line);
1746
-		if (length) {
1693
+		if (screnc_state.length) {
1747 1694
 			ptr = line = cli_readchunk(stream_in, NULL, 8192);
1748 1695
 		}
1749 1696
 	}
1697
+	if(screnc_state.length)
1698
+		cli_dbgmsg("html_screnc_decode: missing %lu bytes\n",screnc_state.length);
1750 1699
 	retval = TRUE;
1751
-						
1700
+
1752 1701
 abort:
1753 1702
 	fclose(stream_in);
1754
-	html_output_flush(&file_buff);
1755
-	close(file_buff.fd);
1703
+	close(ofd);
1756 1704
 	return retval;
1757 1705
 }