Browse code

re-write HTML code

git-svn-id: file:///var/lib/svn/clamav-devel/trunk/clamav-devel@849 77e5149b-7576-45b1-b177-96237e5ba77b

Trog authored on 2004/09/13 19:30:14
Showing 6 changed files
... ...
@@ -1,3 +1,11 @@
1
+Mon Sep 13 11:23:21 BST 2004 (trog)
2
+-----------------------------------
3
+  * libclamav: re-write HTML code:
4
+	- decode MS Script Encoder code
5
+	- doesn't require mmap(), uses it if available
6
+	- extract href tag values
7
+	- single pass parser
8
+
1 9
 Mon Sep 13 03:31:58 CEST 2004 (tk)
2 10
 ----------------------------------
3 11
   * libclamav: CL_BLOCKMAX: allow blocking (i.e. marking as viruses) of
... ...
@@ -62,6 +62,7 @@ static const struct cli_magic_s cli_magic[] = {
62 62
     {0,	    "MSCF",			4,  "MS CAB",		  CL_MSCABFILE},
63 63
     {0,	    "ITSF",			4,  "MS CHM",             CL_MSCHMFILE},
64 64
     {257,   "ustar",			5,  "POSIX tar",	  CL_TARFILE},
65
+    {0,     "#@~^",			4,  "SCRENC",		  CL_SCRENC},
65 66
 
66 67
     /* Mail */
67 68
 
... ...
@@ -37,6 +37,7 @@ typedef enum {
37 37
     CL_OLE2FILE,
38 38
     CL_MSCABFILE,
39 39
     CL_MSCHMFILE,
40
+    CL_SCRENC,
40 41
 
41 42
     /* bigger numbers have higher priority (in o-t-f detection) */
42 43
     CL_HTMLFILE, /* on the fly */
... ...
@@ -1,5 +1,10 @@
1 1
 /*
2
- *  Copyright (C) 2004 Trog <trog@clamav.net>
2
+ *  Normalise HTML text.
3
+ *  Decode MS Script Encoder protection. 
4
+ *
5
+ *  Copyright (C) 2004 trog@uncon.org
6
+ *
7
+ *  The ScrEnc decoder was initially based upon an analysis by Andreas Marx.
3 8
  *
4 9
  *  This program is free software; you can redistribute it and/or modify
5 10
  *  it under the terms of the GNU General Public License as published by
... ...
@@ -16,284 +21,1026 @@
16 16
  *  Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
17 17
  */
18 18
 
19
-#if HAVE_CONFIG_H
20
-#include "clamav-config.h"
21
-#endif
22
-
23 19
 #include <stdio.h>
24
-#include <string.h>
25
-#include <ctype.h>
26 20
 #include <unistd.h>
27 21
 #include <sys/types.h>
28 22
 #include <sys/stat.h>
29 23
 #include <fcntl.h>
24
+#include <string.h>
25
+#include <errno.h>
26
+#include <stdio.h>
27
+
28
+#if HAVE_CONFIG_H
29
+#include "clamav-config.h"
30
+#endif
31
+
32
+#if HAVE_MMAP
33
+#if HAVE_SYS_MMAN_H
34
+#include <sys/mman.h>
35
+#else /* HAVE_SYS_MMAN_H */
36
+#undef HAVE_MMAP
37
+#endif
38
+#endif
30 39
 
31 40
 #include "others.h"
41
+#include "htmlnorm.h"
32 42
 
43
+#define HTML_STR_LENGTH 1024
33 44
 #define FALSE (0)
34 45
 #define TRUE (1)
35 46
 
36
-/* Normalize an HTML buffer using the following rules:
37
-	o Remove multiple contiguous spaces
38
-	o Remove spaces around '<' and '>' in tags
39
-	o Remove spaces around '=' in tags
40
-	o Replace single quote with double quote in tags
41
-	o Convert to lowercase
42
-	o Convert all white space to a space character
43
-*/
47
+typedef enum {
48
+    HTML_BAD_STATE=0,
49
+    HTML_NORM=1,
50
+    HTML_COMMENT=2,
51
+    HTML_CHAR_REF=3,
52
+    HTML_JS_DECODE=4,
53
+    HTML_SKIP_WS=5,
54
+    HTML_TRIM_WS=6,
55
+    HTML_TAG=7,
56
+    HTML_TAG_ARG=8,
57
+    HTML_TAG_ARG_VAL=9,
58
+    HTML_TAG_ARG_EQUAL=10,
59
+    HTML_PROCESS_TAG=11,
60
+    HTML_CHAR_REF_DECODE=12,
61
+    HTML_SKIP_LENGTH=13,
62
+    HTML_JSDECODE=14,
63
+    HTML_JSDECODE_LENGTH=15,
64
+    HTML_JSDECODE_DECRYPT=16,
65
+    HTML_SPECIAL_CHAR=17,
66
+} html_state;
67
+
68
+typedef enum {
69
+    SINGLE_QUOTED,
70
+    DOUBLE_QUOTED,
71
+    NOT_QUOTED,
72
+} quoted_state;
73
+
74
+typedef struct m_area_tag {
75
+	unsigned char *buffer;
76
+	off_t length;
77
+	off_t offset;
78
+} m_area_t;
79
+
80
+#define HTML_FILE_BUFF_LEN 8192
81
+
82
+typedef struct file_buff_tag {
83
+	int fd;
84
+	unsigned char buffer[HTML_FILE_BUFF_LEN];
85
+	int length;
86
+} file_buff_t;
87
+
88
+static const int base64_chars[256] = {
89
+    -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1,
90
+    -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1,
91
+    -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,62, -1,-1,-1,63,
92
+    52,53,54,55, 56,57,58,59, 60,61,-1,-1, -1,-1,-1,-1,
93
+    -1, 0, 1, 2,  3, 4, 5, 6,  7, 8, 9,10, 11,12,13,14,
94
+    15,16,17,18, 19,20,21,22, 23,24,25,-1, -1,-1,-1,-1,
95
+    -1,26,27,28, 29,30,31,32, 33,34,35,36, 37,38,39,40,
96
+    41,42,43,44, 45,46,47,48, 49,50,51,-1, -1,-1,-1,-1,
97
+    -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1,
98
+    -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1,
99
+    -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1,
100
+    -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1,
101
+    -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1,
102
+    -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1,
103
+    -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1,
104
+    -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1,
105
+};
106
+
107
+int table_order[] = {
108
+       00, 02, 01, 00, 02, 01, 02, 01, 01, 02, 01, 02, 00, 01, 02, 01,
109
+       00, 01, 02, 01, 00, 00, 02, 01, 01, 02, 00, 01, 02, 01, 01, 02,
110
+       00, 00, 01, 02, 01, 02, 01, 00, 01, 00, 00, 02, 01, 00, 01, 02,
111
+       00, 01, 02, 01, 00, 00, 02, 01, 01, 00, 00, 02, 01, 00, 01, 02
112
+};
44 113
 
45
-unsigned char *html_normalize(unsigned char *in_buff, off_t in_size)
114
+int decrypt_tables[3][128] = {
115
+      {0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x57, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F,
116
+       0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F,
117
+       0x2E, 0x47, 0x7A, 0x56, 0x42, 0x6A, 0x2F, 0x26, 0x49, 0x41, 0x34, 0x32, 0x5B, 0x76, 0x72, 0x43,
118
+       0x38, 0x39, 0x70, 0x45, 0x68, 0x71, 0x4F, 0x09, 0x62, 0x44, 0x23, 0x75, 0x3C, 0x7E, 0x3E, 0x5E,
119
+       0xFF, 0x77, 0x4A, 0x61, 0x5D, 0x22, 0x4B, 0x6F, 0x4E, 0x3B, 0x4C, 0x50, 0x67, 0x2A, 0x7D, 0x74,
120
+       0x54, 0x2B, 0x2D, 0x2C, 0x30, 0x6E, 0x6B, 0x66, 0x35, 0x25, 0x21, 0x64, 0x4D, 0x52, 0x63, 0x3F,
121
+       0x7B, 0x78, 0x29, 0x28, 0x73, 0x59, 0x33, 0x7F, 0x6D, 0x55, 0x53, 0x7C, 0x3A, 0x5F, 0x65, 0x46,
122
+       0x58, 0x31, 0x69, 0x6C, 0x5A, 0x48, 0x27, 0x5C, 0x3D, 0x24, 0x79, 0x37, 0x60, 0x51, 0x20, 0x36},
123
+
124
+      {0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x7B, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F,
125
+       0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F,
126
+       0x32, 0x30, 0x21, 0x29, 0x5B, 0x38, 0x33, 0x3D, 0x58, 0x3A, 0x35, 0x65, 0x39, 0x5C, 0x56, 0x73,
127
+       0x66, 0x4E, 0x45, 0x6B, 0x62, 0x59, 0x78, 0x5E, 0x7D, 0x4A, 0x6D, 0x71, 0x3C, 0x60, 0x3E, 0x53,
128
+       0xFF, 0x42, 0x27, 0x48, 0x72, 0x75, 0x31, 0x37, 0x4D, 0x52, 0x22, 0x54, 0x6A, 0x47, 0x64, 0x2D,
129
+       0x20, 0x7F, 0x2E, 0x4C, 0x5D, 0x7E, 0x6C, 0x6F, 0x79, 0x74, 0x43, 0x26, 0x76, 0x25, 0x24, 0x2B,
130
+       0x28, 0x23, 0x41, 0x34, 0x09, 0x2A, 0x44, 0x3F, 0x77, 0x3B, 0x55, 0x69, 0x61, 0x63, 0x50, 0x67,
131
+       0x51, 0x49, 0x4F, 0x46, 0x68, 0x7C, 0x36, 0x70, 0x6E, 0x7A, 0x2F, 0x5F, 0x4B, 0x5A, 0x2C, 0x57},
132
+
133
+      {0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x6E, 0x0A, 0x0B, 0x0C, 0x06, 0x0E, 0x0F,
134
+       0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F,
135
+       0x2D, 0x75, 0x52, 0x60, 0x71, 0x5E, 0x49, 0x5C, 0x62, 0x7D, 0x29, 0x36, 0x20, 0x7C, 0x7A, 0x7F,
136
+       0x6B, 0x63, 0x33, 0x2B, 0x68, 0x51, 0x66, 0x76, 0x31, 0x64, 0x54, 0x43, 0x3C, 0x3A, 0x3E, 0x7E,
137
+       0xFF, 0x45, 0x2C, 0x2A, 0x74, 0x27, 0x37, 0x44, 0x79, 0x59, 0x2F, 0x6F, 0x26, 0x72, 0x6A, 0x39,
138
+       0x7B, 0x3F, 0x38, 0x77, 0x67, 0x53, 0x47, 0x34, 0x78, 0x5D, 0x30, 0x23, 0x5A, 0x5B, 0x6C, 0x48,
139
+       0x55, 0x70, 0x69, 0x2E, 0x4C, 0x21, 0x24, 0x4E, 0x50, 0x09, 0x56, 0x73, 0x35, 0x61, 0x4B, 0x58,
140
+       0x3B, 0x57, 0x22, 0x6D, 0x4D, 0x25, 0x28, 0x46, 0x4A, 0x32, 0x41, 0x3D, 0x5F, 0x4F, 0x42, 0x65}
141
+};
142
+
143
+/* TODO: mmap support */
144
+static unsigned char *cli_readline(FILE *stream, m_area_t *m_area, unsigned int max_len)
46 145
 {
47
-	unsigned char *out_buff;
48
-	off_t out_size=0, i;
49
-	int had_space=FALSE, tag_depth=0, in_quote=FALSE;
146
+	unsigned char *line, *ptr, *start, *end;
147
+	unsigned int line_len, count;
148
+	int fddup;
50 149
 
51
-	out_buff = (unsigned char *) cli_malloc(in_size+1);
52
-	if (!out_buff) {
53
-		cli_dbgmsg("html_normalize(): malloc failed\n");
150
+	line = (unsigned char *) malloc(max_len);
151
+	if (!line) {
54 152
 		return NULL;
55 153
 	}
56 154
 
57
-	for (i=0 ; i < in_size ; i++) {
58
-		if (in_buff[i] == '<') {
59
-			out_buff[out_size++] = '<';
60
-			tag_depth++;
61
-			if (tag_depth == 1) {
62
-				had_space=TRUE; /* consume spaces */
63
-			}
64
-		} else if ((in_buff[i] == '=') && (tag_depth == 1)) {
65
-			/* Remove preceeding spaces */
66
-			while ((out_size > 0) &&
67
-				(out_buff[out_size-1] == ' ')) {
68
-				out_size--;
155
+	/* Try and use the memory buffer first */
156
+	if (m_area) {
157
+		start = ptr = m_area->buffer + m_area->offset;
158
+		end = m_area->buffer + m_area->length;
159
+		if (start >= end) {
160
+			free(line);
161
+			return NULL;
162
+		}
163
+		line_len = 1;
164
+		while ((ptr < end) && (*ptr != '\n') && (line_len < (max_len-1))) {
165
+			ptr++;
166
+			line_len++;
167
+		}
168
+		if (ptr == end) {
169
+			line_len--;
170
+			memcpy(line, start, line_len);
171
+			line[line_len] = '\0';
172
+		} else if (*ptr == '\n') {
173
+			memcpy(line, start, line_len);
174
+			line[line_len] = '\0';
175
+		} else {
176
+			/* Hit max_len */
177
+			/* Store the current line end and length*/
178
+			count = line_len;
179
+			while (!isspace(*ptr) && (line_len > 1)) {
180
+				ptr--;
181
+				line_len--;
69 182
 			}
70
-			out_buff[out_size++] = '=';
71
-			had_space=TRUE;
72
-		} else if (isspace(in_buff[i])) {
73
-			if (!had_space) {
74
-				out_buff[out_size++] = ' ';
75
-				had_space=TRUE;
183
+			if (line_len == 1) {
184
+				line_len=count;
76 185
 			}
77
-		} else if (in_buff[i] == '>') {
78
-			/* Remove preceeding spaces */
79
-			if (tag_depth == 1) {
80
-				while ((out_size > 0) &&
81
-					(out_buff[out_size-1] == ' ')) {
82
-					out_size--;
186
+			memcpy(line, start, line_len);
187
+			line[line_len] = '\0';
188
+		}
189
+		m_area->offset += line_len;
190
+	} else {
191
+		if (!stream) {
192
+			cli_dbgmsg("No HTML stream\n");
193
+			free(line);
194
+			return NULL;
195
+		}
196
+		if (fgets(line, max_len, stream) == NULL) {
197
+			free(line);
198
+			return NULL;
199
+		}
200
+
201
+		line_len=strlen(line);
202
+		if (line_len == 0) {
203
+			free(line);
204
+			return NULL;
205
+		}
206
+		if (line_len == max_len-1) {
207
+			/* didn't find a whole line - rewind to a space*/
208
+			count = 0;
209
+			while (!isspace(line[--line_len])) {
210
+				count--;
211
+				if (line_len == 0) {
212
+					return line;
83 213
 				}
84 214
 			}
85
-			out_buff[out_size++] = '>';
86
-			tag_depth--;	
87
-		} else if ((in_buff[i] == '\'') && (tag_depth==1)) {
88
-			/* Convert single quotes to double quotes */
89
-			if (in_quote || out_buff[out_size-1] == '=') {
90
-				out_buff[out_size++] = '\"';
91
-				in_quote = !in_quote;
92
-			} else {
93
-				out_buff[out_size++] = '\'';
94
-			}
95
-		} else {
96
-			out_buff[out_size++] = tolower(in_buff[i]);
97
-			had_space=FALSE;
215
+			fseek(stream, count, SEEK_CUR);
216
+			line[line_len+1] = '\0';
98 217
 		}
99 218
 	}
100
-	out_buff[out_size] = '\0';
101
-	return out_buff;
219
+	return line;
102 220
 }
103 221
 
104
-/* Remove HTML style comments from buffer */
105
-unsigned char *remove_html_comments(unsigned char *line)
222
+static void html_output_flush(file_buff_t *fbuff)
106 223
 {
107
-	unsigned char *newline, *newcurrent;
108
-	int in_comment=FALSE;
109
-	
110
-	if (!line) {
111
-		return NULL;
224
+	if (fbuff && (fbuff->length > 0)) {
225
+		cli_writen(fbuff->fd, fbuff->buffer, fbuff->length);
226
+		fbuff->length = 0;
112 227
 	}
113
-	
114
-	newcurrent = newline = (unsigned char *) cli_malloc(strlen(line) + 1);
115
-	if (!newline) {
116
-		return NULL;
228
+}
229
+
230
+static void html_output_c(file_buff_t *fbuff1, file_buff_t *fbuff2, unsigned char c)
231
+{
232
+	if (fbuff1) {
233
+		if (fbuff1->length == HTML_FILE_BUFF_LEN) {
234
+			html_output_flush(fbuff1);
235
+		}
236
+		fbuff1->buffer[fbuff1->length++] = c;
117 237
 	}
118
-	
119
-	while(line) {
120
-		if (!(in_comment)) {
121
-			while (*line && *line != '<') {
122
-				*newcurrent = *line;
123
-				newcurrent++;
124
-				line++;
125
-			}
126
-			if (! *line) {
127
-				break;
128
-			}
129
-			if (!line[1]) {
130
-				*newcurrent = *line;
131
-				newcurrent++;
132
-				line++;
133
-				continue;
134
-			}
135
-			if (line[1] == '!') {
136
-				in_comment = TRUE;
137
-				line += 1;
138
-			} else {
139
-				*newcurrent = *line;
140
-				newcurrent++;
141
-				line++;
142
-			}
143
-		} else {
144
-			while (*line && *line != '>') {
145
-				line++;
146
-			}
147
-			if (! *line) {
148
-				break;
149
-			}
150
-			in_comment = FALSE;
151
-			line++;
238
+	if (fbuff2) {
239
+		if (fbuff2->length == HTML_FILE_BUFF_LEN) {
240
+			html_output_flush(fbuff2);
152 241
 		}
242
+		fbuff2->buffer[fbuff2->length++] = c;
153 243
 	}
154
-	*newcurrent = '\0';
155
-	return newline;
156 244
 }
157 245
 
158
-/* Decode an HTML escape character into it's character value */
159
-unsigned int decode_html_char_ref(unsigned char *cref,
160
-                                    unsigned char *dest)
246
+static html_output_str(file_buff_t *fbuff, unsigned char *str, int len)
161 247
 {
162
-
163
-	unsigned int hex=FALSE, value=0, count=0;
164
-	
165
-	if (!cref[0] || !cref[1]) {
166
-		return 0;
248
+	if (fbuff) {
249
+		if ((fbuff->length + len) >= HTML_FILE_BUFF_LEN) {
250
+			html_output_flush(fbuff);
251
+		}
252
+		memcpy(fbuff->buffer + fbuff->length, str, len);
167 253
 	}
254
+}
255
+
256
+static char *html_tag_arg_value(tag_arguments_t *tags, char *tag)
257
+{
258
+	int i;
168 259
 	
169
-	if (((*cref == 'x') || (*cref == 'X')) && isxdigit(cref[1])) {
170
-		hex=TRUE;
171
-		cref++;
172
-		count++;
260
+	for (i=0; i < tags->count; i++) {
261
+		if (strcmp(tags->tag[i], tag) == 0) {
262
+			return tags->value[i];
263
+		}
173 264
 	}
265
+	return NULL;
266
+}
267
+
268
+static void html_tag_arg_set(tag_arguments_t *tags, char *tag, char *value)
269
+{
270
+	int i;
174 271
 	
175
-	while (isdigit(*cref) || (hex && isxdigit(*cref))) {
176
-		if (hex) {
177
-			value *= 16;
178
-		} else {
179
-			value *= 10;
272
+	for (i=0; i < tags->count; i++) {
273
+		if (strcmp(tags->tag[i], tag) == 0) {
274
+			free(tags->value[i]);
275
+			tags->value[i] = strdup(value);
276
+			return;
180 277
 		}
181
-		if (isdigit(*cref)) {
182
-			value += (*cref - '0');
278
+	}
279
+	return;
280
+}
281
+static void html_tag_arg_add(tag_arguments_t *tags,
282
+		unsigned char *tag, unsigned char *value)
283
+{
284
+	int len;
285
+	tags->count++;
286
+	tags->tag = (unsigned char **) realloc(tags->tag,
287
+				tags->count * sizeof(char *));
288
+	tags->value = (unsigned char **) realloc(tags->value,
289
+				tags->count * sizeof(char *));
290
+	if (!tags->tag || !tags->value) {
291
+		tags->count--;
292
+		return;
293
+	}
294
+	tags->tag[tags->count-1] = strdup(tag);
295
+	if (value) {
296
+		if (*value == '"') {
297
+			tags->value[tags->count-1] = strdup(value+1);
298
+			len = strlen(value+1);
299
+			if (len > 0) {
300
+				tags->value[tags->count-1][len-1] = '\0';
301
+			}
183 302
 		} else {
184
-			value += (tolower(*cref) - 'a' + 10);
303
+			tags->value[tags->count-1] = strdup(value);
185 304
 		}
186
-		cref++;
187
-		count++;
305
+	} else {
306
+		tags->value[tags->count-1] = NULL;
188 307
 	}
189
-	if (*cref == ';') {
190
-		cref++;
191
-		count++;
308
+}
309
+
310
+static void html_output_tag(file_buff_t *fbuff, char *tag, tag_arguments_t *tags)
311
+{
312
+	int i;
313
+
314
+	html_output_c(fbuff, NULL, '<');
315
+	html_output_str(fbuff, tag, strlen(tag));
316
+	for (i=0; i < tags->count; i++) {
317
+		html_output_c(fbuff, NULL, ' ');
318
+		html_output_str(fbuff, tags->tag[i], strlen(tags->tag[i]));
319
+		if (tags->value[i]) {
320
+			html_output_str(fbuff, "=\"", 2);
321
+			html_output_str(fbuff, tags->value[i], strlen(tags->value[i]));
322
+			html_output_c(fbuff, NULL, '"');
323
+		}
192 324
 	}
325
+	html_output_c(fbuff, NULL, '>');
326
+}
327
+
328
+void html_tag_arg_free(tag_arguments_t *tags)
329
+{
330
+	int i;
193 331
 	
194
-	*dest = value;
195
-	
196
-	return count;
332
+	for (i=0; i < tags->count; i++) {
333
+		free(tags->tag[i]);
334
+		if (tags->value[i]) {
335
+			free(tags->value[i]);
336
+		}
337
+	}
338
+	if (tags->tag) {
339
+		free(tags->tag);
340
+	}
341
+	if (tags->value) {
342
+		free(tags->value);
343
+	}
344
+	tags->tag = tags->value = NULL;
345
+	tags->count = 0;
197 346
 }
198 347
 
199
-/* Remove HTML character escape sequences from buffer */
200
-unsigned char *remove_html_char_ref(unsigned char *line)
348
+static int cli_html_normalise(int fd, m_area_t *m_area, const char *dirname, tag_arguments_t *hrefs)
201 349
 {
202
-	unsigned char *newline, *newcurrent;
203
-	unsigned char *linepos, count;
350
+	int fd_tmp, tag_length, tag_arg_length;
351
+	int retval=FALSE, escape, value, hex, tag_val_length, table_pos, in_script=FALSE;
352
+	FILE *stream_in;
353
+	html_state state=HTML_NORM, next_state=HTML_BAD_STATE;
354
+	char filename[1024], tag[HTML_STR_LENGTH+1], tag_arg[HTML_STR_LENGTH+1];
355
+	char tag_val[HTML_STR_LENGTH+1];
356
+	unsigned char *line, *ptr, *arg_value;
357
+	tag_arguments_t tag_args;
358
+	quoted_state quoted;
359
+	unsigned long length;
360
+	file_buff_t file_buff_o1, file_buff_o2,  file_buff_script;
204 361
 	
205
-	if (!line) {
206
-		return NULL;
362
+	if (!m_area) {
363
+		if (fd < 0) {
364
+			cli_dbgmsg("Invalid HTML fd\n");
365
+			return FALSE;
366
+		}
367
+		lseek(fd, 0, SEEK_SET);	
368
+		fd_tmp = dup(fd);
369
+		if (fd_tmp < 0) {
370
+			return FALSE;
371
+		}
372
+		stream_in = fdopen(fd_tmp, "r");
373
+		if (!stream_in) {
374
+			close(fd_tmp);
375
+			return FALSE;
376
+		}
207 377
 	}
208 378
 	
209
-	newcurrent = newline = (unsigned char *) cli_malloc(strlen(line) + 1);
210
-	if (!newline) {
211
-		return NULL;
379
+	if (dirname) {
380
+		snprintf(filename, 1024, "%s/comment.html", dirname);
381
+		file_buff_o1.fd = open(filename, O_WRONLY|O_CREAT|O_TRUNC, S_IRWXU);
382
+		if (!file_buff_o1.fd) {
383
+			cli_dbgmsg("open failed: %s\n", filename);
384
+			fclose(stream_in);
385
+			return FALSE;
386
+		}
387
+
388
+		snprintf(filename, 1024, "%s/nocomment.html", dirname);
389
+		file_buff_o2.fd = open(filename, O_WRONLY|O_CREAT|O_TRUNC, S_IRWXU);
390
+		if (!file_buff_o2.fd) {
391
+			cli_dbgmsg("open failed: %s\n", filename);
392
+			close(file_buff_o1.fd);
393
+			fclose(stream_in);
394
+			return FALSE;
395
+		}
396
+
397
+		snprintf(filename, 1024, "%s/script.html", dirname);
398
+		file_buff_script.fd = open(filename, O_WRONLY|O_CREAT|O_TRUNC, S_IRWXU);
399
+		if (!file_buff_script.fd) {
400
+			cli_dbgmsg("open failed: %s\n", filename);
401
+			close(file_buff_o1.fd);
402
+			close(file_buff_o2.fd);
403
+			fclose(stream_in);
404
+			return FALSE;
405
+		}
406
+
407
+		file_buff_o1.length = 0;
408
+		file_buff_o2.length = 0;
409
+		file_buff_script.length = 0;
410
+	} else {
411
+		file_buff_o1 = NULL;
412
+		file_buff_o2 = NULL;
413
+		file_buff_script = NULL;
212 414
 	}
415
+	
416
+	tag_args.count = 0;
417
+	tag_args.tag = NULL;
418
+	tag_args.value = NULL;
419
+		
420
+	ptr = line = cli_readline(stream_in, m_area, 8192);
213 421
 	while (line) {
214
-		linepos = strchr(line, '&');
215
-		if (!linepos) {
216
-			strcpy(newcurrent, line);
217
-			return newline;
422
+		while (*ptr && isspace(*ptr)) {
423
+			ptr++;
218 424
 		}
219
-		strncpy(newcurrent, line, linepos-line);
220
-		newcurrent += linepos-line;
221
-
222
-		if (!linepos[1] || !linepos[2]) {
223
-			*newcurrent = '&';
224
-			newcurrent++;
225
-			line = linepos+1;
226
-			continue;
227
-		}
228
-		switch (linepos[1]) {
229
-		case '#':
230
-			count = decode_html_char_ref(linepos+2,
231
-					newcurrent);
232
-			if (count > 0) {
233
-				newcurrent++;
234
-				linepos += count+2;
235
-			} else {
236
-				*newcurrent = '&';
237
-				newcurrent++;
238
-				linepos++;
425
+		while (*ptr) {
426
+			if (*ptr == '\n') {
427
+				if (state == HTML_COMMENT) {
428
+					html_output_c(&file_buff_o1, NULL, ' ');
429
+				} else if ((state != HTML_SKIP_WS) && 
430
+						(state != HTML_TRIM_WS) &&
431
+						(state != HTML_PROCESS_TAG)) {
432
+					html_output_c(&file_buff_o1, &file_buff_o2, ' ');
433
+				}
434
+				ptr++;
435
+				continue;
436
+			}
437
+			if (*ptr == '\r') {
438
+				ptr++;
439
+				continue;
440
+			}
441
+			switch (state) {
442
+			case HTML_BAD_STATE:
443
+				/* An engine error has occurred */
444
+				cli_dbgmsg("HTML Engine Error\n");
445
+				goto abort;
446
+			case HTML_SKIP_LENGTH:
447
+				length--;
448
+				ptr++;
449
+				if (!length) {
450
+					state = next_state;
451
+				}
452
+				break;
453
+			case HTML_SKIP_WS:
454
+				if (isspace(*ptr)) {
455
+					ptr++;
456
+				} else {
457
+					state = next_state;
458
+					next_state = HTML_BAD_STATE;
459
+				}
460
+				break;
461
+			case HTML_TRIM_WS:
462
+				if (isspace(*ptr)) {
463
+					ptr++;
464
+				} else {
465
+					html_output_c(&file_buff_o1, &file_buff_o2, ' ');
466
+					state = next_state;
467
+					next_state = HTML_BAD_STATE;
468
+				}
469
+				break;
470
+			case HTML_NORM:
471
+				if (*ptr == '<') {
472
+					html_output_c(&file_buff_o1, &file_buff_o2, '<');
473
+					if (in_script) {
474
+						html_output_c(&file_buff_script, NULL, '<');
475
+					}
476
+					ptr++;
477
+					state = HTML_SKIP_WS;
478
+					tag_length=0;
479
+					next_state = HTML_TAG;
480
+				} else if (isspace(*ptr)) {
481
+					state = HTML_TRIM_WS;
482
+					next_state = HTML_NORM;
483
+				} else if (*ptr == '&') {
484
+					state = HTML_CHAR_REF;
485
+					next_state = HTML_NORM;
486
+					ptr++;
487
+				} else {
488
+					html_output_c(&file_buff_o1, &file_buff_o2, tolower(*ptr));
489
+					if (in_script) {
490
+						html_output_c(&file_buff_script, NULL, tolower(*ptr));
491
+					}
492
+					ptr++;
493
+				}
494
+				break;
495
+			case HTML_TAG:
496
+				if ((tag_length == 0) && (*ptr == '!')) {
497
+					/* Comment */
498
+					html_output_c(&file_buff_o1, NULL, '!');
499
+					if (in_script) {
500
+						html_output_c(&file_buff_script, NULL, '!');
501
+					}
502
+					/* Need to rewind in the no-comment output stream */
503
+					if (file_buff_o2.length > 0) {
504
+						file_buff_o2.length--;
505
+					}
506
+					state = HTML_COMMENT;
507
+					next_state = HTML_BAD_STATE;
508
+					ptr++;
509
+				} else if (*ptr == '>') {
510
+					html_output_c(&file_buff_o1, &file_buff_o2, '>');
511
+					if (in_script) {
512
+						html_output_c(&file_buff_script, NULL, '>');
513
+					}
514
+					ptr++;
515
+					tag[tag_length] = '\0';
516
+					state = HTML_SKIP_WS;
517
+					next_state = HTML_PROCESS_TAG;
518
+				} else if (!isspace(*ptr)) {
519
+					html_output_c(&file_buff_o1, &file_buff_o2, tolower(*ptr));
520
+					if (in_script) {
521
+						html_output_c(&file_buff_script, NULL, tolower(*ptr));
522
+					}
523
+					if (tag_length < HTML_STR_LENGTH) {
524
+						tag[tag_length++] = tolower(*ptr);
525
+					}
526
+					ptr++;
527
+				}  else {
528
+					tag[tag_length] = '\0';
529
+					state = HTML_SKIP_WS;
530
+					tag_arg_length = 0;
531
+					next_state = HTML_TAG_ARG;
532
+				}
533
+				break;
534
+			case HTML_TAG_ARG:
535
+				if (*ptr == '=') {
536
+					html_output_c(&file_buff_o1, &file_buff_o2, '=');
537
+					tag_arg[tag_arg_length] = '\0';
538
+					ptr++;
539
+					state = HTML_SKIP_WS;
540
+					escape = FALSE;
541
+					quoted = NOT_QUOTED;
542
+					tag_val_length = 0;
543
+					next_state = HTML_TAG_ARG_VAL;
544
+				} else if (isspace(*ptr)) {
545
+					ptr++;
546
+					tag_arg[tag_arg_length] = '\0';
547
+					state = HTML_SKIP_WS;
548
+					next_state = HTML_TAG_ARG_EQUAL;
549
+				} else if (*ptr == '>') {
550
+					html_output_c(&file_buff_o1, &file_buff_o2, '>');
551
+					if (tag_arg_length > 0) {
552
+						tag_arg[tag_arg_length] = '\0';
553
+						html_tag_arg_add(&tag_args, tag_arg, NULL);
554
+					}
555
+					ptr++;
556
+					state = HTML_PROCESS_TAG;
557
+					next_state = HTML_BAD_STATE;
558
+				} else {
559
+					if (tag_arg_length == 0) {
560
+						/* Start of new tag - add space */
561
+						html_output_c(&file_buff_o1, &file_buff_o2,' ');
562
+					}
563
+					html_output_c(&file_buff_o1, &file_buff_o2, tolower(*ptr));
564
+					if (tag_arg_length < HTML_STR_LENGTH) {
565
+						tag_arg[tag_arg_length++] = tolower(*ptr);
566
+					}
567
+					ptr++;
568
+				}
569
+				break;
570
+			case HTML_TAG_ARG_EQUAL:
571
+				if (*ptr == '=') {
572
+					html_output_c(&file_buff_o1, &file_buff_o2, '=');
573
+					ptr++;
574
+					state = HTML_SKIP_WS;
575
+					escape = FALSE;
576
+					quoted = NOT_QUOTED;
577
+					tag_val_length = 0;
578
+					next_state = HTML_TAG_ARG_VAL;
579
+				} else {
580
+					if (tag_arg_length > 0) {
581
+						tag_arg[tag_arg_length] = '\0';
582
+						html_tag_arg_add(&tag_args, tag_arg, NULL);
583
+					}
584
+					tag_arg_length=0;
585
+					state = HTML_TAG_ARG;
586
+					next_state = HTML_BAD_STATE;
587
+				}
588
+				break;
589
+			case HTML_TAG_ARG_VAL:
590
+				if (*ptr == '&') {
591
+					state = HTML_CHAR_REF;
592
+					next_state = HTML_TAG_ARG_VAL;
593
+					ptr++;
594
+				} else if (*ptr == '\'') {
595
+					if (tag_val_length == 0) {
596
+						quoted = SINGLE_QUOTED;
597
+						html_output_c(&file_buff_o1, &file_buff_o2, '"');
598
+						if (tag_val_length < HTML_STR_LENGTH) {
599
+							tag_val[tag_val_length++] = '"';
600
+						}
601
+						ptr++;
602
+					} else {
603
+						if (!escape && (quoted==SINGLE_QUOTED)) {
604
+							html_output_c(&file_buff_o1, &file_buff_o2, '"');
605
+							if (tag_val_length < HTML_STR_LENGTH) {
606
+								tag_val[tag_val_length++] = '"';
607
+							}
608
+							tag_val[tag_val_length] = '\0';
609
+							html_tag_arg_add(&tag_args, tag_arg, tag_val);
610
+							ptr++;
611
+							state = HTML_SKIP_WS;
612
+							tag_arg_length=0;
613
+							next_state = HTML_TAG_ARG;
614
+						} else {
615
+							html_output_c(&file_buff_o1, &file_buff_o2, '"');
616
+							if (tag_val_length < HTML_STR_LENGTH) {
617
+								tag_val[tag_val_length++] = '"';
618
+							}
619
+							ptr++;
620
+						}
621
+					}
622
+				} else if (*ptr == '"') {
623
+					if (tag_val_length == 0) {
624
+						quoted = DOUBLE_QUOTED;
625
+						html_output_c(&file_buff_o1, &file_buff_o2, '"');
626
+						if (tag_val_length < HTML_STR_LENGTH) {
627
+							tag_val[tag_val_length++] = '"';
628
+						}
629
+						ptr++;
630
+					} else {
631
+						if (!escape && (quoted==DOUBLE_QUOTED)) {					
632
+							html_output_c(&file_buff_o1, &file_buff_o2, '"');
633
+							if (tag_val_length < HTML_STR_LENGTH) {
634
+								tag_val[tag_val_length++] = '"';
635
+							}
636
+							tag_val[tag_val_length] = '\0';
637
+							html_tag_arg_add(&tag_args, tag_arg, tag_val);
638
+							ptr++;
639
+							state = HTML_SKIP_WS;
640
+							tag_arg_length=0;
641
+							next_state = HTML_TAG_ARG;
642
+						} else {
643
+							html_output_c(&file_buff_o1, &file_buff_o2, '"');
644
+							if (tag_val_length < HTML_STR_LENGTH) {
645
+								tag_val[tag_val_length++] = '"';
646
+							}
647
+							ptr++;
648
+						}
649
+					}
650
+				} else if (isspace(*ptr) || (*ptr == '>')) {
651
+					if (quoted == NOT_QUOTED) {
652
+						tag_val[tag_val_length] = '\0';
653
+						html_tag_arg_add(&tag_args, tag_arg, tag_val);
654
+						state = HTML_SKIP_WS;
655
+						tag_arg_length=0;
656
+						next_state = HTML_TAG_ARG;
657
+					} else {
658
+						html_output_c(&file_buff_o1, &file_buff_o2, *ptr);
659
+						if (tag_val_length < HTML_STR_LENGTH) {
660
+							if (isspace(*ptr)) {
661
+								tag_val[tag_val_length++] = ' ';
662
+							} else {
663
+								tag_val[tag_val_length++] = '>';
664
+							}
665
+						}
666
+						state = HTML_SKIP_WS;
667
+						escape = FALSE;
668
+						quoted = NOT_QUOTED;
669
+						next_state = HTML_TAG_ARG_VAL;
670
+						ptr++;
671
+					}
672
+				} else {
673
+					html_output_c(&file_buff_o1, &file_buff_o2, tolower(*ptr));
674
+					if (tag_val_length < HTML_STR_LENGTH) {
675
+						tag_val[tag_val_length++] = tolower(*ptr);
676
+					}
677
+					ptr++;
678
+				}
679
+				
680
+				if (*ptr == '\\') {
681
+					escape = TRUE;
682
+				} else {
683
+					escape = FALSE;
684
+				}
685
+				break;
686
+			case HTML_COMMENT:
687
+				html_output_c(&file_buff_o1, NULL, tolower(*ptr));
688
+				if (in_script) {
689
+					html_output_c(&file_buff_script, NULL, tolower(*ptr));
690
+				}
691
+				if (*ptr == '>') {
692
+					state = HTML_SKIP_WS;
693
+					next_state = HTML_NORM;	
694
+				}
695
+				ptr++;
696
+				break;
697
+			case HTML_PROCESS_TAG:
698
+				
699
+				/* Default to no action for this tag */
700
+				state = HTML_SKIP_WS;
701
+				next_state = HTML_NORM;
702
+				if (tag[0] == '/') {
703
+					/* End tag */
704
+					state = HTML_SKIP_WS;
705
+					next_state = HTML_NORM;
706
+					if (strcmp(tag, "/script") == 0) {
707
+						in_script=FALSE;
708
+						html_output_c(&file_buff_script, NULL, '\n');
709
+					}
710
+				} else if (strcmp(tag, "script") == 0) {
711
+					arg_value = html_tag_arg_value(&tag_args, "language");
712
+					if (arg_value && (strcmp(arg_value, "jscript.encode") == 0)) {
713
+						html_tag_arg_set(&tag_args, "language", "javascript");
714
+						state = HTML_SKIP_WS;
715
+						next_state = HTML_JSDECODE;
716
+					} else if (arg_value && (strcmp(arg_value, "vbscript.encode") == 0)) {
717
+						html_tag_arg_set(&tag_args, "language", "vbscript");
718
+						state = HTML_SKIP_WS;
719
+						next_state = HTML_JSDECODE;
720
+					} else {
721
+						in_script = TRUE;
722
+					}
723
+					html_output_tag(&file_buff_script, tag, &tag_args);
724
+				} else if (hrefs && strcmp(tag, "a") == 0) {
725
+					arg_value = html_tag_arg_value(&tag_args, "href");
726
+					if (strlen(arg_value) > 0) {
727
+						html_tag_arg_add(hrefs, "href", arg_value);
728
+					}
729
+				}
730
+				html_tag_arg_free(&tag_args);
731
+				break;
732
+			case HTML_CHAR_REF:
733
+				if (*ptr == '#') {
734
+					value = 0;
735
+					hex = FALSE;
736
+					state = HTML_CHAR_REF_DECODE;
737
+					ptr++;
738
+				} else {
739
+					html_output_c(&file_buff_o1, &file_buff_o2, '&');
740
+					state = next_state;
741
+					next_state = HTML_BAD_STATE;
742
+				}
743
+				break;
744
+			case HTML_CHAR_REF_DECODE:
745
+				if ((value==0) && ((*ptr == 'x') || (*ptr == 'X'))) {
746
+					hex=TRUE;
747
+					ptr++;
748
+				} else if (*ptr == ';') {
749
+					html_output_c(&file_buff_o1, &file_buff_o2, value);
750
+					state = next_state;
751
+					next_state = HTML_BAD_STATE;
752
+					ptr++;
753
+				} else if (isdigit(*ptr) || (hex && isxdigit(*ptr))) {
754
+					if (hex) {
755
+						value *= 16;
756
+					} else {
757
+						value *= 10;
758
+					}
759
+					if (isdigit(*ptr)) {
760
+						value += (*ptr - '0');
761
+					} else {
762
+						value += (tolower(*ptr) - 'a' + 10);
763
+					}
764
+					ptr++;
765
+				} else {
766
+					html_output_c(&file_buff_o1, &file_buff_o2, value);
767
+					state = next_state;
768
+					next_state = HTML_BAD_STATE;
769
+				}
770
+				break;
771
+			case HTML_JSDECODE:
772
+				/* Check for start marker */
773
+				if (strncmp(ptr, "#@~^", 4) == 0) {
774
+					ptr += 4;
775
+					state = HTML_JSDECODE_LENGTH;
776
+					next_state = HTML_BAD_STATE;
777
+				} else {
778
+					html_output_c(&file_buff_o1, &file_buff_o2, tolower(*ptr));
779
+					html_output_c(&file_buff_script, NULL, tolower(*ptr));
780
+					ptr++;
781
+				}
782
+				break;
783
+			case HTML_JSDECODE_LENGTH:
784
+				if (strlen(ptr) < 8) {
785
+					state = HTML_NORM;
786
+					next_state = HTML_BAD_STATE;
787
+					break;
788
+				}
789
+				length = base64_chars[ptr[0]] << 2;
790
+				length += base64_chars[ptr[1]] >> 4;
791
+				length += (base64_chars[ptr[1]] & 0x0f) << 12;
792
+				length += (base64_chars[ptr[2]] >> 2) << 8;
793
+				length += (base64_chars[ptr[2]] & 0x03) << 22;
794
+				length += base64_chars[ptr[3]] << 16;
795
+				length += (base64_chars[ptr[4]] << 2) << 24;
796
+				length += (base64_chars[ptr[5]] >> 4) << 24;
797
+				table_pos = 0;
798
+				state = HTML_JSDECODE_DECRYPT;
799
+				next_state = HTML_BAD_STATE;
800
+				ptr += 8;
801
+				break;
802
+			case HTML_JSDECODE_DECRYPT:
803
+				if (length == 0) {
804
+					html_output_str(&file_buff_script, "</script>\n", 10);
805
+					length = 12;
806
+					state = HTML_SKIP_LENGTH;
807
+					next_state = HTML_NORM;
808
+					break;
809
+				}
810
+				if (*ptr < 0x80) {
811
+					value = decrypt_tables[table_order[table_pos]][*ptr];
812
+					if (value == 0xFF) { /* special character */
813
+						ptr++;
814
+						length--;
815
+						switch (*ptr) {
816
+						case '\0':
817
+							/* Fixup for end of line */
818
+							ptr--;
819
+							break;
820
+						case 0x21:
821
+							html_output_c(&file_buff_o1, &file_buff_o2, 0x3c);
822
+							html_output_c(&file_buff_script, NULL, 0x3c);
823
+							break;
824
+						case 0x23:
825
+							html_output_c(&file_buff_o1, &file_buff_o2, 0x0d);
826
+							html_output_c(&file_buff_script, NULL, 0x0d);
827
+							break;
828
+						case 0x24:
829
+							html_output_c(&file_buff_o1, &file_buff_o2, 0x40);
830
+							html_output_c(&file_buff_script, NULL, 0x40);
831
+							break;				
832
+						case 0x26:
833
+							html_output_c(&file_buff_o1, &file_buff_o2, 0x0a);
834
+							html_output_c(&file_buff_script, NULL, 0x0a);
835
+							break;
836
+						case 0x2a:
837
+							html_output_c(&file_buff_o1, &file_buff_o2, 0x3e);
838
+							html_output_c(&file_buff_script, NULL, 0x3e);
839
+							break;
840
+						}
841
+					} else {
842
+						html_output_c(&file_buff_o1, &file_buff_o2, value);
843
+						html_output_c(&file_buff_script, NULL, tolower(value));
844
+					}
845
+				}
846
+				table_pos = (table_pos + 1) % 64;
847
+				ptr++;
848
+				length--;
849
+				break;
239 850
 			}
240
-			break;
241
-		/* TODO: character entities, &amp; etc. */
242
-		default:
243
-			*newcurrent = '&';
244
-			newcurrent++;
245
-			linepos++;
246 851
 		}
247
-		line = linepos;
852
+		free(line);
853
+		ptr = line = cli_readline(stream_in, m_area, 8192);
854
+	}
855
+	
856
+	retval = TRUE;
857
+abort:
858
+	html_tag_arg_free(&tag_args);
859
+	if (!m_area) {
860
+		fclose(stream_in);
248 861
 	}
249
-	*newcurrent = '\0';
250
-	return newline;
862
+	html_output_flush(&file_buff_o1);
863
+	html_output_flush(&file_buff_o2);
864
+	html_output_flush(&file_buff_script);
865
+	close(file_buff_o1.fd);
866
+	close(file_buff_o2.fd);
867
+	close(file_buff_script.fd);
868
+	return retval;
869
+}
870
+
871
+int html_normalise_mem(unsigned char *in_buff, off_t in_size, const char *dirname, tag_arguments_t *hrefs)
872
+{
873
+	m_area_t m_area;
874
+	
875
+	m_area.buffer = in_buff;
876
+	m_area.length = in_size;
877
+	m_area.offset = 0;
878
+	
879
+	cli_html_normalise(-1, &m_area, dirname, hrefs);
251 880
 }
252 881
 
253
-int char2hex(unsigned char c)
882
+int html_normalise_fd(int fd, const char *dirname, tag_arguments_t *hrefs)
254 883
 {
255
-	if ((c-'0') <= 9) {
256
-		return (c-'0');
257
-	} else if ((c-'A') <= 5) {
258
-		return (c-'A'+10);
884
+#if HAVE_MMAP
885
+	int retval=FALSE;
886
+	m_area_t m_area;
887
+	struct stat statbuf;
888
+	
889
+	if (fstat(fd, &statbuf) == 0) {
890
+		m_area.length = statbuf.st_size;
891
+		m_area.buffer = (unsigned char *) mmap(NULL, m_area.length, PROT_READ, MAP_PRIVATE, fd, 0);
892
+		m_area.offset = 0;
893
+		if (m_area.buffer == MAP_FAILED) {
894
+			cli_dbgmsg("mmap HTML failed\n");
895
+			retval = cli_html_normalise(fd, NULL, dirname, hrefs);
896
+		} else {
897
+			cli_dbgmsg("mmap'ed file\n");
898
+			retval = cli_html_normalise(-1, &m_area, dirname, hrefs);
899
+			munmap(m_area.buffer, m_area.length);
900
+		}
901
+	} else {
902
+		cli_dbgmsg("fstat HTML failed\n");
903
+		retval = cli_html_normalise(fd, NULL, dirname, hrefs);
259 904
 	}
260
-	return (c-'a'+10);
905
+	return retval;
906
+#else
907
+	return cli_html_normalise(fd, NULL, dirname, hrefs);
908
+#endif
261 909
 }
262 910
 
263
-char *quoted_decode(unsigned char *line, off_t in_size)
911
+int html_screnc_decode(int fd, const char *dirname)
264 912
 {
265
-	unsigned char *newline, *newcurrent, *line_end;
913
+	int fd_tmp, table_pos=0, result, count, state, retval=FALSE;
914
+	unsigned char *line, tmpstr[6];
915
+	unsigned long length;
916
+	unsigned char *ptr, filename[1024];
917
+	FILE *stream_in;
918
+	file_buff_t file_buff;
266 919
 	
267
-	newcurrent = newline = (unsigned char *) cli_malloc(in_size + 1);
268
-	if (!newline) {
269
-		return NULL;
920
+	lseek(fd, 0, SEEK_SET);	
921
+	fd_tmp = dup(fd);
922
+	if (fd_tmp < 0) {
923
+		return FALSE;
924
+	}
925
+	stream_in = fdopen(fd_tmp, "r");
926
+	if (!stream_in) {
927
+		close(fd_tmp);
928
+		return FALSE;
929
+	}
930
+	
931
+	snprintf(filename, 1024, "%s/screnc.html", dirname);
932
+	file_buff.fd = open(filename, O_WRONLY|O_CREAT|O_TRUNC, S_IRWXU);
933
+	if (!file_buff.fd) {
934
+		cli_dbgmsg("open failed: %s\n", filename);
935
+		fclose(stream_in);
936
+		return FALSE;
270 937
 	}
271 938
 	
272
-	line_end = line+in_size;
273
-	while (line <= line_end) {
274
-		while ((line < line_end) && *line != '=') {
275
-			*newcurrent = *line;
276
-			line++;
277
-			newcurrent++;
939
+	while ((line = cli_readline(stream_in, NULL, 8192)) != NULL) {
940
+		ptr = strstr(line, "#@~^");
941
+		if (ptr) {
942
+			break;
278 943
 		}
279
-		if ((line < line_end) && isspace(line[1])) {
280
-			line++;
281
-			while ((line < line_end) && isspace(*line)) {
282
-				line++;
944
+		free(line);
945
+        }
946
+	if (!line) {
947
+		goto abort;
948
+	}
949
+	
950
+	/* Calculate the length of the encoded string */
951
+	ptr += 4;
952
+	count = 0;
953
+	do {
954
+		if (! *ptr) {
955
+			free(line);
956
+			ptr = line = cli_readline(stream_in, NULL, 8192);
957
+			if (!line) {
958
+				goto abort;
283 959
 			}
284
-			continue;
285 960
 		}
286
-		if ((line+2) <= line_end) {
287
-			if (isxdigit(line[1]) && isxdigit(line[2])) {
288
-				*newcurrent = 	(char2hex(line[1]) * 16) +
289
-						char2hex(line[2]);
290
-				newcurrent++;
291
-				line += 3;
961
+		tmpstr[count++] = *ptr;
962
+		ptr++;
963
+	} while (count < 6);
964
+	
965
+	length = base64_chars[tmpstr[0]] << 2;
966
+	length += base64_chars[tmpstr[1]] >> 4;
967
+	length += (base64_chars[tmpstr[1]] & 0x0f) << 12;
968
+	length += (base64_chars[tmpstr[2]] >> 2) << 8;
969
+	length += (base64_chars[tmpstr[2]] & 0x03) << 22;
970
+	length += base64_chars[tmpstr[3]] << 16;
971
+	length += (base64_chars[tmpstr[4]] << 2) << 24;
972
+	length += (base64_chars[tmpstr[5]] >> 4) << 24;
973
+
974
+	/* Move forward 2 bytes */
975
+	count = 2;
976
+	state = HTML_SKIP_LENGTH;
977
+
978
+	while (length && line) {
979
+		while (length && *ptr) {
980
+			if ((*ptr == '\n') || (*ptr == '\r')) {
981
+				ptr++;
292 982
 				continue;
293 983
 			}
984
+			switch (state) {
985
+			case HTML_SKIP_LENGTH:
986
+				ptr++;
987
+				count--;
988
+				if (count == 0) {
989
+					state = HTML_NORM;
990
+				}
991
+				break;
992
+			case HTML_SPECIAL_CHAR:
993
+				switch (*ptr) {
994
+				case 0x21:
995
+					html_output_c(&file_buff, NULL, 0x3c);
996
+					break;
997
+				case 0x23:
998
+					html_output_c(&file_buff, NULL, 0x0d);
999
+					break;
1000
+				case 0x24:
1001
+					html_output_c(&file_buff, NULL, 0x40);
1002
+					break;				
1003
+				case 0x26:
1004
+					html_output_c(&file_buff, NULL, 0x0a);
1005
+					break;
1006
+				case 0x2a:
1007
+					html_output_c(&file_buff, NULL, 0x3e);
1008
+					break;
1009
+				}
1010
+				ptr++;
1011
+				length--;
1012
+				state = HTML_NORM;
1013
+				break;
1014
+			case HTML_NORM:	
1015
+				if (*ptr < 0x80) {
1016
+					result = decrypt_tables[table_order[table_pos]][*ptr];
1017
+					if (result == 0xFF) { /* special character */
1018
+						state = HTML_SPECIAL_CHAR;
1019
+					} else {
1020
+						html_output_c(&file_buff, NULL, (char)result);
1021
+					}
1022
+				}
1023
+				ptr++;
1024
+				length--;
1025
+				table_pos = (table_pos + 1) % 64;
1026
+				break;
1027
+			}
1028
+		}
1029
+		free(line);
1030
+		if (length) {
1031
+			ptr = line = cli_readline(stream_in, NULL, 8192);
294 1032
 		}
295
-		line++;	
296 1033
 	}
297
-	*newcurrent = '\0';
298
-	return newline;
1034
+	retval = TRUE;
1035
+						
1036
+abort:
1037
+	fclose(stream_in);
1038
+	html_output_flush(&file_buff);
1039
+	close(file_buff.fd);
1040
+	return retval;
299 1041
 }
... ...
@@ -15,15 +15,21 @@
15 15
  *  along with this program; if not, write to the Free Software
16 16
  *  Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
17 17
  */
18
-
18
+ 
19 19
 #ifndef __HTMLNORM_H
20 20
 #define __HTMLNORM_H
21 21
 
22
-#include <sys/types.h>
23
-
24
-unsigned char *html_normalize(unsigned char *in_buff, off_t in_size);
25
-unsigned char *remove_html_comments(unsigned char *line);
26
-unsigned char *remove_html_char_ref(unsigned char *line);
27
-char *quoted_decode(unsigned char *line, off_t in_size);
22
+typedef struct tag_arguments_tag
23
+{
24
+        int count;
25
+        unsigned char **tag;
26
+        unsigned char **value;
27
+} tag_arguments_t;
28 28
 
29
+int html_normalise_mem(unsigned char *in_buff, off_t in_size, const char *dirname, tag_arguments_t *hrefs);
30
+int html_normalise_fd(int fd, const char *dirname, tag_arguments_t *hrefs);
31
+void html_tag_arg_free(tag_arguments_t *tags);
32
+int html_screnc_decode(int fd, const char *dirname);
33
+ 
29 34
 #endif
35
+
... ...
@@ -680,55 +680,58 @@ static int cli_scanmscab(int desc, const char **virname, long int *scanned, cons
680 680
 
681 681
 static int cli_scanhtml(int desc, const char **virname, long int *scanned, const struct cl_node *root, const struct cl_limits *limits, int options, int *arec, int *mrec)
682 682
 {
683
-	unsigned char *membuff, *newbuff, *newbuff2;
684
-	struct stat statbuf;
685
-	int ret;
683
+	char *tempname, fullname[1024];
684
+	const char *tmpdir;
685
+	int ret=CL_CLEAN, fd;
686 686
 
687 687
 
688
-#ifdef HAVE_MMAP
689 688
     cli_dbgmsg("in cli_scanhtml()\n");
690 689
 
691
-    if(fstat(desc, &statbuf) != 0) {
692
-	cli_dbgmsg("HTML: Can't stat descriptor %d\n", desc);
693
-        return CL_EIO;
690
+    if((tmpdir = getenv("TMPDIR")) == NULL)
691
+#ifdef P_tmpdir
692
+        tmpdir = P_tmpdir;
693
+#else
694
+        tmpdir = "/tmp";
695
+#endif
696
+                                                                                                                                           
697
+    tempname = cli_gentemp(tmpdir);
698
+                                                                                                                                           
699
+    if(mkdir(tempname, 0700)) {
700
+        cli_dbgmsg("ScanHTML -> Can't create temporary directory %s\n", tempname);
701
+        return CL_ETMPDIR;
694 702
     }
695 703
 
696
-    if(limits && limits->maxfilesize && (statbuf.st_size > limits->maxfilesize)) {
697
-	cli_dbgmsg("HTML: Size exceeded (%d, max: %ld)\n", statbuf.st_size, limits->maxfilesize);
698
-	return CL_CLEAN;
704
+    html_normalise_fd(desc, tempname, NULL);
705
+    snprintf(fullname, 1024, "%s/comment.html", tempname);
706
+    fd = open(fullname, O_RDONLY);
707
+    if (fd >= 0) {
708
+        ret = cli_scandesc(fd, virname, scanned, root, 0);
709
+	close(fd);
699 710
     }
700 711
 
701
-    membuff = mmap(NULL, statbuf.st_size, PROT_READ, MAP_PRIVATE, desc, 0);
702
-
703
-    /* TODO: do file operations if mmap fails */
704
-    if(membuff == MAP_FAILED) {
705
-	cli_dbgmsg("HTML: mmap failed\n");
706
-        return CL_EMEM;
712
+    if (ret == CL_CLEAN) {
713
+	snprintf(fullname, 1024, "%s/nocomment.html", tempname);
714
+	fd = open(fullname, O_RDONLY);
715
+	if (fd >= 0) {
716
+	    ret = cli_scandesc(fd, virname, scanned, root, 0);
717
+	    close(fd);
718
+	}
707 719
     }
708 720
 
709
-    newbuff = html_normalize(membuff, statbuf.st_size);
710
-
711
-    if(newbuff) {
712
-	newbuff2 = remove_html_comments(newbuff);
713
-	free(newbuff);
714
-	newbuff = remove_html_char_ref(newbuff2);
715
-	free(newbuff2);
716
-	/* Normalise a second time as the above can leave inconsistent white
717
-	 * space
718
-	 */
719
-	newbuff2 = html_normalize(newbuff, strlen(newbuff));
720
-	free(newbuff);
721
-	newbuff = newbuff2;
721
+    if (ret == CL_CLEAN) {
722
+	snprintf(fullname, 1024, "%s/script.html", tempname);
723
+	fd = open(fullname, O_RDONLY);
724
+	if (fd >= 0) {
725
+	    ret = cli_scandesc(fd, virname, scanned, root, 0);
726
+	    close(fd);
727
+	}
722 728
     }
723 729
 
724
-    ret = cl_scanbuff(newbuff, strlen(newbuff), virname, root);
730
+    if(!cli_leavetemps_flag)
731
+        cli_rmdirs(tempname);
725 732
 
726
-    free(newbuff);
727
-    munmap(membuff, statbuf.st_size);
733
+    free(tempname);
728 734
     return ret;
729
-#else /* FIXME */
730
-    return CL_CLEAN;
731
-#endif
732 735
 }
733 736
 
734 737
 static int  cli_scan_mydoom_log(int desc, const char **virname, long int *scanned, const struct cl_node *root, const struct cl_limits *limits, int options, int *arec, int *mrec)
... ...
@@ -1053,6 +1056,37 @@ static int cli_scanmschm(int desc, const char **virname, long int *scanned, cons
1053 1053
     return ret;
1054 1054
 }
1055 1055
 
1056
+static int cli_scanscrenc(int desc, const char **virname, long int *scanned, const struct cl_node *root, const struct cl_limits *limits, int options, int *arec, int *mrec)
1057
+{
1058
+	const char *tmpdir;
1059
+	char *tempname;
1060
+	int ret = CL_CLEAN;
1061
+
1062
+    cli_dbgmsg("in cli_scanscrenc()\n");
1063
+
1064
+    if((tmpdir = getenv("TMPDIR")) == NULL)
1065
+#ifdef P_tmpdir
1066
+        tmpdir = P_tmpdir;
1067
+#else
1068
+        tmpdir = "/tmp";
1069
+#endif
1070
+                                                                                                                               
1071
+    tempname = cli_gentemp(tmpdir);
1072
+                                                                                                                               
1073
+    if(mkdir(tempname, 0700)) {
1074
+	cli_dbgmsg("CHM: Can't create temporary directory %s\n", tempname);
1075
+	return CL_ETMPDIR;
1076
+    }
1077
+
1078
+    if (html_screnc_decode(desc, tempname))
1079
+	ret = cli_scandir(tempname, virname, scanned, root, limits, options, arec, mrec);
1080
+
1081
+    if(!cli_leavetemps_flag)
1082
+	cli_rmdirs(tempname);
1083
+
1084
+    free(tempname);
1085
+    return ret;
1086
+}
1056 1087
 static int cli_scanmail(int desc, const char **virname, long int *scanned, const struct cl_node *root, const struct cl_limits *limits, int options, int *arec, int *mrec)
1057 1088
 {
1058 1089
 	const char *tmpdir;
... ...
@@ -1193,6 +1227,10 @@ int cli_magic_scandesc(int desc, const char **virname, long int *scanned, const
1193 1193
 		ret = cli_scantar(desc, virname, scanned, root, limits, options, arec, mrec);
1194 1194
 	    break;
1195 1195
 
1196
+	case CL_SCRENC:
1197
+	    ret = cli_scanscrenc(desc, virname, scanned, root, limits, options, arec, mrec);
1198
+	    break;
1199
+
1196 1200
 	case CL_DATAFILE:
1197 1201
 	    /* it could be a false positive and a standard DOS .COM file */
1198 1202
 	    {