Browse code

add support for UTF16 encoded HTML files

git-svn: trunk@2430

Tomasz Kojm authored on 2006/10/26 00:40:47
Showing 6 changed files
... ...
@@ -1,3 +1,8 @@
1
+Wed Oct 25 17:39:24 CEST 2006 (tk)
2
+----------------------------------
3
+  * libclamav: add support for UTF16 encoded HTML files,
4
+	       requested by Christoph
5
+
1 6
 Wed Oct 25 12:40:10 CEST 2006 (acab)
2 7
 ------------------------------------
3 8
   * clamscan/clamscan.c: fix typo breaking -l (closes bug#83)
... ...
@@ -32,6 +32,7 @@
32 32
 #include "others.h"
33 33
 #include "readdb.h"
34 34
 #include "matcher-ac.h"
35
+#include "str.h"
35 36
 
36 37
 struct cli_magic_s {
37 38
     size_t offset;
... ...
@@ -205,7 +206,7 @@ static char internat[256] = {
205 205
 
206 206
 cli_file_t cli_filetype(const unsigned char *buf, size_t buflen)
207 207
 {
208
-	int i, ascii = 1, len;
208
+	int i, text = 1, len;
209 209
 
210 210
 
211 211
     for(i = 0; cli_magic[i].magic; i++) {
... ...
@@ -220,27 +221,64 @@ cli_file_t cli_filetype(const unsigned char *buf, size_t buflen)
220 220
     buflen < 25 ? (len = buflen) : (len = 25);
221 221
     for(i = 0; i < len; i++)
222 222
 	if(!iscntrl(buf[i]) && !isprint(buf[i]) && !internat[buf[i] & 0xff]) {
223
-	    ascii = 0;
223
+	    text = 0;
224 224
 	    break;
225 225
 	}
226 226
 
227
-    return ascii ? CL_TYPE_UNKNOWN_TEXT : CL_TYPE_UNKNOWN_DATA;
227
+    return text ? CL_TYPE_UNKNOWN_TEXT : CL_TYPE_UNKNOWN_DATA;
228 228
 }
229 229
 
230 230
 int is_tar(unsigned char *buf, unsigned int nbytes);
231 231
 
232
-cli_file_t cli_filetype2(int desc)
232
+cli_file_t cli_filetype2(int desc, const struct cl_engine *engine)
233 233
 {
234
-	char smallbuff[MAGIC_BUFFER_SIZE + 1];
234
+	char smallbuff[MAGIC_BUFFER_SIZE + 1], *decoded;
235 235
 	unsigned char *bigbuff;
236
-	int bread;
236
+	int bread, sret;
237 237
 	cli_file_t ret = CL_TYPE_UNKNOWN_DATA;
238
+	struct cli_matcher *root;
239
+	int *partcnt;
240
+	unsigned long int *partoff;
238 241
 
239 242
 
240 243
     memset(smallbuff, 0, sizeof(smallbuff));
241 244
     if((bread = read(desc, smallbuff, MAGIC_BUFFER_SIZE)) > 0)
242 245
 	ret = cli_filetype(smallbuff, bread);
243 246
 
247
+    if(engine && ret == CL_TYPE_UNKNOWN_TEXT) {
248
+	root = engine->root[0];
249
+	if(!root)
250
+	    return ret;
251
+
252
+	if((partcnt = (int *) cli_calloc(root->ac_partsigs + 1, sizeof(int))) == NULL) {
253
+	    cli_warnmsg("cli_filetype2(): unable to cli_calloc(%d, %d)\n", root->ac_partsigs + 1, sizeof(int));
254
+	    return ret;
255
+	}
256
+
257
+	if((partoff = (unsigned long int *) cli_calloc(root->ac_partsigs + 1, sizeof(unsigned long int))) == NULL) {
258
+	    cli_dbgmsg("cli_filetype2(): unable to cli_calloc(%d, %d)\n", root->ac_partsigs + 1, sizeof(unsigned long int));
259
+	    free(partcnt);
260
+	    return ret;
261
+	}
262
+
263
+	sret = cli_ac_scanbuff(smallbuff, bread, NULL, engine->root[0], partcnt, 1, 0, partoff, 0, -1, NULL);
264
+	if(sret >= CL_TYPENO) {
265
+	    ret = sret;
266
+	} else {
267
+	    memset(partcnt, 0, (root->ac_partsigs + 1) * sizeof(int));
268
+	    memset(partoff, 0, (root->ac_partsigs + 1) * sizeof(unsigned long int));
269
+	    decoded = cli_utf16toascii(smallbuff, bread);
270
+	    if(decoded) {
271
+		sret = cli_ac_scanbuff(decoded, strlen(decoded), NULL, engine->root[0], partcnt, 1, 0, partoff, 0, -1, NULL);
272
+		free(decoded);
273
+		if(sret == CL_TYPE_HTML)
274
+		    ret = CL_TYPE_HTML_UTF16;
275
+	    }
276
+	}
277
+	free(partcnt);
278
+	free(partoff);
279
+    }
280
+
244 281
     if(ret == CL_TYPE_UNKNOWN_DATA || ret == CL_TYPE_UNKNOWN_TEXT) {
245 282
 
246 283
 	if(!(bigbuff = (unsigned char *) cli_calloc(37638 + 1, sizeof(unsigned char))))
... ...
@@ -23,7 +23,7 @@
23 23
 
24 24
 #include <sys/types.h>
25 25
 
26
-#define MAGIC_BUFFER_SIZE 50
26
+#define MAGIC_BUFFER_SIZE 256
27 27
 #define CL_TYPENO 500
28 28
 #define SFX_MAX_TESTS 10
29 29
 
... ...
@@ -53,6 +53,7 @@ typedef enum {
53 53
     CL_TYPE_PDF,
54 54
     CL_TYPE_UUENCODED,
55 55
     CL_TYPE_PST,	/* Microsoft Outlook binary email folder (.pst file) */
56
+    CL_TYPE_HTML_UTF16,
56 57
 
57 58
     /* bigger numbers have higher priority (in o-t-f detection) */
58 59
     CL_TYPE_HTML, /* on the fly */
... ...
@@ -71,7 +72,7 @@ struct cli_matched_type {
71 71
 };
72 72
 
73 73
 cli_file_t cli_filetype(const unsigned char *buf, size_t buflen);
74
-cli_file_t cli_filetype2(int desc);
74
+cli_file_t cli_filetype2(int desc, const struct cl_engine *engine);
75 75
 int cli_addtypesigs(struct cl_engine *engine);
76 76
 
77 77
 #endif
... ...
@@ -78,6 +78,7 @@ extern short cli_leavetemps_flag;
78 78
 #include "pst.h"
79 79
 #include "sis.h"
80 80
 #include "pdf.h"
81
+#include "str.h"
81 82
 
82 83
 #ifdef HAVE_ZLIB_H
83 84
 #include <zlib.h>
... ...
@@ -1154,6 +1155,50 @@ static int cli_scanhtml(int desc, cli_ctx *ctx)
1154 1154
     return ret;
1155 1155
 }
1156 1156
 
1157
+static int cli_scanhtml_utf16(int desc, cli_ctx *ctx)
1158
+{
1159
+	char *tempname, buff[512], *decoded;
1160
+	int ret = CL_CLEAN, fd, bytes;
1161
+
1162
+
1163
+    cli_dbgmsg("in cli_scanhtml_utf16()\n");
1164
+
1165
+    tempname = cli_gentemp(NULL);
1166
+    if((fd = open(tempname, O_RDWR|O_CREAT|O_TRUNC|O_BINARY, S_IRWXU)) < 0) {
1167
+	cli_errmsg("cli_scanhtml_utf16: Can't create file %s\n", tempname);
1168
+	free(tempname);
1169
+	return CL_EIO;
1170
+    }
1171
+
1172
+    while((bytes = read(desc, buff, sizeof(buff))) > 0) {
1173
+	decoded = cli_utf16toascii(buff, bytes);
1174
+	if(decoded) {
1175
+	    if(write(fd, decoded, strlen(decoded)) == -1) {
1176
+		cli_errmsg("cli_scanhtml_utf16: Can't write to file %s\n", tempname);
1177
+		free(decoded);
1178
+		unlink(tempname);
1179
+		free(tempname);
1180
+		close(fd);
1181
+		return CL_EIO;
1182
+	    }
1183
+	    free(decoded);
1184
+	}
1185
+    }
1186
+
1187
+    fsync(fd);
1188
+    lseek(fd, 0, SEEK_SET);
1189
+    ret = cli_scanhtml(fd, ctx);
1190
+    close(fd);
1191
+
1192
+    if(!cli_leavetemps_flag)
1193
+	unlink(tempname);
1194
+    else
1195
+	cli_dbgmsg("cli_scanhtml_utf16: Decoded HTML data saved in %s\n", tempname);
1196
+    free(tempname);
1197
+
1198
+    return ret;
1199
+}
1200
+
1157 1201
 static int cli_scanole2(int desc, cli_ctx *ctx)
1158 1202
 {
1159 1203
 	char *dir;
... ...
@@ -1672,7 +1717,7 @@ int cli_magic_scandesc(int desc, cli_ctx *ctx)
1672 1672
 	}
1673 1673
 
1674 1674
     lseek(desc, 0, SEEK_SET);
1675
-    type = cli_filetype2(desc);
1675
+    type = cli_filetype2(desc, ctx->engine);
1676 1676
     lseek(desc, 0, SEEK_SET);
1677 1677
 
1678 1678
     if(type != CL_TYPE_DATA && ctx->engine->sdb) {
... ...
@@ -1716,6 +1761,16 @@ int cli_magic_scandesc(int desc, cli_ctx *ctx)
1716 1716
 		ret = cli_scanmscab(desc, ctx);
1717 1717
 	    break;
1718 1718
 
1719
+	case CL_TYPE_HTML:
1720
+	    if(SCAN_HTML)
1721
+		ret = cli_scanhtml(desc, ctx);
1722
+	    break;
1723
+
1724
+	case CL_TYPE_HTML_UTF16:
1725
+	    if(SCAN_HTML)
1726
+		ret = cli_scanhtml_utf16(desc, ctx);
1727
+	    break;
1728
+
1719 1729
 	case CL_TYPE_MAIL:
1720 1730
 	    if(SCAN_MAIL)
1721 1731
 		ret = cli_scanmail(desc, ctx);
... ...
@@ -173,6 +173,33 @@ char *cli_str2hex(const char *string, unsigned int len)
173 173
     return hexstr;
174 174
 }
175 175
 
176
+char *cli_utf16toascii(const char *str, unsigned int length)
177
+{
178
+	char *decoded;
179
+	unsigned int i, j;
180
+
181
+
182
+    if(length < 2) {
183
+	cli_warnmsg("cli_utf16toascii: length < 2\n");
184
+	return NULL;
185
+    }
186
+
187
+    if(length % 2)
188
+	length--;
189
+
190
+    if(!(decoded = cli_calloc(length / 2 + 1, sizeof(char))))
191
+	return NULL;
192
+
193
+    for(i = 0, j = 0; i < length; i += 2, j++) {
194
+       decoded[j] = str[i + 1] << 4;
195
+       decoded[j] += str[i];
196
+       if(decoded[j] == '%')
197
+	   decoded[j] = '_';
198
+    }
199
+
200
+    return decoded;
201
+}
202
+
176 203
 int cli_strbcasestr(const char *haystack, const char *needle)
177 204
 {
178 205
 	char *pt = (char *) haystack;
... ...
@@ -27,6 +27,7 @@ short int *cli_hex2si(const char *hex);
27 27
 char *cli_hex2str(const char *hex);
28 28
 int cli_hex2num(const char *hex);
29 29
 char *cli_str2hex(const char *string, unsigned int len);
30
+char *cli_utf16toascii(const char *str, unsigned int length);
30 31
 char *cli_strtokbuf(const char *input, int fieldno, const char *delim, char *output);
31 32
 const char *cli_memstr(const char *haystack, int hs, const char *needle, int ns);
32 33
 char *cli_strrcpy(char *dest, const char *source);