git-svn: trunk@2430
Tomasz Kojm authored on 2006/10/26 00:40:47... | ... |
@@ -1,3 +1,8 @@ |
1 |
+Wed Oct 25 17:39:24 CEST 2006 (tk) |
|
2 |
+---------------------------------- |
|
3 |
+ * libclamav: add support for UTF16 encoded HTML files, |
|
4 |
+ requested by Christoph |
|
5 |
+ |
|
1 | 6 |
Wed Oct 25 12:40:10 CEST 2006 (acab) |
2 | 7 |
------------------------------------ |
3 | 8 |
* clamscan/clamscan.c: fix typo breaking -l (closes bug#83) |
... | ... |
@@ -32,6 +32,7 @@ |
32 | 32 |
#include "others.h" |
33 | 33 |
#include "readdb.h" |
34 | 34 |
#include "matcher-ac.h" |
35 |
+#include "str.h" |
|
35 | 36 |
|
36 | 37 |
struct cli_magic_s { |
37 | 38 |
size_t offset; |
... | ... |
@@ -205,7 +206,7 @@ static char internat[256] = { |
205 | 205 |
|
206 | 206 |
cli_file_t cli_filetype(const unsigned char *buf, size_t buflen) |
207 | 207 |
{ |
208 |
- int i, ascii = 1, len; |
|
208 |
+ int i, text = 1, len; |
|
209 | 209 |
|
210 | 210 |
|
211 | 211 |
for(i = 0; cli_magic[i].magic; i++) { |
... | ... |
@@ -220,27 +221,64 @@ cli_file_t cli_filetype(const unsigned char *buf, size_t buflen) |
220 | 220 |
buflen < 25 ? (len = buflen) : (len = 25); |
221 | 221 |
for(i = 0; i < len; i++) |
222 | 222 |
if(!iscntrl(buf[i]) && !isprint(buf[i]) && !internat[buf[i] & 0xff]) { |
223 |
- ascii = 0; |
|
223 |
+ text = 0; |
|
224 | 224 |
break; |
225 | 225 |
} |
226 | 226 |
|
227 |
- return ascii ? CL_TYPE_UNKNOWN_TEXT : CL_TYPE_UNKNOWN_DATA; |
|
227 |
+ return text ? CL_TYPE_UNKNOWN_TEXT : CL_TYPE_UNKNOWN_DATA; |
|
228 | 228 |
} |
229 | 229 |
|
230 | 230 |
int is_tar(unsigned char *buf, unsigned int nbytes); |
231 | 231 |
|
232 |
-cli_file_t cli_filetype2(int desc) |
|
232 |
+cli_file_t cli_filetype2(int desc, const struct cl_engine *engine) |
|
233 | 233 |
{ |
234 |
- char smallbuff[MAGIC_BUFFER_SIZE + 1]; |
|
234 |
+ char smallbuff[MAGIC_BUFFER_SIZE + 1], *decoded; |
|
235 | 235 |
unsigned char *bigbuff; |
236 |
- int bread; |
|
236 |
+ int bread, sret; |
|
237 | 237 |
cli_file_t ret = CL_TYPE_UNKNOWN_DATA; |
238 |
+ struct cli_matcher *root; |
|
239 |
+ int *partcnt; |
|
240 |
+ unsigned long int *partoff; |
|
238 | 241 |
|
239 | 242 |
|
240 | 243 |
memset(smallbuff, 0, sizeof(smallbuff)); |
241 | 244 |
if((bread = read(desc, smallbuff, MAGIC_BUFFER_SIZE)) > 0) |
242 | 245 |
ret = cli_filetype(smallbuff, bread); |
243 | 246 |
|
247 |
+ if(engine && ret == CL_TYPE_UNKNOWN_TEXT) { |
|
248 |
+ root = engine->root[0]; |
|
249 |
+ if(!root) |
|
250 |
+ return ret; |
|
251 |
+ |
|
252 |
+ if((partcnt = (int *) cli_calloc(root->ac_partsigs + 1, sizeof(int))) == NULL) { |
|
253 |
+ cli_warnmsg("cli_filetype2(): unable to cli_calloc(%d, %d)\n", root->ac_partsigs + 1, sizeof(int)); |
|
254 |
+ return ret; |
|
255 |
+ } |
|
256 |
+ |
|
257 |
+ if((partoff = (unsigned long int *) cli_calloc(root->ac_partsigs + 1, sizeof(unsigned long int))) == NULL) { |
|
258 |
+ cli_dbgmsg("cli_filetype2(): unable to cli_calloc(%d, %d)\n", root->ac_partsigs + 1, sizeof(unsigned long int)); |
|
259 |
+ free(partcnt); |
|
260 |
+ return ret; |
|
261 |
+ } |
|
262 |
+ |
|
263 |
+ sret = cli_ac_scanbuff(smallbuff, bread, NULL, engine->root[0], partcnt, 1, 0, partoff, 0, -1, NULL); |
|
264 |
+ if(sret >= CL_TYPENO) { |
|
265 |
+ ret = sret; |
|
266 |
+ } else { |
|
267 |
+ memset(partcnt, 0, (root->ac_partsigs + 1) * sizeof(int)); |
|
268 |
+ memset(partoff, 0, (root->ac_partsigs + 1) * sizeof(unsigned long int)); |
|
269 |
+ decoded = cli_utf16toascii(smallbuff, bread); |
|
270 |
+ if(decoded) { |
|
271 |
+ sret = cli_ac_scanbuff(decoded, strlen(decoded), NULL, engine->root[0], partcnt, 1, 0, partoff, 0, -1, NULL); |
|
272 |
+ free(decoded); |
|
273 |
+ if(sret == CL_TYPE_HTML) |
|
274 |
+ ret = CL_TYPE_HTML_UTF16; |
|
275 |
+ } |
|
276 |
+ } |
|
277 |
+ free(partcnt); |
|
278 |
+ free(partoff); |
|
279 |
+ } |
|
280 |
+ |
|
244 | 281 |
if(ret == CL_TYPE_UNKNOWN_DATA || ret == CL_TYPE_UNKNOWN_TEXT) { |
245 | 282 |
|
246 | 283 |
if(!(bigbuff = (unsigned char *) cli_calloc(37638 + 1, sizeof(unsigned char)))) |
... | ... |
@@ -23,7 +23,7 @@ |
23 | 23 |
|
24 | 24 |
#include <sys/types.h> |
25 | 25 |
|
26 |
-#define MAGIC_BUFFER_SIZE 50 |
|
26 |
+#define MAGIC_BUFFER_SIZE 256 |
|
27 | 27 |
#define CL_TYPENO 500 |
28 | 28 |
#define SFX_MAX_TESTS 10 |
29 | 29 |
|
... | ... |
@@ -53,6 +53,7 @@ typedef enum { |
53 | 53 |
CL_TYPE_PDF, |
54 | 54 |
CL_TYPE_UUENCODED, |
55 | 55 |
CL_TYPE_PST, /* Microsoft Outlook binary email folder (.pst file) */ |
56 |
+ CL_TYPE_HTML_UTF16, |
|
56 | 57 |
|
57 | 58 |
/* bigger numbers have higher priority (in o-t-f detection) */ |
58 | 59 |
CL_TYPE_HTML, /* on the fly */ |
... | ... |
@@ -71,7 +72,7 @@ struct cli_matched_type { |
71 | 71 |
}; |
72 | 72 |
|
73 | 73 |
cli_file_t cli_filetype(const unsigned char *buf, size_t buflen); |
74 |
-cli_file_t cli_filetype2(int desc); |
|
74 |
+cli_file_t cli_filetype2(int desc, const struct cl_engine *engine); |
|
75 | 75 |
int cli_addtypesigs(struct cl_engine *engine); |
76 | 76 |
|
77 | 77 |
#endif |
... | ... |
@@ -78,6 +78,7 @@ extern short cli_leavetemps_flag; |
78 | 78 |
#include "pst.h" |
79 | 79 |
#include "sis.h" |
80 | 80 |
#include "pdf.h" |
81 |
+#include "str.h" |
|
81 | 82 |
|
82 | 83 |
#ifdef HAVE_ZLIB_H |
83 | 84 |
#include <zlib.h> |
... | ... |
@@ -1154,6 +1155,50 @@ static int cli_scanhtml(int desc, cli_ctx *ctx) |
1154 | 1154 |
return ret; |
1155 | 1155 |
} |
1156 | 1156 |
|
1157 |
+static int cli_scanhtml_utf16(int desc, cli_ctx *ctx) |
|
1158 |
+{ |
|
1159 |
+ char *tempname, buff[512], *decoded; |
|
1160 |
+ int ret = CL_CLEAN, fd, bytes; |
|
1161 |
+ |
|
1162 |
+ |
|
1163 |
+ cli_dbgmsg("in cli_scanhtml_utf16()\n"); |
|
1164 |
+ |
|
1165 |
+ tempname = cli_gentemp(NULL); |
|
1166 |
+ if((fd = open(tempname, O_RDWR|O_CREAT|O_TRUNC|O_BINARY, S_IRWXU)) < 0) { |
|
1167 |
+ cli_errmsg("cli_scanhtml_utf16: Can't create file %s\n", tempname); |
|
1168 |
+ free(tempname); |
|
1169 |
+ return CL_EIO; |
|
1170 |
+ } |
|
1171 |
+ |
|
1172 |
+ while((bytes = read(desc, buff, sizeof(buff))) > 0) { |
|
1173 |
+ decoded = cli_utf16toascii(buff, bytes); |
|
1174 |
+ if(decoded) { |
|
1175 |
+ if(write(fd, decoded, strlen(decoded)) == -1) { |
|
1176 |
+ cli_errmsg("cli_scanhtml_utf16: Can't write to file %s\n", tempname); |
|
1177 |
+ free(decoded); |
|
1178 |
+ unlink(tempname); |
|
1179 |
+ free(tempname); |
|
1180 |
+ close(fd); |
|
1181 |
+ return CL_EIO; |
|
1182 |
+ } |
|
1183 |
+ free(decoded); |
|
1184 |
+ } |
|
1185 |
+ } |
|
1186 |
+ |
|
1187 |
+ fsync(fd); |
|
1188 |
+ lseek(fd, 0, SEEK_SET); |
|
1189 |
+ ret = cli_scanhtml(fd, ctx); |
|
1190 |
+ close(fd); |
|
1191 |
+ |
|
1192 |
+ if(!cli_leavetemps_flag) |
|
1193 |
+ unlink(tempname); |
|
1194 |
+ else |
|
1195 |
+ cli_dbgmsg("cli_scanhtml_utf16: Decoded HTML data saved in %s\n", tempname); |
|
1196 |
+ free(tempname); |
|
1197 |
+ |
|
1198 |
+ return ret; |
|
1199 |
+} |
|
1200 |
+ |
|
1157 | 1201 |
static int cli_scanole2(int desc, cli_ctx *ctx) |
1158 | 1202 |
{ |
1159 | 1203 |
char *dir; |
... | ... |
@@ -1672,7 +1717,7 @@ int cli_magic_scandesc(int desc, cli_ctx *ctx) |
1672 | 1672 |
} |
1673 | 1673 |
|
1674 | 1674 |
lseek(desc, 0, SEEK_SET); |
1675 |
- type = cli_filetype2(desc); |
|
1675 |
+ type = cli_filetype2(desc, ctx->engine); |
|
1676 | 1676 |
lseek(desc, 0, SEEK_SET); |
1677 | 1677 |
|
1678 | 1678 |
if(type != CL_TYPE_DATA && ctx->engine->sdb) { |
... | ... |
@@ -1716,6 +1761,16 @@ int cli_magic_scandesc(int desc, cli_ctx *ctx) |
1716 | 1716 |
ret = cli_scanmscab(desc, ctx); |
1717 | 1717 |
break; |
1718 | 1718 |
|
1719 |
+ case CL_TYPE_HTML: |
|
1720 |
+ if(SCAN_HTML) |
|
1721 |
+ ret = cli_scanhtml(desc, ctx); |
|
1722 |
+ break; |
|
1723 |
+ |
|
1724 |
+ case CL_TYPE_HTML_UTF16: |
|
1725 |
+ if(SCAN_HTML) |
|
1726 |
+ ret = cli_scanhtml_utf16(desc, ctx); |
|
1727 |
+ break; |
|
1728 |
+ |
|
1719 | 1729 |
case CL_TYPE_MAIL: |
1720 | 1730 |
if(SCAN_MAIL) |
1721 | 1731 |
ret = cli_scanmail(desc, ctx); |
... | ... |
@@ -173,6 +173,33 @@ char *cli_str2hex(const char *string, unsigned int len) |
173 | 173 |
return hexstr; |
174 | 174 |
} |
175 | 175 |
|
176 |
+char *cli_utf16toascii(const char *str, unsigned int length) |
|
177 |
+{ |
|
178 |
+ char *decoded; |
|
179 |
+ unsigned int i, j; |
|
180 |
+ |
|
181 |
+ |
|
182 |
+ if(length < 2) { |
|
183 |
+ cli_warnmsg("cli_utf16toascii: length < 2\n"); |
|
184 |
+ return NULL; |
|
185 |
+ } |
|
186 |
+ |
|
187 |
+ if(length % 2) |
|
188 |
+ length--; |
|
189 |
+ |
|
190 |
+ if(!(decoded = cli_calloc(length / 2 + 1, sizeof(char)))) |
|
191 |
+ return NULL; |
|
192 |
+ |
|
193 |
+ for(i = 0, j = 0; i < length; i += 2, j++) { |
|
194 |
+ decoded[j] = str[i + 1] << 4; |
|
195 |
+ decoded[j] += str[i]; |
|
196 |
+ if(decoded[j] == '%') |
|
197 |
+ decoded[j] = '_'; |
|
198 |
+ } |
|
199 |
+ |
|
200 |
+ return decoded; |
|
201 |
+} |
|
202 |
+ |
|
176 | 203 |
int cli_strbcasestr(const char *haystack, const char *needle) |
177 | 204 |
{ |
178 | 205 |
char *pt = (char *) haystack; |
... | ... |
@@ -27,6 +27,7 @@ short int *cli_hex2si(const char *hex); |
27 | 27 |
char *cli_hex2str(const char *hex); |
28 | 28 |
int cli_hex2num(const char *hex); |
29 | 29 |
char *cli_str2hex(const char *string, unsigned int len); |
30 |
+char *cli_utf16toascii(const char *str, unsigned int length); |
|
30 | 31 |
char *cli_strtokbuf(const char *input, int fieldno, const char *delim, char *output); |
31 | 32 |
const char *cli_memstr(const char *haystack, int hs, const char *needle, int ns); |
32 | 33 |
char *cli_strrcpy(char *dest, const char *source); |