GitList

Browse code

add support for UTF16 encoded HTML files

git-svn: trunk@2430

Tomasz Kojm authored on 2006/10/26 00:40:47
Showing 6 changed files

clamav-devel/ChangeLog index a0233dd..5eefb92 100644
clamav-devel/libclamav/filetypes.c index 7724624..73bb8b1 100644
clamav-devel/libclamav/filetypes.h index 1aa1316..b86a158 100644
clamav-devel/libclamav/scanners.c index 4a4ec12..033c1aa 100644
clamav-devel/libclamav/str.c index da71f36..6b5ec5c 100644
clamav-devel/libclamav/str.h index be571e2..dac4e1a 100644

@@ -1,3 +1,8 @@
                     +Wed Oct 25 17:39:24 CEST 2006 (tk)
                     +----------------------------------
                     +  * libclamav: add support for UTF16 encoded HTML files,
                     +	       requested by Christoph
+                    +
                      Wed Oct 25 12:40:10 CEST 2006 (acab)
                      ------------------------------------
                        * clamscan/clamscan.c: fix typo breaking -l (closes bug#83)

clamav-devel/libclamav/filetypes.c

History View file @ bd98896

@@ -32,6 +32,7 @@
                      #include "others.h"
                      #include "readdb.h"
                      #include "matcher-ac.h"
                     +#include "str.h"
                      struct cli_magic_s {
                          size_t offset;
@@ -205,7 +206,7 @@ static char internat[256] = {
                      cli_file_t cli_filetype(const unsigned char *buf, size_t buflen)
+                     {
                     -	int i, ascii = 1, len;
                     +	int i, text = 1, len;
                          for(i = 0; cli_magic[i].magic; i++) {
@@ -220,27 +221,64 @@ cli_file_t cli_filetype(const unsigned char *buf, size_t buflen)
                          buflen < 25 ? (len = buflen) : (len = 25);
                          for(i = 0; i < len; i++)
                      	if(!iscntrl(buf[i]) && !isprint(buf[i]) && !internat[buf[i] & 0xff]) {
                     -	    ascii = 0;
                     +	    text = 0;
                      	    break;
+                     	}
                     -    return ascii ? CL_TYPE_UNKNOWN_TEXT : CL_TYPE_UNKNOWN_DATA;
                     +    return text ? CL_TYPE_UNKNOWN_TEXT : CL_TYPE_UNKNOWN_DATA;
+                     }
                      int is_tar(unsigned char *buf, unsigned int nbytes);
                     -cli_file_t cli_filetype2(int desc)
                     +cli_file_t cli_filetype2(int desc, const struct cl_engine *engine)
+                     {
                     -	char smallbuff[MAGIC_BUFFER_SIZE + 1];
                     +	char smallbuff[MAGIC_BUFFER_SIZE + 1], *decoded;
                      	unsigned char *bigbuff;
                     -	int bread;
                     +	int bread, sret;
                      	cli_file_t ret = CL_TYPE_UNKNOWN_DATA;
                     +	struct cli_matcher *root;
                     +	int *partcnt;
                     +	unsigned long int *partoff;
                          memset(smallbuff, 0, sizeof(smallbuff));
                          if((bread = read(desc, smallbuff, MAGIC_BUFFER_SIZE)) > 0)
                      	ret = cli_filetype(smallbuff, bread);
                     +    if(engine && ret == CL_TYPE_UNKNOWN_TEXT) {
                     +	root = engine->root[0];
                     +	if(!root)
                     +	    return ret;
+                    +
                     +	if((partcnt = (int *) cli_calloc(root->ac_partsigs + 1, sizeof(int))) == NULL) {
                     +	    cli_warnmsg("cli_filetype2(): unable to cli_calloc(%d, %d)\n", root->ac_partsigs + 1, sizeof(int));
                     +	    return ret;
                     +	}
+                    +
                     +	if((partoff = (unsigned long int *) cli_calloc(root->ac_partsigs + 1, sizeof(unsigned long int))) == NULL) {
                     +	    cli_dbgmsg("cli_filetype2(): unable to cli_calloc(%d, %d)\n", root->ac_partsigs + 1, sizeof(unsigned long int));
                     +	    free(partcnt);
                     +	    return ret;
                     +	}
+                    +
                     +	sret = cli_ac_scanbuff(smallbuff, bread, NULL, engine->root[0], partcnt, 1, 0, partoff, 0, -1, NULL);
                     +	if(sret >= CL_TYPENO) {
                     +	    ret = sret;
                     +	} else {
                     +	    memset(partcnt, 0, (root->ac_partsigs + 1) * sizeof(int));
                     +	    memset(partoff, 0, (root->ac_partsigs + 1) * sizeof(unsigned long int));
                     +	    decoded = cli_utf16toascii(smallbuff, bread);
                     +	    if(decoded) {
                     +		sret = cli_ac_scanbuff(decoded, strlen(decoded), NULL, engine->root[0], partcnt, 1, 0, partoff, 0, -1, NULL);
                     +		free(decoded);
                     +		if(sret == CL_TYPE_HTML)
                     +		    ret = CL_TYPE_HTML_UTF16;
                     +	    }
                     +	}
                     +	free(partcnt);
                     +	free(partoff);
                     +    }
+                    +
                          if(ret == CL_TYPE_UNKNOWN_DATA || ret == CL_TYPE_UNKNOWN_TEXT) {
                      	if(!(bigbuff = (unsigned char *) cli_calloc(37638 + 1, sizeof(unsigned char))))

clamav-devel/libclamav/filetypes.h

History View file @ bd98896

@@ -23,7 +23,7 @@
                      #include <sys/types.h>
                     -#define MAGIC_BUFFER_SIZE 50
                     +#define MAGIC_BUFFER_SIZE 256
                      #define CL_TYPENO 500
                      #define SFX_MAX_TESTS 10
@@ -53,6 +53,7 @@ typedef enum {
                          CL_TYPE_PDF,
                          CL_TYPE_UUENCODED,
                          CL_TYPE_PST,	/* Microsoft Outlook binary email folder (.pst file) */
                     +    CL_TYPE_HTML_UTF16,
                          /* bigger numbers have higher priority (in o-t-f detection) */
                          CL_TYPE_HTML, /* on the fly */
@@ -71,7 +72,7 @@ struct cli_matched_type {
                      };
                      cli_file_t cli_filetype(const unsigned char *buf, size_t buflen);
                     -cli_file_t cli_filetype2(int desc);
                     +cli_file_t cli_filetype2(int desc, const struct cl_engine *engine);
                      int cli_addtypesigs(struct cl_engine *engine);
                      #endif

clamav-devel/libclamav/scanners.c

History View file @ bd98896

@@ -78,6 +78,7 @@ extern short cli_leavetemps_flag;
                      #include "pst.h"
                      #include "sis.h"
                      #include "pdf.h"
                     +#include "str.h"
                      #ifdef HAVE_ZLIB_H
                      #include <zlib.h>
@@ -1154,6 +1155,50 @@ static int cli_scanhtml(int desc, cli_ctx *ctx)
                          return ret;
+                     }
                     +static int cli_scanhtml_utf16(int desc, cli_ctx *ctx)
                     +{
                     +	char *tempname, buff[512], *decoded;
                     +	int ret = CL_CLEAN, fd, bytes;
+                    +
+                    +
                     +    cli_dbgmsg("in cli_scanhtml_utf16()\n");
+                    +
                     +    tempname = cli_gentemp(NULL);
                     +    if((fd = open(tempname, O_RDWR|O_CREAT|O_TRUNC|O_BINARY, S_IRWXU)) < 0) {
                     +	cli_errmsg("cli_scanhtml_utf16: Can't create file %s\n", tempname);
                     +	free(tempname);
                     +	return CL_EIO;
                     +    }
+                    +
                     +    while((bytes = read(desc, buff, sizeof(buff))) > 0) {
                     +	decoded = cli_utf16toascii(buff, bytes);
                     +	if(decoded) {
                     +	    if(write(fd, decoded, strlen(decoded)) == -1) {
                     +		cli_errmsg("cli_scanhtml_utf16: Can't write to file %s\n", tempname);
                     +		free(decoded);
                     +		unlink(tempname);
                     +		free(tempname);
                     +		close(fd);
                     +		return CL_EIO;
                     +	    }
                     +	    free(decoded);
                     +	}
                     +    }
+                    +
                     +    fsync(fd);
                     +    lseek(fd, 0, SEEK_SET);
                     +    ret = cli_scanhtml(fd, ctx);
                     +    close(fd);
+                    +
                     +    if(!cli_leavetemps_flag)
                     +	unlink(tempname);
                     +    else
                     +	cli_dbgmsg("cli_scanhtml_utf16: Decoded HTML data saved in %s\n", tempname);
                     +    free(tempname);
+                    +
                     +    return ret;
                     +}
+                    +
                      static int cli_scanole2(int desc, cli_ctx *ctx)
+                     {
                      	char *dir;
@@ -1672,7 +1717,7 @@ int cli_magic_scandesc(int desc, cli_ctx *ctx)
+                     	}
                          lseek(desc, 0, SEEK_SET);
                     -    type = cli_filetype2(desc);
                     +    type = cli_filetype2(desc, ctx->engine);
                          lseek(desc, 0, SEEK_SET);
                          if(type != CL_TYPE_DATA && ctx->engine->sdb) {
@@ -1716,6 +1761,16 @@ int cli_magic_scandesc(int desc, cli_ctx *ctx)
                      		ret = cli_scanmscab(desc, ctx);
                      	    break;
                     +	case CL_TYPE_HTML:
                     +	    if(SCAN_HTML)
                     +		ret = cli_scanhtml(desc, ctx);
                     +	    break;
+                    +
                     +	case CL_TYPE_HTML_UTF16:
                     +	    if(SCAN_HTML)
                     +		ret = cli_scanhtml_utf16(desc, ctx);
                     +	    break;
+                    +
                      	case CL_TYPE_MAIL:
                      	    if(SCAN_MAIL)
                      		ret = cli_scanmail(desc, ctx);

clamav-devel/libclamav/str.c

History View file @ bd98896

@@ -173,6 +173,33 @@ char *cli_str2hex(const char *string, unsigned int len)
                          return hexstr;
+                     }
                     +char *cli_utf16toascii(const char *str, unsigned int length)
                     +{
                     +	char *decoded;
                     +	unsigned int i, j;
+                    +
+                    +
                     +    if(length < 2) {
                     +	cli_warnmsg("cli_utf16toascii: length < 2\n");
                     +	return NULL;
                     +    }
+                    +
                     +    if(length % 2)
                     +	length--;
+                    +
                     +    if(!(decoded = cli_calloc(length / 2 + 1, sizeof(char))))
                     +	return NULL;
+                    +
                     +    for(i = 0, j = 0; i < length; i += 2, j++) {
                     +       decoded[j] = str[i + 1] << 4;
                     +       decoded[j] += str[i];
                     +       if(decoded[j] == '%')
                     +	   decoded[j] = '_';
                     +    }
+                    +
                     +    return decoded;
                     +}
+                    +
                      int cli_strbcasestr(const char *haystack, const char *needle)
+                     {
                      	char *pt = (char *) haystack;

clamav-devel/libclamav/str.h

History View file @ bd98896

@@ -27,6 +27,7 @@ short int *cli_hex2si(const char *hex);
                      char *cli_hex2str(const char *hex);
                      int cli_hex2num(const char *hex);
                      char *cli_str2hex(const char *string, unsigned int len);
                     +char *cli_utf16toascii(const char *str, unsigned int length);
                      char *cli_strtokbuf(const char *input, int fieldno, const char *delim, char *output);
                      const char *cli_memstr(const char *haystack, int hs, const char *needle, int ns);
                      char *cli_strrcpy(char *dest, const char *source);