GitList

Browse code

use entconv to detect UTF-16BE, and UCS-4 variants use only cli_readline() we don't need exact conversion drop unused functions, simplify encoding_norm_readline(), and rename to encoding_normalize_toascii()

git-svn: trunk@3571

Török Edvin authored on 2008/02/02 04:38:52
Showing 6 changed files

ChangeLog index 520be06..b913e16 100644
libclamav/entconv.c index 18c94c9..0dfa202 100644
libclamav/entconv.h index 3cb7fc8..bc28837 100644
libclamav/filetypes.c index a87678b..b972a27 100644
libclamav/htmlnorm.c index 19f134c..a45b03e 100644
libclamav/htmlnorm.h index af93723..c000e08 100644

@@ -1,3 +1,12 @@
                     +Fri Feb  1 21:19:58 EET 2008 (edwin)
                     +------------------------------------
                     +  * libclamav/filetypes.c: use entconv to detect UTF-16BE, and UCS-4 variants
                     +  * libclamav/htmlnorm.c: use only cli_readline() we don't need exact
                     +  conversion
                     +  * libclamav/entconv.c:
                     +	* drop unused functions,
                     +  	* simplify encoding_norm_readline(), and rename to encoding_normalize_toascii()
+                    +
                      Fri Feb  1 00:58:05 CET 2008 (tk)
                      ---------------------------------
                        * libclamav: ndb sigs: add new target type (7) for ASCII files; handle

libclamav/entconv.c

History View file @ b3fc7f9

@@ -127,67 +127,7 @@ const char* entity_norm(struct entity_conv* conv,const unsigned char* entity)
                      	return NULL;
+                     }
                     -/* sane default, must be larger, than the longest possible return string,
                     - * which is
                     - * &#xxx;*/
                     -#define MIN_BUFFER_SIZE 32
+                    -
                     -#define LINEMODE_LIMIT 16384
+                    -
                     -int init_entity_converter(struct entity_conv* conv, size_t buffer_size)
                     -{
                     -	if(buffer_size < MIN_BUFFER_SIZE) {
                     -		cli_warnmsg("Entity converter: Supplied buffer size:%lu, smaller than minimum required: %d\n",(unsigned long)buffer_size,MIN_BUFFER_SIZE);
                     -		return CL_ENULLARG;
                     -	}
                     -	if(conv) {
                     -		conv->encoding = NULL;
                     -		conv->encoding_symbolic = E_UNKNOWN;
                     -		conv->bom_cnt = 0;
                     -		conv->buffer_size = buffer_size;
                     -		conv->priority = NOPRIO;
                     -		/* start in linemode */
                     -		conv->linemode = 1;
                     -		conv->linemode_processed = 0;
+                    -
                     -		conv->tmp_area.offset = 0;
                     -		conv->tmp_area.length = 0;
                     -		conv->tmp_area.buffer  =  cli_malloc(buffer_size);
                     -		if(!conv->tmp_area.buffer) {
                     -			return CL_EMEM;
                     -		}
+                    -
                     -		conv->out_area.offset = 0;
                     -		conv->out_area.length = buffer_size;
                     -		conv->out_area.buffer = cli_malloc(buffer_size);
                     -		if(!conv->out_area.buffer) {
                     -			free(conv->tmp_area.buffer);
                     -			return CL_EMEM;
                     -		}
+                    -
                     -		conv->buffer_size = buffer_size;
                     -		conv->norm_area.offset = 0;
                     -		conv->norm_area.length = 0;
                     -		conv->norm_area.buffer = cli_malloc(buffer_size);
                     -		if(!conv->norm_area.buffer) {
                     -			free(conv->tmp_area.buffer);
                     -			free(conv->out_area.buffer);
                     -			return CL_EMEM;
                     -		}
+                    -
                     -		conv->iconv_struct = cli_calloc(1, sizeof(iconv_t));
                     -		if(!conv->iconv_struct) {
                     -			free(conv->tmp_area.buffer);
                     -			free(conv->out_area.buffer);
                     -			free(conv->norm_area.buffer);
                     -			return CL_EMEM;
                     -		}
                     -		return 0;
                     -	}
                     -	else
                     -		return CL_ENULLARG;
                     -}
+                    -
                     +#ifndef HAVE_ICONV
                      static size_t encoding_bytes(const char* fromcode, enum encodings* encoding)
+                     {
                      	/* special case for these unusual byteorders */
@@ -217,7 +157,6 @@ static size_t encoding_bytes(const char* fromcode, enum encodings* encoding)
+                     	}
+                     }
                     -#ifndef HAVE_ICONV
                      static iconv_t iconv_open(const char *tocode, const char* fromcode)
+                     {
                      	iconv_t iconv = cli_malloc(sizeof(*iconv));
@@ -236,7 +175,6 @@ static int iconv_close(iconv_t cd)
                      	return 0;
+                     }
+                    -
                      static int iconv(iconv_t iconv_struct,char **inbuf, size_t *inbytesleft,
                      		char** outbuf, size_t *outbytesleft)
+                     {
@@ -426,14 +364,11 @@ static int iconv(iconv_t iconv_struct,char **inbuf, size_t *inbytesleft,
                      #endif
                     -/* new iconv() version */
                     -static inline void process_bom(struct entity_conv* conv)
                     +static inline const char* detect_encoding(const unsigned char* bom, uint8_t* bom_found, uint8_t* enc_width)
+                     {
                     -	const unsigned char* bom = conv->bom;
                     -	const char* encoding = NULL;
                     +	const char* encoding;
                      	int has_bom = 0;
                     -	uint8_t enc_bytes = 1;/* default is UTF8, which has a minimum of 1 bytes*/
+                    -
                     +	uint8_t enc_bytes = 1; /* default is UTF8, which has a minimum of 1 bytes */
                      	/* undecided 32-bit encodings are treated as ucs4, and
                      	 * 16 bit as utf16*/
                      	switch(bom[0]) {
@@ -442,23 +377,28 @@ static inline void process_bom(struct entity_conv* conv)
                      				if(bom[2] == 0xFE && bom[3] == 0xFF) {
                      					encoding = UCS4_1234;/* UCS-4 big-endian*/
                      					has_bom = 1;
                     +					enc_bytes = 4;
+                     				}
                      				else if(bom[2] == 0xFF && bom[3] == 0xFE) {
                      					encoding = UCS4_2143;/* UCS-4 unusual order 2143 */
                      					has_bom = 1;
                     +					enc_bytes = 4;
+                     				}
                      				else if(bom[2] == 0x00 && bom[3] == 0x3C) {
                      					/* undecided, treat as ucs4 */
                      					encoding = UCS4_1234;
                     +					enc_bytes = 4;
+                     				}
                      				else if(bom[2] == 0x3C && bom[3] == 0x00) {
                      					encoding = UCS4_2143;
                     +					enc_bytes = 4;
+                     				}
                      			}/* 0x00 0x00 */
                      			else if(bom[1] == 0x3C) {
                      				if(bom[2] == 0x00) {
                      					if(bom[3] == 0x00) {
                      						encoding = UCS4_3412;
                     +						enc_bytes = 4;
+                     					}
                      					else if(bom[3] == 0x3F) {
                      						encoding = UTF16_BE;
@@ -471,6 +411,7 @@ static inline void process_bom(struct entity_conv* conv)
                      			if(bom[1] == 0xFE) {
                      				if(bom[2] == 0x00 && bom[3] == 0x00) {
                      					encoding = UCS4_4321;
                     +					enc_bytes = 4;
                      					has_bom = 1;
+                     				}
                      				else {
@@ -484,6 +425,7 @@ static inline void process_bom(struct entity_conv* conv)
                      			if(bom[1] == 0xFF) {
                      					if(bom[2] == 0x00 && bom[3] == 0x00) {
                      						encoding = UCS4_3412;
                     +						enc_bytes = 4;
                      						has_bom = 1;
+                     					}
                      					else {
@@ -504,6 +446,7 @@ static inline void process_bom(struct entity_conv* conv)
                      				if(bom[1] == 0x00) {
                      					if(bom[2] == 0x00 && bom[3] == 0x00) {
                      						encoding = UCS4_4321;
                     +						enc_bytes = 4;
+                     					}
                      					else if(bom[2] == 0x3F && bom[3] == 0x00) {
                      						encoding = UTF16_LE;
@@ -523,12 +466,19 @@ static inline void process_bom(struct entity_conv* conv)
                      				}/*4C 6F A7 94*/
                      				break;
                      	}/*switch*/
                     -	if(encoding) {
                     -		cli_dbgmsg(MODULE_NAME "encoding detected as :%s\n", encoding);
                     -		process_encoding_set(conv, (const unsigned char*)encoding, has_bom ? BOM : NOBOM_AUTODETECT);
                     -	}
                     -	conv->enc_bytes = enc_bytes;
                     -	conv->has_bom = has_bom;
                     +	*enc_width = enc_bytes;
                     +	*bom_found = has_bom;
                     +	return encoding;
                     +}
+                    +
                     +/* detects UTF-16(LE/BE), UCS-4(all 4 variants).
                     + * UTF-8 and simple ASCII are ignored, because we can process those as text */
                     +const char* encoding_detect_bom(const unsigned char* bom)
                     +{
                     +	uint8_t has_bom;
                     +	uint8_t enc_width;
                     +	const char* encoding = detect_encoding(bom, &has_bom, &enc_width);
                     +	return enc_width > 1 ? encoding : NULL;
+                     }
                      /*()-./0123456789:ABCDEFGHIJKLMNOPQRSTUVWXYZ_abcdefghijklmnopqrstuvwxyz*/
@@ -575,53 +525,6 @@ static char* normalize_encoding(const unsigned char* enc)
                      	return norm;
+                     }
                     -static int encoding_norm_done(struct entity_conv* conv)
                     -{
                     -	if(conv->encoding) {
                     -		free(conv->encoding);
                     -		conv->encoding = NULL;
                     -	}
                     -	conv->buffer_size = 0;
                     -	if(conv->tmp_area.buffer) {
                     -		free(conv->tmp_area.buffer);
                     -		conv->tmp_area.buffer = NULL;
                     -	}
                     -	if(conv->out_area.buffer) {
                     -		free(conv->out_area.buffer);
                     -		conv->out_area.buffer = NULL;
                     -	}
                     -	if(conv->norm_area.buffer) {
                     -		free(conv->norm_area.buffer);
                     -		conv->norm_area.buffer = NULL;
                     -	}
                     -	if(conv->iconv_struct) {
                     -		free(conv->iconv_struct);
                     -	}
                     -	return 0;
                     -}
+                    -
                     -int entity_norm_done(struct entity_conv* conv)
                     -{
                     -	return encoding_norm_done(conv);
                     -}
+                    -
                     -static unsigned short bom_length(struct entity_conv* conv)
                     -{
                     -	if(conv->has_bom) {
                     -		switch(conv->enc_bytes) {
                     -			case 1:
                     -				if(conv->encoding_symbolic == E_UTF8) {
                     -					return 3;
                     -				}
                     -				break;
                     -			case 2:
                     -				return 2;
                     -			case 4:
                     -				return 4;
                     -		}
                     -	}
                     -	return 0;
                     -}
                      /* sarge leaks on iconv_open/iconv_close, so lets not open/close so many times,
                       * just keep on each thread its own pool of iconvs*/
@@ -774,99 +677,36 @@ static iconv_t iconv_open_cached(const char* fromcode)
                      	cli_dbgmsg(MODULE_NAME "iconv not found in cache, for encoding:%s\n",fromcode);
                      	iconv_struct = iconv_open("UTF-16BE",(const char*)fromcode);
                      	if(iconv_struct != (iconv_t)-1) {
                     -	idx = cache->last++;
                     -	if(idx >= cache->len) {
                     -		cache->len += 16;
                     -		cache->tab = cli_realloc2(cache->tab, cache->len*sizeof(cache->tab[0]));
                     -		if(!cache->tab) {
                     -			cli_dbgmsg(MODULE_NAME "!Out of mem in iconv-pool\n");
                     -			errno = ENOMEM;
                     -			return (iconv_t)-1;
                     +		idx = cache->last++;
                     +		if(idx >= cache->len) {
                     +			cache->len += 16;
                     +			cache->tab = cli_realloc2(cache->tab, cache->len*sizeof(cache->tab[0]));
                     +			if(!cache->tab) {
                     +				cli_dbgmsg(MODULE_NAME "!Out of mem in iconv-pool\n");
                     +				errno = ENOMEM;
                     +				return (iconv_t)-1;
                     +			}
+                     		}
                     -	}
                     -	hashtab_insert(&cache->hashtab, fromcode, fromcode_len, idx);
                     +		hashtab_insert(&cache->hashtab, fromcode, fromcode_len, idx);
                      		cache->tab[idx] = iconv_struct;
                     -	cli_dbgmsg(MODULE_NAME "iconv_open(),for:%s -> %p\n",fromcode,(void*)cache->tab[idx]);
                     -	return cache->tab[idx];
                     -}
                     -	return (iconv_t)-1;
                     -}
+                    -
                     -void process_encoding_set(struct entity_conv* conv,const unsigned char* encoding,enum encoding_priority prio)
                     -{
                     -	char *tmp_encoding;
                     -	enum encodings tmp;
                     -	size_t new_size,old_size;
+                    -
                     -	if(!encoding && prio == SWITCH_TO_BLOCKMODE) {
                     -		if(conv->linemode) {
                     -			cli_dbgmsg(MODULE_NAME "Switching to block-mode, bytes processed in line-mode: %u\n", conv->linemode_processed);
                     -			conv->linemode = 0;
                     -		}
                     -		return;
                     -	}
+                    -
                     -	cli_dbgmsg(MODULE_NAME "Request to set encoding for %p to %s, priority: %d\n", (void*)conv, encoding, prio);
+                    -
                     -	if(conv->priority == CONTENT_TYPE || conv->encoding || conv->encoding_symbolic == E_ICONV) {
                     -		cli_dbgmsg(MODULE_NAME "won't override encoding due to priorities\n");
                     -		return;
                     -		/* Content-type in header is highest priority, no overrides possible.
                     -		 * Also no overrides after an encoding has been set.*/
                     -	}
+                    -
                     -	/* validate encoding name, and normalize to uppercase */
                     -	if(!(tmp_encoding = normalize_encoding(encoding))) {
                     -		cli_dbgmsg(MODULE_NAME "encoding name is not valid, ignoring\n");
                     -		return;
                     -	}
+                    -
                     -	/* don't allow to change between unicode encodings that have different byte-size */
                     -	if(prio == META) {
                     -		/* need to consider minimum size of an encoding here */
                     -		old_size =  conv->enc_bytes;
                     -		new_size = encoding_bytes(tmp_encoding,&tmp);
                     -		if(old_size != new_size)  {
                     -			/* on x86 gcc wants %u for size_t, on x86_64 it wants %lu for size_t. So just cast to unsigned long to make warnings go away. */
                     -			cli_dbgmsg(MODULE_NAME "refusing to override encoding - new encoding size differs: %s(%lu) != %s(%lu)\n", conv->encoding, (unsigned long)old_size, tmp_encoding, (unsigned long)new_size);
                     -			free(tmp_encoding);
                     -			return;
                     -		}
                     -	}
+                    -
                     -	conv->encoding = tmp_encoding;
                     -	cli_dbgmsg(MODULE_NAME "New encoding for %p:%s\n", (void*)conv, conv->encoding);
                     -	*(iconv_t*)conv->iconv_struct = iconv_open_cached( conv->encoding );
                     -	if(*(iconv_t*)conv->iconv_struct == (iconv_t)-1) {
                     -		cli_dbgmsg(MODULE_NAME "Encoding not accepted by iconv_open()%s, falling back to default!\n", conv->encoding);
                     -		/* message shown only once/file */
                     -		/* what can we do? short-circuit iconv */
                     -		free(conv->encoding);
                     -		conv->encoding = NULL;
                     -		/* we will process using whatever we currently have for encoding_symbolic.
                     -		 * If encoding was already set to iconv, we shouldn't be here.*/
                     -		assert(conv->encoding_symbolic != E_ICONV);
                     -	} else {
                     -		cli_dbgmsg(MODULE_NAME "Switching to block-mode, bytes processed in line-mode: %u\n", conv->linemode_processed);
                     -		conv->encoding_symbolic = E_ICONV;
                     -		conv->priority = prio;
                     -		conv->linemode = 0;
                     +		cli_dbgmsg(MODULE_NAME "iconv_open(),for:%s -> %p\n",fromcode,(void*)cache->tab[idx]);
                     +		return cache->tab[idx];
+                     	}
                     +	return (iconv_t)-1;
+                     }
                     -static int in_iconv_u16(m_area_t* in_m_area, iconv_t* iconv_struct, m_area_t* out_m_area)
                     +static int in_iconv_u16(const m_area_t* in_m_area, iconv_t* iconv_struct, m_area_t* out_m_area)
+                     {
                      	char   tmp4[4];
                      	size_t inleft = in_m_area->length - in_m_area->offset;
                      	size_t rc, alignfix;
                      	char*  input   = (char*)in_m_area->buffer + in_m_area->offset;
                     -	size_t outleft = out_m_area->length > 0 ? out_m_area->length : 0;/*TODO: use real buffer size not last one*/
                     +	size_t outleft = out_m_area->length > 0 ? out_m_area->length : 0;
                      	char* out      = (char*)out_m_area->buffer;
                     +	out_m_area->offset = 0;
                      	if(!inleft) {
                     -		/* EOF */
                     -		out_m_area->offset = out_m_area->length = 0;
                      		return 0;
+                     	}
                      	/* convert encoding conv->tmp_area. conv->out_area */
@@ -886,7 +726,7 @@ static int in_iconv_u16(m_area_t* in_m_area, iconv_t* iconv_struct, m_area_t* ou
                      	while (inleft && (outleft >= 2)) { /* iconv doesn't like inleft to be 0 */
                      		const size_t outleft_last = outleft;
                      		assert(*iconv_struct != (iconv_t)-1);
                     -		rc = iconv(*iconv_struct, (char**) &input,  &inleft, (char**) &out, &outleft);
                     +		rc = iconv(*iconv_struct, &input,  &inleft, &out, &outleft);
                      		if(rc == (size_t)-1) {
                      			if(errno == E2BIG) {
                      				/* not enough space in output buffer */
@@ -909,9 +749,7 @@ static int in_iconv_u16(m_area_t* in_m_area, iconv_t* iconv_struct, m_area_t* ou
                      		*out++ = *input++;
                      		inleft--;
+                     	}
                     -	/* length - offset - alignfix is original value of inleft, new value is inleft,
                     -	 * difference tells how much it moved. */
                     -	in_m_area->offset = in_m_area->length - alignfix - inleft;
                     +	cli_dbgmsg("in_iconv_u16: unprocessed bytes: %lu\n", (unsigned long)inleft);
                      	if(out_m_area->length >= 0 && out_m_area->length >= (off_t)outleft) {
                      		out_m_area->length -= (off_t)outleft;
                      	} else {
@@ -922,156 +760,36 @@ static int in_iconv_u16(m_area_t* in_m_area, iconv_t* iconv_struct, m_area_t* ou
                      	return 0;
+                     }
+                    -
                     -#define NORMALIZE_CHAR(c, out, limit, linemode) \
                     -{\
                     -	        if (linemode && c == '\n') {\
                     -			i++;\
                     -			break;\
                     -		} else {\
                     -			unsigned char* out_new = u16_normalize(c, out, limit);\
                     -			if(out_new) {\
                     -				limit -= out_new - out;\
                     -			}\
                     -			out = out_new;\
                     -		}\
                     -}
+                    -
                     -/* don't use CLI_ISCONTAINED2 here, because values are signed, and gcc4.3
                     - * assumes signed overflow doesn't occur when optimizing (see -Wstrict-overflow) */
                     -#define LIMIT_LENGTH(siz, siz_limit) ((siz) <= (siz_limit) ? (siz) : (siz_limit))
                     -#define OFFSET_INBOUNDS(offset, length) ((offset) >= 0 && (length) >= 0 && (offset) < (length))
+                    -
                     -/* EOF marker is m_area->length == 0 */
+                    -
                     -/* reads input from either @m_area or @stream, and returns an m_area_t pointing to the data read.
                     - * When we can't read anything due to EOF ->length will be set to 0.
                     - * bounds checks offset and length*/
                     -static inline m_area_t* read_raw(struct entity_conv* conv, m_area_t* m_area, FILE* stream)
                     +int encoding_normalize_toascii(const m_area_t* in_m_area, const char* initial_encoding, m_area_t* out_m_area)
+                     {
                     -	if(!m_area) {
                     -		size_t iread;
                     +	iconv_t iconv_struct;
                     +	off_t i, j;
                     +	char *encoding;
                     -		m_area = &conv->tmp_area;
                     -		if(OFFSET_INBOUNDS(m_area->offset, m_area->length)) {
                     -			return m_area;
                     -		}
                     -		/* offset out of bounds -> all the buffer was processed, fill it again */
                     -		iread = fread(m_area->buffer, 1, conv->buffer_size, stream);
                     -		m_area->length = LIMIT_LENGTH(iread, conv->buffer_size);
                     -		m_area->offset = 0;
                     -		if(ferror(stream)) {
                     -			cli_errmsg("Error while reading HTML stream\n");
                     -		}
                     -	} else {
                     -		if(!OFFSET_INBOUNDS(m_area->offset, m_area->length)) {
                     -			cli_dbgmsg(MODULE_NAME "EOF reached\n");
                     -			m_area->offset = m_area->length; /* EOF marker */
                     -		}
                     +	if(!initial_encoding || !in_m_area || !out_m_area) {
                     +		return CL_ENULLARG;
+                     	}
                     -	return m_area;
                     -}
+                    -
                     -static inline uint16_t get_u16(const unsigned char* buf, const size_t i)
                     -{
                     -	return ((uint16_t)buf[i] << 8) | buf[i+1];
                     -}
                     -unsigned char* encoding_norm_readline(struct entity_conv* conv, FILE* stream_in, m_area_t* in_m_area)
                     -{
                     -	unsigned char* out = conv->out_area.buffer;
                     -	if(!conv || !conv->out_area.buffer || !conv->tmp_area.buffer || !out) {
                     -		return NULL;
                     -	}
                     -	if(!(in_m_area = read_raw(conv, in_m_area, stream_in))) {
                     -		/* error encountered */
                     -		return NULL;
                     +	encoding = normalize_encoding((const unsigned char*)initial_encoding);
                     +	if(!encoding) {
                     +		cli_dbgmsg(MODULE_NAME "encoding name is not valid, ignoring\n");
                     +		return -1;
+                     	}
                     -	else {
                     -		const off_t input_limit  = in_m_area->length;
                     -		const unsigned char* input = in_m_area->buffer;
                     -		off_t input_offset = in_m_area->offset;
                     -		off_t limit = conv->out_area.length - 1;
                     -		off_t limit_prev = limit;
                     -		off_t i = 0;
+                    -
                     -		/* read_raw() ensures this condition */
                     -		assert((!input_limit && !input_offset) || (input_offset >=0 && input_limit > 0 && input_offset <= input_limit));
+                    -
                     -		if(!conv->bom_cnt && input_offset + 4 < input_limit) {/* detect Byte Order Mark */
                     -			size_t bom_len;
                     -			memcpy(conv->bom, input, 4);
                     -			process_bom(conv);
                     -			bom_len = bom_length(conv);
                     -			in_m_area->offset = input_offset = input_offset + bom_len;
                     -			conv->bom_cnt = 1;
                     -		}
+                    -
                     -		if(conv->linemode && conv->linemode_processed > LINEMODE_LIMIT) {
                     -			cli_dbgmsg(MODULE_NAME "Line-mode limit exceeded (%u), switching to block-mode\n", conv->linemode_processed);
                     -			conv->linemode = 0;
                     -		}
+                    -
                     -		switch(conv->encoding_symbolic) {
                     -			case E_ICONV:/* only in block-mode */
                     -				/* normalize already converted characters from a previous pass
                     -				 * (output buffer was full, and we couldn't normalize more in previous pass) */
                     -				for(i = conv->norm_area.offset;i < conv->norm_area.length && limit > 0 && out; i += 2) {
                     -					const uint16_t c = get_u16(conv->norm_area.buffer, i);
                     -					NORMALIZE_CHAR(c, out, limit, 0);
                     -				}
                     -				conv->norm_area.offset = i;
                     -			        if(limit > 0) {
                     -					conv->norm_area.length = conv->buffer_size;
                     -					in_iconv_u16(in_m_area, conv->iconv_struct, &conv->norm_area);
+                    -
                     -					/*in_iconv_u16 always fills entire norm_area buffer starting from 0. */
                     -					for(i = 0;i < conv->norm_area.length && limit >  0 && out; i += 2) {
                     -						const uint16_t c = get_u16(conv->norm_area.buffer, i);
                     -						NORMALIZE_CHAR(c, out, limit, 0);
                     -					}
                     -					if(i) {
                     -						conv->norm_area.offset = i;
                     -					}
                     -				}
                     -				if(limit == limit_prev) {
                     -					/* output pointer didn't move => EOF */
                     -					return NULL;
                     -				}
                     -				break;
                     -				/* out_area must have enough space to allow all bytes in norm_area normalized,
                     -				 * if we norm with &x;, then we need 7* space. */
                     -			default:
                     -				cli_dbgmsg(MODULE_NAME "Unhandled encoding:%d\n",conv->encoding_symbolic);
                     -				conv->encoding_symbolic = E_OTHER;
                     -			case E_UNKNOWN:
                     -			case E_OTHER:
                     -				if(!input_limit || input_offset == input_limit) {
                     -					/* nothing to do, EOF */
                     -					return NULL;
                     -				}
                     -				for(i = input_offset; i < input_limit && limit > 0; i++) {
                     -					const unsigned char c = input[i];
                     -					if(conv->linemode && c == '\n') {
                     -						i++;
                     -						break;
                     -					}
                     -					if(c) {
                     -						*out++ = c;
                     -						limit--;
                     -					}
                     -				}
                     -				in_m_area->offset = i;
                     -		}
+                    -
                     -		if(conv->linemode) {
                     -			conv->linemode_processed += i - input_offset;
                     +	cli_dbgmsg(MODULE_NAME "Encoding %s\n", encoding);
                     +	iconv_struct = iconv_open_cached( encoding );
                     +	if(iconv_struct == (iconv_t)-1) {
                     +		cli_dbgmsg(MODULE_NAME "Encoding not accepted by iconv_open(): %s\n", encoding);
                     +		free(encoding);
                     +		return -1;
                     +	}
                     +	in_iconv_u16(in_m_area, &iconv_struct, out_m_area);
                     +	for(i = 0, j = 0; i < out_m_area->length ; i += 2) {
                     +		const unsigned char c = (out_m_area->buffer[i] << 4) + out_m_area->buffer[i+1];
                     +		if(c) {
                     +			out_m_area->buffer[j++] = c;
+                     		}
+                    -
                     -		if(limit < 0) limit = 0;
                     -		conv->out_area.buffer[conv->out_area.length - limit - 1] = '\0';
                     -		return conv->out_area.buffer;
+                     	}
                     +	out_m_area->length = j;
                     +	return 0;
+                     }
+                    -

libclamav/entconv.h

History View file @ b3fc7f9

@@ -44,37 +44,20 @@
                      #define UNKNOWN "\0"
                      #define OTHER   "OTHER"
+                    +
                      enum encoding_priority {NOPRIO,CONTENT_TYPE,BOM,NOBOM_AUTODETECT,XML_CHARSET,META, SWITCH_TO_BLOCKMODE};
                      enum encodings {E_UCS4,E_UTF16,E_UCS4_1234,E_UCS4_4321,E_UCS4_2143,E_UCS4_3412,E_UTF16_BE,E_UTF16_LE,E_UTF8, E_UNKNOWN,E_OTHER, E_ICONV};
                      #define MAX_ENTITY_SIZE 22
                      struct entity_conv {
                     -	char* encoding;
                     -	enum encoding_priority priority;
                     -	enum encodings encoding_symbolic;
                     -	size_t buffer_size;
                     -	void* iconv_struct;
                      	unsigned char entity_buff[MAX_ENTITY_SIZE+2];
                     -	m_area_t tmp_area;
                     -	m_area_t out_area;
                     -	m_area_t norm_area;
                     -	int      linemode;/* TODO:set */
                     -	int      linemode_processed;
                     -	unsigned char bom[4];
                     -	uint8_t has_bom;
                     -	uint8_t enc_bytes;
                     -	uint8_t  bom_cnt;
                      };
                     -int init_entity_converter(struct entity_conv* conv, size_t buffer_size);
                     -void process_encoding_set(struct entity_conv* conv,const unsigned char* encoding,enum encoding_priority priority);
                     -int entity_norm_done(struct entity_conv* conv);
+                    -
                      unsigned char* u16_normalize_tobuffer(uint16_t u16, unsigned char* dst, size_t dst_size);
                     -unsigned char* encoding_norm_readline(struct entity_conv* conv, FILE* stream_in, m_area_t* in_m_area);
                      const char* entity_norm(struct entity_conv* conv,const unsigned char* entity);
                     -int entitynorm_init(void);
                     +const char* encoding_detect_bom(const unsigned char* bom);
                     +int encoding_normalize_toascii(const m_area_t* in_m_area, const char* initial_encoding, m_area_t* out_m_area);
                      #endif

libclamav/filetypes.c

History View file @ b3fc7f9

@@ -182,37 +182,42 @@ cli_file_t cli_filetype2(int desc, const struct cl_engine *engine)
                      	    cli_ac_freedata(&mdata);
                      	    if((((struct cli_dconf*) engine->dconf)->phishing & PHISHING_CONF_ENTCONV) && ret != CL_TYPE_HTML_UTF16) {
                     -		    struct entity_conv conv;
                     -		    const size_t conv_size = 2*bread < 256 ? 256 : 2*bread;
+                    -
                     -		    /* TODO: make detection via daily.ft, then we can get rid of line-mode entirely!*/
                     -		    if(init_entity_converter(&conv, conv_size) == 0) {
                     -			    m_area_t area;
                     -			    area.buffer = (unsigned char *) smallbuff;
                     -			    area.length = bread;
                     -			    area.offset = 0;
+                    -
                     -			    /* switch to blockmode, so that we convert all the input buffer at once,
                     -			     * rather than line-by-line */
                     -			    process_encoding_set(&conv, NULL, SWITCH_TO_BLOCKMODE);
+                    -
                     -			    if(cli_ac_initdata(&mdata, root->ac_partsigs, AC_DEFAULT_TRACKLEN))
                     -				    return ret;
+                    -
                     -			    decoded =  encoding_norm_readline(&conv, NULL, &area);
+                    -
                     -			    if(decoded) {
                     -				    sret = cli_ac_scanbuff(decoded, strlen((const char *) decoded), NULL, engine->root[0], &mdata, 1, 0, 0, -1, NULL);
                     -				    if(sret == CL_TYPE_HTML) {
                     -					    ret = CL_TYPE_HTML;
                     +		    const char* encoding;
+                    +
                     +		    /* check if we can autodetect this encoding.
                     +		     * If we can't don't try to detect HTML sig, since
                     +		     * we just tried that above, and failed */
                     +		    if((encoding = encoding_detect_bom(smallbuff))) {
                     +			    unsigned char decodedbuff[sizeof(smallbuff)*2];
                     +			    m_area_t in_area, out_area;
+                    +
                     +			    in_area.buffer = (unsigned char *) smallbuff;
                     +			    in_area.length = bread;
                     +			    in_area.offset = 0;
                     +			    out_area.buffer = decodedbuff;
                     +			    out_area.length = sizeof(decodedbuff);
                     +			    out_area.offset = 0;
+                    +
                     +			    /* in htmlnorm we simply skip over \0 chars, and that allows to parse HTML in any unicode
                     +			     * (multibyte characters will not be exactly handled, but that is not a problem).
                     +			     * However when detecting whether a file is HTML or not, we need exact conversion.
                     +			     * (just eliminating zeros and matching would introduce false positives */
                     +			    if(encoding_normalize_toascii(&in_area, encoding, &out_area) >= 0 && out_area.length > 0) {
                     +				    out_area.buffer[out_area.length] = '\0';
                     +				    if(cli_ac_initdata(&mdata, root->ac_partsigs, AC_DEFAULT_TRACKLEN))
                     +					    return ret;
+                    +
                     +				    if(out_area.length > 0) {
                     +					    sret = cli_ac_scanbuff(decodedbuff, out_area.length, NULL, engine->root[0], &mdata, 1, 0, 0, -1, NULL);
                     +					    if(sret == CL_TYPE_HTML) {
                     +						    cli_dbgmsg("cli_filetype2: detected HTML signature in Unicode file\n");
                     +						    /* htmlnorm is able to handle any unicode now, since it skips null chars */
                     +						    ret = CL_TYPE_HTML;
                     +					    }
+                     				    }
                     -			    }
                     -			    cli_ac_freedata(&mdata);
+                    -
                     -			    entity_norm_done(&conv);
                     -		    } else {
                     -			    cli_warnmsg("cli_filetype2: Error initializing entity converter\n");
                     +				    cli_ac_freedata(&mdata);
                     +			    }
+                     		    }
+                     	    }
+                     	}

libclamav/htmlnorm.c

History View file @ b3fc7f9

@@ -542,13 +542,6 @@ static int cli_html_normalise(int fd, m_area_t *m_area, const char *dirname, tag
+                     		}
+                     	}
                     -	if(dconf_entconv && (rc = init_entity_converter(&conv, 16384) )) {
                     -		if (!m_area) {
                     -			fclose(stream_in);
                     -		}
                     -		return rc;
                     -	}
+                    -
                      	tag_args.count = 0;
                      	tag_args.tag = NULL;
                      	tag_args.value = NULL;
@@ -628,10 +621,7 @@ static int cli_html_normalise(int fd, m_area_t *m_area, const char *dirname, tag
                      	binary = FALSE;
                     -	if(dconf_entconv)
                     -		ptr = line = encoding_norm_readline(&conv, stream_in, m_area);
                     -	else
                     -		ptr = line = cli_readchunk(stream_in, m_area, 8192);
                     +	ptr = line = cli_readchunk(stream_in, m_area, 8192);
                      	while (line) {
                      		if(href_contents_begin)
@@ -989,37 +979,6 @@ static int cli_html_normalise(int fd, m_area_t *m_area, const char *dirname, tag
                      						in_script = TRUE;
+                     					}
                      					html_output_tag(file_buff_script, tag, &tag_args);
                     -				} else if (dconf_entconv && strcmp(tag, "body") == 0) {
                     -					/* no more charset changes accepted after body encountered */
                     -					process_encoding_set(&conv, NULL, SWITCH_TO_BLOCKMODE);
                     -				} else if (dconf_entconv && strcmp(tag, "meta") == 0) {
                     -					const unsigned char* http_equiv = html_tag_arg_value(&tag_args, "http-equiv");
                     -					const unsigned char* http_content = html_tag_arg_value(&tag_args, "content");
                     -					if(http_equiv && http_content && strcasecmp(http_equiv,"content-type") == 0) {
                     -						size_t len = strlen((const char*)http_content);
                     -						unsigned char* http_content2 = cli_malloc( len + 1);
                     -						unsigned char* charset;
                     -						size_t i;
+                    -
                     -						if(!http_content2)
                     -							return CL_EMEM;
                     -						for(i = 0; i < len; i++)
                     -							http_content2[i] = tolower(http_content[i]);
                     -						http_content2[len] = '\0';
                     -						charset = (unsigned char*) strstr((char*)http_content2,"charset");
                     -						if(charset) {
                     -							while(*charset && *charset != '=')
                     -								charset++;
                     -							if(*charset)
                     -								charset++;/* skip = */
                     -							len = strcspn((const char*)charset," \"'");
                     -							charset[len] = '\0';
                     -							if(len) {
                     -								process_encoding_set(&conv, charset, META);
                     -							}
                     -						}
                     -						free(http_content2);
                     -					}
                      				} else if (hrefs) {
                      					if(in_ahref && !href_contents_begin)
                      						href_contents_begin=ptr;
@@ -1533,12 +1492,8 @@ static int cli_html_normalise(int fd, m_area_t *m_area, const char *dirname, tag
                      			/* end of line, append contents now, resume on next line */
                      			html_tag_contents_append(hrefs,in_ahref,href_contents_begin,ptr);
                      		ptrend = NULL;
                     -		if(dconf_entconv)
                     -			ptr = line = encoding_norm_readline(&conv, stream_in, m_area);
                     -		else {
                     -			free(line);
                     -			ptr = line = cli_readchunk(stream_in, m_area, 8192);
                     -		}
                     +		free(line);
                     +		ptr = line = cli_readchunk(stream_in, m_area, 8192);
+                     	}
                      	if(dconf_entconv) {
@@ -1566,8 +1521,6 @@ abort:
                      	if (in_ahref) /* tag not closed, force closing */
                      		html_tag_contents_done(hrefs,in_ahref);
                     -	if(dconf_entconv)
                     -		entity_norm_done(&conv);
                      	html_tag_arg_free(&tag_args);
                      	if (!m_area) {
                      		fclose(stream_in);
@@ -1593,11 +1546,11 @@ abort:
                      int html_normalise_mem(unsigned char *in_buff, off_t in_size, const char *dirname, tag_arguments_t *hrefs,const struct cli_dconf* dconf)
+                     {
                      	m_area_t m_area;
+                    -
+                    +
                      	m_area.buffer = in_buff;
                      	m_area.length = in_size;
                      	m_area.offset = 0;
+                    -
+                    +
                      	return cli_html_normalise(-1, &m_area, dirname, hrefs, dconf);
+                     }
@@ -1607,7 +1560,7 @@ int html_normalise_fd(int fd, const char *dirname, tag_arguments_t *hrefs,const
                      	int retval=FALSE;
                      	m_area_t m_area;
                      	struct stat statbuf;
+                    -
+                    +
                      	if (fstat(fd, &statbuf) == 0) {
                      		m_area.length = statbuf.st_size;
                      		m_area.buffer = (unsigned char *) mmap(NULL, m_area.length, PROT_READ, MAP_PRIVATE, fd, 0);

libclamav/htmlnorm.h

History View file @ b3fc7f9

@@ -36,7 +36,7 @@ typedef struct m_area_tag {
                      } m_area_t;
                      int html_normalise_mem(unsigned char *in_buff, off_t in_size, const char *dirname, tag_arguments_t *hrefs,const struct cli_dconf* dconf);
                     -int html_normalise_fd(int fd, const char *dirname, tag_arguments_t *hrefs,const struct cli_dconf* dconf);
                     +int html_normalise_fd(int fd, const char *dirname, tag_arguments_t *hrefs, const struct cli_dconf* dconf);
                      void html_tag_arg_free(tag_arguments_t *tags);
                      int html_screnc_decode(int fd, const char *dirname);