GitList

Browse code

AC_TRY_LINK already adds a main(), remove duplicate main() entconv improvements to improve security and performance Part I for (bb #686, #386) TODO: * optimize entity_norm * create testfiles for unicode encoding variants * create a regression test * check for memory leaks

git-svn: trunk@3511

Török Edvin authored on 2008/01/21 07:18:14
Showing 7 changed files

ChangeLog index 63df5b5..ad2670f 100644
configure index 0e026a9..2ac9911 100755
configure.in index 797c996..e59af16 100644
libclamav/entconv.c index 1956d12..ce5dc1b 100644
libclamav/entconv.h index e6596ad..e24fa40 100644
libclamav/filetypes.c index 1ae5d83..a87678b 100644
libclamav/htmlnorm.c index dd90de5..33bb44b 100644

@@ -1,3 +1,13 @@
                     +Sun Jan 20 23:49:41 EET 2008 (edwin)
                     +------------------------------------
                     +  * configure: AC_TRY_LINK already adds a main(), remove duplicate main()
                     +  * libclamav: entconv improvements to improve security and performance
                     +		Part I for  (bb #686, #386)
                     +	       TODO: * optimize entity_norm
                     +	             * create testfiles for unicode encoding variants
                     +		     * create a regression test
                     +		     * check for memory leaks
+                    +
                      Sat Jan 19 14:41:50 CET 2008 (acab)
                      -----------------------------------
                        * test: using splitted instead of byteswapped files

configure

History View file @ 4e1127c

@@ -11397,15 +11397,12 @@ int
                      main ()
+                     {
                     -int main(int argc, char** argv) {
                      	char** xin,**xout;
                      	unsigned il,ol;
                      	int rc;
                      	iconv_t iconv_struct = iconv_open("UTF-16BE","UTF-8");
                      	rc = iconv(iconv_struct,xin,&il,xout,&ol);
                      	iconv_close(iconv_struct);
                     -	return 0;
                     -}
+                       ;
                        return 0;

configure.in

History View file @ 4e1127c

@@ -230,15 +230,12 @@ if test "X$wiconv" != "Xno"; then
                      	AC_TRY_LINK([
                      		     #include <iconv.h>
                      	],[
                     -int main(int argc, char** argv) {
                      	char** xin,**xout;
                      	unsigned il,ol;
                      	int rc;
                      	iconv_t iconv_struct = iconv_open("UTF-16BE","UTF-8");
                      	rc = iconv(iconv_struct,xin,&il,xout,&ol);
                      	iconv_close(iconv_struct);
                     -	return 0;
                     -}
                      ],[
                         AC_MSG_RESULT(yes)
                         AC_DEFINE(HAVE_ICONV, 1, [iconv() available])

libclamav/entconv.c

History View file @ 4e1127c

@@ -32,6 +32,12 @@
                      #include <pthread.h>
                      #endif
                     +#ifndef CL_DEBUG
                     +#define NDEBUG
                     +#endif
+                    +
                     +#include <assert.h>
+                    +
                      #include "clamav.h"
                      #include "others.h"
                      #include "htmlnorm.h"
@@ -46,6 +52,7 @@
                      #include "encoding_aliases.h"
                     +#define MODULE_NAME "entconv: "
                      #define MAX_LINE 1024
@@ -58,11 +65,12 @@ unsigned char* entity_norm(const struct entity_conv* conv,const unsigned char* e
                      	struct element* e = hashtab_find(conv->ht,entity,strlen((const char*)entity));
                      	if(e && e->key) {
                      		const int val = e->data;
                     +		/* TODO: don't allocate memory here, but use a buffer in struct entity_conv */
                      		if(val == '<')/* this was an escaped <, so output it escaped*/
                      			return (unsigned char*)cli_strdup("&lt;");
                      		else if(val == '>')/* see above */
                      			return (unsigned char*)cli_strdup("&gt;");
                     -		else if(val<127) {
                     +		else if(val >= 0 && val <= 0xff) {
                      			unsigned char *e_out = cli_malloc(2);
                      			if(!e_out)
@@ -75,6 +83,7 @@ unsigned char* entity_norm(const struct entity_conv* conv,const unsigned char* e
                      		else if(val==160)
                      			return (unsigned char*)cli_strdup(" ");
                      		else {
                     +			/* TODO: use optimized version from u16_normalize */
                      			unsigned char *ent_out = cli_malloc(10);
                      			if(!ent_out)
@@ -88,21 +97,22 @@ unsigned char* entity_norm(const struct entity_conv* conv,const unsigned char* e
                      	else
                      		return NULL;
+                     }
+                    -
                      /* sane default, must be larger, than the longest possible return string,
                       * which is
                       * &#xxx;*/
                      #define MIN_BUFFER_SIZE 32
                     -int init_entity_converter(struct entity_conv* conv,const unsigned char* encoding,size_t buffer_size)
                     +#define LINEMODE_LIMIT 16384
+                    +
                     +int init_entity_converter(struct entity_conv* conv, size_t buffer_size)
+                     {
                      	if(buffer_size < MIN_BUFFER_SIZE) {
                      		cli_warnmsg("Entity converter: Supplied buffer size:%lu, smaller than minimum required: %d\n",(unsigned long)buffer_size,MIN_BUFFER_SIZE);
                      		return CL_ENULLARG;
+                     	}
                      	if(conv) {
                     -		conv->encoding = (unsigned char*) cli_strdup("ISO-8859-1");
                     -		conv->autodetected = OTHER;
                     +		conv->encoding = NULL;
                     +		conv->encoding_symbolic = E_UNKNOWN;
                      		conv->bom_cnt = 0;
                      		conv->buffer_cnt = 0;
                      		conv->bytes_read = 0;
@@ -110,6 +120,9 @@ int init_entity_converter(struct entity_conv* conv,const unsigned char* encoding
                      		conv->entity_buffcnt = 0;
                      		conv->buffer_size = buffer_size;
                      		conv->priority = NOPRIO;
                     +		/* start in linemode */
                     +		conv->linemode = 1;
                     +		conv->linemode_processed = 0;
                      		conv->tmp_area.offset = 0;
                      		conv->tmp_area.length = 0;
@@ -119,13 +132,14 @@ int init_entity_converter(struct entity_conv* conv,const unsigned char* encoding
+                     		}
                      		conv->out_area.offset = 0;
                     -		conv->out_area.length = 0;
                     +		conv->out_area.length = buffer_size;
                      		conv->out_area.buffer = cli_malloc(buffer_size);
                      		if(!conv->out_area.buffer) {
                      			free(conv->tmp_area.buffer);
                      			return CL_EMEM;
+                     		}
                     +		conv->buffer_size = buffer_size;
                      		conv->norm_area.offset = 0;
                      		conv->norm_area.length = 0;
                      		conv->norm_area.buffer = cli_malloc(buffer_size);
@@ -138,6 +152,13 @@ int init_entity_converter(struct entity_conv* conv,const unsigned char* encoding
                      		conv->ht = &entities_htable;
                      		conv->msg_zero_shown = 0;
                     +		conv->iconv_struct = cli_calloc(1, sizeof(iconv_t));
                     +		if(!conv->iconv_struct) {
                     +			free(conv->tmp_area.buffer);
                     +			free(conv->out_area.buffer);
                     +			free(conv->norm_area.buffer);
                     +			return CL_EMEM;
                     +		}
                      		return 0;
+                     	}
                      	else
@@ -148,23 +169,18 @@ static size_t encoding_bytes(const unsigned char* fromcode, enum encodings* enco
+                     {
                      	const unsigned char* from = (const unsigned char*) fromcode;
                      	/* special case for these unusual byteorders */
                     -	*encoding=E_OTHER;
                     -	if(from == UCS4_2143)
                     -		*encoding = E_UCS4_2134;
                     -	else if (from == UCS4_3412)
                     -		*encoding = E_UCS4_3412;
                     -	else {
                     -		struct element * e = hashtab_find(&aliases_htable,from,strlen((const char*)fromcode));
                     -		if(e && e->key) {
                     -			*encoding = e->data;
                     -		}
                     +	struct element * e = hashtab_find(&aliases_htable,from,strlen((const char*)fromcode));
                     +	if(e && e->key) {
                     +		*encoding = e->data;
                     +	} else {
                     +		*encoding = E_OTHER;
+                     	}
                      	switch(*encoding) {
                      		case E_UCS4:
                      		case E_UCS4_1234:
                      		case E_UCS4_4321:
                     -		case E_UCS4_2134:
                     +		case E_UCS4_2143:
                      		case E_UCS4_3412:
                      			return 4;
                      		case E_UTF16:
@@ -177,7 +193,7 @@ static size_t encoding_bytes(const unsigned char* fromcode, enum encodings* enco
                      		default:
                      			return 1;
+                     	}
                     -	}
                     +}
                      #ifndef HAVE_ICONV
                      typedef struct {
@@ -187,10 +203,10 @@ typedef struct {
                      static iconv_t iconv_open(const char *tocode, const char* fromcode)
+                     {
                     -	cli_dbgmsg("Internal iconv\n");
                      	iconv_t iconv = cli_malloc(sizeof(*iconv));
                      	if(!iconv)
                      		return NULL;
                     +	cli_dbgmsg(MODULE_NAME "Internal iconv\n");
                      	/* TODO: check that tocode is UTF16BE */
                      	iconv->size = encoding_bytes(fromcode,&iconv->encoding);
                      	return iconv;
@@ -216,7 +232,7 @@ static int iconv(iconv_t iconv_struct,char **inbuf, size_t *inbytesleft,
                      	/* output is always utf16be !*/
                      	switch(iconv_struct->encoding) {
                      		case E_UCS4:
                     -		case E_UCS4_1234:
                     +		case E_UCS4_1234:
+                     			{
                      				for(i=0;i < maxcopy; i += 4) {
                      					if(!input[i+2] && !input[i+3]) {
@@ -224,7 +240,7 @@ static int iconv(iconv_t iconv_struct,char **inbuf, size_t *inbytesleft,
                      						output[i/2+1] = input[i];
+                     					}
                      					else {
                     -						cli_dbgmsg("Warning: unicode character out of utf16 range!\n");
                     +						cli_dbgmsg(MODULE_NAME "Warning: unicode character out of utf16 range!\n");
                      						output[i/2] = 0xff;
                      						output[i/2+1] = 0xff;
+                     					}
@@ -316,7 +332,7 @@ static int iconv(iconv_t iconv_struct,char **inbuf, size_t *inbytesleft,
                      							output[j++] = ((input[i] & 0x1F) << 6) | (input[i+1] & 0x3F);
+                     						}
                      						else {
                     -							cli_dbgmsg("invalid UTF8 character encountered\n");
                     +							cli_dbgmsg(MODULE_NAME "invalid UTF8 character encountered\n");
                      							break;
+                     						}
                      						i+=2;
@@ -328,7 +344,7 @@ static int iconv(iconv_t iconv_struct,char **inbuf, size_t *inbytesleft,
                      							output[j++] = (input[i+1] << 6) | (input[i+2] & 0x3F);
+                     						}
                      						else {
                     -							cli_dbgmsg("invalid UTF8 character encountered\n");
                     +							cli_dbgmsg(MODULE_NAME "invalid UTF8 character encountered\n");
                      							break;
+                     						}
                      						i+=3;
@@ -336,7 +352,7 @@ static int iconv(iconv_t iconv_struct,char **inbuf, size_t *inbytesleft,
                      					else if( (input[i]&0xF8) == 0xF0) {
                      						if((input[i+1]&0xC0) == 0x80 && (input[i+2]&0xC0) == 0x80 && (input[i+3]&0xC0) == 0x80) {
                      							/* 4 bytes long 11110www 10xxxxxx 10yyyyyy 10zzzzzz -> 000wwwxx xxxxyyyy yyzzzzzz*/
                     -							cli_dbgmsg("UTF8 character out of UTF16 range encountered");
                     +							cli_dbgmsg(MODULE_NAME "UTF8 character out of UTF16 range encountered");
                      							output[j++] = 0xff;
                      							output[j++] = 0xff;
@@ -345,13 +361,13 @@ static int iconv(iconv_t iconv_struct,char **inbuf, size_t *inbytesleft,
                      							out[j++] = (input[i+2] << 6) | (input[i+2] & 0x3F);*/
+                     						}
                      						else {
                     -							cli_dbgmsg("invalid UTF8 character encountered\n");
                     +							cli_dbgmsg(MODULE_NAME "invalid UTF8 character encountered\n");
                      							break;
+                     						}
                      						i+=4;
+                     					}
                      					else {
                     -						cli_dbgmsg("invalid UTF8 character encountered\n");
                     +						cli_dbgmsg(MODULE_NAME "invalid UTF8 character encountered\n");
                      						break;
+                     					}
+                     				}
@@ -392,10 +408,12 @@ static int iconv(iconv_t iconv_struct,char **inbuf, size_t *inbytesleft,
                      static inline void process_bom(struct entity_conv* conv)
+                     {
                      	const unsigned char* bom = conv->bom;
                     -	const unsigned char* encoding = OTHER;
                     +	const char* encoding = NULL;
                      	int has_bom = 0;
                     -	uint8_t enc_bytes = 4;/* default is UTF8, which has a maximum of 4 bytes*/
                     +	uint8_t enc_bytes = 1;/* default is UTF8, which has a minimum of 1 bytes*/
                     +	/* undecided 32-bit encodings are treated as ucs4, and
                     +	 * 16 bit as utf16*/
                      	switch(bom[0]) {
                      		case 0x00:
                      			if(bom[1] == 0x00) {
@@ -408,19 +426,20 @@ static inline void process_bom(struct entity_conv* conv)
                      					has_bom = 1;
+                     				}
                      				else if(bom[2] == 0x00 && bom[3] == 0x3C) {
                     -					encoding = UNDECIDED_32_1234;
                     -				}
                     +					/* undecided, treat as ucs4 */
                     +					encoding = UCS4_1234;
                     +				}
                      				else if(bom[2] == 0x3C && bom[3] == 0x00) {
                     -					encoding = UNDECIDED_32_2143;
                     +					encoding = UCS4_2143;
+                     				}
                      			}/* 0x00 0x00 */
                      			else if(bom[1] == 0x3C) {
                      				if(bom[2] == 0x00) {
                      					if(bom[3] == 0x00) {
                     -						encoding = UNDECIDED_32_3412;
                     +						encoding = UCS4_3412;
+                     					}
                      					else if(bom[3] == 0x3F) {
                     -						encoding = UNDECIDED_16_BE;
                     +						encoding = UTF16_BE;
                      						enc_bytes = 2;
+                     					}
                      				}/*0x00 0x3C 0x00*/
@@ -439,7 +458,7 @@ static inline void process_bom(struct entity_conv* conv)
+                     				}
                      			}/*0xFF 0xFE*/
                      			break;
                     -		case 0xFE:
                     +		case 0xFE:
                      			if(bom[1] == 0xFF) {
                      					if(bom[2] == 0x00 && bom[3] == 0x00) {
                      						encoding = UCS4_3412;
@@ -449,98 +468,91 @@ static inline void process_bom(struct entity_conv* conv)
                      						encoding = UTF16_BE;
                      						has_bom = 1;
                      						enc_bytes = 2;
                     -					}
                     +					}
                      			}/*0xFE 0xFF*/
                      			break;
                     -		case 0xEF:
                     +		case 0xEF:
                      			if(bom[1] == 0xBB && bom[2] == 0xBF)  {
                      					encoding = UTF8;
                      					has_bom = 1;
                      					/*enc_bytes = 4;- default, maximum 4 bytes*/
                     -			}/*0xEF 0xBB 0xBF*/
                     +			}/*0xEF 0xBB 0xBF*/
                      			break;
                     -		case 0x3C:
                     +		case 0x3C:
                      				if(bom[1] == 0x00) {
                      					if(bom[2] == 0x00 && bom[3] == 0x00) {
                     -						encoding = UNDECIDED_32_4321;
                     +						encoding = UCS4_4321;
+                     					}
                      					else if(bom[2] == 0x3F && bom[3] == 0x00) {
                     -						encoding = UNDECIDED_16_LE;
                     +						encoding = UTF16_LE;
                      						enc_bytes = 2;
+                     					}
                      				}/*0x3C 0x00*/
                      				else if(bom[1] == 0x3F && bom[2] == 0x78 && bom[3]==0x6D) {
                     -					encoding = UNDECIDED_8;
                     +					encoding = NULL;
                      					enc_bytes = 1;
                      				}/*0x3C 3F 78 6D*/
                      				break;
                     -		case 0x4C:
                     +		case 0x4C:
                      				if(bom[1] == 0x6F && bom[2] == 0xA7 && bom[3] == 0x94) {
                     -					encoding = EBCDIC;
                     +					cli_dbgmsg(MODULE_NAME "EBCDIC encoding is not supported in line mode\n");
                     +					encoding = NULL;
                      					enc_bytes = 1;
                      				}/*4C 6F A7 94*/
                      				break;
                      	}/*switch*/
                     -	conv->autodetected = encoding;
                     +	if(encoding) {
                     +		cli_dbgmsg(MODULE_NAME "encoding detected as :%s\n", encoding);
                     +		process_encoding_set(conv, (const unsigned char*)encoding, has_bom ? BOM : NOBOM_AUTODETECT);
                     +	}
                      	conv->enc_bytes = enc_bytes;
                      	conv->has_bom = has_bom;
+                     }
                     +/*()-./012345678:ABCDEFGHIJKLMNOPQRSTUVWXY_abcdefghijklmnopqrstuvwxy*/
                     +static const uint8_t encname_chars[256] = {
                     +        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
                     +        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
                     +        0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1,
                     +        1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0,
                     +        0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
                     +        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1,
                     +        0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
                     +        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0,
                     +        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
                     +        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
                     +        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
                     +        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
                     +        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
                     +        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
                     +        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
                     +        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
                     +};
+                    +
                     +/* checks that encoding is sane, and normalizes to uppercase */
                      static unsigned char* normalize_encoding(const unsigned char* enc)
+                     {
                     -	unsigned char* norm;
                     -	size_t i;
                     -	const size_t len = strlen((const char*)enc);
                     -	norm = cli_malloc( len+1);
                     +	unsigned char* norm;
                     +	size_t i, len;
+                    +
                     +	if(!enc)
                     +		return NULL;
                     +	len = strlen((const char*)enc);
                     +	if(len > 32)
                     +		return NULL;
                     +	for(i=0;i<len;i++) {
                     +		if(!encname_chars[enc[i]])
                     +			return NULL;
                     +	}
                     +	norm = cli_malloc( len+1 );
                      	if(!norm)
                      		return NULL;
                     -	if(enc == OTHER)
                     -		enc = (const unsigned char*)"ISO-8859-1";
                      	for(i=0;i < strlen((const char*)enc); i++)
                      		norm[i] = toupper(enc[i]);
                      	norm[len]='\0';
                      	return norm;
+                     }
                     -static const unsigned char* encoding_name(unsigned char* encoding)
                     -{
                     -	if(!encoding)
                     -		return (const unsigned char*)"ISO-8859-1";
                     -	else
                     -		return encoding;
                     -}
+                    -
                     -void process_encoding_set(struct entity_conv* conv,const unsigned char* encoding,enum encoding_priority prio)
                     -{
                     -	unsigned char *tmp_encoding;
                     -	enum encodings tmp;
                     -	size_t new_size,old_size;
+                    -
                     -	cli_dbgmsg("Setting encoding for %p  to %s, priority: %d\n",(void*)conv, encoding, prio);
                     -	if(encoding == OTHER)
                     -		return;
                     -	if(conv->priority == CONTENT_TYPE)
                     -		return;/* Content-type in header is highest priority, no overrides possible*/
                     -	if(conv->priority ==  BOM && prio == NOBOM_AUTODETECT)
                     -		return;
+                    -
                     -	tmp_encoding = normalize_encoding(encoding);/* FIXME: better obey priorities*/
                     -	if(prio == META) {
                     -	old_size = encoding_bytes(conv->encoding,&tmp);
                     -	new_size = encoding_bytes(tmp_encoding,&tmp);
                     -	if(old_size != new_size)  {
                     -		/* on x86 gcc wants %u for size_t, on x86_64 it wants %lu for size_t. So just cast to unsigned long to make warnings go away. */
                     -		cli_dbgmsg("process_encoding_set: refusing to override encoding - new encoding size differs: %s(%lu) != %s(%lu)\n",conv->encoding,(unsigned long)old_size,tmp_encoding,(unsigned long)new_size);
                     -		free(tmp_encoding);
                     -		return;
                     -	}
                     -	}
                     -	free(conv->encoding);
                     -	conv->encoding = tmp_encoding;
                     -	cli_dbgmsg("New encoding for %p:%s\n",(void*)conv,conv->encoding);
                     -	/* reset stream */
                     -}
+                    -
                      static int encoding_norm_done(struct entity_conv* conv)
+                     {
                      	if(conv->encoding) {
@@ -567,7 +579,7 @@ int entity_norm_done(struct entity_conv* conv)
+                     {
                      	return encoding_norm_done(conv);
+                     }
+                    -
                     +#if 0
                      static size_t read_raw(FILE *stream, m_area_t *m_area, int max_len, unsigned char* outbuff)
+                     {
@@ -615,29 +627,25 @@ static size_t read_raw(FILE *stream, m_area_t *m_area, int max_len, unsigned cha
+                     		}
+                     	}
+                     }
                     +#endif
                     -static void output_first(struct entity_conv* conv,unsigned char** out, unsigned char** in,size_t* inleft)
                     +static unsigned short bom_length(struct entity_conv* conv)
+                     {
                      	if(conv->has_bom) {
                      		switch(conv->enc_bytes) {
                      			case 1:
                     -				if(conv->autodetected == UTF8) {
                     -					*in += 3;
                     -					*inleft -= 3;
                     +				if(conv->encoding_symbolic == E_UTF8) {
                     +					return 3;
+                     				}
                      				break;
                      			case 2:
                     -				*in += 2;
                     -				*inleft -= 2;
                     -				break;
                     +				return 2;
                      			case 4:
                     -				*in += 4;
                     -				*inleft -= 4;
                     -				break;
                     +				return 4;
+                     		}
+                     	}
                     +	return 0;
+                     }
+                    -
                      /* sarge leaks on iconv_open/iconv_close, so lets not open/close so many times,
                       * just keep on each thread its own pool of iconvs*/
@@ -653,16 +661,16 @@ static void iconv_cache_init(struct iconv_cache* cache)
                      /*	cache->tab = NULL;
                      	cache->len = 0;
                      	cache->used = 0; - already done by memset*/
                     -	cli_dbgmsg("Initializing iconv pool:%p\n",(void*)cache);
                     +	cli_dbgmsg(MODULE_NAME "Initializing iconv pool:%p\n",(void*)cache);
                      	hashtab_init(&cache->hashtab, 32);
+                     }
                      static void iconv_cache_destroy(struct iconv_cache* cache)
+                     {
                      	size_t i;
                     -	cli_dbgmsg("Destroying iconv pool:%p\n",(void*)cache);
                     +	cli_dbgmsg(MODULE_NAME "Destroying iconv pool:%p\n",(void*)cache);
                      	for(i=0;i < cache->last;i++) {
                     -		cli_dbgmsg("closing iconv:%p\n",cache->tab[i]);
                     +		cli_dbgmsg(MODULE_NAME "closing iconv:%p\n",cache->tab[i]);
                      		iconv_close(cache->tab[i]);
+                     	}
                      	hashtab_clear(&cache->hashtab);
@@ -702,9 +710,9 @@ static void iconv_pool_tls_key_alloc(void)
+                     {
                      	pthread_key_create(&iconv_pool_tls_key, iconv_pool_tls_instance_destroy);
                      	if(!cache_atexit_registered) {
                     -		cli_dbgmsg("iconv:registering atexit\n");
                     +		cli_dbgmsg(MODULE_NAME "iconv:registering atexit\n");
                      		if(atexit(iconv_cache_cleanup_main)) {
                     -			cli_dbgmsg("failed to register atexit\n");
                     +			cli_dbgmsg(MODULE_NAME "failed to register atexit\n");
+                     		}
                      		cache_atexit_registered = 1;
+                     	}
@@ -721,7 +729,7 @@ static inline struct iconv_cache* cache_get_tls_instance(void)
                      	if(!cache) {
                      		cache = cli_calloc(1,sizeof(*cache));
                      		if(!cache) {
                     -			cli_dbgmsg("!Out of memory allocating TLS iconv instance\n");
                     +			cli_dbgmsg(MODULE_NAME "!Out of memory allocating TLS iconv instance\n");
                      			return NULL;
+                     		}
                      		iconv_cache_init(cache);
@@ -772,7 +780,7 @@ static iconv_t iconv_open_cached(const unsigned char* fromcode)
                      	init_iconv_pool_ifneeded();
                      	cache = cache_get_tls_instance();/* gets TLS iconv pool */
                      	if(!cache) {
                     -		cli_dbgmsg("!Unable to get TLS iconv cache!\n");
                     +		cli_dbgmsg(MODULE_NAME "!Unable to get TLS iconv cache!\n");
                      		errno = EINVAL;
                      		return (iconv_t)-1;
+                     	}
@@ -784,7 +792,7 @@ static iconv_t iconv_open_cached(const unsigned char* fromcode)
                      	if(e) {
                      		return cache->tab[e->data];
+                     	}
                     -	cli_dbgmsg("iconv not found in cache, for encoding:%s\n",fromcode);
                     +	cli_dbgmsg(MODULE_NAME "iconv not found in cache, for encoding:%s\n",fromcode);
                      	iconv_struct = iconv_open("UTF-16BE",(const char*)fromcode);
                      	if(iconv_struct != (iconv_t)-1) {
                      	idx = cache->last++;
@@ -792,7 +800,7 @@ static iconv_t iconv_open_cached(const unsigned char* fromcode)
                      		cache->len += 16;
                      		cache->tab = cli_realloc2(cache->tab, cache->len*sizeof(cache->tab[0]));
                      		if(!cache->tab) {
                     -			cli_dbgmsg("!Out of mem in iconv-pool\n");
                     +			cli_dbgmsg(MODULE_NAME "!Out of mem in iconv-pool\n");
                      			errno = ENOMEM;
                      			return (iconv_t)-1;
+                     		}
@@ -800,12 +808,200 @@ static iconv_t iconv_open_cached(const unsigned char* fromcode)
                      	hashtab_insert(&cache->hashtab, fromcode, fromcode_len, idx);
                      		cache->tab[idx] = iconv_struct;
                     -	cli_dbgmsg("iconv_open(),for:%s -> %p\n",fromcode,(void*)cache->tab[idx]);
                     +	cli_dbgmsg(MODULE_NAME "iconv_open(),for:%s -> %p\n",fromcode,(void*)cache->tab[idx]);
                      	return cache->tab[idx];
+                     }
                      	return (iconv_t)-1;
+                     }
                     +#if 0
                     +struct buffer {
                     +	unsigned char *buffer;
                     +	size_t length;
                     +	size_t offset;
                     +	size_t filled;
                     +};
+                    +
                     +#define BUFFER_FILL(b, fill_func) \
                     +	if((b)->offset >= (b)->filled) {\
                     +		/* buffer empty, attempt to fill it*/\
                     +		if((fill_func) == -1) return -1;/* error encountered */\
                     +		if((b)->filled == 0) return 0;/* EOF */\
                     +		(b)->offset = 0;\
                     +	}
                     +#endif
+                    +
                     +void process_encoding_set(struct entity_conv* conv,const unsigned char* encoding,enum encoding_priority prio)
                     +{
                     +	unsigned char *tmp_encoding;
                     +	enum encodings tmp;
                     +	size_t new_size,old_size;
+                    +
                     +	if(!encoding && prio == SWITCH_TO_BLOCKMODE) {
                     +		if(conv->linemode) {
                     +			cli_dbgmsg(MODULE_NAME "Switching to block-mode, bytes processed in line-mode: %u\n", conv->linemode_processed);
                     +			conv->linemode = 0;
                     +		}
                     +		return;
                     +	}
+                    +
                     +	cli_dbgmsg(MODULE_NAME "Request to set encoding for %p to %s, priority: %d\n", (void*)conv, encoding, prio);
+                    +
                     +	if(conv->priority == CONTENT_TYPE || conv->encoding || conv->encoding_symbolic == E_ICONV) {
                     +		cli_dbgmsg(MODULE_NAME "won't override encoding due to priorities\n");
                     +		return;
                     +		/* Content-type in header is highest priority, no overrides possible.
                     +		 * Also no overrides after an encoding has been set.*/
                     +	}
+                    +
                     +	/* validate encoding name, and normalize to uppercase */
                     +	if(!(tmp_encoding = normalize_encoding(encoding))) {
                     +		cli_dbgmsg(MODULE_NAME "encoding name is not valid, ignoring\n");
                     +		return;
                     +	}
+                    +
                     +	/* don't allow to change between unicode encodings that have different byte-size */
                     +	if(prio == META) {
                     +		/* need to consider minimum size of an encoding here */
                     +		old_size =  conv->enc_bytes;
                     +		new_size = encoding_bytes(tmp_encoding,&tmp);
                     +		if(old_size != new_size)  {
                     +			/* on x86 gcc wants %u for size_t, on x86_64 it wants %lu for size_t. So just cast to unsigned long to make warnings go away. */
                     +			cli_dbgmsg(MODULE_NAME "refusing to override encoding - new encoding size differs: %s(%lu) != %s(%lu)\n", conv->encoding, (unsigned long)old_size, tmp_encoding, (unsigned long)new_size);
                     +			free(tmp_encoding);
                     +			return;
                     +		}
                     +	}
+                    +
                     +	conv->encoding = tmp_encoding;
                     +	cli_dbgmsg(MODULE_NAME "New encoding for %p:%s\n", (void*)conv, conv->encoding);
                     +	*(iconv_t*)conv->iconv_struct = iconv_open_cached( conv->encoding );
                     +	if(*(iconv_t*)conv->iconv_struct == (iconv_t)-1) {
                     +		cli_dbgmsg(MODULE_NAME "Encoding not accepted by iconv_open()%s, falling back to default!\n", conv->encoding);
                     +		/* message shown only once/file */
                     +		/* what can we do? short-circuit iconv */
                     +		free(conv->encoding);
                     +		conv->encoding = NULL;
                     +		/* we will process using whatever we currently have for encoding_symbolic.
                     +		 * If encoding was already set to iconv, we shouldn't be here.*/
                     +		assert(conv->encoding_symbolic != E_ICONV);
                     +	} else {
                     +		cli_dbgmsg(MODULE_NAME "Switching to block-mode, bytes processed in line-mode: %u\n", conv->linemode_processed);
                     +		conv->encoding_symbolic = E_ICONV;
                     +		conv->priority = prio;
                     +		conv->linemode = 0;
                     +	}
                     +}
+                    +
                     +static int in_iconv_u16(m_area_t* in_m_area, iconv_t* iconv_struct, m_area_t* out_m_area)
                     +{
                     +	char   tmp4[4];
                     +	size_t inleft = in_m_area->length - in_m_area->offset;
                     +	size_t rc, alignfix;
                     +	char*  input   = (char*)in_m_area->buffer + in_m_area->offset;
                     +	size_t outleft = out_m_area->length > 0 ? out_m_area->length : 0;/*TODO: use real buffer size not last one*/
                     +	char* out      = (char*)out_m_area->buffer;
+                    +
+                    +
                     +	/* convert encoding conv->tmp_area. conv->out_area */
                     +	alignfix = inleft%4;/* iconv gives an error if we give him 3 bytes to convert,
                     +			       and we are using ucs4, ditto for utf16, and 1 byte*/
                     +	inleft -= alignfix;
+                    +
                     +	if(!inleft && alignfix) {
                     +		/* EOF, and we have less than 4 bytes to convert */
                     +		memset(tmp4, 0, 4);
                     +		memcpy(tmp4, input, alignfix);
                     +		input = tmp4;
                     +		inleft = 4;
                     +	}
+                    +
                     +	rc = (size_t)-1;
                     +	while (inleft && (outleft >= 2) && rc == (size_t)-1) { /* iconv doesn't like inleft to be 0 */
                     +		assert(*iconv_struct != (iconv_t)-1);
                     +		rc = iconv(*iconv_struct, (char**) &input,  &inleft, (char**) &out, &outleft);
                     +		if(rc == (size_t)-1 && errno != E2BIG) {
                     +			cli_dbgmsg("iconv error:%s, silently resuming (%lu, %lu, %ld, %ld)\n",
                     +					strerror(errno), inleft, outleft, input - (char*)in_m_area->buffer,
                     +					out - (char*)out_m_area->buffer);
                     +			/* output raw byte, and resume at next byte */
                     +			if(outleft < 2) break;
                     +			outleft -= 2;
                     +			*out++ = 0;
                     +			*out++ = *input++;
                     +			inleft--;
                     +		}
                     +	}
                     +	in_m_area->offset = in_m_area->length - inleft;
                     +	if(out_m_area->length >= 0 && out_m_area->length >= (off_t)outleft) {
                     +		out_m_area->length -= (off_t)outleft;
                     +	} else {
                     +		cli_dbgmsg(MODULE_NAME "outleft overflown, ignoring\n");
                     +		out_m_area->length = 0;
                     +	}
                     +	out_m_area->offset  = 0;
                     +	return 0;
                     +}
                     +#if 0
                     +/* processes @in buffer, and fills @out. Modifies offset of @in on exit. */
                     +static int u16_normalize (struct entity_conv* conv, struct buffer* in_buff, struct buffer* out_buff)
                     +{
                     +	const unsigned char* in = in_buff->buffer;
                     +	unsigned char* out      = out_buff->buffer;
                     +	const unsigned char* out_end  = out + out_buff->length;
+                    +
                     +	do {
                     +		size_t i;
                     +		BUFFER_FILL(in_buff, in_iconv_u16(conv) );
+                    +
                     +		for(i = in_buff->offset; (i < in_buff->filled) && (out < out_end); i += 2) {
                     +			const uint16_t u16 = ( ((const uint16_t)in[i]) << 8 ) | in[i+1];
                     +			if(u16 >  0 && u16 < 0x80) {
                     +				assert((unsigned char)u16 != 0);
                     +				assert(out < out_end);
                     +				*out++ = (unsigned char)u16;
                     +			}
                     +			else if (u16 == 160)  {/*nbsp*/
                     +				assert(out < out_end);
                     +				*out++ = 0x20;
                     +			}
                     +			else {
                     +				const ssize_t max_num_length = 9;
                     +				ssize_t printed;
                     +				if((out_end - out) <=  max_num_length) {
                     +					/* prevent buffer overflow */
                     +					/* force exit out of while loop */
                     +					out_end = NULL;
                     +					break;
                     +				}
                     +				assert(out + max_num_length < out_end);
                     +				printed = snprintf((char*)out, max_num_length, "&#%d;", u16);
                     +				if(printed > 0) {
                     +					out += printed;
                     +				}
                     +			}
                     +		}
                     +		in_buff->offset = i;
                     +		out_buff->filled = out - out_buff->buffer;
                     +		out_buff->offset = 0;
                     +	} while (out < out_end);/* if out not full, try to fill it */
                     +}
                     +/*
                     + * We need a line-mode, which allows us to change the encoding, and
                     + * a block mode, that doesn't care about lines
                     + *
                     + *
                     + */
                     +/*
                     + * ASCII -> ascii_normalize
                     + * ANY -> iconv -> u16_normalize
                     + * UTF16 -> u16_normalize
                     + */
+                    +
                     +unsigned char* encoding_norm_readline(struct entity_conv* conv, FILE* stream_in, m_area_t* in_m_area, const size_t maxlen)
                     +{
                     +	u16_normalize(conv,
                     +}
                      /* tmp_m_area and conv->out_area are of size maxlen */
                      unsigned char* encoding_norm_readline(struct entity_conv* conv, FILE* stream_in, m_area_t* in_m_area, const size_t maxlen)
@@ -818,7 +1014,7 @@ unsigned char* encoding_norm_readline(struct entity_conv* conv, FILE* stream_in,
                      		const size_t tmp_available = conv->buffer_size - tmp_move;
                      		const size_t max_read = maxlen < tmp_available ? maxlen : tmp_available;
                      		unsigned char* tmpbuff = &conv->tmp_area.buffer[tmp_move];
+                    -
+                    +
                      		const size_t out_move = conv->out_area.length < conv->out_area.offset ? 0 : conv->out_area.length - conv->out_area.offset;
                      		size_t outleft = conv->buffer_size - out_move;
                      		unsigned char* out = &conv->out_area.buffer[out_move];
@@ -849,60 +1045,6 @@ unsigned char* encoding_norm_readline(struct entity_conv* conv, FILE* stream_in,
                      		tmpbuff = conv->tmp_area.buffer;
                      		inleft = conv->tmp_area.length;
                     -		if(!conv->bom_cnt && conv->tmp_area.length >= 4) {/* detect Byte Order Mark */
                     -			memcpy( conv->bom, tmpbuff, 4);
                     -			process_bom(conv);
                     -			process_encoding_set(conv,conv->autodetected,conv->has_bom ? BOM : NOBOM_AUTODETECT);
                     -			output_first(conv,&out,&tmpbuff,&inleft);
                     -			conv->bom_cnt++;
                     -		}
+                    -
                     -		/* convert encoding conv->tmp_area. conv->out_area */
                     -		alignfix = inleft%4;/* iconv gives an error if we give him 3 bytes to convert,
                     -				       and we are using ucs4, ditto for utf16, and 1 byte*/
                     -		inleft -= alignfix;
+                    -
                     -		if(!inleft && alignfix) {
                     -			size_t k;
                     -			for(k=0;k+alignfix < 4;k++)
                     -				tmpbuff[alignfix+k] = '\0';
                     -			inleft = 4;
                     -			alignfix = -inleft;
                     -		}
+                    -
                     -		iconv_struct = iconv_open_cached(encoding_name(conv->encoding));
+                    -
                     -		if(iconv_struct == (iconv_t)-1) {
                     -			cli_dbgmsg("Iconv init problem for encoding:%s, falling back to iso encoding!\n",encoding_name(conv->encoding));
                     -			/* message shown only once/file */
                     -			/* what can we do? just fall back for it being an ISO-8859-1 */
                     -		        free(conv->encoding);
                     -			conv->encoding = (unsigned char*) cli_strdup("ISO-8859-1");
                     -			iconv_struct = iconv_open_cached(conv->encoding);
                     -			if(iconv_struct == (iconv_t)-1) {
                     -				cli_dbgmsg("fallback failed... bail out\n");
                     -				return cli_readline(NULL,&conv->tmp_area,maxlen);
                     -			}
                     -		}
+                    -
                     -		if(inleft && outleft > conv->buffer_size/2 ) /* iconv doesn't like inleft to be 0 */ {
                     -			rc = iconv(iconv_struct, (char**) &tmpbuff,  &inleft, (char**) &out, &outleft);
                     -		}
                     -		else
                     -			rc = 0;
+                    -
                     -#if 0
                     -		 iconv_close(iconv_struct);/* - don't close, we are using a cached instance */
                     -#endif
+                    -
                     -		if(rc==(size_t)-1 && errno != E2BIG) {
                     -				cli_dbgmsg("iconv error:%s, silently resuming (%ld,%ld,%lu,%lu)\n",strerror(errno),(long)(out-conv->out_area.buffer),(long)(tmpbuff-conv->tmp_area.buffer),(unsigned long)inleft,(unsigned long)outleft);
                     -				/* output raw byte, and resume at next byte */
                     -				*out++ = 0;
                     -				*out++ = *tmpbuff++;
                     -				inleft--;
                     -/*				return cli_readline(NULL, &conv->norm_area, maxlen);*/
                     -		}
                      		conv->tmp_area.length = inleft + (alignfix > 0 ? alignfix : 0);
                      		conv->out_area.length = out - conv->out_area.buffer - out_move;
@@ -980,4 +1122,186 @@ unsigned char* encoding_norm_readline(struct entity_conv* conv, FILE* stream_in,
                      		return cli_readline(NULL, &conv->norm_area, maxlen);
+                     	}
+                     }
                     +#endif
+                    +
                     +static inline unsigned char* u16_normalize(uint16_t u16, unsigned char* out, const ssize_t limit)
                     +{
                     +	assert(limit > 0 && "u16_normalize must be called with positive limit");
                     +	/* \0 is just ignored */
                     +	if(u16 > 0 && u16 < 0xff) {
                     +		assert((uint8_t)u16 != 0);
                     +		*out++ = (uint8_t)u16;
                     +	}
                     +	else {
                     +		/* normalize only >255 to speed up */
                     +		char buf[10];
                     +		const ssize_t max_num_length = sizeof(buf)-1;
                     +		int i = sizeof(buf)-1;
+                    +
                     +		if(limit <=  max_num_length) {
                     +			/* not enough space available */
                     +			return NULL;
                     +		}
                     +		/* inline version of
                     +		 * out += snprintf(out, max_num_length, "&#%d;", u16) */
                     +		buf[i] = '\0';
                     +		do {
                     +			buf[--i] = '0' + (u16 % 10);
                     +			u16 /= 10;
                     +		} while (u16 && i > 0);
                     +		*out++ = '&';
                     +		*out++ = '#';
                     +		while(buf[i]) *out++ = buf[i++];
                     +		*out++ = ';';
                     +	}
                     +	assert(out);
                     +	return out;
                     +}
+                    +
                     +#define NORMALIZE_CHAR(c, out, limit, linemode) \
                     +{\
                     +	        if (linemode && c == '\n') {\
                     +			i++;\
                     +			break;\
                     +		} else {\
                     +			unsigned char* out_new = u16_normalize(c, out, limit);\
                     +			if(out_new) {\
                     +				limit -= out_new - out;\
                     +			}\
                     +			out = out_new;\
                     +		}\
                     +}
+                    +
                     +/* don't use CLI_ISCONTAINED2 here, because values are signed, and gcc4.3
                     + * assumes signed overflow doesn't occur when optimizing (see -Wstrict-overflow) */
                     +#define LIMIT_LENGTH(siz, siz_limit) ((siz) <= (siz_limit) ? (siz) : (siz_limit))
                     +#define OFFSET_INBOUNDS(offset, length) ((offset) >= 0 && (length) >= 0 && (offset) < (length))
+                    +
                     +/* EOF marker is m_area->length == 0 */
+                    +
                     +/* reads input from either @m_area or @stream, and returns an m_area_t pointing to the data read.
                     + * When we can't read anything due to EOF ->length will be set to 0.
                     + * bounds checks offset and length*/
                     +static inline m_area_t* read_raw(struct entity_conv* conv, m_area_t* m_area, FILE* stream)
                     +{
                     +	if(!m_area) {
                     +		size_t iread;
+                    +
                     +		m_area = &conv->tmp_area;
                     +		if(OFFSET_INBOUNDS(m_area->offset, m_area->length)) {
                     +			return m_area;
                     +		}
                     +		/* offset out of bounds -> all the buffer was processed, fill it again */
                     +		iread = fread(m_area->buffer, 1, conv->buffer_size, stream);
                     +		m_area->length = LIMIT_LENGTH(iread, conv->buffer_size);
                     +		m_area->offset = 0;
                     +		if(ferror(stream)) {
                     +			cli_errmsg("Error while reading HTML stream\n");
                     +		}
                     +	} else {
                     +		if(!OFFSET_INBOUNDS(m_area->offset, m_area->length)) {
                     +			cli_dbgmsg(MODULE_NAME "EOF reached\n");
                     +			m_area->length = m_area->offset = 0; /* EOF marker */
                     +		}
                     +	}
                     +	return m_area;
                     +}
+                    +
                     +static inline uint16_t get_u16(const unsigned char* buf, const size_t i)
                     +{
                     +	return ((uint16_t)buf[i] << 8) | buf[i+1];
                     +}
+                    +
                     +unsigned char* encoding_norm_readline(struct entity_conv* conv, FILE* stream_in, m_area_t* in_m_area)
                     +{
                     +	unsigned char* out = conv->out_area.buffer;
                     +	if(!conv || !conv->out_area.buffer || !conv->tmp_area.buffer || !out) {
                     +		return NULL;
                     +	}
                     +	if(!(in_m_area = read_raw(conv, in_m_area, stream_in))) {
                     +		/* error encountered */
                     +		return NULL;
                     +	}
                     +	else {
                     +		const off_t input_limit  = in_m_area->length;
                     +		const unsigned char* input = in_m_area->buffer;
                     +		off_t input_offset = in_m_area->offset;
                     +		off_t limit = conv->out_area.length - 1;
                     +		off_t limit_prev = limit;
                     +		off_t i = 0;
+                    +
                     +		/* read_raw() ensures this condition */
                     +		assert((!input_limit && !input_offset) || (input_offset >=0 && input_limit > 0 && input_offset < input_limit));
+                    +
                     +		if(!conv->bom_cnt && input_offset + 4 < input_limit) {/* detect Byte Order Mark */
                     +			size_t bom_len;
                     +			memcpy(conv->bom, input, 4);
                     +			process_bom(conv);
                     +			bom_len = bom_length(conv);
                     +			in_m_area->offset = input_offset = input_offset + bom_len;
                     +			conv->bom_cnt = 1;
                     +		}
+                    +
                     +		if(conv->linemode && conv->linemode_processed > LINEMODE_LIMIT) {
                     +			cli_dbgmsg(MODULE_NAME "Line-mode limit exceeded (%u), switching to block-mode\n", conv->linemode_processed);
                     +			conv->linemode = 0;
                     +		}
+                    +
                     +		switch(conv->encoding_symbolic) {
                     +			case E_ICONV:/* only in block-mode */
                     +				/* normalize already converted characters from a previous pass
                     +				 * (output buffer was full, and we couldn't normalize more in previous pass) */
                     +				for(i = conv->norm_area.offset;i < conv->norm_area.length && limit > 0 && out; i += 2) {
                     +					const uint16_t c = get_u16(conv->norm_area.buffer, i);
                     +					NORMALIZE_CHAR(c, out, limit, 0);
                     +				}
                     +				conv->norm_area.offset = i;
                     +			        if(limit > 0) {
                     +					conv->norm_area.length = conv->buffer_size;
                     +					in_iconv_u16(in_m_area, conv->iconv_struct, &conv->norm_area);
+                    +
                     +					/*in_iconv_u16 always fills entire norm_area buffer starting from 0. */
                     +					for(i = 0;i < conv->norm_area.length && limit >  0 && out; i += 2) {
                     +						const uint16_t c = get_u16(conv->norm_area.buffer, i);
                     +						NORMALIZE_CHAR(c, out, limit, 0);
                     +					}
                     +					if(i) {
                     +						conv->norm_area.offset = i;
                     +					}
                     +				}
                     +				if(limit == limit_prev) {
                     +					/* output pointer didn't move => EOF */
                     +					return NULL;
                     +				}
                     +				break;
                     +				/* out_area must have enough space to allow all bytes in norm_area normalized,
                     +				 * if we norm with &x;, then we need 7* space. */
                     +			default:
                     +				cli_dbgmsg(MODULE_NAME "Unhandled encoding:%d\n",conv->encoding_symbolic);
                     +				conv->encoding_symbolic = E_OTHER;
                     +			case E_UNKNOWN:
                     +			case E_OTHER:
                     +				if(!input_limit) {
                     +					/* nothing to do, EOF */
                     +					return NULL;
                     +				}
                     +				for(i = input_offset; i < input_limit && limit > 0 && out; i++) {
                     +					const uint16_t c = input[i];
                     +					NORMALIZE_CHAR(c, out, limit, conv->linemode);
                     +				}
                     +				in_m_area->offset = i;
                     +		}
+                    +
+                    +
                     +		if(conv->linemode) {
                     +			conv->linemode_processed += i - input_offset;
                     +		}
+                    +
                     +		if(limit < 0) limit = 0;
                     +/*		assert((unsigned)(conv->out_area.length - limit - 1) < conv->buffer_size);
                     +		assert(conv->out_area.length - limit - 1 >= 0); */
                     +		conv->out_area.buffer[conv->out_area.length - limit - 1] = '\0';
                     +		return conv->out_area.buffer;
                     +	}
                     +}

libclamav/entconv.h

History View file @ 4e1127c

@@ -25,33 +25,33 @@
                      #include "hashtab.h"
                     -#define UCS4_1234 (const unsigned char*)"UCS-4LE"
                     -#define UCS4_4321 (const unsigned char*)"UCS-4BE"
                     -#define UCS4_2143 (const unsigned char*)"UCS4"
                     -#define UCS4_3412 (const unsigned char*)"UCS-4"
                     -#define UTF16_BE (const unsigned char*)"UTF-16BE"
                     -#define UTF16_LE (const unsigned char*)"UTF-16LE"
                     -#define UTF8     (const unsigned char*)"UTF-8"
                     +#define UCS4_1234 "UCS-4LE"
                     +#define UCS4_4321 "UCS-4BE"
                     +#define UCS4_2143 "UCS4"
                     +#define UCS4_3412 "UCS-4"
                     +#define UTF16_BE "UTF-16BE"
                     +#define UTF16_LE "UTF-16LE"
                     +#define UTF8     "UTF-8"
                      #define UNDECIDED_32_1234 UCS4_1234
                      #define UNDECIDED_32_4321 UCS4_4321
                      #define UNDECIDED_32_2143 UCS4_2143
                      #define UNDECIDED_32_3412 UCS4_3412
                      #define UNDECIDED_16_BE UTF16_BE
                      #define UNDECIDED_16_LE UTF16_LE
                     -#define UNDECIDED_8 (const unsigned char*)"ISO-8859-1"
                     -#define EBCDIC (const unsigned char*)"EBCDIC-US"
                     -#define UNKNOWN (const unsigned char*)"\0"
                     -#define OTHER   (const unsigned char*)"OTHER"
                     +#define UNDECIDED_8 "ISO-8859-1"
                     +#define EBCDIC "EBCDIC-US"
                     +#define UNKNOWN "\0"
                     +#define OTHER   "OTHER"
                     -enum encoding_priority {NOPRIO,CONTENT_TYPE,BOM,NOBOM_AUTODETECT,XML_CHARSET,META};
                     +enum encoding_priority {NOPRIO,CONTENT_TYPE,BOM,NOBOM_AUTODETECT,XML_CHARSET,META, SWITCH_TO_BLOCKMODE};
                     -enum encodings {E_UCS4,E_UTF16,E_UCS4_1234,E_UCS4_4321,E_UCS4_2134,E_UCS4_3412,E_UTF16_BE,E_UTF16_LE,E_UTF8,E_UNKNOWN,E_OTHER};
                     +enum encodings {E_UCS4,E_UTF16,E_UCS4_1234,E_UCS4_4321,E_UCS4_2143,E_UCS4_3412,E_UTF16_BE,E_UTF16_LE,E_UTF8, E_UNKNOWN,E_OTHER, E_ICONV};
                      #define MAX_ENTITY_SIZE 22
                      struct entity_conv {
                      	unsigned char* encoding;
                     -	const unsigned char* autodetected;
                      	enum encoding_priority priority;
                     +	enum encodings encoding_symbolic;
                      	unsigned short int encoding_specific;/* sub-encoding, used for ISO*/
                      	const struct hashtable* ht;
                      	uint8_t has_bom;
@@ -60,26 +60,24 @@ struct entity_conv {
                      	uint8_t  bom_cnt;
                      	uint32_t partial;
                      	unsigned char bom[4];
                     -#if 0
                     -	char* buffer;
                     -	char* buffer2;
                     -#endif
                      	size_t buffer_size;
                      	size_t buffer_cnt;
                      	uint8_t entity_buffcnt;
                     +	void* iconv_struct;
                      	char entity_buff[MAX_ENTITY_SIZE+2];
                      	m_area_t tmp_area;
                      	m_area_t out_area;
                      	m_area_t norm_area;
                      	int      msg_zero_shown;
                     +	int      linemode;/* TODO:set */
                     +	int      linemode_processed;
                      };
+                    -
                     -int init_entity_converter(struct entity_conv* conv,const unsigned char* encoding,size_t buffer_size);
                     +int init_entity_converter(struct entity_conv* conv, size_t buffer_size);
                      void process_encoding_set(struct entity_conv* conv,const unsigned char* encoding,enum encoding_priority priority);
                      int entity_norm_done(struct entity_conv* conv);
                     -unsigned char* encoding_norm_readline(struct entity_conv* conv, FILE* stream_in, m_area_t* in_m_area, const size_t maxlen);
                     +unsigned char* encoding_norm_readline(struct entity_conv* conv, FILE* stream_in, m_area_t* in_m_area);
                      unsigned char* entity_norm(const struct entity_conv* conv,const unsigned char* entity);
                      int entitynorm_init(void);

libclamav/filetypes.c

History View file @ 4e1127c

@@ -185,37 +185,35 @@ cli_file_t cli_filetype2(int desc, const struct cl_engine *engine)
                      		    struct entity_conv conv;
                      		    const size_t conv_size = 2*bread < 256 ? 256 : 2*bread;
                     -		if(init_entity_converter(&conv,UNKNOWN,conv_size) == 0) {
                     -			int end = 0;
                     -			m_area_t area;
                     -			area.buffer = (unsigned char *) smallbuff;
                     -			area.length = bread;
                     -			area.offset = 0;
+                    -
                     -		    while(!end) {
                     -			if(cli_ac_initdata(&mdata, root->ac_partsigs, AC_DEFAULT_TRACKLEN))
                     -			    return ret;
+                    -
                     -			decoded =  encoding_norm_readline(&conv, NULL, &area, bread);
+                    -
                     -			if(decoded) {
                     -			    sret = cli_ac_scanbuff(decoded, strlen((const char *) decoded), NULL, engine->root[0], &mdata, 1, 0, 0, -1, NULL);
                     -			    free(decoded);
                     -			    if(sret == CL_TYPE_HTML) {
                     -				ret = CL_TYPE_HTML;
                     -				end = 1;
                     +		    /* TODO: make detection via daily.ft, then we can get rid of line-mode entirely!*/
                     +		    if(init_entity_converter(&conv, conv_size) == 0) {
                     +			    m_area_t area;
                     +			    area.buffer = (unsigned char *) smallbuff;
                     +			    area.length = bread;
                     +			    area.offset = 0;
+                    +
                     +			    /* switch to blockmode, so that we convert all the input buffer at once,
                     +			     * rather than line-by-line */
                     +			    process_encoding_set(&conv, NULL, SWITCH_TO_BLOCKMODE);
+                    +
                     +			    if(cli_ac_initdata(&mdata, root->ac_partsigs, AC_DEFAULT_TRACKLEN))
                     +				    return ret;
+                    +
                     +			    decoded =  encoding_norm_readline(&conv, NULL, &area);
+                    +
                     +			    if(decoded) {
                     +				    sret = cli_ac_scanbuff(decoded, strlen((const char *) decoded), NULL, engine->root[0], &mdata, 1, 0, 0, -1, NULL);
                     +				    if(sret == CL_TYPE_HTML) {
                     +					    ret = CL_TYPE_HTML;
                     +				    }
+                     			    }
                     -			} else
                     -			    end = 1;
                     -			cli_ac_freedata(&mdata);
                     -		    }
+                    -
                     -		    entity_norm_done(&conv);
                     +			    cli_ac_freedata(&mdata);
                     -		} else {
                     -		    cli_warnmsg("cli_filetype2: Error initializing entity converter\n");
                     -		}
                     +			    entity_norm_done(&conv);
                     +		    } else {
                     +			    cli_warnmsg("cli_filetype2: Error initializing entity converter\n");
                     +		    }
+                     	    }
+                     	}
+                         }

libclamav/htmlnorm.c

History View file @ 4e1127c

@@ -491,7 +491,7 @@ static int cli_html_normalise(int fd, m_area_t *m_area, const char *dirname, tag
+                     		}
+                     	}
                     -	if(dconf_entconv && (rc = init_entity_converter(&conv, UNKNOWN, 16384) )) {
                     +	if(dconf_entconv && (rc = init_entity_converter(&conv, 16384) )) {
                      		if (!m_area) {
                      			fclose(stream_in);
+                     		}
@@ -502,7 +502,7 @@ static int cli_html_normalise(int fd, m_area_t *m_area, const char *dirname, tag
                      	tag_args.tag = NULL;
                      	tag_args.value = NULL;
                      	tag_args.contents = NULL;
+                    -
+                    +
                      	if (dirname) {
                      		snprintf(filename, 1024, "%s/rfc2397", dirname);
                      		if (mkdir(filename, 0700) && errno != EEXIST) {
@@ -514,14 +514,14 @@ static int cli_html_normalise(int fd, m_area_t *m_area, const char *dirname, tag
                      			file_buff_o1 = file_buff_o2 = file_buff_script = NULL;
                      			goto abort;
+                     		}
+                    -
+                    +
                      		file_buff_o2 = (file_buff_t *) cli_malloc(sizeof(file_buff_t));
                      		if (!file_buff_o2) {
                      			free(file_buff_o1);
                      			file_buff_o1 = file_buff_o2 = file_buff_script = NULL;
                      			goto abort;
+                     		}
+                    -
+                    +
                      		file_buff_script = (file_buff_t *) cli_malloc(sizeof(file_buff_t));
                      		if (!file_buff_script) {
                      			free(file_buff_o1);
@@ -529,7 +529,7 @@ static int cli_html_normalise(int fd, m_area_t *m_area, const char *dirname, tag
                      			file_buff_o1 = file_buff_o2 = file_buff_script = NULL;
                      			goto abort;
+                     		}
+                    -
+                    +
                      		snprintf(filename, 1024, "%s/comment.html", dirname);
                      		file_buff_o1->fd = open(filename, O_WRONLY|O_CREAT|O_TRUNC, S_IWUSR|S_IRUSR);
                      		if (!file_buff_o1->fd) {
@@ -574,12 +574,12 @@ static int cli_html_normalise(int fd, m_area_t *m_area, const char *dirname, tag
                      		file_buff_o2 = NULL;
                      		file_buff_script = NULL;
+                     	}
+                    -
+                    +
                      	binary = FALSE;
                      	if(dconf_entconv)
                     -		ptr = line = encoding_norm_readline(&conv, stream_in, m_area, 8192);
                     -	else
                     +		ptr = line = encoding_norm_readline(&conv, stream_in, m_area);
                     +	else
                      		ptr = line = cli_readline(stream_in, m_area, 8192);
                      	while (line) {
@@ -766,7 +766,7 @@ static int cli_html_normalise(int fd, m_area_t *m_area, const char *dirname, tag
                      					if (file_buff_o2 && (file_buff_o2->length > 0)) {
                      						file_buff_o2->length--;
+                     					}
+                    -
+                    +
                      					if (quoted != NOT_QUOTED) {
                      						html_output_c(file_buff_o1, file_buff_o2, '"');
+                     					}
@@ -783,7 +783,7 @@ static int cli_html_normalise(int fd, m_area_t *m_area, const char *dirname, tag
                      					if (file_buff_o2 && (file_buff_o2->length > 0)) {
                      						file_buff_o2->length--;
+                     					}
+                    -
+                    +
                      					if (quoted != NOT_QUOTED) {
                      						html_output_c(file_buff_o1, file_buff_o2, '"');
+                     					}
@@ -832,7 +832,7 @@ static int cli_html_normalise(int fd, m_area_t *m_area, const char *dirname, tag
+                     						}
                      						ptr++;
                      					} else {
                     -						if (!escape && (quoted==DOUBLE_QUOTED)) {
                     +						if (!escape && (quoted==DOUBLE_QUOTED)) {
                      							html_output_c(file_buff_o1, file_buff_o2, '"');
                      							if (tag_val_length < HTML_STR_LENGTH) {
                      								tag_val[tag_val_length++] = '"';
@@ -880,7 +880,7 @@ static int cli_html_normalise(int fd, m_area_t *m_area, const char *dirname, tag
+                     					}
                      					ptr++;
+                     				}
+                    -
+                    +
                      				if (*ptr == '\\') {
                      					escape = TRUE;
                      				} else {
@@ -899,7 +899,7 @@ static int cli_html_normalise(int fd, m_area_t *m_area, const char *dirname, tag
                      				ptr++;
                      				break;
                      			case HTML_PROCESS_TAG:
+                    -
+                    +
                      				/* Default to no action for this tag */
                      				state = HTML_SKIP_WS;
                      				next_state = HTML_NORM;
@@ -938,6 +938,9 @@ static int cli_html_normalise(int fd, m_area_t *m_area, const char *dirname, tag
                      						in_script = TRUE;
+                     					}
                      					html_output_tag(file_buff_script, tag, &tag_args);
                     +				} else if (dconf_entconv && strcmp(tag, "body") == 0) {
                     +					/* no more charset changes accepted after body encountered */
                     +					process_encoding_set(&conv, NULL, SWITCH_TO_BLOCKMODE);
                      				} else if (dconf_entconv && strcmp(tag, "meta") == 0) {
                      					const unsigned char* http_equiv = html_tag_arg_value(&tag_args, "http-equiv");
                      					const unsigned char* http_content = html_tag_arg_value(&tag_args, "content");
@@ -953,7 +956,7 @@ static int cli_html_normalise(int fd, m_area_t *m_area, const char *dirname, tag
                      							http_content2[i] = tolower(http_content[i]);
                      						http_content2[len] = '\0';
                      						charset = (unsigned char*) strstr((char*)http_content2,"charset");
                     -						if(charset) {
                     +						if(charset) {
                      							while(*charset && *charset != '=')
                      								charset++;
                      							if(*charset)
@@ -1011,8 +1014,8 @@ static int cli_html_normalise(int fd, m_area_t *m_area, const char *dirname, tag
                      					} else if (strcmp(tag,"form") == 0 && hrefs->scanContents) {
                      						const unsigned char* arg_action_value = html_tag_arg_value(&tag_args,"action");
                      						if (arg_action_value) {
                     -							if(in_form_action)
                     -								free(in_form_action);
                     +							if(in_form_action)
                     +								free(in_form_action);
                      							in_form_action = cli_strdup(arg_action_value);
+                     						}
                      					} else if (strcmp(tag, "img") == 0) {
@@ -1077,7 +1080,7 @@ static int cli_html_normalise(int fd, m_area_t *m_area, const char *dirname, tag
                      									in_form_action + strlen(in_form_action));
                      								html_tag_contents_done(hrefs,hrefs->count);
+                     							}
                     -						}
                     +						}
+                     					}
                      					/* TODO:imagemaps can have urls too */
+                     				}
@@ -1123,7 +1126,7 @@ static int cli_html_normalise(int fd, m_area_t *m_area, const char *dirname, tag
                      						html_output_c(file_buff_o1, file_buff_o2, '&');
                      						if (next_state == HTML_TAG_ARG_VAL && tag_val_length < HTML_STR_LENGTH) {
                      								tag_val[tag_val_length++] = '&';
                     -						}
                     +						}
                      						for(i=0; i < entity_val_length; i++) {
                      							const char c = tolower(entity_val[i]);
                      							html_output_c(file_buff_o1, file_buff_o2, c);
@@ -1266,7 +1269,7 @@ static int cli_html_normalise(int fd, m_area_t *m_area, const char *dirname, tag
                      						case 0x24:
                      							html_output_c(file_buff_o1, file_buff_o2, 0x40);
                      							html_output_c(file_buff_script, NULL, 0x40);
                     -							break;
                     +							break;
                      						case 0x26:
                      							html_output_c(file_buff_o1, file_buff_o2, 0x0a);
                      							html_output_c(file_buff_script, NULL, 0x0a);
@@ -1285,7 +1288,7 @@ static int cli_html_normalise(int fd, m_area_t *m_area, const char *dirname, tag
                      				ptr++;
                      				length--;
                      				break;
+                    -
+                    +
                      			case HTML_RFC2397_TYPE:
                      				if (*ptr == '\'') {
                      					if (!escape && (quoted==SINGLE_QUOTED)) {
@@ -1340,7 +1343,7 @@ static int cli_html_normalise(int fd, m_area_t *m_area, const char *dirname, tag
                      					escape = FALSE;
                      					next_state = HTML_BAD_STATE;
                      					ptr++;
+                    -
+                    +
                      				} else {
                      					if (tag_val_length < HTML_STR_LENGTH) {
                      						tag_val[tag_val_length++] = tolower(*ptr);
@@ -1370,7 +1373,7 @@ static int cli_html_normalise(int fd, m_area_t *m_area, const char *dirname, tag
                      						goto abort;
+                     					}
                      					file_tmp_o1->length = 0;
+                    -
+                    +
                      					html_output_str(file_tmp_o1, "From html-normalise\n", 20);
                      					html_output_str(file_tmp_o1, "Content-type: ", 14);
                      					if ((tag_val_length == 0) && (*tag_val == ';')) {
@@ -1455,7 +1458,7 @@ static int cli_html_normalise(int fd, m_area_t *m_area, const char *dirname, tag
                      					html_output_c(file_tmp_o1, NULL, '%');
+                     				}
                      				state = HTML_RFC2397_DATA;
                     -				break;
                     +				break;
                      			case HTML_ESCAPE_CHAR:
                      				value *= 16;
                      				length++;
@@ -1472,22 +1475,23 @@ static int cli_html_normalise(int fd, m_area_t *m_area, const char *dirname, tag
                      					state = next_state;
+                     				}
                      				ptr++;
                     -				break;
                     +				break;
+                     			}
+                     		}
                      		if(hrefs && hrefs->scanContents && in_ahref && href_contents_begin)
                      			/* end of line, append contents now, resume on next line */
                      			html_tag_contents_append(hrefs,in_ahref,href_contents_begin,ptr);
                      		ptrend = NULL;
                     -		free(line);
                     - 		if(dconf_entconv)
                     - 			ptr = line = encoding_norm_readline(&conv, stream_in, m_area, 8192);
                     - 		else
                     - 			ptr = line = cli_readline(stream_in, m_area, 8192);
                     +		if(dconf_entconv)
                     +			ptr = line = encoding_norm_readline(&conv, stream_in, m_area);
                     +		else {
                     +			free(line);
                     +			ptr = line = cli_readline(stream_in, m_area, 8192);
                     +		}
+                     	}
+                    -
                     - 	if(dconf_entconv) {
                     - 		/* handle "unfinished" entitites */
+                    +
                     +	if(dconf_entconv) {
                     +		/* handle "unfinished" entitites */
                      		size_t i;
                      		unsigned char* normalized;
                      		entity_val[entity_val_length] = '\0';