Browse code

use entconv to detect UTF-16BE, and UCS-4 variants use only cli_readline() we don't need exact conversion drop unused functions, simplify encoding_norm_readline(), and rename to encoding_normalize_toascii()

git-svn: trunk@3571

Török Edvin authored on 2008/02/02 04:38:52
Showing 6 changed files
... ...
@@ -1,3 +1,12 @@
1
+Fri Feb  1 21:19:58 EET 2008 (edwin)
2
+------------------------------------
3
+  * libclamav/filetypes.c: use entconv to detect UTF-16BE, and UCS-4 variants
4
+  * libclamav/htmlnorm.c: use only cli_readline() we don't need exact
5
+  conversion
6
+  * libclamav/entconv.c:
7
+	* drop unused functions,
8
+  	* simplify encoding_norm_readline(), and rename to encoding_normalize_toascii()
9
+
1 10
 Fri Feb  1 00:58:05 CET 2008 (tk)
2 11
 ---------------------------------
3 12
   * libclamav: ndb sigs: add new target type (7) for ASCII files; handle
... ...
@@ -127,67 +127,7 @@ const char* entity_norm(struct entity_conv* conv,const unsigned char* entity)
127 127
 	return NULL;
128 128
 }
129 129
 
130
-/* sane default, must be larger, than the longest possible return string,
131
- * which is
132
- * &#xxx;*/
133
-#define MIN_BUFFER_SIZE 32
134
-
135
-#define LINEMODE_LIMIT 16384
136
-
137
-int init_entity_converter(struct entity_conv* conv, size_t buffer_size)
138
-{
139
-	if(buffer_size < MIN_BUFFER_SIZE) {
140
-		cli_warnmsg("Entity converter: Supplied buffer size:%lu, smaller than minimum required: %d\n",(unsigned long)buffer_size,MIN_BUFFER_SIZE);
141
-		return CL_ENULLARG;
142
-	}
143
-	if(conv) {
144
-		conv->encoding = NULL;
145
-		conv->encoding_symbolic = E_UNKNOWN;
146
-		conv->bom_cnt = 0;
147
-		conv->buffer_size = buffer_size;
148
-		conv->priority = NOPRIO;
149
-		/* start in linemode */
150
-		conv->linemode = 1;
151
-		conv->linemode_processed = 0;
152
-
153
-		conv->tmp_area.offset = 0;
154
-		conv->tmp_area.length = 0;
155
-		conv->tmp_area.buffer  =  cli_malloc(buffer_size);
156
-		if(!conv->tmp_area.buffer) {
157
-			return CL_EMEM;
158
-		}
159
-
160
-		conv->out_area.offset = 0;
161
-		conv->out_area.length = buffer_size;
162
-		conv->out_area.buffer = cli_malloc(buffer_size);
163
-		if(!conv->out_area.buffer) {
164
-			free(conv->tmp_area.buffer);
165
-			return CL_EMEM;
166
-		}
167
-
168
-		conv->buffer_size = buffer_size;
169
-		conv->norm_area.offset = 0;
170
-		conv->norm_area.length = 0;
171
-		conv->norm_area.buffer = cli_malloc(buffer_size);
172
-		if(!conv->norm_area.buffer) {
173
-			free(conv->tmp_area.buffer);
174
-			free(conv->out_area.buffer);
175
-			return CL_EMEM;
176
-		}
177
-
178
-		conv->iconv_struct = cli_calloc(1, sizeof(iconv_t));
179
-		if(!conv->iconv_struct) {
180
-			free(conv->tmp_area.buffer);
181
-			free(conv->out_area.buffer);
182
-			free(conv->norm_area.buffer);
183
-			return CL_EMEM;
184
-		}
185
-		return 0;
186
-	}
187
-	else 
188
-		return CL_ENULLARG;
189
-}
190
-
130
+#ifndef HAVE_ICONV
191 131
 static size_t encoding_bytes(const char* fromcode, enum encodings* encoding)
192 132
 {
193 133
 	/* special case for these unusual byteorders */
... ...
@@ -217,7 +157,6 @@ static size_t encoding_bytes(const char* fromcode, enum encodings* encoding)
217 217
 	}
218 218
 }
219 219
 
220
-#ifndef HAVE_ICONV
221 220
 static iconv_t iconv_open(const char *tocode, const char* fromcode)
222 221
 {
223 222
 	iconv_t iconv = cli_malloc(sizeof(*iconv));
... ...
@@ -236,7 +175,6 @@ static int iconv_close(iconv_t cd)
236 236
 	return 0;
237 237
 }
238 238
 
239
-
240 239
 static int iconv(iconv_t iconv_struct,char **inbuf, size_t *inbytesleft,
241 240
 		char** outbuf, size_t *outbytesleft)
242 241
 {
... ...
@@ -426,14 +364,11 @@ static int iconv(iconv_t iconv_struct,char **inbuf, size_t *inbytesleft,
426 426
 
427 427
 #endif
428 428
 
429
-/* new iconv() version */
430
-static inline void process_bom(struct entity_conv* conv)
429
+static inline const char* detect_encoding(const unsigned char* bom, uint8_t* bom_found, uint8_t* enc_width)
431 430
 {
432
-	const unsigned char* bom = conv->bom;
433
-	const char* encoding = NULL;
431
+	const char* encoding;
434 432
 	int has_bom = 0;
435
-	uint8_t enc_bytes = 1;/* default is UTF8, which has a minimum of 1 bytes*/
436
-
433
+	uint8_t enc_bytes = 1; /* default is UTF8, which has a minimum of 1 bytes */
437 434
 	/* undecided 32-bit encodings are treated as ucs4, and
438 435
 	 * 16 bit as utf16*/
439 436
 	switch(bom[0]) {
... ...
@@ -442,23 +377,28 @@ static inline void process_bom(struct entity_conv* conv)
442 442
 				if(bom[2] == 0xFE && bom[3] == 0xFF) {
443 443
 					encoding = UCS4_1234;/* UCS-4 big-endian*/
444 444
 					has_bom = 1;
445
+					enc_bytes = 4;
445 446
 				}
446 447
 				else if(bom[2] == 0xFF && bom[3] == 0xFE) {
447 448
 					encoding = UCS4_2143;/* UCS-4 unusual order 2143 */
448 449
 					has_bom = 1;
450
+					enc_bytes = 4;
449 451
 				}
450 452
 				else if(bom[2] == 0x00 && bom[3] == 0x3C) {
451 453
 					/* undecided, treat as ucs4 */
452 454
 					encoding = UCS4_1234;
455
+					enc_bytes = 4;
453 456
 				}
454 457
 				else if(bom[2] == 0x3C && bom[3] == 0x00) {
455 458
 					encoding = UCS4_2143;
459
+					enc_bytes = 4;
456 460
 				}
457 461
 			}/* 0x00 0x00 */
458 462
 			else if(bom[1] == 0x3C) {
459 463
 				if(bom[2] == 0x00) {
460 464
 					if(bom[3] == 0x00) {
461 465
 						encoding = UCS4_3412;
466
+						enc_bytes = 4;
462 467
 					}
463 468
 					else if(bom[3] == 0x3F) {
464 469
 						encoding = UTF16_BE;
... ...
@@ -471,6 +411,7 @@ static inline void process_bom(struct entity_conv* conv)
471 471
 			if(bom[1] == 0xFE) {
472 472
 				if(bom[2] == 0x00 && bom[3] == 0x00) {
473 473
 					encoding = UCS4_4321;
474
+					enc_bytes = 4;
474 475
 					has_bom = 1;
475 476
 				}
476 477
 				else {
... ...
@@ -484,6 +425,7 @@ static inline void process_bom(struct entity_conv* conv)
484 484
 			if(bom[1] == 0xFF) {
485 485
 					if(bom[2] == 0x00 && bom[3] == 0x00) {
486 486
 						encoding = UCS4_3412;
487
+						enc_bytes = 4;
487 488
 						has_bom = 1;
488 489
 					}
489 490
 					else {
... ...
@@ -504,6 +446,7 @@ static inline void process_bom(struct entity_conv* conv)
504 504
 				if(bom[1] == 0x00) {
505 505
 					if(bom[2] == 0x00 && bom[3] == 0x00) {
506 506
 						encoding = UCS4_4321;
507
+						enc_bytes = 4;
507 508
 					}
508 509
 					else if(bom[2] == 0x3F && bom[3] == 0x00) {
509 510
 						encoding = UTF16_LE;
... ...
@@ -523,12 +466,19 @@ static inline void process_bom(struct entity_conv* conv)
523 523
 				}/*4C 6F A7 94*/
524 524
 				break;
525 525
 	}/*switch*/
526
-	if(encoding) {
527
-		cli_dbgmsg(MODULE_NAME "encoding detected as :%s\n", encoding);
528
-		process_encoding_set(conv, (const unsigned char*)encoding, has_bom ? BOM : NOBOM_AUTODETECT);
529
-	}
530
-	conv->enc_bytes = enc_bytes;
531
-	conv->has_bom = has_bom;
526
+	*enc_width = enc_bytes;
527
+	*bom_found = has_bom;
528
+	return encoding;
529
+}
530
+
531
+/* detects UTF-16(LE/BE), UCS-4(all 4 variants).
532
+ * UTF-8 and simple ASCII are ignored, because we can process those as text */
533
+const char* encoding_detect_bom(const unsigned char* bom)
534
+{
535
+	uint8_t has_bom;
536
+	uint8_t enc_width;
537
+	const char* encoding = detect_encoding(bom, &has_bom, &enc_width);
538
+	return enc_width > 1 ? encoding : NULL;
532 539
 }
533 540
 
534 541
 /*()-./0123456789:ABCDEFGHIJKLMNOPQRSTUVWXYZ_abcdefghijklmnopqrstuvwxyz*/
... ...
@@ -575,53 +525,6 @@ static char* normalize_encoding(const unsigned char* enc)
575 575
 	return norm;
576 576
 }
577 577
 
578
-static int encoding_norm_done(struct entity_conv* conv)
579
-{
580
-	if(conv->encoding) {
581
-		free(conv->encoding);
582
-		conv->encoding = NULL;
583
-	}
584
-	conv->buffer_size = 0;
585
-	if(conv->tmp_area.buffer) {
586
-		free(conv->tmp_area.buffer);
587
-		conv->tmp_area.buffer = NULL;
588
-	}
589
-	if(conv->out_area.buffer) {
590
-		free(conv->out_area.buffer);
591
-		conv->out_area.buffer = NULL;
592
-	}
593
-	if(conv->norm_area.buffer) {
594
-		free(conv->norm_area.buffer);
595
-		conv->norm_area.buffer = NULL;
596
-	}
597
-	if(conv->iconv_struct) {
598
-		free(conv->iconv_struct);
599
-	}
600
-	return 0;
601
-}
602
-
603
-int entity_norm_done(struct entity_conv* conv)
604
-{
605
-	return encoding_norm_done(conv);
606
-}
607
-
608
-static unsigned short bom_length(struct entity_conv* conv)
609
-{
610
-	if(conv->has_bom) {
611
-		switch(conv->enc_bytes) {
612
-			case 1:
613
-				if(conv->encoding_symbolic == E_UTF8) {
614
-					return 3;
615
-				}
616
-				break;
617
-			case 2:
618
-				return 2;
619
-			case 4:
620
-				return 4;
621
-		}
622
-	}
623
-	return 0;
624
-}
625 578
 /* sarge leaks on iconv_open/iconv_close, so lets not open/close so many times,
626 579
  * just keep on each thread its own pool of iconvs*/
627 580
 
... ...
@@ -774,99 +677,36 @@ static iconv_t iconv_open_cached(const char* fromcode)
774 774
 	cli_dbgmsg(MODULE_NAME "iconv not found in cache, for encoding:%s\n",fromcode);
775 775
 	iconv_struct = iconv_open("UTF-16BE",(const char*)fromcode);
776 776
 	if(iconv_struct != (iconv_t)-1) {
777
-	idx = cache->last++;
778
-	if(idx >= cache->len) {
779
-		cache->len += 16;
780
-		cache->tab = cli_realloc2(cache->tab, cache->len*sizeof(cache->tab[0]));
781
-		if(!cache->tab) {
782
-			cli_dbgmsg(MODULE_NAME "!Out of mem in iconv-pool\n");
783
-			errno = ENOMEM;
784
-			return (iconv_t)-1;
777
+		idx = cache->last++;
778
+		if(idx >= cache->len) {
779
+			cache->len += 16;
780
+			cache->tab = cli_realloc2(cache->tab, cache->len*sizeof(cache->tab[0]));
781
+			if(!cache->tab) {
782
+				cli_dbgmsg(MODULE_NAME "!Out of mem in iconv-pool\n");
783
+				errno = ENOMEM;
784
+				return (iconv_t)-1;
785
+			}
785 786
 		}
786
-	}
787 787
 
788
-	hashtab_insert(&cache->hashtab, fromcode, fromcode_len, idx);
788
+		hashtab_insert(&cache->hashtab, fromcode, fromcode_len, idx);
789 789
 		cache->tab[idx] = iconv_struct;
790
-	cli_dbgmsg(MODULE_NAME "iconv_open(),for:%s -> %p\n",fromcode,(void*)cache->tab[idx]);
791
-	return cache->tab[idx];
792
-}
793
-	return (iconv_t)-1;
794
-}
795
-
796
-void process_encoding_set(struct entity_conv* conv,const unsigned char* encoding,enum encoding_priority prio)
797
-{
798
-	char *tmp_encoding;
799
-	enum encodings tmp;
800
-	size_t new_size,old_size;
801
-
802
-	if(!encoding && prio == SWITCH_TO_BLOCKMODE) {
803
-		if(conv->linemode) {
804
-			cli_dbgmsg(MODULE_NAME "Switching to block-mode, bytes processed in line-mode: %u\n", conv->linemode_processed);
805
-			conv->linemode = 0;
806
-		}
807
-		return;
808
-	}
809
-
810
-	cli_dbgmsg(MODULE_NAME "Request to set encoding for %p to %s, priority: %d\n", (void*)conv, encoding, prio);
811
-
812
-	if(conv->priority == CONTENT_TYPE || conv->encoding || conv->encoding_symbolic == E_ICONV) {
813
-		cli_dbgmsg(MODULE_NAME "won't override encoding due to priorities\n");
814
-		return;
815
-		/* Content-type in header is highest priority, no overrides possible.
816
-		 * Also no overrides after an encoding has been set.*/
817
-	}
818
-
819
-	/* validate encoding name, and normalize to uppercase */
820
-	if(!(tmp_encoding = normalize_encoding(encoding))) {
821
-		cli_dbgmsg(MODULE_NAME "encoding name is not valid, ignoring\n");
822
-		return;
823
-	}
824
-
825
-	/* don't allow to change between unicode encodings that have different byte-size */
826
-	if(prio == META) {
827
-		/* need to consider minimum size of an encoding here */
828
-		old_size =  conv->enc_bytes;
829
-		new_size = encoding_bytes(tmp_encoding,&tmp);
830
-		if(old_size != new_size)  {
831
-			/* on x86 gcc wants %u for size_t, on x86_64 it wants %lu for size_t. So just cast to unsigned long to make warnings go away. */
832
-			cli_dbgmsg(MODULE_NAME "refusing to override encoding - new encoding size differs: %s(%lu) != %s(%lu)\n", conv->encoding, (unsigned long)old_size, tmp_encoding, (unsigned long)new_size);
833
-			free(tmp_encoding);
834
-			return;
835
-		}
836
-	}
837
-
838
-	conv->encoding = tmp_encoding;
839
-	cli_dbgmsg(MODULE_NAME "New encoding for %p:%s\n", (void*)conv, conv->encoding);
840
-	*(iconv_t*)conv->iconv_struct = iconv_open_cached( conv->encoding );
841
-	if(*(iconv_t*)conv->iconv_struct == (iconv_t)-1) {
842
-		cli_dbgmsg(MODULE_NAME "Encoding not accepted by iconv_open()%s, falling back to default!\n", conv->encoding);
843
-		/* message shown only once/file */
844
-		/* what can we do? short-circuit iconv */
845
-		free(conv->encoding);
846
-		conv->encoding = NULL;
847
-		/* we will process using whatever we currently have for encoding_symbolic.
848
-		 * If encoding was already set to iconv, we shouldn't be here.*/
849
-		assert(conv->encoding_symbolic != E_ICONV);
850
-	} else {
851
-		cli_dbgmsg(MODULE_NAME "Switching to block-mode, bytes processed in line-mode: %u\n", conv->linemode_processed);
852
-		conv->encoding_symbolic = E_ICONV;
853
-		conv->priority = prio;
854
-		conv->linemode = 0;
790
+		cli_dbgmsg(MODULE_NAME "iconv_open(),for:%s -> %p\n",fromcode,(void*)cache->tab[idx]);
791
+		return cache->tab[idx];
855 792
 	}
793
+	return (iconv_t)-1;
856 794
 }
857 795
 
858
-static int in_iconv_u16(m_area_t* in_m_area, iconv_t* iconv_struct, m_area_t* out_m_area)
796
+static int in_iconv_u16(const m_area_t* in_m_area, iconv_t* iconv_struct, m_area_t* out_m_area)
859 797
 {
860 798
 	char   tmp4[4];
861 799
 	size_t inleft = in_m_area->length - in_m_area->offset;
862 800
 	size_t rc, alignfix;
863 801
 	char*  input   = (char*)in_m_area->buffer + in_m_area->offset;
864
-	size_t outleft = out_m_area->length > 0 ? out_m_area->length : 0;/*TODO: use real buffer size not last one*/
802
+	size_t outleft = out_m_area->length > 0 ? out_m_area->length : 0;
865 803
 	char* out      = (char*)out_m_area->buffer;
866 804
 
805
+	out_m_area->offset = 0;
867 806
 	if(!inleft) {
868
-		/* EOF */
869
-		out_m_area->offset = out_m_area->length = 0;
870 807
 		return 0;
871 808
 	}
872 809
 	/* convert encoding conv->tmp_area. conv->out_area */
... ...
@@ -886,7 +726,7 @@ static int in_iconv_u16(m_area_t* in_m_area, iconv_t* iconv_struct, m_area_t* ou
886 886
 	while (inleft && (outleft >= 2)) { /* iconv doesn't like inleft to be 0 */
887 887
 		const size_t outleft_last = outleft;
888 888
 		assert(*iconv_struct != (iconv_t)-1);
889
-		rc = iconv(*iconv_struct, (char**) &input,  &inleft, (char**) &out, &outleft);
889
+		rc = iconv(*iconv_struct, &input,  &inleft, &out, &outleft);
890 890
 		if(rc == (size_t)-1) {
891 891
 			if(errno == E2BIG) {
892 892
 				/* not enough space in output buffer */
... ...
@@ -909,9 +749,7 @@ static int in_iconv_u16(m_area_t* in_m_area, iconv_t* iconv_struct, m_area_t* ou
909 909
 		*out++ = *input++;
910 910
 		inleft--;
911 911
 	}
912
-	/* length - offset - alignfix is original value of inleft, new value is inleft, 
913
-	 * difference tells how much it moved. */
914
-	in_m_area->offset = in_m_area->length - alignfix - inleft;
912
+	cli_dbgmsg("in_iconv_u16: unprocessed bytes: %lu\n", (unsigned long)inleft);
915 913
 	if(out_m_area->length >= 0 && out_m_area->length >= (off_t)outleft) {
916 914
 		out_m_area->length -= (off_t)outleft;
917 915
 	} else {
... ...
@@ -922,156 +760,36 @@ static int in_iconv_u16(m_area_t* in_m_area, iconv_t* iconv_struct, m_area_t* ou
922 922
 	return 0;
923 923
 }
924 924
 
925
-
926
-#define NORMALIZE_CHAR(c, out, limit, linemode) \
927
-{\
928
-	        if (linemode && c == '\n') {\
929
-			i++;\
930
-			break;\
931
-		} else {\
932
-			unsigned char* out_new = u16_normalize(c, out, limit);\
933
-			if(out_new) {\
934
-				limit -= out_new - out;\
935
-			}\
936
-			out = out_new;\
937
-		}\
938
-}
939
-
940
-/* don't use CLI_ISCONTAINED2 here, because values are signed, and gcc4.3
941
- * assumes signed overflow doesn't occur when optimizing (see -Wstrict-overflow) */
942
-#define LIMIT_LENGTH(siz, siz_limit) ((siz) <= (siz_limit) ? (siz) : (siz_limit))
943
-#define OFFSET_INBOUNDS(offset, length) ((offset) >= 0 && (length) >= 0 && (offset) < (length))
944
-
945
-/* EOF marker is m_area->length == 0 */
946
-
947
-/* reads input from either @m_area or @stream, and returns an m_area_t pointing to the data read.
948
- * When we can't read anything due to EOF ->length will be set to 0.
949
- * bounds checks offset and length*/
950
-static inline m_area_t* read_raw(struct entity_conv* conv, m_area_t* m_area, FILE* stream)
925
+int encoding_normalize_toascii(const m_area_t* in_m_area, const char* initial_encoding, m_area_t* out_m_area)
951 926
 {
952
-	if(!m_area) {
953
-		size_t iread;
927
+	iconv_t iconv_struct;
928
+	off_t i, j;
929
+	char *encoding;
954 930
 
955
-		m_area = &conv->tmp_area;
956
-		if(OFFSET_INBOUNDS(m_area->offset, m_area->length)) {
957
-			return m_area;
958
-		}
959
-		/* offset out of bounds -> all the buffer was processed, fill it again */
960
-		iread = fread(m_area->buffer, 1, conv->buffer_size, stream);
961
-		m_area->length = LIMIT_LENGTH(iread, conv->buffer_size);
962
-		m_area->offset = 0;
963
-		if(ferror(stream)) {
964
-			cli_errmsg("Error while reading HTML stream\n");
965
-		}
966
-	} else {
967
-		if(!OFFSET_INBOUNDS(m_area->offset, m_area->length)) {
968
-			cli_dbgmsg(MODULE_NAME "EOF reached\n");
969
-			m_area->offset = m_area->length; /* EOF marker */
970
-		}
931
+	if(!initial_encoding || !in_m_area || !out_m_area) {
932
+		return CL_ENULLARG;
971 933
 	}
972
-	return m_area;
973
-}
974
-
975
-static inline uint16_t get_u16(const unsigned char* buf, const size_t i)
976
-{
977
-	return ((uint16_t)buf[i] << 8) | buf[i+1];
978
-}
979 934
 
980
-unsigned char* encoding_norm_readline(struct entity_conv* conv, FILE* stream_in, m_area_t* in_m_area)
981
-{
982
-	unsigned char* out = conv->out_area.buffer;
983
-	if(!conv || !conv->out_area.buffer || !conv->tmp_area.buffer || !out) {
984
-		return NULL;
985
-	}
986
-	if(!(in_m_area = read_raw(conv, in_m_area, stream_in))) {
987
-		/* error encountered */
988
-		return NULL;
935
+	encoding = normalize_encoding((const unsigned char*)initial_encoding);
936
+	if(!encoding) {
937
+		cli_dbgmsg(MODULE_NAME "encoding name is not valid, ignoring\n");
938
+		return -1;
989 939
 	}
990
-	else {
991
-		const off_t input_limit  = in_m_area->length;
992
-		const unsigned char* input = in_m_area->buffer;
993
-		off_t input_offset = in_m_area->offset;
994
-		off_t limit = conv->out_area.length - 1;
995
-		off_t limit_prev = limit;
996
-		off_t i = 0;
997
-
998
-		/* read_raw() ensures this condition */
999
-		assert((!input_limit && !input_offset) || (input_offset >=0 && input_limit > 0 && input_offset <= input_limit));
1000
-
1001
-		if(!conv->bom_cnt && input_offset + 4 < input_limit) {/* detect Byte Order Mark */
1002
-			size_t bom_len;
1003
-			memcpy(conv->bom, input, 4);
1004
-			process_bom(conv);
1005
-			bom_len = bom_length(conv);
1006
-			in_m_area->offset = input_offset = input_offset + bom_len;
1007
-			conv->bom_cnt = 1;
1008
-		}
1009
-
1010
-		if(conv->linemode && conv->linemode_processed > LINEMODE_LIMIT) {
1011
-			cli_dbgmsg(MODULE_NAME "Line-mode limit exceeded (%u), switching to block-mode\n", conv->linemode_processed);
1012
-			conv->linemode = 0;
1013
-		}
1014
-
1015
-		switch(conv->encoding_symbolic) {
1016
-			case E_ICONV:/* only in block-mode */
1017
-				/* normalize already converted characters from a previous pass
1018
-				 * (output buffer was full, and we couldn't normalize more in previous pass) */
1019
-				for(i = conv->norm_area.offset;i < conv->norm_area.length && limit > 0 && out; i += 2) {
1020
-					const uint16_t c = get_u16(conv->norm_area.buffer, i);
1021
-					NORMALIZE_CHAR(c, out, limit, 0);
1022
-				}
1023
-				conv->norm_area.offset = i;
1024
-			        if(limit > 0) {
1025
-					conv->norm_area.length = conv->buffer_size;
1026
-					in_iconv_u16(in_m_area, conv->iconv_struct, &conv->norm_area);
1027
-
1028
-					/*in_iconv_u16 always fills entire norm_area buffer starting from 0. */
1029
-					for(i = 0;i < conv->norm_area.length && limit >  0 && out; i += 2) {
1030
-						const uint16_t c = get_u16(conv->norm_area.buffer, i);
1031
-						NORMALIZE_CHAR(c, out, limit, 0);
1032
-					}
1033
-					if(i) {
1034
-						conv->norm_area.offset = i;
1035
-					}
1036
-				}
1037
-				if(limit == limit_prev) {
1038
-					/* output pointer didn't move => EOF */
1039
-					return NULL;
1040
-				}
1041
-				break;
1042
-				/* out_area must have enough space to allow all bytes in norm_area normalized,
1043
-				 * if we norm with &x;, then we need 7* space. */
1044
-			default:
1045
-				cli_dbgmsg(MODULE_NAME "Unhandled encoding:%d\n",conv->encoding_symbolic);
1046
-				conv->encoding_symbolic = E_OTHER;
1047
-			case E_UNKNOWN:
1048
-			case E_OTHER:
1049
-				if(!input_limit || input_offset == input_limit) {
1050
-					/* nothing to do, EOF */
1051
-					return NULL;
1052
-				}
1053
-				for(i = input_offset; i < input_limit && limit > 0; i++) {
1054
-					const unsigned char c = input[i];
1055
-					if(conv->linemode && c == '\n') {
1056
-						i++;
1057
-						break;
1058
-					}
1059
-					if(c) {
1060
-						*out++ = c;
1061
-						limit--;
1062
-					}
1063
-				}
1064
-				in_m_area->offset = i;
1065
-		}
1066 940
 
1067
-
1068
-		if(conv->linemode) {
1069
-			conv->linemode_processed += i - input_offset;
941
+	cli_dbgmsg(MODULE_NAME "Encoding %s\n", encoding);
942
+	iconv_struct = iconv_open_cached( encoding );
943
+	if(iconv_struct == (iconv_t)-1) {
944
+		cli_dbgmsg(MODULE_NAME "Encoding not accepted by iconv_open(): %s\n", encoding);
945
+		free(encoding);
946
+		return -1;
947
+	}
948
+	in_iconv_u16(in_m_area, &iconv_struct, out_m_area);
949
+	for(i = 0, j = 0; i < out_m_area->length ; i += 2) {
950
+		const unsigned char c = (out_m_area->buffer[i] << 4) + out_m_area->buffer[i+1];
951
+		if(c) {
952
+			out_m_area->buffer[j++] = c;
1070 953
 		}
1071
-
1072
-		if(limit < 0) limit = 0;
1073
-		conv->out_area.buffer[conv->out_area.length - limit - 1] = '\0';
1074
-		return conv->out_area.buffer;
1075 954
 	}
955
+	out_m_area->length = j;
956
+	return 0;
1076 957
 }
1077
-
... ...
@@ -44,37 +44,20 @@
44 44
 #define UNKNOWN "\0"
45 45
 #define OTHER   "OTHER"
46 46
 
47
+
47 48
 enum encoding_priority {NOPRIO,CONTENT_TYPE,BOM,NOBOM_AUTODETECT,XML_CHARSET,META, SWITCH_TO_BLOCKMODE};
48 49
 
49 50
 enum encodings {E_UCS4,E_UTF16,E_UCS4_1234,E_UCS4_4321,E_UCS4_2143,E_UCS4_3412,E_UTF16_BE,E_UTF16_LE,E_UTF8, E_UNKNOWN,E_OTHER, E_ICONV};
50 51
 #define MAX_ENTITY_SIZE 22
51 52
 
52 53
 struct entity_conv {
53
-	char* encoding;
54
-	enum encoding_priority priority;
55
-	enum encodings encoding_symbolic;
56
-	size_t buffer_size;
57
-	void* iconv_struct;
58 54
 	unsigned char entity_buff[MAX_ENTITY_SIZE+2];
59
-	m_area_t tmp_area;
60
-	m_area_t out_area;
61
-	m_area_t norm_area;
62
-	int      linemode;/* TODO:set */
63
-	int      linemode_processed;
64
-	unsigned char bom[4];
65
-	uint8_t has_bom;
66
-	uint8_t enc_bytes;
67
-	uint8_t  bom_cnt;
68 55
 };
69 56
 
70
-int init_entity_converter(struct entity_conv* conv, size_t buffer_size);
71
-void process_encoding_set(struct entity_conv* conv,const unsigned char* encoding,enum encoding_priority priority);
72
-int entity_norm_done(struct entity_conv* conv);
73
-
74 57
 unsigned char* u16_normalize_tobuffer(uint16_t u16, unsigned char* dst, size_t dst_size);
75
-unsigned char* encoding_norm_readline(struct entity_conv* conv, FILE* stream_in, m_area_t* in_m_area);
76 58
 const char* entity_norm(struct entity_conv* conv,const unsigned char* entity);
77
-int entitynorm_init(void);
59
+const char* encoding_detect_bom(const unsigned char* bom);
60
+int encoding_normalize_toascii(const m_area_t* in_m_area, const char* initial_encoding, m_area_t* out_m_area);
78 61
 
79 62
 #endif
80 63
 
... ...
@@ -182,37 +182,42 @@ cli_file_t cli_filetype2(int desc, const struct cl_engine *engine)
182 182
 	    cli_ac_freedata(&mdata);
183 183
 
184 184
 	    if((((struct cli_dconf*) engine->dconf)->phishing & PHISHING_CONF_ENTCONV) && ret != CL_TYPE_HTML_UTF16) {
185
-		    struct entity_conv conv;
186
-		    const size_t conv_size = 2*bread < 256 ? 256 : 2*bread;
187
-
188
-		    /* TODO: make detection via daily.ft, then we can get rid of line-mode entirely!*/
189
-		    if(init_entity_converter(&conv, conv_size) == 0) {
190
-			    m_area_t area;
191
-			    area.buffer = (unsigned char *) smallbuff;
192
-			    area.length = bread;
193
-			    area.offset = 0;
194
-
195
-			    /* switch to blockmode, so that we convert all the input buffer at once,
196
-			     * rather than line-by-line */
197
-			    process_encoding_set(&conv, NULL, SWITCH_TO_BLOCKMODE);
198
-
199
-			    if(cli_ac_initdata(&mdata, root->ac_partsigs, AC_DEFAULT_TRACKLEN))
200
-				    return ret;
201
-
202
-			    decoded =  encoding_norm_readline(&conv, NULL, &area);
203
-
204
-			    if(decoded) {
205
-				    sret = cli_ac_scanbuff(decoded, strlen((const char *) decoded), NULL, engine->root[0], &mdata, 1, 0, 0, -1, NULL);
206
-				    if(sret == CL_TYPE_HTML) {
207
-					    ret = CL_TYPE_HTML;
185
+		    const char* encoding;
186
+
187
+		    /* check if we can autodetect this encoding.
188
+		     * If we can't don't try to detect HTML sig, since
189
+		     * we just tried that above, and failed */
190
+		    if((encoding = encoding_detect_bom(smallbuff))) {
191
+			    unsigned char decodedbuff[sizeof(smallbuff)*2];
192
+			    m_area_t in_area, out_area;
193
+
194
+			    in_area.buffer = (unsigned char *) smallbuff;
195
+			    in_area.length = bread;
196
+			    in_area.offset = 0;
197
+			    out_area.buffer = decodedbuff;
198
+			    out_area.length = sizeof(decodedbuff);
199
+			    out_area.offset = 0;
200
+
201
+			    /* in htmlnorm we simply skip over \0 chars, and that allows to parse HTML in any unicode 
202
+			     * (multibyte characters will not be exactly handled, but that is not a problem).
203
+			     * However when detecting whether a file is HTML or not, we need exact conversion.
204
+			     * (just eliminating zeros and matching would introduce false positives */
205
+			    if(encoding_normalize_toascii(&in_area, encoding, &out_area) >= 0 && out_area.length > 0) {
206
+				    out_area.buffer[out_area.length] = '\0';
207
+				    if(cli_ac_initdata(&mdata, root->ac_partsigs, AC_DEFAULT_TRACKLEN))
208
+					    return ret;
209
+
210
+				    if(out_area.length > 0) {
211
+					    sret = cli_ac_scanbuff(decodedbuff, out_area.length, NULL, engine->root[0], &mdata, 1, 0, 0, -1, NULL);
212
+					    if(sret == CL_TYPE_HTML) {
213
+						    cli_dbgmsg("cli_filetype2: detected HTML signature in Unicode file\n");
214
+						    /* htmlnorm is able to handle any unicode now, since it skips null chars */
215
+						    ret = CL_TYPE_HTML;
216
+					    }
208 217
 				    }
209
-			    }
210 218
 
211
-			    cli_ac_freedata(&mdata);
212
-
213
-			    entity_norm_done(&conv);
214
-		    } else {
215
-			    cli_warnmsg("cli_filetype2: Error initializing entity converter\n");
219
+				    cli_ac_freedata(&mdata);
220
+			    }
216 221
 		    }
217 222
 	    }
218 223
 	}
... ...
@@ -542,13 +542,6 @@ static int cli_html_normalise(int fd, m_area_t *m_area, const char *dirname, tag
542 542
 		}
543 543
 	}
544 544
 
545
-	if(dconf_entconv && (rc = init_entity_converter(&conv, 16384) )) {
546
-		if (!m_area) {
547
-			fclose(stream_in);
548
-		}
549
-		return rc;
550
-	}
551
-
552 545
 	tag_args.count = 0;
553 546
 	tag_args.tag = NULL;
554 547
 	tag_args.value = NULL;
... ...
@@ -628,10 +621,7 @@ static int cli_html_normalise(int fd, m_area_t *m_area, const char *dirname, tag
628 628
 
629 629
 	binary = FALSE;
630 630
 
631
-	if(dconf_entconv)
632
-		ptr = line = encoding_norm_readline(&conv, stream_in, m_area);
633
-	else
634
-		ptr = line = cli_readchunk(stream_in, m_area, 8192);
631
+	ptr = line = cli_readchunk(stream_in, m_area, 8192);
635 632
 
636 633
 	while (line) {
637 634
 		if(href_contents_begin)
... ...
@@ -989,37 +979,6 @@ static int cli_html_normalise(int fd, m_area_t *m_area, const char *dirname, tag
989 989
 						in_script = TRUE;
990 990
 					}
991 991
 					html_output_tag(file_buff_script, tag, &tag_args);
992
-				} else if (dconf_entconv && strcmp(tag, "body") == 0) {
993
-					/* no more charset changes accepted after body encountered */
994
-					process_encoding_set(&conv, NULL, SWITCH_TO_BLOCKMODE);
995
-				} else if (dconf_entconv && strcmp(tag, "meta") == 0) {
996
-					const unsigned char* http_equiv = html_tag_arg_value(&tag_args, "http-equiv");
997
-					const unsigned char* http_content = html_tag_arg_value(&tag_args, "content");
998
-					if(http_equiv && http_content && strcasecmp(http_equiv,"content-type") == 0) {
999
-						size_t len = strlen((const char*)http_content);
1000
-						unsigned char* http_content2 = cli_malloc( len + 1);
1001
-						unsigned char* charset;
1002
-						size_t i;
1003
-
1004
-						if(!http_content2)
1005
-							return CL_EMEM;
1006
-						for(i = 0; i < len; i++)
1007
-							http_content2[i] = tolower(http_content[i]);
1008
-						http_content2[len] = '\0';
1009
-						charset = (unsigned char*) strstr((char*)http_content2,"charset");
1010
-						if(charset) {
1011
-							while(*charset && *charset != '=')
1012
-								charset++;
1013
-							if(*charset)
1014
-								charset++;/* skip = */
1015
-							len = strcspn((const char*)charset," \"'");
1016
-							charset[len] = '\0';
1017
-							if(len) {
1018
-								process_encoding_set(&conv, charset, META);
1019
-							}
1020
-						}
1021
-						free(http_content2);
1022
-					}
1023 992
 				} else if (hrefs) {
1024 993
 					if(in_ahref && !href_contents_begin)
1025 994
 						href_contents_begin=ptr;
... ...
@@ -1533,12 +1492,8 @@ static int cli_html_normalise(int fd, m_area_t *m_area, const char *dirname, tag
1533 1533
 			/* end of line, append contents now, resume on next line */
1534 1534
 			html_tag_contents_append(hrefs,in_ahref,href_contents_begin,ptr);
1535 1535
 		ptrend = NULL;
1536
-		if(dconf_entconv)
1537
-			ptr = line = encoding_norm_readline(&conv, stream_in, m_area);
1538
-		else {
1539
-			free(line);
1540
-			ptr = line = cli_readchunk(stream_in, m_area, 8192);
1541
-		}
1536
+		free(line);
1537
+		ptr = line = cli_readchunk(stream_in, m_area, 8192);
1542 1538
 	}
1543 1539
 
1544 1540
 	if(dconf_entconv) {
... ...
@@ -1566,8 +1521,6 @@ abort:
1566 1566
 	if (in_ahref) /* tag not closed, force closing */
1567 1567
 		html_tag_contents_done(hrefs,in_ahref);
1568 1568
 
1569
-	if(dconf_entconv)
1570
-		entity_norm_done(&conv);
1571 1569
 	html_tag_arg_free(&tag_args);
1572 1570
 	if (!m_area) {
1573 1571
 		fclose(stream_in);
... ...
@@ -1593,11 +1546,11 @@ abort:
1593 1593
 int html_normalise_mem(unsigned char *in_buff, off_t in_size, const char *dirname, tag_arguments_t *hrefs,const struct cli_dconf* dconf)
1594 1594
 {
1595 1595
 	m_area_t m_area;
1596
-	
1596
+
1597 1597
 	m_area.buffer = in_buff;
1598 1598
 	m_area.length = in_size;
1599 1599
 	m_area.offset = 0;
1600
-	
1600
+
1601 1601
 	return cli_html_normalise(-1, &m_area, dirname, hrefs, dconf);
1602 1602
 }
1603 1603
 
... ...
@@ -1607,7 +1560,7 @@ int html_normalise_fd(int fd, const char *dirname, tag_arguments_t *hrefs,const
1607 1607
 	int retval=FALSE;
1608 1608
 	m_area_t m_area;
1609 1609
 	struct stat statbuf;
1610
-	
1610
+
1611 1611
 	if (fstat(fd, &statbuf) == 0) {
1612 1612
 		m_area.length = statbuf.st_size;
1613 1613
 		m_area.buffer = (unsigned char *) mmap(NULL, m_area.length, PROT_READ, MAP_PRIVATE, fd, 0);
... ...
@@ -36,7 +36,7 @@ typedef struct m_area_tag {
36 36
 } m_area_t;
37 37
 
38 38
 int html_normalise_mem(unsigned char *in_buff, off_t in_size, const char *dirname, tag_arguments_t *hrefs,const struct cli_dconf* dconf);
39
-int html_normalise_fd(int fd, const char *dirname, tag_arguments_t *hrefs,const struct cli_dconf* dconf);
39
+int html_normalise_fd(int fd, const char *dirname, tag_arguments_t *hrefs, const struct cli_dconf* dconf);
40 40
 void html_tag_arg_free(tag_arguments_t *tags);
41 41
 int html_screnc_decode(int fd, const char *dirname);
42 42