Browse code

AC_TRY_LINK already adds a main(), remove duplicate main() entconv improvements to improve security and performance Part I for (bb #686, #386) TODO: * optimize entity_norm * create testfiles for unicode encoding variants * create a regression test * check for memory leaks

git-svn: trunk@3511

Török Edvin authored on 2008/01/21 07:18:14
Showing 7 changed files
... ...
@@ -1,3 +1,13 @@
1
+Sun Jan 20 23:49:41 EET 2008 (edwin)
2
+------------------------------------
3
+  * configure: AC_TRY_LINK already adds a main(), remove duplicate main()
4
+  * libclamav: entconv improvements to improve security and performance
5
+		Part I for  (bb #686, #386)
6
+	       TODO: * optimize entity_norm
7
+	             * create testfiles for unicode encoding variants
8
+		     * create a regression test
9
+		     * check for memory leaks
10
+
1 11
 Sat Jan 19 14:41:50 CET 2008 (acab)
2 12
 -----------------------------------
3 13
   * test: using splitted instead of byteswapped files
... ...
@@ -11397,15 +11397,12 @@ int
11397 11397
 main ()
11398 11398
 {
11399 11399
 
11400
-int main(int argc, char** argv) {
11401 11400
 	char** xin,**xout;
11402 11401
 	unsigned il,ol;
11403 11402
 	int rc;
11404 11403
 	iconv_t iconv_struct = iconv_open("UTF-16BE","UTF-8");
11405 11404
 	rc = iconv(iconv_struct,xin,&il,xout,&ol);
11406 11405
 	iconv_close(iconv_struct);
11407
-	return 0;
11408
-}
11409 11406
 
11410 11407
   ;
11411 11408
   return 0;
... ...
@@ -230,15 +230,12 @@ if test "X$wiconv" != "Xno"; then
230 230
 	AC_TRY_LINK([
231 231
 		     #include <iconv.h>
232 232
 	],[
233
-int main(int argc, char** argv) {
234 233
 	char** xin,**xout;
235 234
 	unsigned il,ol;
236 235
 	int rc;
237 236
 	iconv_t iconv_struct = iconv_open("UTF-16BE","UTF-8");
238 237
 	rc = iconv(iconv_struct,xin,&il,xout,&ol);
239 238
 	iconv_close(iconv_struct);
240
-	return 0;
241
-}
242 239
 ],[
243 240
    AC_MSG_RESULT(yes)
244 241
    AC_DEFINE(HAVE_ICONV, 1, [iconv() available])
... ...
@@ -32,6 +32,12 @@
32 32
 #include <pthread.h>
33 33
 #endif
34 34
 
35
+#ifndef CL_DEBUG
36
+#define NDEBUG
37
+#endif
38
+
39
+#include <assert.h>
40
+
35 41
 #include "clamav.h"
36 42
 #include "others.h"
37 43
 #include "htmlnorm.h"
... ...
@@ -46,6 +52,7 @@
46 46
 
47 47
 #include "encoding_aliases.h"
48 48
 
49
+#define MODULE_NAME "entconv: "
49 50
 
50 51
 #define MAX_LINE 1024
51 52
 
... ...
@@ -58,11 +65,12 @@ unsigned char* entity_norm(const struct entity_conv* conv,const unsigned char* e
58 58
 	struct element* e = hashtab_find(conv->ht,entity,strlen((const char*)entity));
59 59
 	if(e && e->key) {
60 60
 		const int val = e->data;
61
+		/* TODO: don't allocate memory here, but use a buffer in struct entity_conv */
61 62
 		if(val == '<')/* this was an escaped <, so output it escaped*/
62 63
 			return (unsigned char*)cli_strdup("&lt;");
63 64
 		else if(val == '>')/* see above */
64 65
 			return (unsigned char*)cli_strdup("&gt;");
65
-		else if(val<127) {
66
+		else if(val >= 0 && val <= 0xff) {
66 67
 			unsigned char *e_out = cli_malloc(2);
67 68
 
68 69
 			if(!e_out)
... ...
@@ -75,6 +83,7 @@ unsigned char* entity_norm(const struct entity_conv* conv,const unsigned char* e
75 75
 		else if(val==160)
76 76
 			return (unsigned char*)cli_strdup(" ");
77 77
 		else {
78
+			/* TODO: use optimized version from u16_normalize */
78 79
 			unsigned char *ent_out = cli_malloc(10);
79 80
 
80 81
 			if(!ent_out)
... ...
@@ -88,21 +97,22 @@ unsigned char* entity_norm(const struct entity_conv* conv,const unsigned char* e
88 88
 	else
89 89
 		return NULL;
90 90
 }
91
-
92 91
 /* sane default, must be larger, than the longest possible return string,
93 92
  * which is
94 93
  * &#xxx;*/
95 94
 #define MIN_BUFFER_SIZE 32
96 95
 
97
-int init_entity_converter(struct entity_conv* conv,const unsigned char* encoding,size_t buffer_size)
96
+#define LINEMODE_LIMIT 16384
97
+
98
+int init_entity_converter(struct entity_conv* conv, size_t buffer_size)
98 99
 {
99 100
 	if(buffer_size < MIN_BUFFER_SIZE) {
100 101
 		cli_warnmsg("Entity converter: Supplied buffer size:%lu, smaller than minimum required: %d\n",(unsigned long)buffer_size,MIN_BUFFER_SIZE);
101 102
 		return CL_ENULLARG;
102 103
 	}
103 104
 	if(conv) {
104
-		conv->encoding = (unsigned char*) cli_strdup("ISO-8859-1");
105
-		conv->autodetected = OTHER;
105
+		conv->encoding = NULL;
106
+		conv->encoding_symbolic = E_UNKNOWN;
106 107
 		conv->bom_cnt = 0;
107 108
 		conv->buffer_cnt = 0;
108 109
 		conv->bytes_read = 0;
... ...
@@ -110,6 +120,9 @@ int init_entity_converter(struct entity_conv* conv,const unsigned char* encoding
110 110
 		conv->entity_buffcnt = 0;
111 111
 		conv->buffer_size = buffer_size;
112 112
 		conv->priority = NOPRIO;
113
+		/* start in linemode */
114
+		conv->linemode = 1;
115
+		conv->linemode_processed = 0;
113 116
 
114 117
 		conv->tmp_area.offset = 0;
115 118
 		conv->tmp_area.length = 0;
... ...
@@ -119,13 +132,14 @@ int init_entity_converter(struct entity_conv* conv,const unsigned char* encoding
119 119
 		}
120 120
 
121 121
 		conv->out_area.offset = 0;
122
-		conv->out_area.length = 0;
122
+		conv->out_area.length = buffer_size;
123 123
 		conv->out_area.buffer = cli_malloc(buffer_size);
124 124
 		if(!conv->out_area.buffer) {
125 125
 			free(conv->tmp_area.buffer);
126 126
 			return CL_EMEM;
127 127
 		}
128 128
 
129
+		conv->buffer_size = buffer_size;
129 130
 		conv->norm_area.offset = 0;
130 131
 		conv->norm_area.length = 0;
131 132
 		conv->norm_area.buffer = cli_malloc(buffer_size);
... ...
@@ -138,6 +152,13 @@ int init_entity_converter(struct entity_conv* conv,const unsigned char* encoding
138 138
 		conv->ht = &entities_htable;
139 139
 		conv->msg_zero_shown = 0;
140 140
 
141
+		conv->iconv_struct = cli_calloc(1, sizeof(iconv_t));
142
+		if(!conv->iconv_struct) {
143
+			free(conv->tmp_area.buffer);
144
+			free(conv->out_area.buffer);
145
+			free(conv->norm_area.buffer);
146
+			return CL_EMEM;
147
+		}
141 148
 		return 0;
142 149
 	}
143 150
 	else 
... ...
@@ -148,23 +169,18 @@ static size_t encoding_bytes(const unsigned char* fromcode, enum encodings* enco
148 148
 {
149 149
 	const unsigned char* from = (const unsigned char*) fromcode;
150 150
 	/* special case for these unusual byteorders */
151
-	*encoding=E_OTHER;
152
-	if(from == UCS4_2143)
153
-		*encoding = E_UCS4_2134;
154
-	else if (from == UCS4_3412)
155
-		*encoding = E_UCS4_3412;
156
-	else {
157
-		struct element * e = hashtab_find(&aliases_htable,from,strlen((const char*)fromcode));
158
-		if(e && e->key) {
159
-			*encoding = e->data;
160
-		}
151
+	struct element * e = hashtab_find(&aliases_htable,from,strlen((const char*)fromcode));
152
+	if(e && e->key) {
153
+		*encoding = e->data;
154
+	} else {
155
+		*encoding = E_OTHER;
161 156
 	}
162 157
 
163 158
 	switch(*encoding) {
164 159
 		case E_UCS4:
165 160
 		case E_UCS4_1234:
166 161
 		case E_UCS4_4321:
167
-		case E_UCS4_2134:
162
+		case E_UCS4_2143:
168 163
 		case E_UCS4_3412:
169 164
 			return 4;
170 165
 		case E_UTF16:
... ...
@@ -177,7 +193,7 @@ static size_t encoding_bytes(const unsigned char* fromcode, enum encodings* enco
177 177
 		default:
178 178
 			return 1;
179 179
 	}
180
-	}
180
+}
181 181
 
182 182
 #ifndef HAVE_ICONV
183 183
 typedef struct {
... ...
@@ -187,10 +203,10 @@ typedef struct {
187 187
 
188 188
 static iconv_t iconv_open(const char *tocode, const char* fromcode)
189 189
 {
190
-	cli_dbgmsg("Internal iconv\n");
191 190
 	iconv_t iconv = cli_malloc(sizeof(*iconv));
192 191
 	if(!iconv)
193 192
 		return NULL;
193
+	cli_dbgmsg(MODULE_NAME "Internal iconv\n");
194 194
 	/* TODO: check that tocode is UTF16BE */
195 195
 	iconv->size = encoding_bytes(fromcode,&iconv->encoding);
196 196
 	return iconv;
... ...
@@ -216,7 +232,7 @@ static int iconv(iconv_t iconv_struct,char **inbuf, size_t *inbytesleft,
216 216
 	/* output is always utf16be !*/
217 217
 	switch(iconv_struct->encoding) {
218 218
 		case E_UCS4:
219
-		case E_UCS4_1234:			
219
+		case E_UCS4_1234:
220 220
 			{
221 221
 				for(i=0;i < maxcopy; i += 4) {
222 222
 					if(!input[i+2] && !input[i+3]) {
... ...
@@ -224,7 +240,7 @@ static int iconv(iconv_t iconv_struct,char **inbuf, size_t *inbytesleft,
224 224
 						output[i/2+1] = input[i];
225 225
 					}
226 226
 					else {
227
-						cli_dbgmsg("Warning: unicode character out of utf16 range!\n");
227
+						cli_dbgmsg(MODULE_NAME "Warning: unicode character out of utf16 range!\n");
228 228
 						output[i/2] = 0xff;
229 229
 						output[i/2+1] = 0xff;
230 230
 					}
... ...
@@ -316,7 +332,7 @@ static int iconv(iconv_t iconv_struct,char **inbuf, size_t *inbytesleft,
316 316
 							output[j++] = ((input[i] & 0x1F) << 6) | (input[i+1] & 0x3F);
317 317
 						}
318 318
 						else {
319
-							cli_dbgmsg("invalid UTF8 character encountered\n");
319
+							cli_dbgmsg(MODULE_NAME "invalid UTF8 character encountered\n");
320 320
 							break;
321 321
 						}
322 322
 						i+=2;
... ...
@@ -328,7 +344,7 @@ static int iconv(iconv_t iconv_struct,char **inbuf, size_t *inbytesleft,
328 328
 							output[j++] = (input[i+1] << 6) | (input[i+2] & 0x3F);
329 329
 						}
330 330
 						else {
331
-							cli_dbgmsg("invalid UTF8 character encountered\n");
331
+							cli_dbgmsg(MODULE_NAME "invalid UTF8 character encountered\n");
332 332
 							break;
333 333
 						}
334 334
 						i+=3;
... ...
@@ -336,7 +352,7 @@ static int iconv(iconv_t iconv_struct,char **inbuf, size_t *inbytesleft,
336 336
 					else if( (input[i]&0xF8) == 0xF0) {
337 337
 						if((input[i+1]&0xC0) == 0x80 && (input[i+2]&0xC0) == 0x80 && (input[i+3]&0xC0) == 0x80) {
338 338
 							/* 4 bytes long 11110www 10xxxxxx 10yyyyyy 10zzzzzz -> 000wwwxx xxxxyyyy yyzzzzzz*/
339
-							cli_dbgmsg("UTF8 character out of UTF16 range encountered");
339
+							cli_dbgmsg(MODULE_NAME "UTF8 character out of UTF16 range encountered");
340 340
 							output[j++] = 0xff;
341 341
 							output[j++] = 0xff;
342 342
 
... ...
@@ -345,13 +361,13 @@ static int iconv(iconv_t iconv_struct,char **inbuf, size_t *inbytesleft,
345 345
 							out[j++] = (input[i+2] << 6) | (input[i+2] & 0x3F);*/
346 346
 						}
347 347
 						else {
348
-							cli_dbgmsg("invalid UTF8 character encountered\n");
348
+							cli_dbgmsg(MODULE_NAME "invalid UTF8 character encountered\n");
349 349
 							break;
350 350
 						}
351 351
 						i+=4;
352 352
 					}
353 353
 					else {
354
-						cli_dbgmsg("invalid UTF8 character encountered\n");
354
+						cli_dbgmsg(MODULE_NAME "invalid UTF8 character encountered\n");
355 355
 						break;
356 356
 					}							
357 357
 				}
... ...
@@ -392,10 +408,12 @@ static int iconv(iconv_t iconv_struct,char **inbuf, size_t *inbytesleft,
392 392
 static inline void process_bom(struct entity_conv* conv)
393 393
 {
394 394
 	const unsigned char* bom = conv->bom;
395
-	const unsigned char* encoding = OTHER;
395
+	const char* encoding = NULL;
396 396
 	int has_bom = 0;
397
-	uint8_t enc_bytes = 4;/* default is UTF8, which has a maximum of 4 bytes*/
397
+	uint8_t enc_bytes = 1;/* default is UTF8, which has a minimum of 1 bytes*/
398 398
 
399
+	/* undecided 32-bit encodings are treated as ucs4, and
400
+	 * 16 bit as utf16*/
399 401
 	switch(bom[0]) {
400 402
 		case 0x00:
401 403
 			if(bom[1] == 0x00) {
... ...
@@ -408,19 +426,20 @@ static inline void process_bom(struct entity_conv* conv)
408 408
 					has_bom = 1;
409 409
 				}
410 410
 				else if(bom[2] == 0x00 && bom[3] == 0x3C) {
411
-					encoding = UNDECIDED_32_1234;
412
-				} 
411
+					/* undecided, treat as ucs4 */
412
+					encoding = UCS4_1234;
413
+				}
413 414
 				else if(bom[2] == 0x3C && bom[3] == 0x00) {
414
-					encoding = UNDECIDED_32_2143;
415
+					encoding = UCS4_2143;
415 416
 				}
416 417
 			}/* 0x00 0x00 */
417 418
 			else if(bom[1] == 0x3C) {
418 419
 				if(bom[2] == 0x00) {
419 420
 					if(bom[3] == 0x00) {
420
-						encoding = UNDECIDED_32_3412;
421
+						encoding = UCS4_3412;
421 422
 					}
422 423
 					else if(bom[3] == 0x3F) {
423
-						encoding = UNDECIDED_16_BE;
424
+						encoding = UTF16_BE;
424 425
 						enc_bytes = 2;
425 426
 					}
426 427
 				}/*0x00 0x3C 0x00*/
... ...
@@ -439,7 +458,7 @@ static inline void process_bom(struct entity_conv* conv)
439 439
 				}
440 440
 			}/*0xFF 0xFE*/
441 441
 			break;
442
-		case 0xFE: 
442
+		case 0xFE:
443 443
 			if(bom[1] == 0xFF) {
444 444
 					if(bom[2] == 0x00 && bom[3] == 0x00) {
445 445
 						encoding = UCS4_3412;
... ...
@@ -449,98 +468,91 @@ static inline void process_bom(struct entity_conv* conv)
449 449
 						encoding = UTF16_BE;
450 450
 						has_bom = 1;
451 451
 						enc_bytes = 2;
452
-					}					
452
+					}
453 453
 			}/*0xFE 0xFF*/
454 454
 			break;
455
-		case 0xEF: 
455
+		case 0xEF:
456 456
 			if(bom[1] == 0xBB && bom[2] == 0xBF)  {
457 457
 					encoding = UTF8;
458 458
 					has_bom = 1;
459 459
 					/*enc_bytes = 4;- default, maximum 4 bytes*/
460
-			}/*0xEF 0xBB 0xBF*/				
460
+			}/*0xEF 0xBB 0xBF*/
461 461
 			break;
462
-		case 0x3C: 
462
+		case 0x3C:
463 463
 				if(bom[1] == 0x00) {
464 464
 					if(bom[2] == 0x00 && bom[3] == 0x00) {
465
-						encoding = UNDECIDED_32_4321;
465
+						encoding = UCS4_4321;
466 466
 					}
467 467
 					else if(bom[2] == 0x3F && bom[3] == 0x00) {
468
-						encoding = UNDECIDED_16_LE;
468
+						encoding = UTF16_LE;
469 469
 						enc_bytes = 2;
470 470
 					}
471 471
 				}/*0x3C 0x00*/
472 472
 				else if(bom[1] == 0x3F && bom[2] == 0x78 && bom[3]==0x6D) {
473
-					encoding = UNDECIDED_8;
473
+					encoding = NULL;
474 474
 					enc_bytes = 1;
475 475
 				}/*0x3C 3F 78 6D*/
476 476
 				break;
477
-		case 0x4C: 
477
+		case 0x4C:
478 478
 				if(bom[1] == 0x6F && bom[2] == 0xA7 && bom[3] == 0x94) {
479
-					encoding = EBCDIC;
479
+					cli_dbgmsg(MODULE_NAME "EBCDIC encoding is not supported in line mode\n");
480
+					encoding = NULL;
480 481
 					enc_bytes = 1;
481 482
 				}/*4C 6F A7 94*/
482 483
 				break;
483 484
 	}/*switch*/
484
-	conv->autodetected = encoding;
485
+	if(encoding) {
486
+		cli_dbgmsg(MODULE_NAME "encoding detected as :%s\n", encoding);
487
+		process_encoding_set(conv, (const unsigned char*)encoding, has_bom ? BOM : NOBOM_AUTODETECT);
488
+	}
485 489
 	conv->enc_bytes = enc_bytes;
486 490
 	conv->has_bom = has_bom;
487 491
 }
488 492
 
493
+/*()-./012345678:ABCDEFGHIJKLMNOPQRSTUVWXY_abcdefghijklmnopqrstuvwxy*/
494
+static const uint8_t encname_chars[256] = {
495
+        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
496
+        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
497
+        0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1,
498
+        1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0,
499
+        0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
500
+        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1,
501
+        0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
502
+        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0,
503
+        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
504
+        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
505
+        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
506
+        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
507
+        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
508
+        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
509
+        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
510
+        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
511
+};
512
+
513
+/* checks that encoding is sane, and normalizes to uppercase */
489 514
 static unsigned char* normalize_encoding(const unsigned char* enc)
490 515
 {
491
-	unsigned char* norm; 
492
-	size_t i;
493
-	const size_t len = strlen((const char*)enc);
494
-	norm = cli_malloc( len+1);
516
+	unsigned char* norm;
517
+	size_t i, len;
518
+
519
+	if(!enc)
520
+		return NULL;
521
+	len = strlen((const char*)enc);
522
+	if(len > 32)
523
+		return NULL;
524
+	for(i=0;i<len;i++) {
525
+		if(!encname_chars[enc[i]])
526
+			return NULL;
527
+	}
528
+	norm = cli_malloc( len+1 );
495 529
 	if(!norm)
496 530
 		return NULL;
497
-	if(enc == OTHER)
498
-		enc = (const unsigned char*)"ISO-8859-1";
499 531
 	for(i=0;i < strlen((const char*)enc); i++)
500 532
 		norm[i] = toupper(enc[i]);
501 533
 	norm[len]='\0';
502 534
 	return norm;
503 535
 }
504 536
 
505
-static const unsigned char* encoding_name(unsigned char* encoding)
506
-{
507
-	if(!encoding)
508
-		return (const unsigned char*)"ISO-8859-1";
509
-	else
510
-		return encoding;
511
-}
512
-
513
-void process_encoding_set(struct entity_conv* conv,const unsigned char* encoding,enum encoding_priority prio)
514
-{
515
-	unsigned char *tmp_encoding;
516
-	enum encodings tmp;
517
-	size_t new_size,old_size;
518
-
519
-	cli_dbgmsg("Setting encoding for %p  to %s, priority: %d\n",(void*)conv, encoding, prio);
520
-	if(encoding == OTHER)
521
-		return;
522
-	if(conv->priority == CONTENT_TYPE)
523
-		return;/* Content-type in header is highest priority, no overrides possible*/
524
-	if(conv->priority ==  BOM && prio == NOBOM_AUTODETECT)
525
-		return;
526
-
527
-	tmp_encoding = normalize_encoding(encoding);/* FIXME: better obey priorities*/
528
-	if(prio == META) {
529
-	old_size = encoding_bytes(conv->encoding,&tmp);
530
-	new_size = encoding_bytes(tmp_encoding,&tmp);
531
-	if(old_size != new_size)  {
532
-		/* on x86 gcc wants %u for size_t, on x86_64 it wants %lu for size_t. So just cast to unsigned long to make warnings go away. */
533
-		cli_dbgmsg("process_encoding_set: refusing to override encoding - new encoding size differs: %s(%lu) != %s(%lu)\n",conv->encoding,(unsigned long)old_size,tmp_encoding,(unsigned long)new_size);
534
-		free(tmp_encoding);
535
-		return;
536
-	}
537
-	}
538
-	free(conv->encoding);
539
-	conv->encoding = tmp_encoding;
540
-	cli_dbgmsg("New encoding for %p:%s\n",(void*)conv,conv->encoding);
541
-	/* reset stream */
542
-}
543
-
544 537
 static int encoding_norm_done(struct entity_conv* conv)
545 538
 {
546 539
 	if(conv->encoding) {
... ...
@@ -567,7 +579,7 @@ int entity_norm_done(struct entity_conv* conv)
567 567
 {
568 568
 	return encoding_norm_done(conv);
569 569
 }
570
-
570
+#if 0
571 571
 static size_t read_raw(FILE *stream, m_area_t *m_area, int max_len, unsigned char* outbuff)
572 572
 {
573 573
 
... ...
@@ -615,29 +627,25 @@ static size_t read_raw(FILE *stream, m_area_t *m_area, int max_len, unsigned cha
615 615
 		}
616 616
 	}
617 617
 }
618
+#endif
618 619
 
619
-static void output_first(struct entity_conv* conv,unsigned char** out, unsigned char** in,size_t* inleft)
620
+static unsigned short bom_length(struct entity_conv* conv)
620 621
 {
621 622
 	if(conv->has_bom) {
622 623
 		switch(conv->enc_bytes) {
623 624
 			case 1:
624
-				if(conv->autodetected == UTF8) {
625
-					*in += 3;
626
-					*inleft -= 3;
625
+				if(conv->encoding_symbolic == E_UTF8) {
626
+					return 3;
627 627
 				}
628 628
 				break;
629 629
 			case 2:
630
-				*in += 2;
631
-				*inleft -= 2;
632
-				break;
630
+				return 2;
633 631
 			case 4:
634
-				*in += 4;
635
-				*inleft -= 4;
636
-				break;
632
+				return 4;
637 633
 		}
638 634
 	}
635
+	return 0;
639 636
 }
640
-
641 637
 /* sarge leaks on iconv_open/iconv_close, so lets not open/close so many times,
642 638
  * just keep on each thread its own pool of iconvs*/
643 639
 
... ...
@@ -653,16 +661,16 @@ static void iconv_cache_init(struct iconv_cache* cache)
653 653
 /*	cache->tab = NULL;
654 654
 	cache->len = 0;
655 655
 	cache->used = 0; - already done by memset*/
656
-	cli_dbgmsg("Initializing iconv pool:%p\n",(void*)cache);
656
+	cli_dbgmsg(MODULE_NAME "Initializing iconv pool:%p\n",(void*)cache);
657 657
 	hashtab_init(&cache->hashtab, 32);
658 658
 }
659 659
 
660 660
 static void iconv_cache_destroy(struct iconv_cache* cache)
661 661
 {
662 662
 	size_t i;
663
-	cli_dbgmsg("Destroying iconv pool:%p\n",(void*)cache);
663
+	cli_dbgmsg(MODULE_NAME "Destroying iconv pool:%p\n",(void*)cache);
664 664
 	for(i=0;i < cache->last;i++) {
665
-		cli_dbgmsg("closing iconv:%p\n",cache->tab[i]);
665
+		cli_dbgmsg(MODULE_NAME "closing iconv:%p\n",cache->tab[i]);
666 666
 		iconv_close(cache->tab[i]);
667 667
 	}
668 668
 	hashtab_clear(&cache->hashtab);
... ...
@@ -702,9 +710,9 @@ static void iconv_pool_tls_key_alloc(void)
702 702
 {
703 703
 	pthread_key_create(&iconv_pool_tls_key, iconv_pool_tls_instance_destroy);
704 704
 	if(!cache_atexit_registered) {
705
-		cli_dbgmsg("iconv:registering atexit\n");
705
+		cli_dbgmsg(MODULE_NAME "iconv:registering atexit\n");
706 706
 		if(atexit(iconv_cache_cleanup_main)) {
707
-			cli_dbgmsg("failed to register atexit\n");
707
+			cli_dbgmsg(MODULE_NAME "failed to register atexit\n");
708 708
 		}
709 709
 		cache_atexit_registered = 1;
710 710
 	}
... ...
@@ -721,7 +729,7 @@ static inline struct iconv_cache* cache_get_tls_instance(void)
721 721
 	if(!cache) {
722 722
 		cache = cli_calloc(1,sizeof(*cache));
723 723
 		if(!cache) {
724
-			cli_dbgmsg("!Out of memory allocating TLS iconv instance\n");
724
+			cli_dbgmsg(MODULE_NAME "!Out of memory allocating TLS iconv instance\n");
725 725
 			return NULL;
726 726
 		}
727 727
 		iconv_cache_init(cache);
... ...
@@ -772,7 +780,7 @@ static iconv_t iconv_open_cached(const unsigned char* fromcode)
772 772
 	init_iconv_pool_ifneeded();
773 773
 	cache = cache_get_tls_instance();/* gets TLS iconv pool */
774 774
 	if(!cache) {
775
-		cli_dbgmsg("!Unable to get TLS iconv cache!\n");
775
+		cli_dbgmsg(MODULE_NAME "!Unable to get TLS iconv cache!\n");
776 776
 		errno = EINVAL;
777 777
 		return (iconv_t)-1;
778 778
 	}
... ...
@@ -784,7 +792,7 @@ static iconv_t iconv_open_cached(const unsigned char* fromcode)
784 784
 	if(e) {
785 785
 		return cache->tab[e->data];
786 786
 	}
787
-	cli_dbgmsg("iconv not found in cache, for encoding:%s\n",fromcode);
787
+	cli_dbgmsg(MODULE_NAME "iconv not found in cache, for encoding:%s\n",fromcode);
788 788
 	iconv_struct = iconv_open("UTF-16BE",(const char*)fromcode);
789 789
 	if(iconv_struct != (iconv_t)-1) {
790 790
 	idx = cache->last++;
... ...
@@ -792,7 +800,7 @@ static iconv_t iconv_open_cached(const unsigned char* fromcode)
792 792
 		cache->len += 16;
793 793
 		cache->tab = cli_realloc2(cache->tab, cache->len*sizeof(cache->tab[0]));
794 794
 		if(!cache->tab) {
795
-			cli_dbgmsg("!Out of mem in iconv-pool\n");
795
+			cli_dbgmsg(MODULE_NAME "!Out of mem in iconv-pool\n");
796 796
 			errno = ENOMEM;
797 797
 			return (iconv_t)-1;
798 798
 		}
... ...
@@ -800,12 +808,200 @@ static iconv_t iconv_open_cached(const unsigned char* fromcode)
800 800
 
801 801
 	hashtab_insert(&cache->hashtab, fromcode, fromcode_len, idx);
802 802
 		cache->tab[idx] = iconv_struct;
803
-	cli_dbgmsg("iconv_open(),for:%s -> %p\n",fromcode,(void*)cache->tab[idx]);
803
+	cli_dbgmsg(MODULE_NAME "iconv_open(),for:%s -> %p\n",fromcode,(void*)cache->tab[idx]);
804 804
 	return cache->tab[idx];
805 805
 }
806 806
 	return (iconv_t)-1;
807 807
 }
808
+#if 0
809
+struct buffer {
810
+	unsigned char *buffer;
811
+	size_t length;
812
+	size_t offset;
813
+	size_t filled;
814
+};
815
+
816
+#define BUFFER_FILL(b, fill_func) \
817
+	if((b)->offset >= (b)->filled) {\
818
+		/* buffer empty, attempt to fill it*/\
819
+		if((fill_func) == -1) return -1;/* error encountered */\
820
+		if((b)->filled == 0) return 0;/* EOF */\
821
+		(b)->offset = 0;\
822
+	}
823
+#endif
824
+
825
+void process_encoding_set(struct entity_conv* conv,const unsigned char* encoding,enum encoding_priority prio)
826
+{
827
+	unsigned char *tmp_encoding;
828
+	enum encodings tmp;
829
+	size_t new_size,old_size;
830
+
831
+	if(!encoding && prio == SWITCH_TO_BLOCKMODE) {
832
+		if(conv->linemode) {
833
+			cli_dbgmsg(MODULE_NAME "Switching to block-mode, bytes processed in line-mode: %u\n", conv->linemode_processed);
834
+			conv->linemode = 0;
835
+		}
836
+		return;
837
+	}
838
+
839
+	cli_dbgmsg(MODULE_NAME "Request to set encoding for %p to %s, priority: %d\n", (void*)conv, encoding, prio);
840
+
841
+	if(conv->priority == CONTENT_TYPE || conv->encoding || conv->encoding_symbolic == E_ICONV) {
842
+		cli_dbgmsg(MODULE_NAME "won't override encoding due to priorities\n");
843
+		return;
844
+		/* Content-type in header is highest priority, no overrides possible.
845
+		 * Also no overrides after an encoding has been set.*/
846
+	}
847
+
848
+	/* validate encoding name, and normalize to uppercase */
849
+	if(!(tmp_encoding = normalize_encoding(encoding))) {
850
+		cli_dbgmsg(MODULE_NAME "encoding name is not valid, ignoring\n");
851
+		return;
852
+	}
853
+
854
+	/* don't allow to change between unicode encodings that have different byte-size */
855
+	if(prio == META) {
856
+		/* need to consider minimum size of an encoding here */
857
+		old_size =  conv->enc_bytes;
858
+		new_size = encoding_bytes(tmp_encoding,&tmp);
859
+		if(old_size != new_size)  {
860
+			/* on x86 gcc wants %u for size_t, on x86_64 it wants %lu for size_t. So just cast to unsigned long to make warnings go away. */
861
+			cli_dbgmsg(MODULE_NAME "refusing to override encoding - new encoding size differs: %s(%lu) != %s(%lu)\n", conv->encoding, (unsigned long)old_size, tmp_encoding, (unsigned long)new_size);
862
+			free(tmp_encoding);
863
+			return;
864
+		}
865
+	}
866
+
867
+	conv->encoding = tmp_encoding;
868
+	cli_dbgmsg(MODULE_NAME "New encoding for %p:%s\n", (void*)conv, conv->encoding);
869
+	*(iconv_t*)conv->iconv_struct = iconv_open_cached( conv->encoding );
870
+	if(*(iconv_t*)conv->iconv_struct == (iconv_t)-1) {
871
+		cli_dbgmsg(MODULE_NAME "Encoding not accepted by iconv_open()%s, falling back to default!\n", conv->encoding);
872
+		/* message shown only once/file */
873
+		/* what can we do? short-circuit iconv */
874
+		free(conv->encoding);
875
+		conv->encoding = NULL;
876
+		/* we will process using whatever we currently have for encoding_symbolic.
877
+		 * If encoding was already set to iconv, we shouldn't be here.*/
878
+		assert(conv->encoding_symbolic != E_ICONV);
879
+	} else {
880
+		cli_dbgmsg(MODULE_NAME "Switching to block-mode, bytes processed in line-mode: %u\n", conv->linemode_processed);
881
+		conv->encoding_symbolic = E_ICONV;
882
+		conv->priority = prio;
883
+		conv->linemode = 0;
884
+	}
885
+}
886
+
887
+static int in_iconv_u16(m_area_t* in_m_area, iconv_t* iconv_struct, m_area_t* out_m_area)
888
+{
889
+	char   tmp4[4];
890
+	size_t inleft = in_m_area->length - in_m_area->offset;
891
+	size_t rc, alignfix;
892
+	char*  input   = (char*)in_m_area->buffer + in_m_area->offset;
893
+	size_t outleft = out_m_area->length > 0 ? out_m_area->length : 0;/*TODO: use real buffer size not last one*/
894
+	char* out      = (char*)out_m_area->buffer;
895
+
896
+
897
+	/* convert encoding conv->tmp_area. conv->out_area */
898
+	alignfix = inleft%4;/* iconv gives an error if we give him 3 bytes to convert, 
899
+			       and we are using ucs4, ditto for utf16, and 1 byte*/
900
+	inleft -= alignfix;
901
+
902
+	if(!inleft && alignfix) {
903
+		/* EOF, and we have less than 4 bytes to convert */
904
+		memset(tmp4, 0, 4);
905
+		memcpy(tmp4, input, alignfix);
906
+		input = tmp4;
907
+		inleft = 4;
908
+	}
909
+
910
+	rc = (size_t)-1;
911
+	while (inleft && (outleft >= 2) && rc == (size_t)-1) { /* iconv doesn't like inleft to be 0 */
912
+		assert(*iconv_struct != (iconv_t)-1);
913
+		rc = iconv(*iconv_struct, (char**) &input,  &inleft, (char**) &out, &outleft);
914
+		if(rc == (size_t)-1 && errno != E2BIG) {
915
+			cli_dbgmsg("iconv error:%s, silently resuming (%lu, %lu, %ld, %ld)\n",
916
+					strerror(errno), inleft, outleft, input - (char*)in_m_area->buffer,
917
+					out - (char*)out_m_area->buffer);
918
+			/* output raw byte, and resume at next byte */
919
+			if(outleft < 2) break;
920
+			outleft -= 2;
921
+			*out++ = 0;
922
+			*out++ = *input++;
923
+			inleft--;
924
+		}
925
+	}
926
+	in_m_area->offset = in_m_area->length - inleft;
927
+	if(out_m_area->length >= 0 && out_m_area->length >= (off_t)outleft) {
928
+		out_m_area->length -= (off_t)outleft;
929
+	} else {
930
+		cli_dbgmsg(MODULE_NAME "outleft overflown, ignoring\n");
931
+		out_m_area->length = 0;
932
+	}
933
+	out_m_area->offset  = 0;
934
+	return 0;
935
+}
936
+#if 0
937
+/* processes @in buffer, and fills @out. Modifies offset of @in on exit. */
938
+static int u16_normalize (struct entity_conv* conv, struct buffer* in_buff, struct buffer* out_buff)
939
+{
940
+	const unsigned char* in = in_buff->buffer;
941
+	unsigned char* out      = out_buff->buffer;
942
+	const unsigned char* out_end  = out + out_buff->length;
943
+
944
+	do {
945
+		size_t i;
946
+		BUFFER_FILL(in_buff, in_iconv_u16(conv) );
947
+
948
+		for(i = in_buff->offset; (i < in_buff->filled) && (out < out_end); i += 2) {
949
+			const uint16_t u16 = ( ((const uint16_t)in[i]) << 8 ) | in[i+1];
950
+			if(u16 >  0 && u16 < 0x80) {
951
+				assert((unsigned char)u16 != 0);
952
+				assert(out < out_end);
953
+				*out++ = (unsigned char)u16;
954
+			}
955
+			else if (u16 == 160)  {/*nbsp*/
956
+				assert(out < out_end);
957
+				*out++ = 0x20;
958
+			}
959
+			else {
960
+				const ssize_t max_num_length = 9;
961
+				ssize_t printed;
962
+				if((out_end - out) <=  max_num_length) {
963
+					/* prevent buffer overflow */
964
+					/* force exit out of while loop */
965
+					out_end = NULL;
966
+					break;
967
+				}
968
+				assert(out + max_num_length < out_end);
808 969
 
970
+				printed = snprintf((char*)out, max_num_length, "&#%d;", u16);
971
+				if(printed > 0) {
972
+					out += printed;
973
+				}
974
+			}
975
+		}
976
+		in_buff->offset = i;
977
+		out_buff->filled = out - out_buff->buffer;
978
+		out_buff->offset = 0;
979
+	} while (out < out_end);/* if out not full, try to fill it */
980
+}
981
+/*
982
+ * We need a line-mode, which allows us to change the encoding, and 
983
+ * a block mode, that doesn't care about lines
984
+ *
985
+ *
986
+ */
987
+/*
988
+ * ASCII -> ascii_normalize
989
+ * ANY -> iconv -> u16_normalize
990
+ * UTF16 -> u16_normalize
991
+ */
992
+
993
+unsigned char* encoding_norm_readline(struct entity_conv* conv, FILE* stream_in, m_area_t* in_m_area, const size_t maxlen)
994
+{
995
+	u16_normalize(conv, 
996
+}
809 997
 
810 998
 /* tmp_m_area and conv->out_area are of size maxlen */
811 999
 unsigned char* encoding_norm_readline(struct entity_conv* conv, FILE* stream_in, m_area_t* in_m_area, const size_t maxlen)
... ...
@@ -818,7 +1014,7 @@ unsigned char* encoding_norm_readline(struct entity_conv* conv, FILE* stream_in,
818 818
 		const size_t tmp_available = conv->buffer_size - tmp_move;
819 819
 		const size_t max_read = maxlen < tmp_available ? maxlen : tmp_available;
820 820
 		unsigned char* tmpbuff = &conv->tmp_area.buffer[tmp_move];
821
-	
821
+
822 822
 		const size_t out_move = conv->out_area.length < conv->out_area.offset ? 0 : conv->out_area.length - conv->out_area.offset;
823 823
 		size_t outleft = conv->buffer_size - out_move;
824 824
 		unsigned char* out = &conv->out_area.buffer[out_move];
... ...
@@ -849,60 +1045,6 @@ unsigned char* encoding_norm_readline(struct entity_conv* conv, FILE* stream_in,
849 849
 
850 850
 		tmpbuff = conv->tmp_area.buffer;
851 851
 		inleft = conv->tmp_area.length;
852
-		if(!conv->bom_cnt && conv->tmp_area.length >= 4) {/* detect Byte Order Mark */
853
-			memcpy( conv->bom, tmpbuff, 4);
854
-			process_bom(conv);
855
-			process_encoding_set(conv,conv->autodetected,conv->has_bom ? BOM : NOBOM_AUTODETECT);
856
-			output_first(conv,&out,&tmpbuff,&inleft);
857
-			conv->bom_cnt++;
858
-		}
859
-
860
-		/* convert encoding conv->tmp_area. conv->out_area */
861
-		alignfix = inleft%4;/* iconv gives an error if we give him 3 bytes to convert, 
862
-				       and we are using ucs4, ditto for utf16, and 1 byte*/
863
-		inleft -= alignfix;
864
-
865
-		if(!inleft && alignfix) {
866
-			size_t k;
867
-			for(k=0;k+alignfix < 4;k++)
868
-				tmpbuff[alignfix+k] = '\0';
869
-			inleft = 4;
870
-			alignfix = -inleft;
871
-		}
872
-
873
-		iconv_struct = iconv_open_cached(encoding_name(conv->encoding));
874
-
875
-		if(iconv_struct == (iconv_t)-1) {
876
-			cli_dbgmsg("Iconv init problem for encoding:%s, falling back to iso encoding!\n",encoding_name(conv->encoding));
877
-			/* message shown only once/file */
878
-			/* what can we do? just fall back for it being an ISO-8859-1 */
879
-		        free(conv->encoding);
880
-			conv->encoding = (unsigned char*) cli_strdup("ISO-8859-1");
881
-			iconv_struct = iconv_open_cached(conv->encoding);
882
-			if(iconv_struct == (iconv_t)-1) {
883
-				cli_dbgmsg("fallback failed... bail out\n");
884
-				return cli_readline(NULL,&conv->tmp_area,maxlen);
885
-			}
886
-		}
887
-
888
-		if(inleft && outleft > conv->buffer_size/2 ) /* iconv doesn't like inleft to be 0 */ {
889
-			rc = iconv(iconv_struct, (char**) &tmpbuff,  &inleft, (char**) &out, &outleft);	
890
-		}
891
-		else
892
-			rc = 0;
893
-
894
-#if 0
895
-		 iconv_close(iconv_struct);/* - don't close, we are using a cached instance */
896
-#endif
897
-
898
-		if(rc==(size_t)-1 && errno != E2BIG) {
899
-				cli_dbgmsg("iconv error:%s, silently resuming (%ld,%ld,%lu,%lu)\n",strerror(errno),(long)(out-conv->out_area.buffer),(long)(tmpbuff-conv->tmp_area.buffer),(unsigned long)inleft,(unsigned long)outleft);
900
-				/* output raw byte, and resume at next byte */
901
-				*out++ = 0;
902
-				*out++ = *tmpbuff++;
903
-				inleft--;
904
-/*				return cli_readline(NULL, &conv->norm_area, maxlen);*/
905
-		}
906 852
 
907 853
 		conv->tmp_area.length = inleft + (alignfix > 0 ? alignfix : 0);
908 854
 		conv->out_area.length = out - conv->out_area.buffer - out_move;
... ...
@@ -980,4 +1122,186 @@ unsigned char* encoding_norm_readline(struct entity_conv* conv, FILE* stream_in,
980 980
 		return cli_readline(NULL, &conv->norm_area, maxlen);
981 981
 	}
982 982
 }
983
+#endif
984
+
985
+static inline unsigned char* u16_normalize(uint16_t u16, unsigned char* out, const ssize_t limit)
986
+{
987
+	assert(limit > 0 && "u16_normalize must be called with positive limit");
988
+	/* \0 is just ignored */
989
+	if(u16 > 0 && u16 < 0xff) {
990
+		assert((uint8_t)u16 != 0);
991
+		*out++ = (uint8_t)u16;
992
+	}
993
+	else {
994
+		/* normalize only >255 to speed up */
995
+		char buf[10];
996
+		const ssize_t max_num_length = sizeof(buf)-1;
997
+		int i = sizeof(buf)-1;
998
+
999
+		if(limit <=  max_num_length) {
1000
+			/* not enough space available */
1001
+			return NULL;
1002
+		}
1003
+		/* inline version of
1004
+		 * out += snprintf(out, max_num_length, "&#%d;", u16) */
1005
+		buf[i] = '\0';
1006
+		do {
1007
+			buf[--i] = '0' + (u16 % 10);
1008
+			u16 /= 10;
1009
+		} while (u16 && i > 0);
1010
+		*out++ = '&';
1011
+		*out++ = '#';
1012
+		while(buf[i]) *out++ = buf[i++];
1013
+		*out++ = ';';
1014
+	}
1015
+	assert(out);
1016
+	return out;
1017
+}
1018
+
1019
+#define NORMALIZE_CHAR(c, out, limit, linemode) \
1020
+{\
1021
+	        if (linemode && c == '\n') {\
1022
+			i++;\
1023
+			break;\
1024
+		} else {\
1025
+			unsigned char* out_new = u16_normalize(c, out, limit);\
1026
+			if(out_new) {\
1027
+				limit -= out_new - out;\
1028
+			}\
1029
+			out = out_new;\
1030
+		}\
1031
+}
1032
+
1033
+/* don't use CLI_ISCONTAINED2 here, because values are signed, and gcc4.3
1034
+ * assumes signed overflow doesn't occur when optimizing (see -Wstrict-overflow) */
1035
+#define LIMIT_LENGTH(siz, siz_limit) ((siz) <= (siz_limit) ? (siz) : (siz_limit))
1036
+#define OFFSET_INBOUNDS(offset, length) ((offset) >= 0 && (length) >= 0 && (offset) < (length))
1037
+
1038
+/* EOF marker is m_area->length == 0 */
1039
+
1040
+/* reads input from either @m_area or @stream, and returns an m_area_t pointing to the data read.
1041
+ * When we can't read anything due to EOF ->length will be set to 0.
1042
+ * bounds checks offset and length*/
1043
+static inline m_area_t* read_raw(struct entity_conv* conv, m_area_t* m_area, FILE* stream)
1044
+{
1045
+	if(!m_area) {
1046
+		size_t iread;
1047
+
1048
+		m_area = &conv->tmp_area;
1049
+		if(OFFSET_INBOUNDS(m_area->offset, m_area->length)) {
1050
+			return m_area;
1051
+		}
1052
+		/* offset out of bounds -> all the buffer was processed, fill it again */
1053
+		iread = fread(m_area->buffer, 1, conv->buffer_size, stream);
1054
+		m_area->length = LIMIT_LENGTH(iread, conv->buffer_size);
1055
+		m_area->offset = 0;
1056
+		if(ferror(stream)) {
1057
+			cli_errmsg("Error while reading HTML stream\n");
1058
+		}
1059
+	} else {
1060
+		if(!OFFSET_INBOUNDS(m_area->offset, m_area->length)) {
1061
+			cli_dbgmsg(MODULE_NAME "EOF reached\n");
1062
+			m_area->length = m_area->offset = 0; /* EOF marker */
1063
+		}
1064
+	}
1065
+	return m_area;
1066
+}
1067
+
1068
+static inline uint16_t get_u16(const unsigned char* buf, const size_t i)
1069
+{
1070
+	return ((uint16_t)buf[i] << 8) | buf[i+1];
1071
+}
1072
+
1073
+unsigned char* encoding_norm_readline(struct entity_conv* conv, FILE* stream_in, m_area_t* in_m_area)
1074
+{
1075
+	unsigned char* out = conv->out_area.buffer;
1076
+	if(!conv || !conv->out_area.buffer || !conv->tmp_area.buffer || !out) {
1077
+		return NULL;
1078
+	}
1079
+	if(!(in_m_area = read_raw(conv, in_m_area, stream_in))) {
1080
+		/* error encountered */
1081
+		return NULL;
1082
+	}
1083
+	else {
1084
+		const off_t input_limit  = in_m_area->length;
1085
+		const unsigned char* input = in_m_area->buffer;
1086
+		off_t input_offset = in_m_area->offset;
1087
+		off_t limit = conv->out_area.length - 1;
1088
+		off_t limit_prev = limit;
1089
+		off_t i = 0;
1090
+
1091
+		/* read_raw() ensures this condition */
1092
+		assert((!input_limit && !input_offset) || (input_offset >=0 && input_limit > 0 && input_offset < input_limit));
1093
+
1094
+		if(!conv->bom_cnt && input_offset + 4 < input_limit) {/* detect Byte Order Mark */
1095
+			size_t bom_len;
1096
+			memcpy(conv->bom, input, 4);
1097
+			process_bom(conv);
1098
+			bom_len = bom_length(conv);
1099
+			in_m_area->offset = input_offset = input_offset + bom_len;
1100
+			conv->bom_cnt = 1;
1101
+		}
1102
+
1103
+		if(conv->linemode && conv->linemode_processed > LINEMODE_LIMIT) {
1104
+			cli_dbgmsg(MODULE_NAME "Line-mode limit exceeded (%u), switching to block-mode\n", conv->linemode_processed);
1105
+			conv->linemode = 0;
1106
+		}
1107
+
1108
+		switch(conv->encoding_symbolic) {
1109
+			case E_ICONV:/* only in block-mode */
1110
+				/* normalize already converted characters from a previous pass
1111
+				 * (output buffer was full, and we couldn't normalize more in previous pass) */
1112
+				for(i = conv->norm_area.offset;i < conv->norm_area.length && limit > 0 && out; i += 2) {
1113
+					const uint16_t c = get_u16(conv->norm_area.buffer, i);
1114
+					NORMALIZE_CHAR(c, out, limit, 0);
1115
+				}
1116
+				conv->norm_area.offset = i;
1117
+			        if(limit > 0) {
1118
+					conv->norm_area.length = conv->buffer_size;
1119
+					in_iconv_u16(in_m_area, conv->iconv_struct, &conv->norm_area);
1120
+
1121
+					/*in_iconv_u16 always fills entire norm_area buffer starting from 0. */
1122
+					for(i = 0;i < conv->norm_area.length && limit >  0 && out; i += 2) {
1123
+						const uint16_t c = get_u16(conv->norm_area.buffer, i);
1124
+						NORMALIZE_CHAR(c, out, limit, 0);
1125
+					}
1126
+					if(i) {
1127
+						conv->norm_area.offset = i;
1128
+					}
1129
+				}
1130
+				if(limit == limit_prev) {
1131
+					/* output pointer didn't move => EOF */
1132
+					return NULL;
1133
+				}
1134
+				break;
1135
+				/* out_area must have enough space to allow all bytes in norm_area normalized,
1136
+				 * if we norm with &x;, then we need 7* space. */
1137
+			default:
1138
+				cli_dbgmsg(MODULE_NAME "Unhandled encoding:%d\n",conv->encoding_symbolic);
1139
+				conv->encoding_symbolic = E_OTHER;
1140
+			case E_UNKNOWN:
1141
+			case E_OTHER:
1142
+				if(!input_limit) {
1143
+					/* nothing to do, EOF */
1144
+					return NULL;
1145
+				}
1146
+				for(i = input_offset; i < input_limit && limit > 0 && out; i++) {
1147
+					const uint16_t c = input[i];
1148
+					NORMALIZE_CHAR(c, out, limit, conv->linemode);
1149
+				}
1150
+				in_m_area->offset = i;
1151
+		}
1152
+
1153
+
1154
+		if(conv->linemode) {
1155
+			conv->linemode_processed += i - input_offset;
1156
+		}
1157
+
1158
+		if(limit < 0) limit = 0;
1159
+/*		assert((unsigned)(conv->out_area.length - limit - 1) < conv->buffer_size);
1160
+		assert(conv->out_area.length - limit - 1 >= 0); */
1161
+		conv->out_area.buffer[conv->out_area.length - limit - 1] = '\0';
1162
+		return conv->out_area.buffer;
1163
+	}
1164
+}
983 1165
 
... ...
@@ -25,33 +25,33 @@
25 25
 
26 26
 #include "hashtab.h"
27 27
 
28
-#define UCS4_1234 (const unsigned char*)"UCS-4LE"
29
-#define UCS4_4321 (const unsigned char*)"UCS-4BE"
30
-#define UCS4_2143 (const unsigned char*)"UCS4"
31
-#define UCS4_3412 (const unsigned char*)"UCS-4"
32
-#define UTF16_BE (const unsigned char*)"UTF-16BE"
33
-#define UTF16_LE (const unsigned char*)"UTF-16LE"
34
-#define UTF8     (const unsigned char*)"UTF-8"
28
+#define UCS4_1234 "UCS-4LE"
29
+#define UCS4_4321 "UCS-4BE"
30
+#define UCS4_2143 "UCS4"
31
+#define UCS4_3412 "UCS-4"
32
+#define UTF16_BE "UTF-16BE"
33
+#define UTF16_LE "UTF-16LE"
34
+#define UTF8     "UTF-8"
35 35
 #define UNDECIDED_32_1234 UCS4_1234
36 36
 #define UNDECIDED_32_4321 UCS4_4321
37 37
 #define UNDECIDED_32_2143 UCS4_2143
38 38
 #define UNDECIDED_32_3412 UCS4_3412
39 39
 #define UNDECIDED_16_BE UTF16_BE
40 40
 #define UNDECIDED_16_LE UTF16_LE
41
-#define UNDECIDED_8 (const unsigned char*)"ISO-8859-1"
42
-#define EBCDIC (const unsigned char*)"EBCDIC-US"
43
-#define UNKNOWN (const unsigned char*)"\0"
44
-#define OTHER   (const unsigned char*)"OTHER"
41
+#define UNDECIDED_8 "ISO-8859-1"
42
+#define EBCDIC "EBCDIC-US"
43
+#define UNKNOWN "\0"
44
+#define OTHER   "OTHER"
45 45
 
46
-enum encoding_priority {NOPRIO,CONTENT_TYPE,BOM,NOBOM_AUTODETECT,XML_CHARSET,META};
46
+enum encoding_priority {NOPRIO,CONTENT_TYPE,BOM,NOBOM_AUTODETECT,XML_CHARSET,META, SWITCH_TO_BLOCKMODE};
47 47
 
48
-enum encodings {E_UCS4,E_UTF16,E_UCS4_1234,E_UCS4_4321,E_UCS4_2134,E_UCS4_3412,E_UTF16_BE,E_UTF16_LE,E_UTF8,E_UNKNOWN,E_OTHER};
48
+enum encodings {E_UCS4,E_UTF16,E_UCS4_1234,E_UCS4_4321,E_UCS4_2143,E_UCS4_3412,E_UTF16_BE,E_UTF16_LE,E_UTF8, E_UNKNOWN,E_OTHER, E_ICONV};
49 49
 #define MAX_ENTITY_SIZE 22
50 50
 
51 51
 struct entity_conv {
52 52
 	unsigned char* encoding;
53
-	const unsigned char* autodetected;
54 53
 	enum encoding_priority priority;
54
+	enum encodings encoding_symbolic;
55 55
 	unsigned short int encoding_specific;/* sub-encoding, used for ISO*/
56 56
 	const struct hashtable* ht;
57 57
 	uint8_t has_bom;
... ...
@@ -60,26 +60,24 @@ struct entity_conv {
60 60
 	uint8_t  bom_cnt;
61 61
 	uint32_t partial;
62 62
 	unsigned char bom[4];
63
-#if 0	
64
-	char* buffer;
65
-	char* buffer2;
66
-#endif	
67 63
 	size_t buffer_size;
68 64
 	size_t buffer_cnt;
69 65
 	uint8_t entity_buffcnt;
66
+	void* iconv_struct;
70 67
 	char entity_buff[MAX_ENTITY_SIZE+2];
71 68
 	m_area_t tmp_area;
72 69
 	m_area_t out_area;
73 70
 	m_area_t norm_area;
74 71
 	int      msg_zero_shown;
72
+	int      linemode;/* TODO:set */
73
+	int      linemode_processed;
75 74
 };
76 75
 
77
-
78
-int init_entity_converter(struct entity_conv* conv,const unsigned char* encoding,size_t buffer_size);
76
+int init_entity_converter(struct entity_conv* conv, size_t buffer_size);
79 77
 void process_encoding_set(struct entity_conv* conv,const unsigned char* encoding,enum encoding_priority priority);
80 78
 int entity_norm_done(struct entity_conv* conv);
81 79
 
82
-unsigned char* encoding_norm_readline(struct entity_conv* conv, FILE* stream_in, m_area_t* in_m_area, const size_t maxlen);
80
+unsigned char* encoding_norm_readline(struct entity_conv* conv, FILE* stream_in, m_area_t* in_m_area);
83 81
 unsigned char* entity_norm(const struct entity_conv* conv,const unsigned char* entity);
84 82
 int entitynorm_init(void);
85 83
 
... ...
@@ -185,37 +185,35 @@ cli_file_t cli_filetype2(int desc, const struct cl_engine *engine)
185 185
 		    struct entity_conv conv;
186 186
 		    const size_t conv_size = 2*bread < 256 ? 256 : 2*bread;
187 187
 
188
-		if(init_entity_converter(&conv,UNKNOWN,conv_size) == 0) {
189
-			int end = 0;
190
-			m_area_t area;
191
-			area.buffer = (unsigned char *) smallbuff;
192
-			area.length = bread;
193
-			area.offset = 0;
194
-
195
-		    while(!end) {
196
-			if(cli_ac_initdata(&mdata, root->ac_partsigs, AC_DEFAULT_TRACKLEN))
197
-			    return ret;
198
-
199
-			decoded =  encoding_norm_readline(&conv, NULL, &area, bread);
200
-
201
-			if(decoded) {
202
-			    sret = cli_ac_scanbuff(decoded, strlen((const char *) decoded), NULL, engine->root[0], &mdata, 1, 0, 0, -1, NULL);
203
-			    free(decoded);
204
-			    if(sret == CL_TYPE_HTML) {
205
-				ret = CL_TYPE_HTML;
206
-				end = 1;
188
+		    /* TODO: make detection via daily.ft, then we can get rid of line-mode entirely!*/
189
+		    if(init_entity_converter(&conv, conv_size) == 0) {
190
+			    m_area_t area;
191
+			    area.buffer = (unsigned char *) smallbuff;
192
+			    area.length = bread;
193
+			    area.offset = 0;
194
+
195
+			    /* switch to blockmode, so that we convert all the input buffer at once,
196
+			     * rather than line-by-line */
197
+			    process_encoding_set(&conv, NULL, SWITCH_TO_BLOCKMODE);
198
+
199
+			    if(cli_ac_initdata(&mdata, root->ac_partsigs, AC_DEFAULT_TRACKLEN))
200
+				    return ret;
201
+
202
+			    decoded =  encoding_norm_readline(&conv, NULL, &area);
203
+
204
+			    if(decoded) {
205
+				    sret = cli_ac_scanbuff(decoded, strlen((const char *) decoded), NULL, engine->root[0], &mdata, 1, 0, 0, -1, NULL);
206
+				    if(sret == CL_TYPE_HTML) {
207
+					    ret = CL_TYPE_HTML;
208
+				    }
207 209
 			    }
208
-			} else
209
-			    end = 1;
210 210
 
211
-			cli_ac_freedata(&mdata);
212
-		    }
213
-
214
-		    entity_norm_done(&conv);
211
+			    cli_ac_freedata(&mdata);
215 212
 
216
-		} else {
217
-		    cli_warnmsg("cli_filetype2: Error initializing entity converter\n");
218
-		}
213
+			    entity_norm_done(&conv);
214
+		    } else {
215
+			    cli_warnmsg("cli_filetype2: Error initializing entity converter\n");
216
+		    }
219 217
 	    }
220 218
 	}
221 219
     }
... ...
@@ -491,7 +491,7 @@ static int cli_html_normalise(int fd, m_area_t *m_area, const char *dirname, tag
491 491
 		}
492 492
 	}
493 493
 
494
-	if(dconf_entconv && (rc = init_entity_converter(&conv, UNKNOWN, 16384) )) {
494
+	if(dconf_entconv && (rc = init_entity_converter(&conv, 16384) )) {
495 495
 		if (!m_area) {
496 496
 			fclose(stream_in);
497 497
 		}
... ...
@@ -502,7 +502,7 @@ static int cli_html_normalise(int fd, m_area_t *m_area, const char *dirname, tag
502 502
 	tag_args.tag = NULL;
503 503
 	tag_args.value = NULL;
504 504
 	tag_args.contents = NULL;
505
-	
505
+
506 506
 	if (dirname) {
507 507
 		snprintf(filename, 1024, "%s/rfc2397", dirname);
508 508
 		if (mkdir(filename, 0700) && errno != EEXIST) {
... ...
@@ -514,14 +514,14 @@ static int cli_html_normalise(int fd, m_area_t *m_area, const char *dirname, tag
514 514
 			file_buff_o1 = file_buff_o2 = file_buff_script = NULL;
515 515
 			goto abort;
516 516
 		}
517
-		
517
+
518 518
 		file_buff_o2 = (file_buff_t *) cli_malloc(sizeof(file_buff_t));
519 519
 		if (!file_buff_o2) {
520 520
 			free(file_buff_o1);
521 521
 			file_buff_o1 = file_buff_o2 = file_buff_script = NULL;
522 522
 			goto abort;
523 523
 		}
524
-		
524
+
525 525
 		file_buff_script = (file_buff_t *) cli_malloc(sizeof(file_buff_t));
526 526
 		if (!file_buff_script) {
527 527
 			free(file_buff_o1);
... ...
@@ -529,7 +529,7 @@ static int cli_html_normalise(int fd, m_area_t *m_area, const char *dirname, tag
529 529
 			file_buff_o1 = file_buff_o2 = file_buff_script = NULL;
530 530
 			goto abort;
531 531
 		}
532
-		
532
+
533 533
 		snprintf(filename, 1024, "%s/comment.html", dirname);
534 534
 		file_buff_o1->fd = open(filename, O_WRONLY|O_CREAT|O_TRUNC, S_IWUSR|S_IRUSR);
535 535
 		if (!file_buff_o1->fd) {
... ...
@@ -574,12 +574,12 @@ static int cli_html_normalise(int fd, m_area_t *m_area, const char *dirname, tag
574 574
 		file_buff_o2 = NULL;
575 575
 		file_buff_script = NULL;
576 576
 	}
577
-	
577
+
578 578
 	binary = FALSE;
579 579
 
580 580
 	if(dconf_entconv)
581
-		ptr = line = encoding_norm_readline(&conv, stream_in, m_area, 8192);
582
-	else   
581
+		ptr = line = encoding_norm_readline(&conv, stream_in, m_area);
582
+	else
583 583
 		ptr = line = cli_readline(stream_in, m_area, 8192);
584 584
 
585 585
 	while (line) {
... ...
@@ -766,7 +766,7 @@ static int cli_html_normalise(int fd, m_area_t *m_area, const char *dirname, tag
766 766
 					if (file_buff_o2 && (file_buff_o2->length > 0)) {
767 767
 						file_buff_o2->length--;
768 768
 					}
769
-					
769
+
770 770
 					if (quoted != NOT_QUOTED) {
771 771
 						html_output_c(file_buff_o1, file_buff_o2, '"');
772 772
 					}
... ...
@@ -783,7 +783,7 @@ static int cli_html_normalise(int fd, m_area_t *m_area, const char *dirname, tag
783 783
 					if (file_buff_o2 && (file_buff_o2->length > 0)) {
784 784
 						file_buff_o2->length--;
785 785
 					}
786
-					
786
+
787 787
 					if (quoted != NOT_QUOTED) {
788 788
 						html_output_c(file_buff_o1, file_buff_o2, '"');
789 789
 					}
... ...
@@ -832,7 +832,7 @@ static int cli_html_normalise(int fd, m_area_t *m_area, const char *dirname, tag
832 832
 						}
833 833
 						ptr++;
834 834
 					} else {
835
-						if (!escape && (quoted==DOUBLE_QUOTED)) {					
835
+						if (!escape && (quoted==DOUBLE_QUOTED)) {
836 836
 							html_output_c(file_buff_o1, file_buff_o2, '"');
837 837
 							if (tag_val_length < HTML_STR_LENGTH) {
838 838
 								tag_val[tag_val_length++] = '"';
... ...
@@ -880,7 +880,7 @@ static int cli_html_normalise(int fd, m_area_t *m_area, const char *dirname, tag
880 880
 					}
881 881
 					ptr++;
882 882
 				}
883
-				
883
+
884 884
 				if (*ptr == '\\') {
885 885
 					escape = TRUE;
886 886
 				} else {
... ...
@@ -899,7 +899,7 @@ static int cli_html_normalise(int fd, m_area_t *m_area, const char *dirname, tag
899 899
 				ptr++;
900 900
 				break;
901 901
 			case HTML_PROCESS_TAG:
902
-				
902
+
903 903
 				/* Default to no action for this tag */
904 904
 				state = HTML_SKIP_WS;
905 905
 				next_state = HTML_NORM;
... ...
@@ -938,6 +938,9 @@ static int cli_html_normalise(int fd, m_area_t *m_area, const char *dirname, tag
938 938
 						in_script = TRUE;
939 939
 					}
940 940
 					html_output_tag(file_buff_script, tag, &tag_args);
941
+				} else if (dconf_entconv && strcmp(tag, "body") == 0) {
942
+					/* no more charset changes accepted after body encountered */
943
+					process_encoding_set(&conv, NULL, SWITCH_TO_BLOCKMODE);
941 944
 				} else if (dconf_entconv && strcmp(tag, "meta") == 0) {
942 945
 					const unsigned char* http_equiv = html_tag_arg_value(&tag_args, "http-equiv");
943 946
 					const unsigned char* http_content = html_tag_arg_value(&tag_args, "content");
... ...
@@ -953,7 +956,7 @@ static int cli_html_normalise(int fd, m_area_t *m_area, const char *dirname, tag
953 953
 							http_content2[i] = tolower(http_content[i]);
954 954
 						http_content2[len] = '\0';
955 955
 						charset = (unsigned char*) strstr((char*)http_content2,"charset");
956
-						if(charset) {							
956
+						if(charset) {
957 957
 							while(*charset && *charset != '=')
958 958
 								charset++;
959 959
 							if(*charset)
... ...
@@ -1011,8 +1014,8 @@ static int cli_html_normalise(int fd, m_area_t *m_area, const char *dirname, tag
1011 1011
 					} else if (strcmp(tag,"form") == 0 && hrefs->scanContents) {
1012 1012
 						const unsigned char* arg_action_value = html_tag_arg_value(&tag_args,"action");
1013 1013
 						if (arg_action_value) {
1014
-							if(in_form_action) 
1015
-								free(in_form_action);							
1014
+							if(in_form_action)
1015
+								free(in_form_action);
1016 1016
 							in_form_action = cli_strdup(arg_action_value);
1017 1017
 						}
1018 1018
 					} else if (strcmp(tag, "img") == 0) {
... ...
@@ -1077,7 +1080,7 @@ static int cli_html_normalise(int fd, m_area_t *m_area, const char *dirname, tag
1077 1077
 									in_form_action + strlen(in_form_action));
1078 1078
 								html_tag_contents_done(hrefs,hrefs->count);
1079 1079
 							}
1080
-						}						
1080
+						}
1081 1081
 					}
1082 1082
 					/* TODO:imagemaps can have urls too */
1083 1083
 				}
... ...
@@ -1123,7 +1126,7 @@ static int cli_html_normalise(int fd, m_area_t *m_area, const char *dirname, tag
1123 1123
 						html_output_c(file_buff_o1, file_buff_o2, '&');
1124 1124
 						if (next_state == HTML_TAG_ARG_VAL && tag_val_length < HTML_STR_LENGTH) {
1125 1125
 								tag_val[tag_val_length++] = '&';
1126
-						}						
1126
+						}
1127 1127
 						for(i=0; i < entity_val_length; i++) {
1128 1128
 							const char c = tolower(entity_val[i]);
1129 1129
 							html_output_c(file_buff_o1, file_buff_o2, c);
... ...
@@ -1266,7 +1269,7 @@ static int cli_html_normalise(int fd, m_area_t *m_area, const char *dirname, tag
1266 1266
 						case 0x24:
1267 1267
 							html_output_c(file_buff_o1, file_buff_o2, 0x40);
1268 1268
 							html_output_c(file_buff_script, NULL, 0x40);
1269
-							break;				
1269
+							break;
1270 1270
 						case 0x26:
1271 1271
 							html_output_c(file_buff_o1, file_buff_o2, 0x0a);
1272 1272
 							html_output_c(file_buff_script, NULL, 0x0a);
... ...
@@ -1285,7 +1288,7 @@ static int cli_html_normalise(int fd, m_area_t *m_area, const char *dirname, tag
1285 1285
 				ptr++;
1286 1286
 				length--;
1287 1287
 				break;
1288
-				
1288
+
1289 1289
 			case HTML_RFC2397_TYPE:
1290 1290
 				if (*ptr == '\'') {
1291 1291
 					if (!escape && (quoted==SINGLE_QUOTED)) {
... ...
@@ -1340,7 +1343,7 @@ static int cli_html_normalise(int fd, m_area_t *m_area, const char *dirname, tag
1340 1340
 					escape = FALSE;
1341 1341
 					next_state = HTML_BAD_STATE;
1342 1342
 					ptr++;
1343
-				
1343
+
1344 1344
 				} else {
1345 1345
 					if (tag_val_length < HTML_STR_LENGTH) {
1346 1346
 						tag_val[tag_val_length++] = tolower(*ptr);
... ...
@@ -1370,7 +1373,7 @@ static int cli_html_normalise(int fd, m_area_t *m_area, const char *dirname, tag
1370 1370
 						goto abort;
1371 1371
 					}
1372 1372
 					file_tmp_o1->length = 0;
1373
-				
1373
+
1374 1374
 					html_output_str(file_tmp_o1, "From html-normalise\n", 20);
1375 1375
 					html_output_str(file_tmp_o1, "Content-type: ", 14);
1376 1376
 					if ((tag_val_length == 0) && (*tag_val == ';')) {
... ...
@@ -1455,7 +1458,7 @@ static int cli_html_normalise(int fd, m_area_t *m_area, const char *dirname, tag
1455 1455
 					html_output_c(file_tmp_o1, NULL, '%');
1456 1456
 				}
1457 1457
 				state = HTML_RFC2397_DATA;
1458
-				break;		
1458
+				break;
1459 1459
 			case HTML_ESCAPE_CHAR:
1460 1460
 				value *= 16;
1461 1461
 				length++;
... ...
@@ -1472,22 +1475,23 @@ static int cli_html_normalise(int fd, m_area_t *m_area, const char *dirname, tag
1472 1472
 					state = next_state;
1473 1473
 				}
1474 1474
 				ptr++;
1475
-				break;	
1475
+				break;
1476 1476
 			}
1477 1477
 		}
1478 1478
 		if(hrefs && hrefs->scanContents && in_ahref && href_contents_begin)
1479 1479
 			/* end of line, append contents now, resume on next line */
1480 1480
 			html_tag_contents_append(hrefs,in_ahref,href_contents_begin,ptr);
1481 1481
 		ptrend = NULL;
1482
-		free(line);
1483
- 		if(dconf_entconv)
1484
- 			ptr = line = encoding_norm_readline(&conv, stream_in, m_area, 8192);
1485
- 		else
1486
- 			ptr = line = cli_readline(stream_in, m_area, 8192);
1482
+		if(dconf_entconv)
1483
+			ptr = line = encoding_norm_readline(&conv, stream_in, m_area);
1484
+		else {
1485
+			free(line);
1486
+			ptr = line = cli_readline(stream_in, m_area, 8192);
1487
+		}
1487 1488
 	}
1488
-	
1489
- 	if(dconf_entconv) {
1490
- 		/* handle "unfinished" entitites */
1489
+
1490
+	if(dconf_entconv) {
1491
+		/* handle "unfinished" entitites */
1491 1492
 		size_t i;
1492 1493
 		unsigned char* normalized;
1493 1494
 		entity_val[entity_val_length] = '\0';