git-svn: trunk@3511
Török Edvin authored on 2008/01/21 07:18:14... | ... |
@@ -1,3 +1,13 @@ |
1 |
+Sun Jan 20 23:49:41 EET 2008 (edwin) |
|
2 |
+------------------------------------ |
|
3 |
+ * configure: AC_TRY_LINK already adds a main(), remove duplicate main() |
|
4 |
+ * libclamav: entconv improvements to improve security and performance |
|
5 |
+ Part I for (bb #686, #386) |
|
6 |
+ TODO: * optimize entity_norm |
|
7 |
+ * create testfiles for unicode encoding variants |
|
8 |
+ * create a regression test |
|
9 |
+ * check for memory leaks |
|
10 |
+ |
|
1 | 11 |
Sat Jan 19 14:41:50 CET 2008 (acab) |
2 | 12 |
----------------------------------- |
3 | 13 |
* test: using splitted instead of byteswapped files |
... | ... |
@@ -11397,15 +11397,12 @@ int |
11397 | 11397 |
main () |
11398 | 11398 |
{ |
11399 | 11399 |
|
11400 |
-int main(int argc, char** argv) { |
|
11401 | 11400 |
char** xin,**xout; |
11402 | 11401 |
unsigned il,ol; |
11403 | 11402 |
int rc; |
11404 | 11403 |
iconv_t iconv_struct = iconv_open("UTF-16BE","UTF-8"); |
11405 | 11404 |
rc = iconv(iconv_struct,xin,&il,xout,&ol); |
11406 | 11405 |
iconv_close(iconv_struct); |
11407 |
- return 0; |
|
11408 |
-} |
|
11409 | 11406 |
|
11410 | 11407 |
; |
11411 | 11408 |
return 0; |
... | ... |
@@ -230,15 +230,12 @@ if test "X$wiconv" != "Xno"; then |
230 | 230 |
AC_TRY_LINK([ |
231 | 231 |
#include <iconv.h> |
232 | 232 |
],[ |
233 |
-int main(int argc, char** argv) { |
|
234 | 233 |
char** xin,**xout; |
235 | 234 |
unsigned il,ol; |
236 | 235 |
int rc; |
237 | 236 |
iconv_t iconv_struct = iconv_open("UTF-16BE","UTF-8"); |
238 | 237 |
rc = iconv(iconv_struct,xin,&il,xout,&ol); |
239 | 238 |
iconv_close(iconv_struct); |
240 |
- return 0; |
|
241 |
-} |
|
242 | 239 |
],[ |
243 | 240 |
AC_MSG_RESULT(yes) |
244 | 241 |
AC_DEFINE(HAVE_ICONV, 1, [iconv() available]) |
... | ... |
@@ -32,6 +32,12 @@ |
32 | 32 |
#include <pthread.h> |
33 | 33 |
#endif |
34 | 34 |
|
35 |
+#ifndef CL_DEBUG |
|
36 |
+#define NDEBUG |
|
37 |
+#endif |
|
38 |
+ |
|
39 |
+#include <assert.h> |
|
40 |
+ |
|
35 | 41 |
#include "clamav.h" |
36 | 42 |
#include "others.h" |
37 | 43 |
#include "htmlnorm.h" |
... | ... |
@@ -46,6 +52,7 @@ |
46 | 46 |
|
47 | 47 |
#include "encoding_aliases.h" |
48 | 48 |
|
49 |
+#define MODULE_NAME "entconv: " |
|
49 | 50 |
|
50 | 51 |
#define MAX_LINE 1024 |
51 | 52 |
|
... | ... |
@@ -58,11 +65,12 @@ unsigned char* entity_norm(const struct entity_conv* conv,const unsigned char* e |
58 | 58 |
struct element* e = hashtab_find(conv->ht,entity,strlen((const char*)entity)); |
59 | 59 |
if(e && e->key) { |
60 | 60 |
const int val = e->data; |
61 |
+ /* TODO: don't allocate memory here, but use a buffer in struct entity_conv */ |
|
61 | 62 |
if(val == '<')/* this was an escaped <, so output it escaped*/ |
62 | 63 |
return (unsigned char*)cli_strdup("<"); |
63 | 64 |
else if(val == '>')/* see above */ |
64 | 65 |
return (unsigned char*)cli_strdup(">"); |
65 |
- else if(val<127) { |
|
66 |
+ else if(val >= 0 && val <= 0xff) { |
|
66 | 67 |
unsigned char *e_out = cli_malloc(2); |
67 | 68 |
|
68 | 69 |
if(!e_out) |
... | ... |
@@ -75,6 +83,7 @@ unsigned char* entity_norm(const struct entity_conv* conv,const unsigned char* e |
75 | 75 |
else if(val==160) |
76 | 76 |
return (unsigned char*)cli_strdup(" "); |
77 | 77 |
else { |
78 |
+ /* TODO: use optimized version from u16_normalize */ |
|
78 | 79 |
unsigned char *ent_out = cli_malloc(10); |
79 | 80 |
|
80 | 81 |
if(!ent_out) |
... | ... |
@@ -88,21 +97,22 @@ unsigned char* entity_norm(const struct entity_conv* conv,const unsigned char* e |
88 | 88 |
else |
89 | 89 |
return NULL; |
90 | 90 |
} |
91 |
- |
|
92 | 91 |
/* sane default, must be larger, than the longest possible return string, |
93 | 92 |
* which is |
94 | 93 |
* &#xxx;*/ |
95 | 94 |
#define MIN_BUFFER_SIZE 32 |
96 | 95 |
|
97 |
-int init_entity_converter(struct entity_conv* conv,const unsigned char* encoding,size_t buffer_size) |
|
96 |
+#define LINEMODE_LIMIT 16384 |
|
97 |
+ |
|
98 |
+int init_entity_converter(struct entity_conv* conv, size_t buffer_size) |
|
98 | 99 |
{ |
99 | 100 |
if(buffer_size < MIN_BUFFER_SIZE) { |
100 | 101 |
cli_warnmsg("Entity converter: Supplied buffer size:%lu, smaller than minimum required: %d\n",(unsigned long)buffer_size,MIN_BUFFER_SIZE); |
101 | 102 |
return CL_ENULLARG; |
102 | 103 |
} |
103 | 104 |
if(conv) { |
104 |
- conv->encoding = (unsigned char*) cli_strdup("ISO-8859-1"); |
|
105 |
- conv->autodetected = OTHER; |
|
105 |
+ conv->encoding = NULL; |
|
106 |
+ conv->encoding_symbolic = E_UNKNOWN; |
|
106 | 107 |
conv->bom_cnt = 0; |
107 | 108 |
conv->buffer_cnt = 0; |
108 | 109 |
conv->bytes_read = 0; |
... | ... |
@@ -110,6 +120,9 @@ int init_entity_converter(struct entity_conv* conv,const unsigned char* encoding |
110 | 110 |
conv->entity_buffcnt = 0; |
111 | 111 |
conv->buffer_size = buffer_size; |
112 | 112 |
conv->priority = NOPRIO; |
113 |
+ /* start in linemode */ |
|
114 |
+ conv->linemode = 1; |
|
115 |
+ conv->linemode_processed = 0; |
|
113 | 116 |
|
114 | 117 |
conv->tmp_area.offset = 0; |
115 | 118 |
conv->tmp_area.length = 0; |
... | ... |
@@ -119,13 +132,14 @@ int init_entity_converter(struct entity_conv* conv,const unsigned char* encoding |
119 | 119 |
} |
120 | 120 |
|
121 | 121 |
conv->out_area.offset = 0; |
122 |
- conv->out_area.length = 0; |
|
122 |
+ conv->out_area.length = buffer_size; |
|
123 | 123 |
conv->out_area.buffer = cli_malloc(buffer_size); |
124 | 124 |
if(!conv->out_area.buffer) { |
125 | 125 |
free(conv->tmp_area.buffer); |
126 | 126 |
return CL_EMEM; |
127 | 127 |
} |
128 | 128 |
|
129 |
+ conv->buffer_size = buffer_size; |
|
129 | 130 |
conv->norm_area.offset = 0; |
130 | 131 |
conv->norm_area.length = 0; |
131 | 132 |
conv->norm_area.buffer = cli_malloc(buffer_size); |
... | ... |
@@ -138,6 +152,13 @@ int init_entity_converter(struct entity_conv* conv,const unsigned char* encoding |
138 | 138 |
conv->ht = &entities_htable; |
139 | 139 |
conv->msg_zero_shown = 0; |
140 | 140 |
|
141 |
+ conv->iconv_struct = cli_calloc(1, sizeof(iconv_t)); |
|
142 |
+ if(!conv->iconv_struct) { |
|
143 |
+ free(conv->tmp_area.buffer); |
|
144 |
+ free(conv->out_area.buffer); |
|
145 |
+ free(conv->norm_area.buffer); |
|
146 |
+ return CL_EMEM; |
|
147 |
+ } |
|
141 | 148 |
return 0; |
142 | 149 |
} |
143 | 150 |
else |
... | ... |
@@ -148,23 +169,18 @@ static size_t encoding_bytes(const unsigned char* fromcode, enum encodings* enco |
148 | 148 |
{ |
149 | 149 |
const unsigned char* from = (const unsigned char*) fromcode; |
150 | 150 |
/* special case for these unusual byteorders */ |
151 |
- *encoding=E_OTHER; |
|
152 |
- if(from == UCS4_2143) |
|
153 |
- *encoding = E_UCS4_2134; |
|
154 |
- else if (from == UCS4_3412) |
|
155 |
- *encoding = E_UCS4_3412; |
|
156 |
- else { |
|
157 |
- struct element * e = hashtab_find(&aliases_htable,from,strlen((const char*)fromcode)); |
|
158 |
- if(e && e->key) { |
|
159 |
- *encoding = e->data; |
|
160 |
- } |
|
151 |
+ struct element * e = hashtab_find(&aliases_htable,from,strlen((const char*)fromcode)); |
|
152 |
+ if(e && e->key) { |
|
153 |
+ *encoding = e->data; |
|
154 |
+ } else { |
|
155 |
+ *encoding = E_OTHER; |
|
161 | 156 |
} |
162 | 157 |
|
163 | 158 |
switch(*encoding) { |
164 | 159 |
case E_UCS4: |
165 | 160 |
case E_UCS4_1234: |
166 | 161 |
case E_UCS4_4321: |
167 |
- case E_UCS4_2134: |
|
162 |
+ case E_UCS4_2143: |
|
168 | 163 |
case E_UCS4_3412: |
169 | 164 |
return 4; |
170 | 165 |
case E_UTF16: |
... | ... |
@@ -177,7 +193,7 @@ static size_t encoding_bytes(const unsigned char* fromcode, enum encodings* enco |
177 | 177 |
default: |
178 | 178 |
return 1; |
179 | 179 |
} |
180 |
- } |
|
180 |
+} |
|
181 | 181 |
|
182 | 182 |
#ifndef HAVE_ICONV |
183 | 183 |
typedef struct { |
... | ... |
@@ -187,10 +203,10 @@ typedef struct { |
187 | 187 |
|
188 | 188 |
static iconv_t iconv_open(const char *tocode, const char* fromcode) |
189 | 189 |
{ |
190 |
- cli_dbgmsg("Internal iconv\n"); |
|
191 | 190 |
iconv_t iconv = cli_malloc(sizeof(*iconv)); |
192 | 191 |
if(!iconv) |
193 | 192 |
return NULL; |
193 |
+ cli_dbgmsg(MODULE_NAME "Internal iconv\n"); |
|
194 | 194 |
/* TODO: check that tocode is UTF16BE */ |
195 | 195 |
iconv->size = encoding_bytes(fromcode,&iconv->encoding); |
196 | 196 |
return iconv; |
... | ... |
@@ -216,7 +232,7 @@ static int iconv(iconv_t iconv_struct,char **inbuf, size_t *inbytesleft, |
216 | 216 |
/* output is always utf16be !*/ |
217 | 217 |
switch(iconv_struct->encoding) { |
218 | 218 |
case E_UCS4: |
219 |
- case E_UCS4_1234: |
|
219 |
+ case E_UCS4_1234: |
|
220 | 220 |
{ |
221 | 221 |
for(i=0;i < maxcopy; i += 4) { |
222 | 222 |
if(!input[i+2] && !input[i+3]) { |
... | ... |
@@ -224,7 +240,7 @@ static int iconv(iconv_t iconv_struct,char **inbuf, size_t *inbytesleft, |
224 | 224 |
output[i/2+1] = input[i]; |
225 | 225 |
} |
226 | 226 |
else { |
227 |
- cli_dbgmsg("Warning: unicode character out of utf16 range!\n"); |
|
227 |
+ cli_dbgmsg(MODULE_NAME "Warning: unicode character out of utf16 range!\n"); |
|
228 | 228 |
output[i/2] = 0xff; |
229 | 229 |
output[i/2+1] = 0xff; |
230 | 230 |
} |
... | ... |
@@ -316,7 +332,7 @@ static int iconv(iconv_t iconv_struct,char **inbuf, size_t *inbytesleft, |
316 | 316 |
output[j++] = ((input[i] & 0x1F) << 6) | (input[i+1] & 0x3F); |
317 | 317 |
} |
318 | 318 |
else { |
319 |
- cli_dbgmsg("invalid UTF8 character encountered\n"); |
|
319 |
+ cli_dbgmsg(MODULE_NAME "invalid UTF8 character encountered\n"); |
|
320 | 320 |
break; |
321 | 321 |
} |
322 | 322 |
i+=2; |
... | ... |
@@ -328,7 +344,7 @@ static int iconv(iconv_t iconv_struct,char **inbuf, size_t *inbytesleft, |
328 | 328 |
output[j++] = (input[i+1] << 6) | (input[i+2] & 0x3F); |
329 | 329 |
} |
330 | 330 |
else { |
331 |
- cli_dbgmsg("invalid UTF8 character encountered\n"); |
|
331 |
+ cli_dbgmsg(MODULE_NAME "invalid UTF8 character encountered\n"); |
|
332 | 332 |
break; |
333 | 333 |
} |
334 | 334 |
i+=3; |
... | ... |
@@ -336,7 +352,7 @@ static int iconv(iconv_t iconv_struct,char **inbuf, size_t *inbytesleft, |
336 | 336 |
else if( (input[i]&0xF8) == 0xF0) { |
337 | 337 |
if((input[i+1]&0xC0) == 0x80 && (input[i+2]&0xC0) == 0x80 && (input[i+3]&0xC0) == 0x80) { |
338 | 338 |
/* 4 bytes long 11110www 10xxxxxx 10yyyyyy 10zzzzzz -> 000wwwxx xxxxyyyy yyzzzzzz*/ |
339 |
- cli_dbgmsg("UTF8 character out of UTF16 range encountered"); |
|
339 |
+ cli_dbgmsg(MODULE_NAME "UTF8 character out of UTF16 range encountered"); |
|
340 | 340 |
output[j++] = 0xff; |
341 | 341 |
output[j++] = 0xff; |
342 | 342 |
|
... | ... |
@@ -345,13 +361,13 @@ static int iconv(iconv_t iconv_struct,char **inbuf, size_t *inbytesleft, |
345 | 345 |
out[j++] = (input[i+2] << 6) | (input[i+2] & 0x3F);*/ |
346 | 346 |
} |
347 | 347 |
else { |
348 |
- cli_dbgmsg("invalid UTF8 character encountered\n"); |
|
348 |
+ cli_dbgmsg(MODULE_NAME "invalid UTF8 character encountered\n"); |
|
349 | 349 |
break; |
350 | 350 |
} |
351 | 351 |
i+=4; |
352 | 352 |
} |
353 | 353 |
else { |
354 |
- cli_dbgmsg("invalid UTF8 character encountered\n"); |
|
354 |
+ cli_dbgmsg(MODULE_NAME "invalid UTF8 character encountered\n"); |
|
355 | 355 |
break; |
356 | 356 |
} |
357 | 357 |
} |
... | ... |
@@ -392,10 +408,12 @@ static int iconv(iconv_t iconv_struct,char **inbuf, size_t *inbytesleft, |
392 | 392 |
static inline void process_bom(struct entity_conv* conv) |
393 | 393 |
{ |
394 | 394 |
const unsigned char* bom = conv->bom; |
395 |
- const unsigned char* encoding = OTHER; |
|
395 |
+ const char* encoding = NULL; |
|
396 | 396 |
int has_bom = 0; |
397 |
- uint8_t enc_bytes = 4;/* default is UTF8, which has a maximum of 4 bytes*/ |
|
397 |
+ uint8_t enc_bytes = 1;/* default is UTF8, which has a minimum of 1 bytes*/ |
|
398 | 398 |
|
399 |
+ /* undecided 32-bit encodings are treated as ucs4, and |
|
400 |
+ * 16 bit as utf16*/ |
|
399 | 401 |
switch(bom[0]) { |
400 | 402 |
case 0x00: |
401 | 403 |
if(bom[1] == 0x00) { |
... | ... |
@@ -408,19 +426,20 @@ static inline void process_bom(struct entity_conv* conv) |
408 | 408 |
has_bom = 1; |
409 | 409 |
} |
410 | 410 |
else if(bom[2] == 0x00 && bom[3] == 0x3C) { |
411 |
- encoding = UNDECIDED_32_1234; |
|
412 |
- } |
|
411 |
+ /* undecided, treat as ucs4 */ |
|
412 |
+ encoding = UCS4_1234; |
|
413 |
+ } |
|
413 | 414 |
else if(bom[2] == 0x3C && bom[3] == 0x00) { |
414 |
- encoding = UNDECIDED_32_2143; |
|
415 |
+ encoding = UCS4_2143; |
|
415 | 416 |
} |
416 | 417 |
}/* 0x00 0x00 */ |
417 | 418 |
else if(bom[1] == 0x3C) { |
418 | 419 |
if(bom[2] == 0x00) { |
419 | 420 |
if(bom[3] == 0x00) { |
420 |
- encoding = UNDECIDED_32_3412; |
|
421 |
+ encoding = UCS4_3412; |
|
421 | 422 |
} |
422 | 423 |
else if(bom[3] == 0x3F) { |
423 |
- encoding = UNDECIDED_16_BE; |
|
424 |
+ encoding = UTF16_BE; |
|
424 | 425 |
enc_bytes = 2; |
425 | 426 |
} |
426 | 427 |
}/*0x00 0x3C 0x00*/ |
... | ... |
@@ -439,7 +458,7 @@ static inline void process_bom(struct entity_conv* conv) |
439 | 439 |
} |
440 | 440 |
}/*0xFF 0xFE*/ |
441 | 441 |
break; |
442 |
- case 0xFE: |
|
442 |
+ case 0xFE: |
|
443 | 443 |
if(bom[1] == 0xFF) { |
444 | 444 |
if(bom[2] == 0x00 && bom[3] == 0x00) { |
445 | 445 |
encoding = UCS4_3412; |
... | ... |
@@ -449,98 +468,91 @@ static inline void process_bom(struct entity_conv* conv) |
449 | 449 |
encoding = UTF16_BE; |
450 | 450 |
has_bom = 1; |
451 | 451 |
enc_bytes = 2; |
452 |
- } |
|
452 |
+ } |
|
453 | 453 |
}/*0xFE 0xFF*/ |
454 | 454 |
break; |
455 |
- case 0xEF: |
|
455 |
+ case 0xEF: |
|
456 | 456 |
if(bom[1] == 0xBB && bom[2] == 0xBF) { |
457 | 457 |
encoding = UTF8; |
458 | 458 |
has_bom = 1; |
459 | 459 |
/*enc_bytes = 4;- default, maximum 4 bytes*/ |
460 |
- }/*0xEF 0xBB 0xBF*/ |
|
460 |
+ }/*0xEF 0xBB 0xBF*/ |
|
461 | 461 |
break; |
462 |
- case 0x3C: |
|
462 |
+ case 0x3C: |
|
463 | 463 |
if(bom[1] == 0x00) { |
464 | 464 |
if(bom[2] == 0x00 && bom[3] == 0x00) { |
465 |
- encoding = UNDECIDED_32_4321; |
|
465 |
+ encoding = UCS4_4321; |
|
466 | 466 |
} |
467 | 467 |
else if(bom[2] == 0x3F && bom[3] == 0x00) { |
468 |
- encoding = UNDECIDED_16_LE; |
|
468 |
+ encoding = UTF16_LE; |
|
469 | 469 |
enc_bytes = 2; |
470 | 470 |
} |
471 | 471 |
}/*0x3C 0x00*/ |
472 | 472 |
else if(bom[1] == 0x3F && bom[2] == 0x78 && bom[3]==0x6D) { |
473 |
- encoding = UNDECIDED_8; |
|
473 |
+ encoding = NULL; |
|
474 | 474 |
enc_bytes = 1; |
475 | 475 |
}/*0x3C 3F 78 6D*/ |
476 | 476 |
break; |
477 |
- case 0x4C: |
|
477 |
+ case 0x4C: |
|
478 | 478 |
if(bom[1] == 0x6F && bom[2] == 0xA7 && bom[3] == 0x94) { |
479 |
- encoding = EBCDIC; |
|
479 |
+ cli_dbgmsg(MODULE_NAME "EBCDIC encoding is not supported in line mode\n"); |
|
480 |
+ encoding = NULL; |
|
480 | 481 |
enc_bytes = 1; |
481 | 482 |
}/*4C 6F A7 94*/ |
482 | 483 |
break; |
483 | 484 |
}/*switch*/ |
484 |
- conv->autodetected = encoding; |
|
485 |
+ if(encoding) { |
|
486 |
+ cli_dbgmsg(MODULE_NAME "encoding detected as :%s\n", encoding); |
|
487 |
+ process_encoding_set(conv, (const unsigned char*)encoding, has_bom ? BOM : NOBOM_AUTODETECT); |
|
488 |
+ } |
|
485 | 489 |
conv->enc_bytes = enc_bytes; |
486 | 490 |
conv->has_bom = has_bom; |
487 | 491 |
} |
488 | 492 |
|
493 |
+/*()-./012345678:ABCDEFGHIJKLMNOPQRSTUVWXY_abcdefghijklmnopqrstuvwxy*/ |
|
494 |
+static const uint8_t encname_chars[256] = { |
|
495 |
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
|
496 |
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
|
497 |
+ 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, |
|
498 |
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, |
|
499 |
+ 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
|
500 |
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, |
|
501 |
+ 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
|
502 |
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, |
|
503 |
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
|
504 |
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
|
505 |
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
|
506 |
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
|
507 |
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
|
508 |
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
|
509 |
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
|
510 |
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 |
|
511 |
+}; |
|
512 |
+ |
|
513 |
+/* checks that encoding is sane, and normalizes to uppercase */ |
|
489 | 514 |
static unsigned char* normalize_encoding(const unsigned char* enc) |
490 | 515 |
{ |
491 |
- unsigned char* norm; |
|
492 |
- size_t i; |
|
493 |
- const size_t len = strlen((const char*)enc); |
|
494 |
- norm = cli_malloc( len+1); |
|
516 |
+ unsigned char* norm; |
|
517 |
+ size_t i, len; |
|
518 |
+ |
|
519 |
+ if(!enc) |
|
520 |
+ return NULL; |
|
521 |
+ len = strlen((const char*)enc); |
|
522 |
+ if(len > 32) |
|
523 |
+ return NULL; |
|
524 |
+ for(i=0;i<len;i++) { |
|
525 |
+ if(!encname_chars[enc[i]]) |
|
526 |
+ return NULL; |
|
527 |
+ } |
|
528 |
+ norm = cli_malloc( len+1 ); |
|
495 | 529 |
if(!norm) |
496 | 530 |
return NULL; |
497 |
- if(enc == OTHER) |
|
498 |
- enc = (const unsigned char*)"ISO-8859-1"; |
|
499 | 531 |
for(i=0;i < strlen((const char*)enc); i++) |
500 | 532 |
norm[i] = toupper(enc[i]); |
501 | 533 |
norm[len]='\0'; |
502 | 534 |
return norm; |
503 | 535 |
} |
504 | 536 |
|
505 |
-static const unsigned char* encoding_name(unsigned char* encoding) |
|
506 |
-{ |
|
507 |
- if(!encoding) |
|
508 |
- return (const unsigned char*)"ISO-8859-1"; |
|
509 |
- else |
|
510 |
- return encoding; |
|
511 |
-} |
|
512 |
- |
|
513 |
-void process_encoding_set(struct entity_conv* conv,const unsigned char* encoding,enum encoding_priority prio) |
|
514 |
-{ |
|
515 |
- unsigned char *tmp_encoding; |
|
516 |
- enum encodings tmp; |
|
517 |
- size_t new_size,old_size; |
|
518 |
- |
|
519 |
- cli_dbgmsg("Setting encoding for %p to %s, priority: %d\n",(void*)conv, encoding, prio); |
|
520 |
- if(encoding == OTHER) |
|
521 |
- return; |
|
522 |
- if(conv->priority == CONTENT_TYPE) |
|
523 |
- return;/* Content-type in header is highest priority, no overrides possible*/ |
|
524 |
- if(conv->priority == BOM && prio == NOBOM_AUTODETECT) |
|
525 |
- return; |
|
526 |
- |
|
527 |
- tmp_encoding = normalize_encoding(encoding);/* FIXME: better obey priorities*/ |
|
528 |
- if(prio == META) { |
|
529 |
- old_size = encoding_bytes(conv->encoding,&tmp); |
|
530 |
- new_size = encoding_bytes(tmp_encoding,&tmp); |
|
531 |
- if(old_size != new_size) { |
|
532 |
- /* on x86 gcc wants %u for size_t, on x86_64 it wants %lu for size_t. So just cast to unsigned long to make warnings go away. */ |
|
533 |
- cli_dbgmsg("process_encoding_set: refusing to override encoding - new encoding size differs: %s(%lu) != %s(%lu)\n",conv->encoding,(unsigned long)old_size,tmp_encoding,(unsigned long)new_size); |
|
534 |
- free(tmp_encoding); |
|
535 |
- return; |
|
536 |
- } |
|
537 |
- } |
|
538 |
- free(conv->encoding); |
|
539 |
- conv->encoding = tmp_encoding; |
|
540 |
- cli_dbgmsg("New encoding for %p:%s\n",(void*)conv,conv->encoding); |
|
541 |
- /* reset stream */ |
|
542 |
-} |
|
543 |
- |
|
544 | 537 |
static int encoding_norm_done(struct entity_conv* conv) |
545 | 538 |
{ |
546 | 539 |
if(conv->encoding) { |
... | ... |
@@ -567,7 +579,7 @@ int entity_norm_done(struct entity_conv* conv) |
567 | 567 |
{ |
568 | 568 |
return encoding_norm_done(conv); |
569 | 569 |
} |
570 |
- |
|
570 |
+#if 0 |
|
571 | 571 |
static size_t read_raw(FILE *stream, m_area_t *m_area, int max_len, unsigned char* outbuff) |
572 | 572 |
{ |
573 | 573 |
|
... | ... |
@@ -615,29 +627,25 @@ static size_t read_raw(FILE *stream, m_area_t *m_area, int max_len, unsigned cha |
615 | 615 |
} |
616 | 616 |
} |
617 | 617 |
} |
618 |
+#endif |
|
618 | 619 |
|
619 |
-static void output_first(struct entity_conv* conv,unsigned char** out, unsigned char** in,size_t* inleft) |
|
620 |
+static unsigned short bom_length(struct entity_conv* conv) |
|
620 | 621 |
{ |
621 | 622 |
if(conv->has_bom) { |
622 | 623 |
switch(conv->enc_bytes) { |
623 | 624 |
case 1: |
624 |
- if(conv->autodetected == UTF8) { |
|
625 |
- *in += 3; |
|
626 |
- *inleft -= 3; |
|
625 |
+ if(conv->encoding_symbolic == E_UTF8) { |
|
626 |
+ return 3; |
|
627 | 627 |
} |
628 | 628 |
break; |
629 | 629 |
case 2: |
630 |
- *in += 2; |
|
631 |
- *inleft -= 2; |
|
632 |
- break; |
|
630 |
+ return 2; |
|
633 | 631 |
case 4: |
634 |
- *in += 4; |
|
635 |
- *inleft -= 4; |
|
636 |
- break; |
|
632 |
+ return 4; |
|
637 | 633 |
} |
638 | 634 |
} |
635 |
+ return 0; |
|
639 | 636 |
} |
640 |
- |
|
641 | 637 |
/* sarge leaks on iconv_open/iconv_close, so lets not open/close so many times, |
642 | 638 |
* just keep on each thread its own pool of iconvs*/ |
643 | 639 |
|
... | ... |
@@ -653,16 +661,16 @@ static void iconv_cache_init(struct iconv_cache* cache) |
653 | 653 |
/* cache->tab = NULL; |
654 | 654 |
cache->len = 0; |
655 | 655 |
cache->used = 0; - already done by memset*/ |
656 |
- cli_dbgmsg("Initializing iconv pool:%p\n",(void*)cache); |
|
656 |
+ cli_dbgmsg(MODULE_NAME "Initializing iconv pool:%p\n",(void*)cache); |
|
657 | 657 |
hashtab_init(&cache->hashtab, 32); |
658 | 658 |
} |
659 | 659 |
|
660 | 660 |
static void iconv_cache_destroy(struct iconv_cache* cache) |
661 | 661 |
{ |
662 | 662 |
size_t i; |
663 |
- cli_dbgmsg("Destroying iconv pool:%p\n",(void*)cache); |
|
663 |
+ cli_dbgmsg(MODULE_NAME "Destroying iconv pool:%p\n",(void*)cache); |
|
664 | 664 |
for(i=0;i < cache->last;i++) { |
665 |
- cli_dbgmsg("closing iconv:%p\n",cache->tab[i]); |
|
665 |
+ cli_dbgmsg(MODULE_NAME "closing iconv:%p\n",cache->tab[i]); |
|
666 | 666 |
iconv_close(cache->tab[i]); |
667 | 667 |
} |
668 | 668 |
hashtab_clear(&cache->hashtab); |
... | ... |
@@ -702,9 +710,9 @@ static void iconv_pool_tls_key_alloc(void) |
702 | 702 |
{ |
703 | 703 |
pthread_key_create(&iconv_pool_tls_key, iconv_pool_tls_instance_destroy); |
704 | 704 |
if(!cache_atexit_registered) { |
705 |
- cli_dbgmsg("iconv:registering atexit\n"); |
|
705 |
+ cli_dbgmsg(MODULE_NAME "iconv:registering atexit\n"); |
|
706 | 706 |
if(atexit(iconv_cache_cleanup_main)) { |
707 |
- cli_dbgmsg("failed to register atexit\n"); |
|
707 |
+ cli_dbgmsg(MODULE_NAME "failed to register atexit\n"); |
|
708 | 708 |
} |
709 | 709 |
cache_atexit_registered = 1; |
710 | 710 |
} |
... | ... |
@@ -721,7 +729,7 @@ static inline struct iconv_cache* cache_get_tls_instance(void) |
721 | 721 |
if(!cache) { |
722 | 722 |
cache = cli_calloc(1,sizeof(*cache)); |
723 | 723 |
if(!cache) { |
724 |
- cli_dbgmsg("!Out of memory allocating TLS iconv instance\n"); |
|
724 |
+ cli_dbgmsg(MODULE_NAME "!Out of memory allocating TLS iconv instance\n"); |
|
725 | 725 |
return NULL; |
726 | 726 |
} |
727 | 727 |
iconv_cache_init(cache); |
... | ... |
@@ -772,7 +780,7 @@ static iconv_t iconv_open_cached(const unsigned char* fromcode) |
772 | 772 |
init_iconv_pool_ifneeded(); |
773 | 773 |
cache = cache_get_tls_instance();/* gets TLS iconv pool */ |
774 | 774 |
if(!cache) { |
775 |
- cli_dbgmsg("!Unable to get TLS iconv cache!\n"); |
|
775 |
+ cli_dbgmsg(MODULE_NAME "!Unable to get TLS iconv cache!\n"); |
|
776 | 776 |
errno = EINVAL; |
777 | 777 |
return (iconv_t)-1; |
778 | 778 |
} |
... | ... |
@@ -784,7 +792,7 @@ static iconv_t iconv_open_cached(const unsigned char* fromcode) |
784 | 784 |
if(e) { |
785 | 785 |
return cache->tab[e->data]; |
786 | 786 |
} |
787 |
- cli_dbgmsg("iconv not found in cache, for encoding:%s\n",fromcode); |
|
787 |
+ cli_dbgmsg(MODULE_NAME "iconv not found in cache, for encoding:%s\n",fromcode); |
|
788 | 788 |
iconv_struct = iconv_open("UTF-16BE",(const char*)fromcode); |
789 | 789 |
if(iconv_struct != (iconv_t)-1) { |
790 | 790 |
idx = cache->last++; |
... | ... |
@@ -792,7 +800,7 @@ static iconv_t iconv_open_cached(const unsigned char* fromcode) |
792 | 792 |
cache->len += 16; |
793 | 793 |
cache->tab = cli_realloc2(cache->tab, cache->len*sizeof(cache->tab[0])); |
794 | 794 |
if(!cache->tab) { |
795 |
- cli_dbgmsg("!Out of mem in iconv-pool\n"); |
|
795 |
+ cli_dbgmsg(MODULE_NAME "!Out of mem in iconv-pool\n"); |
|
796 | 796 |
errno = ENOMEM; |
797 | 797 |
return (iconv_t)-1; |
798 | 798 |
} |
... | ... |
@@ -800,12 +808,200 @@ static iconv_t iconv_open_cached(const unsigned char* fromcode) |
800 | 800 |
|
801 | 801 |
hashtab_insert(&cache->hashtab, fromcode, fromcode_len, idx); |
802 | 802 |
cache->tab[idx] = iconv_struct; |
803 |
- cli_dbgmsg("iconv_open(),for:%s -> %p\n",fromcode,(void*)cache->tab[idx]); |
|
803 |
+ cli_dbgmsg(MODULE_NAME "iconv_open(),for:%s -> %p\n",fromcode,(void*)cache->tab[idx]); |
|
804 | 804 |
return cache->tab[idx]; |
805 | 805 |
} |
806 | 806 |
return (iconv_t)-1; |
807 | 807 |
} |
808 |
+#if 0 |
|
809 |
+struct buffer { |
|
810 |
+ unsigned char *buffer; |
|
811 |
+ size_t length; |
|
812 |
+ size_t offset; |
|
813 |
+ size_t filled; |
|
814 |
+}; |
|
815 |
+ |
|
816 |
+#define BUFFER_FILL(b, fill_func) \ |
|
817 |
+ if((b)->offset >= (b)->filled) {\ |
|
818 |
+ /* buffer empty, attempt to fill it*/\ |
|
819 |
+ if((fill_func) == -1) return -1;/* error encountered */\ |
|
820 |
+ if((b)->filled == 0) return 0;/* EOF */\ |
|
821 |
+ (b)->offset = 0;\ |
|
822 |
+ } |
|
823 |
+#endif |
|
824 |
+ |
|
825 |
+void process_encoding_set(struct entity_conv* conv,const unsigned char* encoding,enum encoding_priority prio) |
|
826 |
+{ |
|
827 |
+ unsigned char *tmp_encoding; |
|
828 |
+ enum encodings tmp; |
|
829 |
+ size_t new_size,old_size; |
|
830 |
+ |
|
831 |
+ if(!encoding && prio == SWITCH_TO_BLOCKMODE) { |
|
832 |
+ if(conv->linemode) { |
|
833 |
+ cli_dbgmsg(MODULE_NAME "Switching to block-mode, bytes processed in line-mode: %u\n", conv->linemode_processed); |
|
834 |
+ conv->linemode = 0; |
|
835 |
+ } |
|
836 |
+ return; |
|
837 |
+ } |
|
838 |
+ |
|
839 |
+ cli_dbgmsg(MODULE_NAME "Request to set encoding for %p to %s, priority: %d\n", (void*)conv, encoding, prio); |
|
840 |
+ |
|
841 |
+ if(conv->priority == CONTENT_TYPE || conv->encoding || conv->encoding_symbolic == E_ICONV) { |
|
842 |
+ cli_dbgmsg(MODULE_NAME "won't override encoding due to priorities\n"); |
|
843 |
+ return; |
|
844 |
+ /* Content-type in header is highest priority, no overrides possible. |
|
845 |
+ * Also no overrides after an encoding has been set.*/ |
|
846 |
+ } |
|
847 |
+ |
|
848 |
+ /* validate encoding name, and normalize to uppercase */ |
|
849 |
+ if(!(tmp_encoding = normalize_encoding(encoding))) { |
|
850 |
+ cli_dbgmsg(MODULE_NAME "encoding name is not valid, ignoring\n"); |
|
851 |
+ return; |
|
852 |
+ } |
|
853 |
+ |
|
854 |
+ /* don't allow to change between unicode encodings that have different byte-size */ |
|
855 |
+ if(prio == META) { |
|
856 |
+ /* need to consider minimum size of an encoding here */ |
|
857 |
+ old_size = conv->enc_bytes; |
|
858 |
+ new_size = encoding_bytes(tmp_encoding,&tmp); |
|
859 |
+ if(old_size != new_size) { |
|
860 |
+ /* on x86 gcc wants %u for size_t, on x86_64 it wants %lu for size_t. So just cast to unsigned long to make warnings go away. */ |
|
861 |
+ cli_dbgmsg(MODULE_NAME "refusing to override encoding - new encoding size differs: %s(%lu) != %s(%lu)\n", conv->encoding, (unsigned long)old_size, tmp_encoding, (unsigned long)new_size); |
|
862 |
+ free(tmp_encoding); |
|
863 |
+ return; |
|
864 |
+ } |
|
865 |
+ } |
|
866 |
+ |
|
867 |
+ conv->encoding = tmp_encoding; |
|
868 |
+ cli_dbgmsg(MODULE_NAME "New encoding for %p:%s\n", (void*)conv, conv->encoding); |
|
869 |
+ *(iconv_t*)conv->iconv_struct = iconv_open_cached( conv->encoding ); |
|
870 |
+ if(*(iconv_t*)conv->iconv_struct == (iconv_t)-1) { |
|
871 |
+ cli_dbgmsg(MODULE_NAME "Encoding not accepted by iconv_open()%s, falling back to default!\n", conv->encoding); |
|
872 |
+ /* message shown only once/file */ |
|
873 |
+ /* what can we do? short-circuit iconv */ |
|
874 |
+ free(conv->encoding); |
|
875 |
+ conv->encoding = NULL; |
|
876 |
+ /* we will process using whatever we currently have for encoding_symbolic. |
|
877 |
+ * If encoding was already set to iconv, we shouldn't be here.*/ |
|
878 |
+ assert(conv->encoding_symbolic != E_ICONV); |
|
879 |
+ } else { |
|
880 |
+ cli_dbgmsg(MODULE_NAME "Switching to block-mode, bytes processed in line-mode: %u\n", conv->linemode_processed); |
|
881 |
+ conv->encoding_symbolic = E_ICONV; |
|
882 |
+ conv->priority = prio; |
|
883 |
+ conv->linemode = 0; |
|
884 |
+ } |
|
885 |
+} |
|
886 |
+ |
|
887 |
+static int in_iconv_u16(m_area_t* in_m_area, iconv_t* iconv_struct, m_area_t* out_m_area) |
|
888 |
+{ |
|
889 |
+ char tmp4[4]; |
|
890 |
+ size_t inleft = in_m_area->length - in_m_area->offset; |
|
891 |
+ size_t rc, alignfix; |
|
892 |
+ char* input = (char*)in_m_area->buffer + in_m_area->offset; |
|
893 |
+ size_t outleft = out_m_area->length > 0 ? out_m_area->length : 0;/*TODO: use real buffer size not last one*/ |
|
894 |
+ char* out = (char*)out_m_area->buffer; |
|
895 |
+ |
|
896 |
+ |
|
897 |
+ /* convert encoding conv->tmp_area. conv->out_area */ |
|
898 |
+ alignfix = inleft%4;/* iconv gives an error if we give him 3 bytes to convert, |
|
899 |
+ and we are using ucs4, ditto for utf16, and 1 byte*/ |
|
900 |
+ inleft -= alignfix; |
|
901 |
+ |
|
902 |
+ if(!inleft && alignfix) { |
|
903 |
+ /* EOF, and we have less than 4 bytes to convert */ |
|
904 |
+ memset(tmp4, 0, 4); |
|
905 |
+ memcpy(tmp4, input, alignfix); |
|
906 |
+ input = tmp4; |
|
907 |
+ inleft = 4; |
|
908 |
+ } |
|
909 |
+ |
|
910 |
+ rc = (size_t)-1; |
|
911 |
+ while (inleft && (outleft >= 2) && rc == (size_t)-1) { /* iconv doesn't like inleft to be 0 */ |
|
912 |
+ assert(*iconv_struct != (iconv_t)-1); |
|
913 |
+ rc = iconv(*iconv_struct, (char**) &input, &inleft, (char**) &out, &outleft); |
|
914 |
+ if(rc == (size_t)-1 && errno != E2BIG) { |
|
915 |
+ cli_dbgmsg("iconv error:%s, silently resuming (%lu, %lu, %ld, %ld)\n", |
|
916 |
+ strerror(errno), inleft, outleft, input - (char*)in_m_area->buffer, |
|
917 |
+ out - (char*)out_m_area->buffer); |
|
918 |
+ /* output raw byte, and resume at next byte */ |
|
919 |
+ if(outleft < 2) break; |
|
920 |
+ outleft -= 2; |
|
921 |
+ *out++ = 0; |
|
922 |
+ *out++ = *input++; |
|
923 |
+ inleft--; |
|
924 |
+ } |
|
925 |
+ } |
|
926 |
+ in_m_area->offset = in_m_area->length - inleft; |
|
927 |
+ if(out_m_area->length >= 0 && out_m_area->length >= (off_t)outleft) { |
|
928 |
+ out_m_area->length -= (off_t)outleft; |
|
929 |
+ } else { |
|
930 |
+ cli_dbgmsg(MODULE_NAME "outleft overflown, ignoring\n"); |
|
931 |
+ out_m_area->length = 0; |
|
932 |
+ } |
|
933 |
+ out_m_area->offset = 0; |
|
934 |
+ return 0; |
|
935 |
+} |
|
936 |
+#if 0 |
|
937 |
+/* processes @in buffer, and fills @out. Modifies offset of @in on exit. */ |
|
938 |
+static int u16_normalize (struct entity_conv* conv, struct buffer* in_buff, struct buffer* out_buff) |
|
939 |
+{ |
|
940 |
+ const unsigned char* in = in_buff->buffer; |
|
941 |
+ unsigned char* out = out_buff->buffer; |
|
942 |
+ const unsigned char* out_end = out + out_buff->length; |
|
943 |
+ |
|
944 |
+ do { |
|
945 |
+ size_t i; |
|
946 |
+ BUFFER_FILL(in_buff, in_iconv_u16(conv) ); |
|
947 |
+ |
|
948 |
+ for(i = in_buff->offset; (i < in_buff->filled) && (out < out_end); i += 2) { |
|
949 |
+ const uint16_t u16 = ( ((const uint16_t)in[i]) << 8 ) | in[i+1]; |
|
950 |
+ if(u16 > 0 && u16 < 0x80) { |
|
951 |
+ assert((unsigned char)u16 != 0); |
|
952 |
+ assert(out < out_end); |
|
953 |
+ *out++ = (unsigned char)u16; |
|
954 |
+ } |
|
955 |
+ else if (u16 == 160) {/*nbsp*/ |
|
956 |
+ assert(out < out_end); |
|
957 |
+ *out++ = 0x20; |
|
958 |
+ } |
|
959 |
+ else { |
|
960 |
+ const ssize_t max_num_length = 9; |
|
961 |
+ ssize_t printed; |
|
962 |
+ if((out_end - out) <= max_num_length) { |
|
963 |
+ /* prevent buffer overflow */ |
|
964 |
+ /* force exit out of while loop */ |
|
965 |
+ out_end = NULL; |
|
966 |
+ break; |
|
967 |
+ } |
|
968 |
+ assert(out + max_num_length < out_end); |
|
808 | 969 |
|
970 |
+ printed = snprintf((char*)out, max_num_length, "&#%d;", u16); |
|
971 |
+ if(printed > 0) { |
|
972 |
+ out += printed; |
|
973 |
+ } |
|
974 |
+ } |
|
975 |
+ } |
|
976 |
+ in_buff->offset = i; |
|
977 |
+ out_buff->filled = out - out_buff->buffer; |
|
978 |
+ out_buff->offset = 0; |
|
979 |
+ } while (out < out_end);/* if out not full, try to fill it */ |
|
980 |
+} |
|
981 |
+/* |
|
982 |
+ * We need a line-mode, which allows us to change the encoding, and |
|
983 |
+ * a block mode, that doesn't care about lines |
|
984 |
+ * |
|
985 |
+ * |
|
986 |
+ */ |
|
987 |
+/* |
|
988 |
+ * ASCII -> ascii_normalize |
|
989 |
+ * ANY -> iconv -> u16_normalize |
|
990 |
+ * UTF16 -> u16_normalize |
|
991 |
+ */ |
|
992 |
+ |
|
993 |
+unsigned char* encoding_norm_readline(struct entity_conv* conv, FILE* stream_in, m_area_t* in_m_area, const size_t maxlen) |
|
994 |
+{ |
|
995 |
+ u16_normalize(conv, |
|
996 |
+} |
|
809 | 997 |
|
810 | 998 |
/* tmp_m_area and conv->out_area are of size maxlen */ |
811 | 999 |
unsigned char* encoding_norm_readline(struct entity_conv* conv, FILE* stream_in, m_area_t* in_m_area, const size_t maxlen) |
... | ... |
@@ -818,7 +1014,7 @@ unsigned char* encoding_norm_readline(struct entity_conv* conv, FILE* stream_in, |
818 | 818 |
const size_t tmp_available = conv->buffer_size - tmp_move; |
819 | 819 |
const size_t max_read = maxlen < tmp_available ? maxlen : tmp_available; |
820 | 820 |
unsigned char* tmpbuff = &conv->tmp_area.buffer[tmp_move]; |
821 |
- |
|
821 |
+ |
|
822 | 822 |
const size_t out_move = conv->out_area.length < conv->out_area.offset ? 0 : conv->out_area.length - conv->out_area.offset; |
823 | 823 |
size_t outleft = conv->buffer_size - out_move; |
824 | 824 |
unsigned char* out = &conv->out_area.buffer[out_move]; |
... | ... |
@@ -849,60 +1045,6 @@ unsigned char* encoding_norm_readline(struct entity_conv* conv, FILE* stream_in, |
849 | 849 |
|
850 | 850 |
tmpbuff = conv->tmp_area.buffer; |
851 | 851 |
inleft = conv->tmp_area.length; |
852 |
- if(!conv->bom_cnt && conv->tmp_area.length >= 4) {/* detect Byte Order Mark */ |
|
853 |
- memcpy( conv->bom, tmpbuff, 4); |
|
854 |
- process_bom(conv); |
|
855 |
- process_encoding_set(conv,conv->autodetected,conv->has_bom ? BOM : NOBOM_AUTODETECT); |
|
856 |
- output_first(conv,&out,&tmpbuff,&inleft); |
|
857 |
- conv->bom_cnt++; |
|
858 |
- } |
|
859 |
- |
|
860 |
- /* convert encoding conv->tmp_area. conv->out_area */ |
|
861 |
- alignfix = inleft%4;/* iconv gives an error if we give him 3 bytes to convert, |
|
862 |
- and we are using ucs4, ditto for utf16, and 1 byte*/ |
|
863 |
- inleft -= alignfix; |
|
864 |
- |
|
865 |
- if(!inleft && alignfix) { |
|
866 |
- size_t k; |
|
867 |
- for(k=0;k+alignfix < 4;k++) |
|
868 |
- tmpbuff[alignfix+k] = '\0'; |
|
869 |
- inleft = 4; |
|
870 |
- alignfix = -inleft; |
|
871 |
- } |
|
872 |
- |
|
873 |
- iconv_struct = iconv_open_cached(encoding_name(conv->encoding)); |
|
874 |
- |
|
875 |
- if(iconv_struct == (iconv_t)-1) { |
|
876 |
- cli_dbgmsg("Iconv init problem for encoding:%s, falling back to iso encoding!\n",encoding_name(conv->encoding)); |
|
877 |
- /* message shown only once/file */ |
|
878 |
- /* what can we do? just fall back for it being an ISO-8859-1 */ |
|
879 |
- free(conv->encoding); |
|
880 |
- conv->encoding = (unsigned char*) cli_strdup("ISO-8859-1"); |
|
881 |
- iconv_struct = iconv_open_cached(conv->encoding); |
|
882 |
- if(iconv_struct == (iconv_t)-1) { |
|
883 |
- cli_dbgmsg("fallback failed... bail out\n"); |
|
884 |
- return cli_readline(NULL,&conv->tmp_area,maxlen); |
|
885 |
- } |
|
886 |
- } |
|
887 |
- |
|
888 |
- if(inleft && outleft > conv->buffer_size/2 ) /* iconv doesn't like inleft to be 0 */ { |
|
889 |
- rc = iconv(iconv_struct, (char**) &tmpbuff, &inleft, (char**) &out, &outleft); |
|
890 |
- } |
|
891 |
- else |
|
892 |
- rc = 0; |
|
893 |
- |
|
894 |
-#if 0 |
|
895 |
- iconv_close(iconv_struct);/* - don't close, we are using a cached instance */ |
|
896 |
-#endif |
|
897 |
- |
|
898 |
- if(rc==(size_t)-1 && errno != E2BIG) { |
|
899 |
- cli_dbgmsg("iconv error:%s, silently resuming (%ld,%ld,%lu,%lu)\n",strerror(errno),(long)(out-conv->out_area.buffer),(long)(tmpbuff-conv->tmp_area.buffer),(unsigned long)inleft,(unsigned long)outleft); |
|
900 |
- /* output raw byte, and resume at next byte */ |
|
901 |
- *out++ = 0; |
|
902 |
- *out++ = *tmpbuff++; |
|
903 |
- inleft--; |
|
904 |
-/* return cli_readline(NULL, &conv->norm_area, maxlen);*/ |
|
905 |
- } |
|
906 | 852 |
|
907 | 853 |
conv->tmp_area.length = inleft + (alignfix > 0 ? alignfix : 0); |
908 | 854 |
conv->out_area.length = out - conv->out_area.buffer - out_move; |
... | ... |
@@ -980,4 +1122,186 @@ unsigned char* encoding_norm_readline(struct entity_conv* conv, FILE* stream_in, |
980 | 980 |
return cli_readline(NULL, &conv->norm_area, maxlen); |
981 | 981 |
} |
982 | 982 |
} |
983 |
+#endif |
|
984 |
+ |
|
985 |
+static inline unsigned char* u16_normalize(uint16_t u16, unsigned char* out, const ssize_t limit) |
|
986 |
+{ |
|
987 |
+ assert(limit > 0 && "u16_normalize must be called with positive limit"); |
|
988 |
+ /* \0 is just ignored */ |
|
989 |
+ if(u16 > 0 && u16 < 0xff) { |
|
990 |
+ assert((uint8_t)u16 != 0); |
|
991 |
+ *out++ = (uint8_t)u16; |
|
992 |
+ } |
|
993 |
+ else { |
|
994 |
+ /* normalize only >255 to speed up */ |
|
995 |
+ char buf[10]; |
|
996 |
+ const ssize_t max_num_length = sizeof(buf)-1; |
|
997 |
+ int i = sizeof(buf)-1; |
|
998 |
+ |
|
999 |
+ if(limit <= max_num_length) { |
|
1000 |
+ /* not enough space available */ |
|
1001 |
+ return NULL; |
|
1002 |
+ } |
|
1003 |
+ /* inline version of |
|
1004 |
+ * out += snprintf(out, max_num_length, "&#%d;", u16) */ |
|
1005 |
+ buf[i] = '\0'; |
|
1006 |
+ do { |
|
1007 |
+ buf[--i] = '0' + (u16 % 10); |
|
1008 |
+ u16 /= 10; |
|
1009 |
+ } while (u16 && i > 0); |
|
1010 |
+ *out++ = '&'; |
|
1011 |
+ *out++ = '#'; |
|
1012 |
+ while(buf[i]) *out++ = buf[i++]; |
|
1013 |
+ *out++ = ';'; |
|
1014 |
+ } |
|
1015 |
+ assert(out); |
|
1016 |
+ return out; |
|
1017 |
+} |
|
1018 |
+ |
|
1019 |
+#define NORMALIZE_CHAR(c, out, limit, linemode) \ |
|
1020 |
+{\ |
|
1021 |
+ if (linemode && c == '\n') {\ |
|
1022 |
+ i++;\ |
|
1023 |
+ break;\ |
|
1024 |
+ } else {\ |
|
1025 |
+ unsigned char* out_new = u16_normalize(c, out, limit);\ |
|
1026 |
+ if(out_new) {\ |
|
1027 |
+ limit -= out_new - out;\ |
|
1028 |
+ }\ |
|
1029 |
+ out = out_new;\ |
|
1030 |
+ }\ |
|
1031 |
+} |
|
1032 |
+ |
|
1033 |
+/* don't use CLI_ISCONTAINED2 here, because values are signed, and gcc4.3 |
|
1034 |
+ * assumes signed overflow doesn't occur when optimizing (see -Wstrict-overflow) */ |
|
1035 |
+#define LIMIT_LENGTH(siz, siz_limit) ((siz) <= (siz_limit) ? (siz) : (siz_limit)) |
|
1036 |
+#define OFFSET_INBOUNDS(offset, length) ((offset) >= 0 && (length) >= 0 && (offset) < (length)) |
|
1037 |
+ |
|
1038 |
+/* EOF marker is m_area->length == 0 */ |
|
1039 |
+ |
|
1040 |
+/* reads input from either @m_area or @stream, and returns an m_area_t pointing to the data read. |
|
1041 |
+ * When we can't read anything due to EOF ->length will be set to 0. |
|
1042 |
+ * bounds checks offset and length*/ |
|
1043 |
+static inline m_area_t* read_raw(struct entity_conv* conv, m_area_t* m_area, FILE* stream) |
|
1044 |
+{ |
|
1045 |
+ if(!m_area) { |
|
1046 |
+ size_t iread; |
|
1047 |
+ |
|
1048 |
+ m_area = &conv->tmp_area; |
|
1049 |
+ if(OFFSET_INBOUNDS(m_area->offset, m_area->length)) { |
|
1050 |
+ return m_area; |
|
1051 |
+ } |
|
1052 |
+ /* offset out of bounds -> all the buffer was processed, fill it again */ |
|
1053 |
+ iread = fread(m_area->buffer, 1, conv->buffer_size, stream); |
|
1054 |
+ m_area->length = LIMIT_LENGTH(iread, conv->buffer_size); |
|
1055 |
+ m_area->offset = 0; |
|
1056 |
+ if(ferror(stream)) { |
|
1057 |
+ cli_errmsg("Error while reading HTML stream\n"); |
|
1058 |
+ } |
|
1059 |
+ } else { |
|
1060 |
+ if(!OFFSET_INBOUNDS(m_area->offset, m_area->length)) { |
|
1061 |
+ cli_dbgmsg(MODULE_NAME "EOF reached\n"); |
|
1062 |
+ m_area->length = m_area->offset = 0; /* EOF marker */ |
|
1063 |
+ } |
|
1064 |
+ } |
|
1065 |
+ return m_area; |
|
1066 |
+} |
|
1067 |
+ |
|
1068 |
+static inline uint16_t get_u16(const unsigned char* buf, const size_t i) |
|
1069 |
+{ |
|
1070 |
+ return ((uint16_t)buf[i] << 8) | buf[i+1]; |
|
1071 |
+} |
|
1072 |
+ |
|
1073 |
+unsigned char* encoding_norm_readline(struct entity_conv* conv, FILE* stream_in, m_area_t* in_m_area) |
|
1074 |
+{ |
|
1075 |
+ unsigned char* out = conv->out_area.buffer; |
|
1076 |
+ if(!conv || !conv->out_area.buffer || !conv->tmp_area.buffer || !out) { |
|
1077 |
+ return NULL; |
|
1078 |
+ } |
|
1079 |
+ if(!(in_m_area = read_raw(conv, in_m_area, stream_in))) { |
|
1080 |
+ /* error encountered */ |
|
1081 |
+ return NULL; |
|
1082 |
+ } |
|
1083 |
+ else { |
|
1084 |
+ const off_t input_limit = in_m_area->length; |
|
1085 |
+ const unsigned char* input = in_m_area->buffer; |
|
1086 |
+ off_t input_offset = in_m_area->offset; |
|
1087 |
+ off_t limit = conv->out_area.length - 1; |
|
1088 |
+ off_t limit_prev = limit; |
|
1089 |
+ off_t i = 0; |
|
1090 |
+ |
|
1091 |
+ /* read_raw() ensures this condition */ |
|
1092 |
+ assert((!input_limit && !input_offset) || (input_offset >=0 && input_limit > 0 && input_offset < input_limit)); |
|
1093 |
+ |
|
1094 |
+ if(!conv->bom_cnt && input_offset + 4 < input_limit) {/* detect Byte Order Mark */ |
|
1095 |
+ size_t bom_len; |
|
1096 |
+ memcpy(conv->bom, input, 4); |
|
1097 |
+ process_bom(conv); |
|
1098 |
+ bom_len = bom_length(conv); |
|
1099 |
+ in_m_area->offset = input_offset = input_offset + bom_len; |
|
1100 |
+ conv->bom_cnt = 1; |
|
1101 |
+ } |
|
1102 |
+ |
|
1103 |
+ if(conv->linemode && conv->linemode_processed > LINEMODE_LIMIT) { |
|
1104 |
+ cli_dbgmsg(MODULE_NAME "Line-mode limit exceeded (%u), switching to block-mode\n", conv->linemode_processed); |
|
1105 |
+ conv->linemode = 0; |
|
1106 |
+ } |
|
1107 |
+ |
|
1108 |
+ switch(conv->encoding_symbolic) { |
|
1109 |
+ case E_ICONV:/* only in block-mode */ |
|
1110 |
+ /* normalize already converted characters from a previous pass |
|
1111 |
+ * (output buffer was full, and we couldn't normalize more in previous pass) */ |
|
1112 |
+ for(i = conv->norm_area.offset;i < conv->norm_area.length && limit > 0 && out; i += 2) { |
|
1113 |
+ const uint16_t c = get_u16(conv->norm_area.buffer, i); |
|
1114 |
+ NORMALIZE_CHAR(c, out, limit, 0); |
|
1115 |
+ } |
|
1116 |
+ conv->norm_area.offset = i; |
|
1117 |
+ if(limit > 0) { |
|
1118 |
+ conv->norm_area.length = conv->buffer_size; |
|
1119 |
+ in_iconv_u16(in_m_area, conv->iconv_struct, &conv->norm_area); |
|
1120 |
+ |
|
1121 |
+ /*in_iconv_u16 always fills entire norm_area buffer starting from 0. */ |
|
1122 |
+ for(i = 0;i < conv->norm_area.length && limit > 0 && out; i += 2) { |
|
1123 |
+ const uint16_t c = get_u16(conv->norm_area.buffer, i); |
|
1124 |
+ NORMALIZE_CHAR(c, out, limit, 0); |
|
1125 |
+ } |
|
1126 |
+ if(i) { |
|
1127 |
+ conv->norm_area.offset = i; |
|
1128 |
+ } |
|
1129 |
+ } |
|
1130 |
+ if(limit == limit_prev) { |
|
1131 |
+ /* output pointer didn't move => EOF */ |
|
1132 |
+ return NULL; |
|
1133 |
+ } |
|
1134 |
+ break; |
|
1135 |
+ /* out_area must have enough space to allow all bytes in norm_area normalized, |
|
1136 |
+ * if we norm with &x;, then we need 7* space. */ |
|
1137 |
+ default: |
|
1138 |
+ cli_dbgmsg(MODULE_NAME "Unhandled encoding:%d\n",conv->encoding_symbolic); |
|
1139 |
+ conv->encoding_symbolic = E_OTHER; |
|
1140 |
+ case E_UNKNOWN: |
|
1141 |
+ case E_OTHER: |
|
1142 |
+ if(!input_limit) { |
|
1143 |
+ /* nothing to do, EOF */ |
|
1144 |
+ return NULL; |
|
1145 |
+ } |
|
1146 |
+ for(i = input_offset; i < input_limit && limit > 0 && out; i++) { |
|
1147 |
+ const uint16_t c = input[i]; |
|
1148 |
+ NORMALIZE_CHAR(c, out, limit, conv->linemode); |
|
1149 |
+ } |
|
1150 |
+ in_m_area->offset = i; |
|
1151 |
+ } |
|
1152 |
+ |
|
1153 |
+ |
|
1154 |
+ if(conv->linemode) { |
|
1155 |
+ conv->linemode_processed += i - input_offset; |
|
1156 |
+ } |
|
1157 |
+ |
|
1158 |
+ if(limit < 0) limit = 0; |
|
1159 |
+/* assert((unsigned)(conv->out_area.length - limit - 1) < conv->buffer_size); |
|
1160 |
+ assert(conv->out_area.length - limit - 1 >= 0); */ |
|
1161 |
+ conv->out_area.buffer[conv->out_area.length - limit - 1] = '\0'; |
|
1162 |
+ return conv->out_area.buffer; |
|
1163 |
+ } |
|
1164 |
+} |
|
983 | 1165 |
|
... | ... |
@@ -25,33 +25,33 @@ |
25 | 25 |
|
26 | 26 |
#include "hashtab.h" |
27 | 27 |
|
28 |
-#define UCS4_1234 (const unsigned char*)"UCS-4LE" |
|
29 |
-#define UCS4_4321 (const unsigned char*)"UCS-4BE" |
|
30 |
-#define UCS4_2143 (const unsigned char*)"UCS4" |
|
31 |
-#define UCS4_3412 (const unsigned char*)"UCS-4" |
|
32 |
-#define UTF16_BE (const unsigned char*)"UTF-16BE" |
|
33 |
-#define UTF16_LE (const unsigned char*)"UTF-16LE" |
|
34 |
-#define UTF8 (const unsigned char*)"UTF-8" |
|
28 |
+#define UCS4_1234 "UCS-4LE" |
|
29 |
+#define UCS4_4321 "UCS-4BE" |
|
30 |
+#define UCS4_2143 "UCS4" |
|
31 |
+#define UCS4_3412 "UCS-4" |
|
32 |
+#define UTF16_BE "UTF-16BE" |
|
33 |
+#define UTF16_LE "UTF-16LE" |
|
34 |
+#define UTF8 "UTF-8" |
|
35 | 35 |
#define UNDECIDED_32_1234 UCS4_1234 |
36 | 36 |
#define UNDECIDED_32_4321 UCS4_4321 |
37 | 37 |
#define UNDECIDED_32_2143 UCS4_2143 |
38 | 38 |
#define UNDECIDED_32_3412 UCS4_3412 |
39 | 39 |
#define UNDECIDED_16_BE UTF16_BE |
40 | 40 |
#define UNDECIDED_16_LE UTF16_LE |
41 |
-#define UNDECIDED_8 (const unsigned char*)"ISO-8859-1" |
|
42 |
-#define EBCDIC (const unsigned char*)"EBCDIC-US" |
|
43 |
-#define UNKNOWN (const unsigned char*)"\0" |
|
44 |
-#define OTHER (const unsigned char*)"OTHER" |
|
41 |
+#define UNDECIDED_8 "ISO-8859-1" |
|
42 |
+#define EBCDIC "EBCDIC-US" |
|
43 |
+#define UNKNOWN "\0" |
|
44 |
+#define OTHER "OTHER" |
|
45 | 45 |
|
46 |
-enum encoding_priority {NOPRIO,CONTENT_TYPE,BOM,NOBOM_AUTODETECT,XML_CHARSET,META}; |
|
46 |
+enum encoding_priority {NOPRIO,CONTENT_TYPE,BOM,NOBOM_AUTODETECT,XML_CHARSET,META, SWITCH_TO_BLOCKMODE}; |
|
47 | 47 |
|
48 |
-enum encodings {E_UCS4,E_UTF16,E_UCS4_1234,E_UCS4_4321,E_UCS4_2134,E_UCS4_3412,E_UTF16_BE,E_UTF16_LE,E_UTF8,E_UNKNOWN,E_OTHER}; |
|
48 |
+enum encodings {E_UCS4,E_UTF16,E_UCS4_1234,E_UCS4_4321,E_UCS4_2143,E_UCS4_3412,E_UTF16_BE,E_UTF16_LE,E_UTF8, E_UNKNOWN,E_OTHER, E_ICONV}; |
|
49 | 49 |
#define MAX_ENTITY_SIZE 22 |
50 | 50 |
|
51 | 51 |
struct entity_conv { |
52 | 52 |
unsigned char* encoding; |
53 |
- const unsigned char* autodetected; |
|
54 | 53 |
enum encoding_priority priority; |
54 |
+ enum encodings encoding_symbolic; |
|
55 | 55 |
unsigned short int encoding_specific;/* sub-encoding, used for ISO*/ |
56 | 56 |
const struct hashtable* ht; |
57 | 57 |
uint8_t has_bom; |
... | ... |
@@ -60,26 +60,24 @@ struct entity_conv { |
60 | 60 |
uint8_t bom_cnt; |
61 | 61 |
uint32_t partial; |
62 | 62 |
unsigned char bom[4]; |
63 |
-#if 0 |
|
64 |
- char* buffer; |
|
65 |
- char* buffer2; |
|
66 |
-#endif |
|
67 | 63 |
size_t buffer_size; |
68 | 64 |
size_t buffer_cnt; |
69 | 65 |
uint8_t entity_buffcnt; |
66 |
+ void* iconv_struct; |
|
70 | 67 |
char entity_buff[MAX_ENTITY_SIZE+2]; |
71 | 68 |
m_area_t tmp_area; |
72 | 69 |
m_area_t out_area; |
73 | 70 |
m_area_t norm_area; |
74 | 71 |
int msg_zero_shown; |
72 |
+ int linemode;/* TODO:set */ |
|
73 |
+ int linemode_processed; |
|
75 | 74 |
}; |
76 | 75 |
|
77 |
- |
|
78 |
-int init_entity_converter(struct entity_conv* conv,const unsigned char* encoding,size_t buffer_size); |
|
76 |
+int init_entity_converter(struct entity_conv* conv, size_t buffer_size); |
|
79 | 77 |
void process_encoding_set(struct entity_conv* conv,const unsigned char* encoding,enum encoding_priority priority); |
80 | 78 |
int entity_norm_done(struct entity_conv* conv); |
81 | 79 |
|
82 |
-unsigned char* encoding_norm_readline(struct entity_conv* conv, FILE* stream_in, m_area_t* in_m_area, const size_t maxlen); |
|
80 |
+unsigned char* encoding_norm_readline(struct entity_conv* conv, FILE* stream_in, m_area_t* in_m_area); |
|
83 | 81 |
unsigned char* entity_norm(const struct entity_conv* conv,const unsigned char* entity); |
84 | 82 |
int entitynorm_init(void); |
85 | 83 |
|
... | ... |
@@ -185,37 +185,35 @@ cli_file_t cli_filetype2(int desc, const struct cl_engine *engine) |
185 | 185 |
struct entity_conv conv; |
186 | 186 |
const size_t conv_size = 2*bread < 256 ? 256 : 2*bread; |
187 | 187 |
|
188 |
- if(init_entity_converter(&conv,UNKNOWN,conv_size) == 0) { |
|
189 |
- int end = 0; |
|
190 |
- m_area_t area; |
|
191 |
- area.buffer = (unsigned char *) smallbuff; |
|
192 |
- area.length = bread; |
|
193 |
- area.offset = 0; |
|
194 |
- |
|
195 |
- while(!end) { |
|
196 |
- if(cli_ac_initdata(&mdata, root->ac_partsigs, AC_DEFAULT_TRACKLEN)) |
|
197 |
- return ret; |
|
198 |
- |
|
199 |
- decoded = encoding_norm_readline(&conv, NULL, &area, bread); |
|
200 |
- |
|
201 |
- if(decoded) { |
|
202 |
- sret = cli_ac_scanbuff(decoded, strlen((const char *) decoded), NULL, engine->root[0], &mdata, 1, 0, 0, -1, NULL); |
|
203 |
- free(decoded); |
|
204 |
- if(sret == CL_TYPE_HTML) { |
|
205 |
- ret = CL_TYPE_HTML; |
|
206 |
- end = 1; |
|
188 |
+ /* TODO: make detection via daily.ft, then we can get rid of line-mode entirely!*/ |
|
189 |
+ if(init_entity_converter(&conv, conv_size) == 0) { |
|
190 |
+ m_area_t area; |
|
191 |
+ area.buffer = (unsigned char *) smallbuff; |
|
192 |
+ area.length = bread; |
|
193 |
+ area.offset = 0; |
|
194 |
+ |
|
195 |
+ /* switch to blockmode, so that we convert all the input buffer at once, |
|
196 |
+ * rather than line-by-line */ |
|
197 |
+ process_encoding_set(&conv, NULL, SWITCH_TO_BLOCKMODE); |
|
198 |
+ |
|
199 |
+ if(cli_ac_initdata(&mdata, root->ac_partsigs, AC_DEFAULT_TRACKLEN)) |
|
200 |
+ return ret; |
|
201 |
+ |
|
202 |
+ decoded = encoding_norm_readline(&conv, NULL, &area); |
|
203 |
+ |
|
204 |
+ if(decoded) { |
|
205 |
+ sret = cli_ac_scanbuff(decoded, strlen((const char *) decoded), NULL, engine->root[0], &mdata, 1, 0, 0, -1, NULL); |
|
206 |
+ if(sret == CL_TYPE_HTML) { |
|
207 |
+ ret = CL_TYPE_HTML; |
|
208 |
+ } |
|
207 | 209 |
} |
208 |
- } else |
|
209 |
- end = 1; |
|
210 | 210 |
|
211 |
- cli_ac_freedata(&mdata); |
|
212 |
- } |
|
213 |
- |
|
214 |
- entity_norm_done(&conv); |
|
211 |
+ cli_ac_freedata(&mdata); |
|
215 | 212 |
|
216 |
- } else { |
|
217 |
- cli_warnmsg("cli_filetype2: Error initializing entity converter\n"); |
|
218 |
- } |
|
213 |
+ entity_norm_done(&conv); |
|
214 |
+ } else { |
|
215 |
+ cli_warnmsg("cli_filetype2: Error initializing entity converter\n"); |
|
216 |
+ } |
|
219 | 217 |
} |
220 | 218 |
} |
221 | 219 |
} |
... | ... |
@@ -491,7 +491,7 @@ static int cli_html_normalise(int fd, m_area_t *m_area, const char *dirname, tag |
491 | 491 |
} |
492 | 492 |
} |
493 | 493 |
|
494 |
- if(dconf_entconv && (rc = init_entity_converter(&conv, UNKNOWN, 16384) )) { |
|
494 |
+ if(dconf_entconv && (rc = init_entity_converter(&conv, 16384) )) { |
|
495 | 495 |
if (!m_area) { |
496 | 496 |
fclose(stream_in); |
497 | 497 |
} |
... | ... |
@@ -502,7 +502,7 @@ static int cli_html_normalise(int fd, m_area_t *m_area, const char *dirname, tag |
502 | 502 |
tag_args.tag = NULL; |
503 | 503 |
tag_args.value = NULL; |
504 | 504 |
tag_args.contents = NULL; |
505 |
- |
|
505 |
+ |
|
506 | 506 |
if (dirname) { |
507 | 507 |
snprintf(filename, 1024, "%s/rfc2397", dirname); |
508 | 508 |
if (mkdir(filename, 0700) && errno != EEXIST) { |
... | ... |
@@ -514,14 +514,14 @@ static int cli_html_normalise(int fd, m_area_t *m_area, const char *dirname, tag |
514 | 514 |
file_buff_o1 = file_buff_o2 = file_buff_script = NULL; |
515 | 515 |
goto abort; |
516 | 516 |
} |
517 |
- |
|
517 |
+ |
|
518 | 518 |
file_buff_o2 = (file_buff_t *) cli_malloc(sizeof(file_buff_t)); |
519 | 519 |
if (!file_buff_o2) { |
520 | 520 |
free(file_buff_o1); |
521 | 521 |
file_buff_o1 = file_buff_o2 = file_buff_script = NULL; |
522 | 522 |
goto abort; |
523 | 523 |
} |
524 |
- |
|
524 |
+ |
|
525 | 525 |
file_buff_script = (file_buff_t *) cli_malloc(sizeof(file_buff_t)); |
526 | 526 |
if (!file_buff_script) { |
527 | 527 |
free(file_buff_o1); |
... | ... |
@@ -529,7 +529,7 @@ static int cli_html_normalise(int fd, m_area_t *m_area, const char *dirname, tag |
529 | 529 |
file_buff_o1 = file_buff_o2 = file_buff_script = NULL; |
530 | 530 |
goto abort; |
531 | 531 |
} |
532 |
- |
|
532 |
+ |
|
533 | 533 |
snprintf(filename, 1024, "%s/comment.html", dirname); |
534 | 534 |
file_buff_o1->fd = open(filename, O_WRONLY|O_CREAT|O_TRUNC, S_IWUSR|S_IRUSR); |
535 | 535 |
if (!file_buff_o1->fd) { |
... | ... |
@@ -574,12 +574,12 @@ static int cli_html_normalise(int fd, m_area_t *m_area, const char *dirname, tag |
574 | 574 |
file_buff_o2 = NULL; |
575 | 575 |
file_buff_script = NULL; |
576 | 576 |
} |
577 |
- |
|
577 |
+ |
|
578 | 578 |
binary = FALSE; |
579 | 579 |
|
580 | 580 |
if(dconf_entconv) |
581 |
- ptr = line = encoding_norm_readline(&conv, stream_in, m_area, 8192); |
|
582 |
- else |
|
581 |
+ ptr = line = encoding_norm_readline(&conv, stream_in, m_area); |
|
582 |
+ else |
|
583 | 583 |
ptr = line = cli_readline(stream_in, m_area, 8192); |
584 | 584 |
|
585 | 585 |
while (line) { |
... | ... |
@@ -766,7 +766,7 @@ static int cli_html_normalise(int fd, m_area_t *m_area, const char *dirname, tag |
766 | 766 |
if (file_buff_o2 && (file_buff_o2->length > 0)) { |
767 | 767 |
file_buff_o2->length--; |
768 | 768 |
} |
769 |
- |
|
769 |
+ |
|
770 | 770 |
if (quoted != NOT_QUOTED) { |
771 | 771 |
html_output_c(file_buff_o1, file_buff_o2, '"'); |
772 | 772 |
} |
... | ... |
@@ -783,7 +783,7 @@ static int cli_html_normalise(int fd, m_area_t *m_area, const char *dirname, tag |
783 | 783 |
if (file_buff_o2 && (file_buff_o2->length > 0)) { |
784 | 784 |
file_buff_o2->length--; |
785 | 785 |
} |
786 |
- |
|
786 |
+ |
|
787 | 787 |
if (quoted != NOT_QUOTED) { |
788 | 788 |
html_output_c(file_buff_o1, file_buff_o2, '"'); |
789 | 789 |
} |
... | ... |
@@ -832,7 +832,7 @@ static int cli_html_normalise(int fd, m_area_t *m_area, const char *dirname, tag |
832 | 832 |
} |
833 | 833 |
ptr++; |
834 | 834 |
} else { |
835 |
- if (!escape && (quoted==DOUBLE_QUOTED)) { |
|
835 |
+ if (!escape && (quoted==DOUBLE_QUOTED)) { |
|
836 | 836 |
html_output_c(file_buff_o1, file_buff_o2, '"'); |
837 | 837 |
if (tag_val_length < HTML_STR_LENGTH) { |
838 | 838 |
tag_val[tag_val_length++] = '"'; |
... | ... |
@@ -880,7 +880,7 @@ static int cli_html_normalise(int fd, m_area_t *m_area, const char *dirname, tag |
880 | 880 |
} |
881 | 881 |
ptr++; |
882 | 882 |
} |
883 |
- |
|
883 |
+ |
|
884 | 884 |
if (*ptr == '\\') { |
885 | 885 |
escape = TRUE; |
886 | 886 |
} else { |
... | ... |
@@ -899,7 +899,7 @@ static int cli_html_normalise(int fd, m_area_t *m_area, const char *dirname, tag |
899 | 899 |
ptr++; |
900 | 900 |
break; |
901 | 901 |
case HTML_PROCESS_TAG: |
902 |
- |
|
902 |
+ |
|
903 | 903 |
/* Default to no action for this tag */ |
904 | 904 |
state = HTML_SKIP_WS; |
905 | 905 |
next_state = HTML_NORM; |
... | ... |
@@ -938,6 +938,9 @@ static int cli_html_normalise(int fd, m_area_t *m_area, const char *dirname, tag |
938 | 938 |
in_script = TRUE; |
939 | 939 |
} |
940 | 940 |
html_output_tag(file_buff_script, tag, &tag_args); |
941 |
+ } else if (dconf_entconv && strcmp(tag, "body") == 0) { |
|
942 |
+ /* no more charset changes accepted after body encountered */ |
|
943 |
+ process_encoding_set(&conv, NULL, SWITCH_TO_BLOCKMODE); |
|
941 | 944 |
} else if (dconf_entconv && strcmp(tag, "meta") == 0) { |
942 | 945 |
const unsigned char* http_equiv = html_tag_arg_value(&tag_args, "http-equiv"); |
943 | 946 |
const unsigned char* http_content = html_tag_arg_value(&tag_args, "content"); |
... | ... |
@@ -953,7 +956,7 @@ static int cli_html_normalise(int fd, m_area_t *m_area, const char *dirname, tag |
953 | 953 |
http_content2[i] = tolower(http_content[i]); |
954 | 954 |
http_content2[len] = '\0'; |
955 | 955 |
charset = (unsigned char*) strstr((char*)http_content2,"charset"); |
956 |
- if(charset) { |
|
956 |
+ if(charset) { |
|
957 | 957 |
while(*charset && *charset != '=') |
958 | 958 |
charset++; |
959 | 959 |
if(*charset) |
... | ... |
@@ -1011,8 +1014,8 @@ static int cli_html_normalise(int fd, m_area_t *m_area, const char *dirname, tag |
1011 | 1011 |
} else if (strcmp(tag,"form") == 0 && hrefs->scanContents) { |
1012 | 1012 |
const unsigned char* arg_action_value = html_tag_arg_value(&tag_args,"action"); |
1013 | 1013 |
if (arg_action_value) { |
1014 |
- if(in_form_action) |
|
1015 |
- free(in_form_action); |
|
1014 |
+ if(in_form_action) |
|
1015 |
+ free(in_form_action); |
|
1016 | 1016 |
in_form_action = cli_strdup(arg_action_value); |
1017 | 1017 |
} |
1018 | 1018 |
} else if (strcmp(tag, "img") == 0) { |
... | ... |
@@ -1077,7 +1080,7 @@ static int cli_html_normalise(int fd, m_area_t *m_area, const char *dirname, tag |
1077 | 1077 |
in_form_action + strlen(in_form_action)); |
1078 | 1078 |
html_tag_contents_done(hrefs,hrefs->count); |
1079 | 1079 |
} |
1080 |
- } |
|
1080 |
+ } |
|
1081 | 1081 |
} |
1082 | 1082 |
/* TODO:imagemaps can have urls too */ |
1083 | 1083 |
} |
... | ... |
@@ -1123,7 +1126,7 @@ static int cli_html_normalise(int fd, m_area_t *m_area, const char *dirname, tag |
1123 | 1123 |
html_output_c(file_buff_o1, file_buff_o2, '&'); |
1124 | 1124 |
if (next_state == HTML_TAG_ARG_VAL && tag_val_length < HTML_STR_LENGTH) { |
1125 | 1125 |
tag_val[tag_val_length++] = '&'; |
1126 |
- } |
|
1126 |
+ } |
|
1127 | 1127 |
for(i=0; i < entity_val_length; i++) { |
1128 | 1128 |
const char c = tolower(entity_val[i]); |
1129 | 1129 |
html_output_c(file_buff_o1, file_buff_o2, c); |
... | ... |
@@ -1266,7 +1269,7 @@ static int cli_html_normalise(int fd, m_area_t *m_area, const char *dirname, tag |
1266 | 1266 |
case 0x24: |
1267 | 1267 |
html_output_c(file_buff_o1, file_buff_o2, 0x40); |
1268 | 1268 |
html_output_c(file_buff_script, NULL, 0x40); |
1269 |
- break; |
|
1269 |
+ break; |
|
1270 | 1270 |
case 0x26: |
1271 | 1271 |
html_output_c(file_buff_o1, file_buff_o2, 0x0a); |
1272 | 1272 |
html_output_c(file_buff_script, NULL, 0x0a); |
... | ... |
@@ -1285,7 +1288,7 @@ static int cli_html_normalise(int fd, m_area_t *m_area, const char *dirname, tag |
1285 | 1285 |
ptr++; |
1286 | 1286 |
length--; |
1287 | 1287 |
break; |
1288 |
- |
|
1288 |
+ |
|
1289 | 1289 |
case HTML_RFC2397_TYPE: |
1290 | 1290 |
if (*ptr == '\'') { |
1291 | 1291 |
if (!escape && (quoted==SINGLE_QUOTED)) { |
... | ... |
@@ -1340,7 +1343,7 @@ static int cli_html_normalise(int fd, m_area_t *m_area, const char *dirname, tag |
1340 | 1340 |
escape = FALSE; |
1341 | 1341 |
next_state = HTML_BAD_STATE; |
1342 | 1342 |
ptr++; |
1343 |
- |
|
1343 |
+ |
|
1344 | 1344 |
} else { |
1345 | 1345 |
if (tag_val_length < HTML_STR_LENGTH) { |
1346 | 1346 |
tag_val[tag_val_length++] = tolower(*ptr); |
... | ... |
@@ -1370,7 +1373,7 @@ static int cli_html_normalise(int fd, m_area_t *m_area, const char *dirname, tag |
1370 | 1370 |
goto abort; |
1371 | 1371 |
} |
1372 | 1372 |
file_tmp_o1->length = 0; |
1373 |
- |
|
1373 |
+ |
|
1374 | 1374 |
html_output_str(file_tmp_o1, "From html-normalise\n", 20); |
1375 | 1375 |
html_output_str(file_tmp_o1, "Content-type: ", 14); |
1376 | 1376 |
if ((tag_val_length == 0) && (*tag_val == ';')) { |
... | ... |
@@ -1455,7 +1458,7 @@ static int cli_html_normalise(int fd, m_area_t *m_area, const char *dirname, tag |
1455 | 1455 |
html_output_c(file_tmp_o1, NULL, '%'); |
1456 | 1456 |
} |
1457 | 1457 |
state = HTML_RFC2397_DATA; |
1458 |
- break; |
|
1458 |
+ break; |
|
1459 | 1459 |
case HTML_ESCAPE_CHAR: |
1460 | 1460 |
value *= 16; |
1461 | 1461 |
length++; |
... | ... |
@@ -1472,22 +1475,23 @@ static int cli_html_normalise(int fd, m_area_t *m_area, const char *dirname, tag |
1472 | 1472 |
state = next_state; |
1473 | 1473 |
} |
1474 | 1474 |
ptr++; |
1475 |
- break; |
|
1475 |
+ break; |
|
1476 | 1476 |
} |
1477 | 1477 |
} |
1478 | 1478 |
if(hrefs && hrefs->scanContents && in_ahref && href_contents_begin) |
1479 | 1479 |
/* end of line, append contents now, resume on next line */ |
1480 | 1480 |
html_tag_contents_append(hrefs,in_ahref,href_contents_begin,ptr); |
1481 | 1481 |
ptrend = NULL; |
1482 |
- free(line); |
|
1483 |
- if(dconf_entconv) |
|
1484 |
- ptr = line = encoding_norm_readline(&conv, stream_in, m_area, 8192); |
|
1485 |
- else |
|
1486 |
- ptr = line = cli_readline(stream_in, m_area, 8192); |
|
1482 |
+ if(dconf_entconv) |
|
1483 |
+ ptr = line = encoding_norm_readline(&conv, stream_in, m_area); |
|
1484 |
+ else { |
|
1485 |
+ free(line); |
|
1486 |
+ ptr = line = cli_readline(stream_in, m_area, 8192); |
|
1487 |
+ } |
|
1487 | 1488 |
} |
1488 |
- |
|
1489 |
- if(dconf_entconv) { |
|
1490 |
- /* handle "unfinished" entitites */ |
|
1489 |
+ |
|
1490 |
+ if(dconf_entconv) { |
|
1491 |
+ /* handle "unfinished" entitites */ |
|
1491 | 1492 |
size_t i; |
1492 | 1493 |
unsigned char* normalized; |
1493 | 1494 |
entity_val[entity_val_length] = '\0'; |