git-svn: trunk@3571
Török Edvin authored on 2008/02/02 04:38:52... | ... |
@@ -1,3 +1,12 @@ |
1 |
+Fri Feb 1 21:19:58 EET 2008 (edwin) |
|
2 |
+------------------------------------ |
|
3 |
+ * libclamav/filetypes.c: use entconv to detect UTF-16BE, and UCS-4 variants |
|
4 |
+ * libclamav/htmlnorm.c: use only cli_readline() we don't need exact |
|
5 |
+ conversion |
|
6 |
+ * libclamav/entconv.c: |
|
7 |
+ * drop unused functions, |
|
8 |
+ * simplify encoding_norm_readline(), and rename to encoding_normalize_toascii() |
|
9 |
+ |
|
1 | 10 |
Fri Feb 1 00:58:05 CET 2008 (tk) |
2 | 11 |
--------------------------------- |
3 | 12 |
* libclamav: ndb sigs: add new target type (7) for ASCII files; handle |
... | ... |
@@ -127,67 +127,7 @@ const char* entity_norm(struct entity_conv* conv,const unsigned char* entity) |
127 | 127 |
return NULL; |
128 | 128 |
} |
129 | 129 |
|
130 |
-/* sane default, must be larger, than the longest possible return string, |
|
131 |
- * which is |
|
132 |
- * &#xxx;*/ |
|
133 |
-#define MIN_BUFFER_SIZE 32 |
|
134 |
- |
|
135 |
-#define LINEMODE_LIMIT 16384 |
|
136 |
- |
|
137 |
-int init_entity_converter(struct entity_conv* conv, size_t buffer_size) |
|
138 |
-{ |
|
139 |
- if(buffer_size < MIN_BUFFER_SIZE) { |
|
140 |
- cli_warnmsg("Entity converter: Supplied buffer size:%lu, smaller than minimum required: %d\n",(unsigned long)buffer_size,MIN_BUFFER_SIZE); |
|
141 |
- return CL_ENULLARG; |
|
142 |
- } |
|
143 |
- if(conv) { |
|
144 |
- conv->encoding = NULL; |
|
145 |
- conv->encoding_symbolic = E_UNKNOWN; |
|
146 |
- conv->bom_cnt = 0; |
|
147 |
- conv->buffer_size = buffer_size; |
|
148 |
- conv->priority = NOPRIO; |
|
149 |
- /* start in linemode */ |
|
150 |
- conv->linemode = 1; |
|
151 |
- conv->linemode_processed = 0; |
|
152 |
- |
|
153 |
- conv->tmp_area.offset = 0; |
|
154 |
- conv->tmp_area.length = 0; |
|
155 |
- conv->tmp_area.buffer = cli_malloc(buffer_size); |
|
156 |
- if(!conv->tmp_area.buffer) { |
|
157 |
- return CL_EMEM; |
|
158 |
- } |
|
159 |
- |
|
160 |
- conv->out_area.offset = 0; |
|
161 |
- conv->out_area.length = buffer_size; |
|
162 |
- conv->out_area.buffer = cli_malloc(buffer_size); |
|
163 |
- if(!conv->out_area.buffer) { |
|
164 |
- free(conv->tmp_area.buffer); |
|
165 |
- return CL_EMEM; |
|
166 |
- } |
|
167 |
- |
|
168 |
- conv->buffer_size = buffer_size; |
|
169 |
- conv->norm_area.offset = 0; |
|
170 |
- conv->norm_area.length = 0; |
|
171 |
- conv->norm_area.buffer = cli_malloc(buffer_size); |
|
172 |
- if(!conv->norm_area.buffer) { |
|
173 |
- free(conv->tmp_area.buffer); |
|
174 |
- free(conv->out_area.buffer); |
|
175 |
- return CL_EMEM; |
|
176 |
- } |
|
177 |
- |
|
178 |
- conv->iconv_struct = cli_calloc(1, sizeof(iconv_t)); |
|
179 |
- if(!conv->iconv_struct) { |
|
180 |
- free(conv->tmp_area.buffer); |
|
181 |
- free(conv->out_area.buffer); |
|
182 |
- free(conv->norm_area.buffer); |
|
183 |
- return CL_EMEM; |
|
184 |
- } |
|
185 |
- return 0; |
|
186 |
- } |
|
187 |
- else |
|
188 |
- return CL_ENULLARG; |
|
189 |
-} |
|
190 |
- |
|
130 |
+#ifndef HAVE_ICONV |
|
191 | 131 |
static size_t encoding_bytes(const char* fromcode, enum encodings* encoding) |
192 | 132 |
{ |
193 | 133 |
/* special case for these unusual byteorders */ |
... | ... |
@@ -217,7 +157,6 @@ static size_t encoding_bytes(const char* fromcode, enum encodings* encoding) |
217 | 217 |
} |
218 | 218 |
} |
219 | 219 |
|
220 |
-#ifndef HAVE_ICONV |
|
221 | 220 |
static iconv_t iconv_open(const char *tocode, const char* fromcode) |
222 | 221 |
{ |
223 | 222 |
iconv_t iconv = cli_malloc(sizeof(*iconv)); |
... | ... |
@@ -236,7 +175,6 @@ static int iconv_close(iconv_t cd) |
236 | 236 |
return 0; |
237 | 237 |
} |
238 | 238 |
|
239 |
- |
|
240 | 239 |
static int iconv(iconv_t iconv_struct,char **inbuf, size_t *inbytesleft, |
241 | 240 |
char** outbuf, size_t *outbytesleft) |
242 | 241 |
{ |
... | ... |
@@ -426,14 +364,11 @@ static int iconv(iconv_t iconv_struct,char **inbuf, size_t *inbytesleft, |
426 | 426 |
|
427 | 427 |
#endif |
428 | 428 |
|
429 |
-/* new iconv() version */ |
|
430 |
-static inline void process_bom(struct entity_conv* conv) |
|
429 |
+static inline const char* detect_encoding(const unsigned char* bom, uint8_t* bom_found, uint8_t* enc_width) |
|
431 | 430 |
{ |
432 |
- const unsigned char* bom = conv->bom; |
|
433 |
- const char* encoding = NULL; |
|
431 |
+ const char* encoding; |
|
434 | 432 |
int has_bom = 0; |
435 |
- uint8_t enc_bytes = 1;/* default is UTF8, which has a minimum of 1 bytes*/ |
|
436 |
- |
|
433 |
+ uint8_t enc_bytes = 1; /* default is UTF8, which has a minimum of 1 bytes */ |
|
437 | 434 |
/* undecided 32-bit encodings are treated as ucs4, and |
438 | 435 |
* 16 bit as utf16*/ |
439 | 436 |
switch(bom[0]) { |
... | ... |
@@ -442,23 +377,28 @@ static inline void process_bom(struct entity_conv* conv) |
442 | 442 |
if(bom[2] == 0xFE && bom[3] == 0xFF) { |
443 | 443 |
encoding = UCS4_1234;/* UCS-4 big-endian*/ |
444 | 444 |
has_bom = 1; |
445 |
+ enc_bytes = 4; |
|
445 | 446 |
} |
446 | 447 |
else if(bom[2] == 0xFF && bom[3] == 0xFE) { |
447 | 448 |
encoding = UCS4_2143;/* UCS-4 unusual order 2143 */ |
448 | 449 |
has_bom = 1; |
450 |
+ enc_bytes = 4; |
|
449 | 451 |
} |
450 | 452 |
else if(bom[2] == 0x00 && bom[3] == 0x3C) { |
451 | 453 |
/* undecided, treat as ucs4 */ |
452 | 454 |
encoding = UCS4_1234; |
455 |
+ enc_bytes = 4; |
|
453 | 456 |
} |
454 | 457 |
else if(bom[2] == 0x3C && bom[3] == 0x00) { |
455 | 458 |
encoding = UCS4_2143; |
459 |
+ enc_bytes = 4; |
|
456 | 460 |
} |
457 | 461 |
}/* 0x00 0x00 */ |
458 | 462 |
else if(bom[1] == 0x3C) { |
459 | 463 |
if(bom[2] == 0x00) { |
460 | 464 |
if(bom[3] == 0x00) { |
461 | 465 |
encoding = UCS4_3412; |
466 |
+ enc_bytes = 4; |
|
462 | 467 |
} |
463 | 468 |
else if(bom[3] == 0x3F) { |
464 | 469 |
encoding = UTF16_BE; |
... | ... |
@@ -471,6 +411,7 @@ static inline void process_bom(struct entity_conv* conv) |
471 | 471 |
if(bom[1] == 0xFE) { |
472 | 472 |
if(bom[2] == 0x00 && bom[3] == 0x00) { |
473 | 473 |
encoding = UCS4_4321; |
474 |
+ enc_bytes = 4; |
|
474 | 475 |
has_bom = 1; |
475 | 476 |
} |
476 | 477 |
else { |
... | ... |
@@ -484,6 +425,7 @@ static inline void process_bom(struct entity_conv* conv) |
484 | 484 |
if(bom[1] == 0xFF) { |
485 | 485 |
if(bom[2] == 0x00 && bom[3] == 0x00) { |
486 | 486 |
encoding = UCS4_3412; |
487 |
+ enc_bytes = 4; |
|
487 | 488 |
has_bom = 1; |
488 | 489 |
} |
489 | 490 |
else { |
... | ... |
@@ -504,6 +446,7 @@ static inline void process_bom(struct entity_conv* conv) |
504 | 504 |
if(bom[1] == 0x00) { |
505 | 505 |
if(bom[2] == 0x00 && bom[3] == 0x00) { |
506 | 506 |
encoding = UCS4_4321; |
507 |
+ enc_bytes = 4; |
|
507 | 508 |
} |
508 | 509 |
else if(bom[2] == 0x3F && bom[3] == 0x00) { |
509 | 510 |
encoding = UTF16_LE; |
... | ... |
@@ -523,12 +466,19 @@ static inline void process_bom(struct entity_conv* conv) |
523 | 523 |
}/*4C 6F A7 94*/ |
524 | 524 |
break; |
525 | 525 |
}/*switch*/ |
526 |
- if(encoding) { |
|
527 |
- cli_dbgmsg(MODULE_NAME "encoding detected as :%s\n", encoding); |
|
528 |
- process_encoding_set(conv, (const unsigned char*)encoding, has_bom ? BOM : NOBOM_AUTODETECT); |
|
529 |
- } |
|
530 |
- conv->enc_bytes = enc_bytes; |
|
531 |
- conv->has_bom = has_bom; |
|
526 |
+ *enc_width = enc_bytes; |
|
527 |
+ *bom_found = has_bom; |
|
528 |
+ return encoding; |
|
529 |
+} |
|
530 |
+ |
|
531 |
+/* detects UTF-16(LE/BE), UCS-4(all 4 variants). |
|
532 |
+ * UTF-8 and simple ASCII are ignored, because we can process those as text */ |
|
533 |
+const char* encoding_detect_bom(const unsigned char* bom) |
|
534 |
+{ |
|
535 |
+ uint8_t has_bom; |
|
536 |
+ uint8_t enc_width; |
|
537 |
+ const char* encoding = detect_encoding(bom, &has_bom, &enc_width); |
|
538 |
+ return enc_width > 1 ? encoding : NULL; |
|
532 | 539 |
} |
533 | 540 |
|
534 | 541 |
/*()-./0123456789:ABCDEFGHIJKLMNOPQRSTUVWXYZ_abcdefghijklmnopqrstuvwxyz*/ |
... | ... |
@@ -575,53 +525,6 @@ static char* normalize_encoding(const unsigned char* enc) |
575 | 575 |
return norm; |
576 | 576 |
} |
577 | 577 |
|
578 |
-static int encoding_norm_done(struct entity_conv* conv) |
|
579 |
-{ |
|
580 |
- if(conv->encoding) { |
|
581 |
- free(conv->encoding); |
|
582 |
- conv->encoding = NULL; |
|
583 |
- } |
|
584 |
- conv->buffer_size = 0; |
|
585 |
- if(conv->tmp_area.buffer) { |
|
586 |
- free(conv->tmp_area.buffer); |
|
587 |
- conv->tmp_area.buffer = NULL; |
|
588 |
- } |
|
589 |
- if(conv->out_area.buffer) { |
|
590 |
- free(conv->out_area.buffer); |
|
591 |
- conv->out_area.buffer = NULL; |
|
592 |
- } |
|
593 |
- if(conv->norm_area.buffer) { |
|
594 |
- free(conv->norm_area.buffer); |
|
595 |
- conv->norm_area.buffer = NULL; |
|
596 |
- } |
|
597 |
- if(conv->iconv_struct) { |
|
598 |
- free(conv->iconv_struct); |
|
599 |
- } |
|
600 |
- return 0; |
|
601 |
-} |
|
602 |
- |
|
603 |
-int entity_norm_done(struct entity_conv* conv) |
|
604 |
-{ |
|
605 |
- return encoding_norm_done(conv); |
|
606 |
-} |
|
607 |
- |
|
608 |
-static unsigned short bom_length(struct entity_conv* conv) |
|
609 |
-{ |
|
610 |
- if(conv->has_bom) { |
|
611 |
- switch(conv->enc_bytes) { |
|
612 |
- case 1: |
|
613 |
- if(conv->encoding_symbolic == E_UTF8) { |
|
614 |
- return 3; |
|
615 |
- } |
|
616 |
- break; |
|
617 |
- case 2: |
|
618 |
- return 2; |
|
619 |
- case 4: |
|
620 |
- return 4; |
|
621 |
- } |
|
622 |
- } |
|
623 |
- return 0; |
|
624 |
-} |
|
625 | 578 |
/* sarge leaks on iconv_open/iconv_close, so lets not open/close so many times, |
626 | 579 |
* just keep on each thread its own pool of iconvs*/ |
627 | 580 |
|
... | ... |
@@ -774,99 +677,36 @@ static iconv_t iconv_open_cached(const char* fromcode) |
774 | 774 |
cli_dbgmsg(MODULE_NAME "iconv not found in cache, for encoding:%s\n",fromcode); |
775 | 775 |
iconv_struct = iconv_open("UTF-16BE",(const char*)fromcode); |
776 | 776 |
if(iconv_struct != (iconv_t)-1) { |
777 |
- idx = cache->last++; |
|
778 |
- if(idx >= cache->len) { |
|
779 |
- cache->len += 16; |
|
780 |
- cache->tab = cli_realloc2(cache->tab, cache->len*sizeof(cache->tab[0])); |
|
781 |
- if(!cache->tab) { |
|
782 |
- cli_dbgmsg(MODULE_NAME "!Out of mem in iconv-pool\n"); |
|
783 |
- errno = ENOMEM; |
|
784 |
- return (iconv_t)-1; |
|
777 |
+ idx = cache->last++; |
|
778 |
+ if(idx >= cache->len) { |
|
779 |
+ cache->len += 16; |
|
780 |
+ cache->tab = cli_realloc2(cache->tab, cache->len*sizeof(cache->tab[0])); |
|
781 |
+ if(!cache->tab) { |
|
782 |
+ cli_dbgmsg(MODULE_NAME "!Out of mem in iconv-pool\n"); |
|
783 |
+ errno = ENOMEM; |
|
784 |
+ return (iconv_t)-1; |
|
785 |
+ } |
|
785 | 786 |
} |
786 |
- } |
|
787 | 787 |
|
788 |
- hashtab_insert(&cache->hashtab, fromcode, fromcode_len, idx); |
|
788 |
+ hashtab_insert(&cache->hashtab, fromcode, fromcode_len, idx); |
|
789 | 789 |
cache->tab[idx] = iconv_struct; |
790 |
- cli_dbgmsg(MODULE_NAME "iconv_open(),for:%s -> %p\n",fromcode,(void*)cache->tab[idx]); |
|
791 |
- return cache->tab[idx]; |
|
792 |
-} |
|
793 |
- return (iconv_t)-1; |
|
794 |
-} |
|
795 |
- |
|
796 |
-void process_encoding_set(struct entity_conv* conv,const unsigned char* encoding,enum encoding_priority prio) |
|
797 |
-{ |
|
798 |
- char *tmp_encoding; |
|
799 |
- enum encodings tmp; |
|
800 |
- size_t new_size,old_size; |
|
801 |
- |
|
802 |
- if(!encoding && prio == SWITCH_TO_BLOCKMODE) { |
|
803 |
- if(conv->linemode) { |
|
804 |
- cli_dbgmsg(MODULE_NAME "Switching to block-mode, bytes processed in line-mode: %u\n", conv->linemode_processed); |
|
805 |
- conv->linemode = 0; |
|
806 |
- } |
|
807 |
- return; |
|
808 |
- } |
|
809 |
- |
|
810 |
- cli_dbgmsg(MODULE_NAME "Request to set encoding for %p to %s, priority: %d\n", (void*)conv, encoding, prio); |
|
811 |
- |
|
812 |
- if(conv->priority == CONTENT_TYPE || conv->encoding || conv->encoding_symbolic == E_ICONV) { |
|
813 |
- cli_dbgmsg(MODULE_NAME "won't override encoding due to priorities\n"); |
|
814 |
- return; |
|
815 |
- /* Content-type in header is highest priority, no overrides possible. |
|
816 |
- * Also no overrides after an encoding has been set.*/ |
|
817 |
- } |
|
818 |
- |
|
819 |
- /* validate encoding name, and normalize to uppercase */ |
|
820 |
- if(!(tmp_encoding = normalize_encoding(encoding))) { |
|
821 |
- cli_dbgmsg(MODULE_NAME "encoding name is not valid, ignoring\n"); |
|
822 |
- return; |
|
823 |
- } |
|
824 |
- |
|
825 |
- /* don't allow to change between unicode encodings that have different byte-size */ |
|
826 |
- if(prio == META) { |
|
827 |
- /* need to consider minimum size of an encoding here */ |
|
828 |
- old_size = conv->enc_bytes; |
|
829 |
- new_size = encoding_bytes(tmp_encoding,&tmp); |
|
830 |
- if(old_size != new_size) { |
|
831 |
- /* on x86 gcc wants %u for size_t, on x86_64 it wants %lu for size_t. So just cast to unsigned long to make warnings go away. */ |
|
832 |
- cli_dbgmsg(MODULE_NAME "refusing to override encoding - new encoding size differs: %s(%lu) != %s(%lu)\n", conv->encoding, (unsigned long)old_size, tmp_encoding, (unsigned long)new_size); |
|
833 |
- free(tmp_encoding); |
|
834 |
- return; |
|
835 |
- } |
|
836 |
- } |
|
837 |
- |
|
838 |
- conv->encoding = tmp_encoding; |
|
839 |
- cli_dbgmsg(MODULE_NAME "New encoding for %p:%s\n", (void*)conv, conv->encoding); |
|
840 |
- *(iconv_t*)conv->iconv_struct = iconv_open_cached( conv->encoding ); |
|
841 |
- if(*(iconv_t*)conv->iconv_struct == (iconv_t)-1) { |
|
842 |
- cli_dbgmsg(MODULE_NAME "Encoding not accepted by iconv_open()%s, falling back to default!\n", conv->encoding); |
|
843 |
- /* message shown only once/file */ |
|
844 |
- /* what can we do? short-circuit iconv */ |
|
845 |
- free(conv->encoding); |
|
846 |
- conv->encoding = NULL; |
|
847 |
- /* we will process using whatever we currently have for encoding_symbolic. |
|
848 |
- * If encoding was already set to iconv, we shouldn't be here.*/ |
|
849 |
- assert(conv->encoding_symbolic != E_ICONV); |
|
850 |
- } else { |
|
851 |
- cli_dbgmsg(MODULE_NAME "Switching to block-mode, bytes processed in line-mode: %u\n", conv->linemode_processed); |
|
852 |
- conv->encoding_symbolic = E_ICONV; |
|
853 |
- conv->priority = prio; |
|
854 |
- conv->linemode = 0; |
|
790 |
+ cli_dbgmsg(MODULE_NAME "iconv_open(),for:%s -> %p\n",fromcode,(void*)cache->tab[idx]); |
|
791 |
+ return cache->tab[idx]; |
|
855 | 792 |
} |
793 |
+ return (iconv_t)-1; |
|
856 | 794 |
} |
857 | 795 |
|
858 |
-static int in_iconv_u16(m_area_t* in_m_area, iconv_t* iconv_struct, m_area_t* out_m_area) |
|
796 |
+static int in_iconv_u16(const m_area_t* in_m_area, iconv_t* iconv_struct, m_area_t* out_m_area) |
|
859 | 797 |
{ |
860 | 798 |
char tmp4[4]; |
861 | 799 |
size_t inleft = in_m_area->length - in_m_area->offset; |
862 | 800 |
size_t rc, alignfix; |
863 | 801 |
char* input = (char*)in_m_area->buffer + in_m_area->offset; |
864 |
- size_t outleft = out_m_area->length > 0 ? out_m_area->length : 0;/*TODO: use real buffer size not last one*/ |
|
802 |
+ size_t outleft = out_m_area->length > 0 ? out_m_area->length : 0; |
|
865 | 803 |
char* out = (char*)out_m_area->buffer; |
866 | 804 |
|
805 |
+ out_m_area->offset = 0; |
|
867 | 806 |
if(!inleft) { |
868 |
- /* EOF */ |
|
869 |
- out_m_area->offset = out_m_area->length = 0; |
|
870 | 807 |
return 0; |
871 | 808 |
} |
872 | 809 |
/* convert encoding conv->tmp_area. conv->out_area */ |
... | ... |
@@ -886,7 +726,7 @@ static int in_iconv_u16(m_area_t* in_m_area, iconv_t* iconv_struct, m_area_t* ou |
886 | 886 |
while (inleft && (outleft >= 2)) { /* iconv doesn't like inleft to be 0 */ |
887 | 887 |
const size_t outleft_last = outleft; |
888 | 888 |
assert(*iconv_struct != (iconv_t)-1); |
889 |
- rc = iconv(*iconv_struct, (char**) &input, &inleft, (char**) &out, &outleft); |
|
889 |
+ rc = iconv(*iconv_struct, &input, &inleft, &out, &outleft); |
|
890 | 890 |
if(rc == (size_t)-1) { |
891 | 891 |
if(errno == E2BIG) { |
892 | 892 |
/* not enough space in output buffer */ |
... | ... |
@@ -909,9 +749,7 @@ static int in_iconv_u16(m_area_t* in_m_area, iconv_t* iconv_struct, m_area_t* ou |
909 | 909 |
*out++ = *input++; |
910 | 910 |
inleft--; |
911 | 911 |
} |
912 |
- /* length - offset - alignfix is original value of inleft, new value is inleft, |
|
913 |
- * difference tells how much it moved. */ |
|
914 |
- in_m_area->offset = in_m_area->length - alignfix - inleft; |
|
912 |
+ cli_dbgmsg("in_iconv_u16: unprocessed bytes: %lu\n", (unsigned long)inleft); |
|
915 | 913 |
if(out_m_area->length >= 0 && out_m_area->length >= (off_t)outleft) { |
916 | 914 |
out_m_area->length -= (off_t)outleft; |
917 | 915 |
} else { |
... | ... |
@@ -922,156 +760,36 @@ static int in_iconv_u16(m_area_t* in_m_area, iconv_t* iconv_struct, m_area_t* ou |
922 | 922 |
return 0; |
923 | 923 |
} |
924 | 924 |
|
925 |
- |
|
926 |
-#define NORMALIZE_CHAR(c, out, limit, linemode) \ |
|
927 |
-{\ |
|
928 |
- if (linemode && c == '\n') {\ |
|
929 |
- i++;\ |
|
930 |
- break;\ |
|
931 |
- } else {\ |
|
932 |
- unsigned char* out_new = u16_normalize(c, out, limit);\ |
|
933 |
- if(out_new) {\ |
|
934 |
- limit -= out_new - out;\ |
|
935 |
- }\ |
|
936 |
- out = out_new;\ |
|
937 |
- }\ |
|
938 |
-} |
|
939 |
- |
|
940 |
-/* don't use CLI_ISCONTAINED2 here, because values are signed, and gcc4.3 |
|
941 |
- * assumes signed overflow doesn't occur when optimizing (see -Wstrict-overflow) */ |
|
942 |
-#define LIMIT_LENGTH(siz, siz_limit) ((siz) <= (siz_limit) ? (siz) : (siz_limit)) |
|
943 |
-#define OFFSET_INBOUNDS(offset, length) ((offset) >= 0 && (length) >= 0 && (offset) < (length)) |
|
944 |
- |
|
945 |
-/* EOF marker is m_area->length == 0 */ |
|
946 |
- |
|
947 |
-/* reads input from either @m_area or @stream, and returns an m_area_t pointing to the data read. |
|
948 |
- * When we can't read anything due to EOF ->length will be set to 0. |
|
949 |
- * bounds checks offset and length*/ |
|
950 |
-static inline m_area_t* read_raw(struct entity_conv* conv, m_area_t* m_area, FILE* stream) |
|
925 |
+int encoding_normalize_toascii(const m_area_t* in_m_area, const char* initial_encoding, m_area_t* out_m_area) |
|
951 | 926 |
{ |
952 |
- if(!m_area) { |
|
953 |
- size_t iread; |
|
927 |
+ iconv_t iconv_struct; |
|
928 |
+ off_t i, j; |
|
929 |
+ char *encoding; |
|
954 | 930 |
|
955 |
- m_area = &conv->tmp_area; |
|
956 |
- if(OFFSET_INBOUNDS(m_area->offset, m_area->length)) { |
|
957 |
- return m_area; |
|
958 |
- } |
|
959 |
- /* offset out of bounds -> all the buffer was processed, fill it again */ |
|
960 |
- iread = fread(m_area->buffer, 1, conv->buffer_size, stream); |
|
961 |
- m_area->length = LIMIT_LENGTH(iread, conv->buffer_size); |
|
962 |
- m_area->offset = 0; |
|
963 |
- if(ferror(stream)) { |
|
964 |
- cli_errmsg("Error while reading HTML stream\n"); |
|
965 |
- } |
|
966 |
- } else { |
|
967 |
- if(!OFFSET_INBOUNDS(m_area->offset, m_area->length)) { |
|
968 |
- cli_dbgmsg(MODULE_NAME "EOF reached\n"); |
|
969 |
- m_area->offset = m_area->length; /* EOF marker */ |
|
970 |
- } |
|
931 |
+ if(!initial_encoding || !in_m_area || !out_m_area) { |
|
932 |
+ return CL_ENULLARG; |
|
971 | 933 |
} |
972 |
- return m_area; |
|
973 |
-} |
|
974 |
- |
|
975 |
-static inline uint16_t get_u16(const unsigned char* buf, const size_t i) |
|
976 |
-{ |
|
977 |
- return ((uint16_t)buf[i] << 8) | buf[i+1]; |
|
978 |
-} |
|
979 | 934 |
|
980 |
-unsigned char* encoding_norm_readline(struct entity_conv* conv, FILE* stream_in, m_area_t* in_m_area) |
|
981 |
-{ |
|
982 |
- unsigned char* out = conv->out_area.buffer; |
|
983 |
- if(!conv || !conv->out_area.buffer || !conv->tmp_area.buffer || !out) { |
|
984 |
- return NULL; |
|
985 |
- } |
|
986 |
- if(!(in_m_area = read_raw(conv, in_m_area, stream_in))) { |
|
987 |
- /* error encountered */ |
|
988 |
- return NULL; |
|
935 |
+ encoding = normalize_encoding((const unsigned char*)initial_encoding); |
|
936 |
+ if(!encoding) { |
|
937 |
+ cli_dbgmsg(MODULE_NAME "encoding name is not valid, ignoring\n"); |
|
938 |
+ return -1; |
|
989 | 939 |
} |
990 |
- else { |
|
991 |
- const off_t input_limit = in_m_area->length; |
|
992 |
- const unsigned char* input = in_m_area->buffer; |
|
993 |
- off_t input_offset = in_m_area->offset; |
|
994 |
- off_t limit = conv->out_area.length - 1; |
|
995 |
- off_t limit_prev = limit; |
|
996 |
- off_t i = 0; |
|
997 |
- |
|
998 |
- /* read_raw() ensures this condition */ |
|
999 |
- assert((!input_limit && !input_offset) || (input_offset >=0 && input_limit > 0 && input_offset <= input_limit)); |
|
1000 |
- |
|
1001 |
- if(!conv->bom_cnt && input_offset + 4 < input_limit) {/* detect Byte Order Mark */ |
|
1002 |
- size_t bom_len; |
|
1003 |
- memcpy(conv->bom, input, 4); |
|
1004 |
- process_bom(conv); |
|
1005 |
- bom_len = bom_length(conv); |
|
1006 |
- in_m_area->offset = input_offset = input_offset + bom_len; |
|
1007 |
- conv->bom_cnt = 1; |
|
1008 |
- } |
|
1009 |
- |
|
1010 |
- if(conv->linemode && conv->linemode_processed > LINEMODE_LIMIT) { |
|
1011 |
- cli_dbgmsg(MODULE_NAME "Line-mode limit exceeded (%u), switching to block-mode\n", conv->linemode_processed); |
|
1012 |
- conv->linemode = 0; |
|
1013 |
- } |
|
1014 |
- |
|
1015 |
- switch(conv->encoding_symbolic) { |
|
1016 |
- case E_ICONV:/* only in block-mode */ |
|
1017 |
- /* normalize already converted characters from a previous pass |
|
1018 |
- * (output buffer was full, and we couldn't normalize more in previous pass) */ |
|
1019 |
- for(i = conv->norm_area.offset;i < conv->norm_area.length && limit > 0 && out; i += 2) { |
|
1020 |
- const uint16_t c = get_u16(conv->norm_area.buffer, i); |
|
1021 |
- NORMALIZE_CHAR(c, out, limit, 0); |
|
1022 |
- } |
|
1023 |
- conv->norm_area.offset = i; |
|
1024 |
- if(limit > 0) { |
|
1025 |
- conv->norm_area.length = conv->buffer_size; |
|
1026 |
- in_iconv_u16(in_m_area, conv->iconv_struct, &conv->norm_area); |
|
1027 |
- |
|
1028 |
- /*in_iconv_u16 always fills entire norm_area buffer starting from 0. */ |
|
1029 |
- for(i = 0;i < conv->norm_area.length && limit > 0 && out; i += 2) { |
|
1030 |
- const uint16_t c = get_u16(conv->norm_area.buffer, i); |
|
1031 |
- NORMALIZE_CHAR(c, out, limit, 0); |
|
1032 |
- } |
|
1033 |
- if(i) { |
|
1034 |
- conv->norm_area.offset = i; |
|
1035 |
- } |
|
1036 |
- } |
|
1037 |
- if(limit == limit_prev) { |
|
1038 |
- /* output pointer didn't move => EOF */ |
|
1039 |
- return NULL; |
|
1040 |
- } |
|
1041 |
- break; |
|
1042 |
- /* out_area must have enough space to allow all bytes in norm_area normalized, |
|
1043 |
- * if we norm with &x;, then we need 7* space. */ |
|
1044 |
- default: |
|
1045 |
- cli_dbgmsg(MODULE_NAME "Unhandled encoding:%d\n",conv->encoding_symbolic); |
|
1046 |
- conv->encoding_symbolic = E_OTHER; |
|
1047 |
- case E_UNKNOWN: |
|
1048 |
- case E_OTHER: |
|
1049 |
- if(!input_limit || input_offset == input_limit) { |
|
1050 |
- /* nothing to do, EOF */ |
|
1051 |
- return NULL; |
|
1052 |
- } |
|
1053 |
- for(i = input_offset; i < input_limit && limit > 0; i++) { |
|
1054 |
- const unsigned char c = input[i]; |
|
1055 |
- if(conv->linemode && c == '\n') { |
|
1056 |
- i++; |
|
1057 |
- break; |
|
1058 |
- } |
|
1059 |
- if(c) { |
|
1060 |
- *out++ = c; |
|
1061 |
- limit--; |
|
1062 |
- } |
|
1063 |
- } |
|
1064 |
- in_m_area->offset = i; |
|
1065 |
- } |
|
1066 | 940 |
|
1067 |
- |
|
1068 |
- if(conv->linemode) { |
|
1069 |
- conv->linemode_processed += i - input_offset; |
|
941 |
+ cli_dbgmsg(MODULE_NAME "Encoding %s\n", encoding); |
|
942 |
+ iconv_struct = iconv_open_cached( encoding ); |
|
943 |
+ if(iconv_struct == (iconv_t)-1) { |
|
944 |
+ cli_dbgmsg(MODULE_NAME "Encoding not accepted by iconv_open(): %s\n", encoding); |
|
945 |
+ free(encoding); |
|
946 |
+ return -1; |
|
947 |
+ } |
|
948 |
+ in_iconv_u16(in_m_area, &iconv_struct, out_m_area); |
|
949 |
+ for(i = 0, j = 0; i < out_m_area->length ; i += 2) { |
|
950 |
+ const unsigned char c = (out_m_area->buffer[i] << 4) + out_m_area->buffer[i+1]; |
|
951 |
+ if(c) { |
|
952 |
+ out_m_area->buffer[j++] = c; |
|
1070 | 953 |
} |
1071 |
- |
|
1072 |
- if(limit < 0) limit = 0; |
|
1073 |
- conv->out_area.buffer[conv->out_area.length - limit - 1] = '\0'; |
|
1074 |
- return conv->out_area.buffer; |
|
1075 | 954 |
} |
955 |
+ out_m_area->length = j; |
|
956 |
+ return 0; |
|
1076 | 957 |
} |
1077 |
- |
... | ... |
@@ -44,37 +44,20 @@ |
44 | 44 |
#define UNKNOWN "\0" |
45 | 45 |
#define OTHER "OTHER" |
46 | 46 |
|
47 |
+ |
|
47 | 48 |
enum encoding_priority {NOPRIO,CONTENT_TYPE,BOM,NOBOM_AUTODETECT,XML_CHARSET,META, SWITCH_TO_BLOCKMODE}; |
48 | 49 |
|
49 | 50 |
enum encodings {E_UCS4,E_UTF16,E_UCS4_1234,E_UCS4_4321,E_UCS4_2143,E_UCS4_3412,E_UTF16_BE,E_UTF16_LE,E_UTF8, E_UNKNOWN,E_OTHER, E_ICONV}; |
50 | 51 |
#define MAX_ENTITY_SIZE 22 |
51 | 52 |
|
52 | 53 |
struct entity_conv { |
53 |
- char* encoding; |
|
54 |
- enum encoding_priority priority; |
|
55 |
- enum encodings encoding_symbolic; |
|
56 |
- size_t buffer_size; |
|
57 |
- void* iconv_struct; |
|
58 | 54 |
unsigned char entity_buff[MAX_ENTITY_SIZE+2]; |
59 |
- m_area_t tmp_area; |
|
60 |
- m_area_t out_area; |
|
61 |
- m_area_t norm_area; |
|
62 |
- int linemode;/* TODO:set */ |
|
63 |
- int linemode_processed; |
|
64 |
- unsigned char bom[4]; |
|
65 |
- uint8_t has_bom; |
|
66 |
- uint8_t enc_bytes; |
|
67 |
- uint8_t bom_cnt; |
|
68 | 55 |
}; |
69 | 56 |
|
70 |
-int init_entity_converter(struct entity_conv* conv, size_t buffer_size); |
|
71 |
-void process_encoding_set(struct entity_conv* conv,const unsigned char* encoding,enum encoding_priority priority); |
|
72 |
-int entity_norm_done(struct entity_conv* conv); |
|
73 |
- |
|
74 | 57 |
unsigned char* u16_normalize_tobuffer(uint16_t u16, unsigned char* dst, size_t dst_size); |
75 |
-unsigned char* encoding_norm_readline(struct entity_conv* conv, FILE* stream_in, m_area_t* in_m_area); |
|
76 | 58 |
const char* entity_norm(struct entity_conv* conv,const unsigned char* entity); |
77 |
-int entitynorm_init(void); |
|
59 |
+const char* encoding_detect_bom(const unsigned char* bom); |
|
60 |
+int encoding_normalize_toascii(const m_area_t* in_m_area, const char* initial_encoding, m_area_t* out_m_area); |
|
78 | 61 |
|
79 | 62 |
#endif |
80 | 63 |
|
... | ... |
@@ -182,37 +182,42 @@ cli_file_t cli_filetype2(int desc, const struct cl_engine *engine) |
182 | 182 |
cli_ac_freedata(&mdata); |
183 | 183 |
|
184 | 184 |
if((((struct cli_dconf*) engine->dconf)->phishing & PHISHING_CONF_ENTCONV) && ret != CL_TYPE_HTML_UTF16) { |
185 |
- struct entity_conv conv; |
|
186 |
- const size_t conv_size = 2*bread < 256 ? 256 : 2*bread; |
|
187 |
- |
|
188 |
- /* TODO: make detection via daily.ft, then we can get rid of line-mode entirely!*/ |
|
189 |
- if(init_entity_converter(&conv, conv_size) == 0) { |
|
190 |
- m_area_t area; |
|
191 |
- area.buffer = (unsigned char *) smallbuff; |
|
192 |
- area.length = bread; |
|
193 |
- area.offset = 0; |
|
194 |
- |
|
195 |
- /* switch to blockmode, so that we convert all the input buffer at once, |
|
196 |
- * rather than line-by-line */ |
|
197 |
- process_encoding_set(&conv, NULL, SWITCH_TO_BLOCKMODE); |
|
198 |
- |
|
199 |
- if(cli_ac_initdata(&mdata, root->ac_partsigs, AC_DEFAULT_TRACKLEN)) |
|
200 |
- return ret; |
|
201 |
- |
|
202 |
- decoded = encoding_norm_readline(&conv, NULL, &area); |
|
203 |
- |
|
204 |
- if(decoded) { |
|
205 |
- sret = cli_ac_scanbuff(decoded, strlen((const char *) decoded), NULL, engine->root[0], &mdata, 1, 0, 0, -1, NULL); |
|
206 |
- if(sret == CL_TYPE_HTML) { |
|
207 |
- ret = CL_TYPE_HTML; |
|
185 |
+ const char* encoding; |
|
186 |
+ |
|
187 |
+ /* check if we can autodetect this encoding. |
|
188 |
+ * If we can't don't try to detect HTML sig, since |
|
189 |
+ * we just tried that above, and failed */ |
|
190 |
+ if((encoding = encoding_detect_bom(smallbuff))) { |
|
191 |
+ unsigned char decodedbuff[sizeof(smallbuff)*2]; |
|
192 |
+ m_area_t in_area, out_area; |
|
193 |
+ |
|
194 |
+ in_area.buffer = (unsigned char *) smallbuff; |
|
195 |
+ in_area.length = bread; |
|
196 |
+ in_area.offset = 0; |
|
197 |
+ out_area.buffer = decodedbuff; |
|
198 |
+ out_area.length = sizeof(decodedbuff); |
|
199 |
+ out_area.offset = 0; |
|
200 |
+ |
|
201 |
+ /* in htmlnorm we simply skip over \0 chars, and that allows to parse HTML in any unicode |
|
202 |
+ * (multibyte characters will not be exactly handled, but that is not a problem). |
|
203 |
+ * However when detecting whether a file is HTML or not, we need exact conversion. |
|
204 |
+ * (just eliminating zeros and matching would introduce false positives */ |
|
205 |
+ if(encoding_normalize_toascii(&in_area, encoding, &out_area) >= 0 && out_area.length > 0) { |
|
206 |
+ out_area.buffer[out_area.length] = '\0'; |
|
207 |
+ if(cli_ac_initdata(&mdata, root->ac_partsigs, AC_DEFAULT_TRACKLEN)) |
|
208 |
+ return ret; |
|
209 |
+ |
|
210 |
+ if(out_area.length > 0) { |
|
211 |
+ sret = cli_ac_scanbuff(decodedbuff, out_area.length, NULL, engine->root[0], &mdata, 1, 0, 0, -1, NULL); |
|
212 |
+ if(sret == CL_TYPE_HTML) { |
|
213 |
+ cli_dbgmsg("cli_filetype2: detected HTML signature in Unicode file\n"); |
|
214 |
+ /* htmlnorm is able to handle any unicode now, since it skips null chars */ |
|
215 |
+ ret = CL_TYPE_HTML; |
|
216 |
+ } |
|
208 | 217 |
} |
209 |
- } |
|
210 | 218 |
|
211 |
- cli_ac_freedata(&mdata); |
|
212 |
- |
|
213 |
- entity_norm_done(&conv); |
|
214 |
- } else { |
|
215 |
- cli_warnmsg("cli_filetype2: Error initializing entity converter\n"); |
|
219 |
+ cli_ac_freedata(&mdata); |
|
220 |
+ } |
|
216 | 221 |
} |
217 | 222 |
} |
218 | 223 |
} |
... | ... |
@@ -542,13 +542,6 @@ static int cli_html_normalise(int fd, m_area_t *m_area, const char *dirname, tag |
542 | 542 |
} |
543 | 543 |
} |
544 | 544 |
|
545 |
- if(dconf_entconv && (rc = init_entity_converter(&conv, 16384) )) { |
|
546 |
- if (!m_area) { |
|
547 |
- fclose(stream_in); |
|
548 |
- } |
|
549 |
- return rc; |
|
550 |
- } |
|
551 |
- |
|
552 | 545 |
tag_args.count = 0; |
553 | 546 |
tag_args.tag = NULL; |
554 | 547 |
tag_args.value = NULL; |
... | ... |
@@ -628,10 +621,7 @@ static int cli_html_normalise(int fd, m_area_t *m_area, const char *dirname, tag |
628 | 628 |
|
629 | 629 |
binary = FALSE; |
630 | 630 |
|
631 |
- if(dconf_entconv) |
|
632 |
- ptr = line = encoding_norm_readline(&conv, stream_in, m_area); |
|
633 |
- else |
|
634 |
- ptr = line = cli_readchunk(stream_in, m_area, 8192); |
|
631 |
+ ptr = line = cli_readchunk(stream_in, m_area, 8192); |
|
635 | 632 |
|
636 | 633 |
while (line) { |
637 | 634 |
if(href_contents_begin) |
... | ... |
@@ -989,37 +979,6 @@ static int cli_html_normalise(int fd, m_area_t *m_area, const char *dirname, tag |
989 | 989 |
in_script = TRUE; |
990 | 990 |
} |
991 | 991 |
html_output_tag(file_buff_script, tag, &tag_args); |
992 |
- } else if (dconf_entconv && strcmp(tag, "body") == 0) { |
|
993 |
- /* no more charset changes accepted after body encountered */ |
|
994 |
- process_encoding_set(&conv, NULL, SWITCH_TO_BLOCKMODE); |
|
995 |
- } else if (dconf_entconv && strcmp(tag, "meta") == 0) { |
|
996 |
- const unsigned char* http_equiv = html_tag_arg_value(&tag_args, "http-equiv"); |
|
997 |
- const unsigned char* http_content = html_tag_arg_value(&tag_args, "content"); |
|
998 |
- if(http_equiv && http_content && strcasecmp(http_equiv,"content-type") == 0) { |
|
999 |
- size_t len = strlen((const char*)http_content); |
|
1000 |
- unsigned char* http_content2 = cli_malloc( len + 1); |
|
1001 |
- unsigned char* charset; |
|
1002 |
- size_t i; |
|
1003 |
- |
|
1004 |
- if(!http_content2) |
|
1005 |
- return CL_EMEM; |
|
1006 |
- for(i = 0; i < len; i++) |
|
1007 |
- http_content2[i] = tolower(http_content[i]); |
|
1008 |
- http_content2[len] = '\0'; |
|
1009 |
- charset = (unsigned char*) strstr((char*)http_content2,"charset"); |
|
1010 |
- if(charset) { |
|
1011 |
- while(*charset && *charset != '=') |
|
1012 |
- charset++; |
|
1013 |
- if(*charset) |
|
1014 |
- charset++;/* skip = */ |
|
1015 |
- len = strcspn((const char*)charset," \"'"); |
|
1016 |
- charset[len] = '\0'; |
|
1017 |
- if(len) { |
|
1018 |
- process_encoding_set(&conv, charset, META); |
|
1019 |
- } |
|
1020 |
- } |
|
1021 |
- free(http_content2); |
|
1022 |
- } |
|
1023 | 992 |
} else if (hrefs) { |
1024 | 993 |
if(in_ahref && !href_contents_begin) |
1025 | 994 |
href_contents_begin=ptr; |
... | ... |
@@ -1533,12 +1492,8 @@ static int cli_html_normalise(int fd, m_area_t *m_area, const char *dirname, tag |
1533 | 1533 |
/* end of line, append contents now, resume on next line */ |
1534 | 1534 |
html_tag_contents_append(hrefs,in_ahref,href_contents_begin,ptr); |
1535 | 1535 |
ptrend = NULL; |
1536 |
- if(dconf_entconv) |
|
1537 |
- ptr = line = encoding_norm_readline(&conv, stream_in, m_area); |
|
1538 |
- else { |
|
1539 |
- free(line); |
|
1540 |
- ptr = line = cli_readchunk(stream_in, m_area, 8192); |
|
1541 |
- } |
|
1536 |
+ free(line); |
|
1537 |
+ ptr = line = cli_readchunk(stream_in, m_area, 8192); |
|
1542 | 1538 |
} |
1543 | 1539 |
|
1544 | 1540 |
if(dconf_entconv) { |
... | ... |
@@ -1566,8 +1521,6 @@ abort: |
1566 | 1566 |
if (in_ahref) /* tag not closed, force closing */ |
1567 | 1567 |
html_tag_contents_done(hrefs,in_ahref); |
1568 | 1568 |
|
1569 |
- if(dconf_entconv) |
|
1570 |
- entity_norm_done(&conv); |
|
1571 | 1569 |
html_tag_arg_free(&tag_args); |
1572 | 1570 |
if (!m_area) { |
1573 | 1571 |
fclose(stream_in); |
... | ... |
@@ -1593,11 +1546,11 @@ abort: |
1593 | 1593 |
int html_normalise_mem(unsigned char *in_buff, off_t in_size, const char *dirname, tag_arguments_t *hrefs,const struct cli_dconf* dconf) |
1594 | 1594 |
{ |
1595 | 1595 |
m_area_t m_area; |
1596 |
- |
|
1596 |
+ |
|
1597 | 1597 |
m_area.buffer = in_buff; |
1598 | 1598 |
m_area.length = in_size; |
1599 | 1599 |
m_area.offset = 0; |
1600 |
- |
|
1600 |
+ |
|
1601 | 1601 |
return cli_html_normalise(-1, &m_area, dirname, hrefs, dconf); |
1602 | 1602 |
} |
1603 | 1603 |
|
... | ... |
@@ -1607,7 +1560,7 @@ int html_normalise_fd(int fd, const char *dirname, tag_arguments_t *hrefs,const |
1607 | 1607 |
int retval=FALSE; |
1608 | 1608 |
m_area_t m_area; |
1609 | 1609 |
struct stat statbuf; |
1610 |
- |
|
1610 |
+ |
|
1611 | 1611 |
if (fstat(fd, &statbuf) == 0) { |
1612 | 1612 |
m_area.length = statbuf.st_size; |
1613 | 1613 |
m_area.buffer = (unsigned char *) mmap(NULL, m_area.length, PROT_READ, MAP_PRIVATE, fd, 0); |
... | ... |
@@ -36,7 +36,7 @@ typedef struct m_area_tag { |
36 | 36 |
} m_area_t; |
37 | 37 |
|
38 | 38 |
int html_normalise_mem(unsigned char *in_buff, off_t in_size, const char *dirname, tag_arguments_t *hrefs,const struct cli_dconf* dconf); |
39 |
-int html_normalise_fd(int fd, const char *dirname, tag_arguments_t *hrefs,const struct cli_dconf* dconf); |
|
39 |
+int html_normalise_fd(int fd, const char *dirname, tag_arguments_t *hrefs, const struct cli_dconf* dconf); |
|
40 | 40 |
void html_tag_arg_free(tag_arguments_t *tags); |
41 | 41 |
int html_screnc_decode(int fd, const char *dirname); |
42 | 42 |
|