git-svn: trunk@2675
Tomasz Kojm authored on 2007/02/08 01:23:40... | ... |
@@ -1,3 +1,11 @@ |
1 |
+Wed Feb 7 17:20:12 CET 2007 (tk) |
|
2 |
+--------------------------------- |
|
3 |
+ * libclamav: apply entconv patch from Edwin: |
|
4 |
+ - workaround sarge libc leak using a per-thread cache |
|
5 |
+ - normalize <0x20 chars too |
|
6 |
+ - fix utf-16 double-decoding |
|
7 |
+ - fix performance issue with some encodings |
|
8 |
+ |
|
1 | 9 |
Sun Feb 4 17:58:16 CET 2007 (tk) |
2 | 10 |
--------------------------------- |
3 | 11 |
* libclamav: remove some warnings from gcc |
... | ... |
@@ -32,6 +32,11 @@ |
32 | 32 |
#include <dirent.h> |
33 | 33 |
#include <errno.h> |
34 | 34 |
|
35 |
+ |
|
36 |
+#ifdef CL_THREAD_SAFE |
|
37 |
+#include <pthread.h> |
|
38 |
+#endif |
|
39 |
+ |
|
35 | 40 |
#include "clamav.h" |
36 | 41 |
#include "others.h" |
37 | 42 |
#include "htmlnorm.h" |
... | ... |
@@ -41,9 +46,9 @@ |
41 | 41 |
|
42 | 42 |
#ifdef HAVE_ICONV_H |
43 | 43 |
#include <iconv.h> |
44 |
-#else |
|
45 |
-#include "encoding_aliases.h" |
|
46 | 44 |
#endif |
45 |
+#include "encoding_aliases.h" |
|
46 |
+ |
|
47 | 47 |
|
48 | 48 |
#define MAX_LINE 1024 |
49 | 49 |
|
... | ... |
@@ -136,6 +141,7 @@ int init_entity_converter(struct entity_conv* conv,const unsigned char* encoding |
136 | 136 |
} |
137 | 137 |
|
138 | 138 |
conv->ht = &entities_htable; |
139 |
+ conv->msg_zero_shown = 0; |
|
139 | 140 |
|
140 | 141 |
return 0; |
141 | 142 |
} |
... | ... |
@@ -143,55 +149,57 @@ int init_entity_converter(struct entity_conv* conv,const unsigned char* encoding |
143 | 143 |
return CL_ENULLARG; |
144 | 144 |
} |
145 | 145 |
|
146 |
-#ifndef HAVE_ICONV_H |
|
147 |
- |
|
148 |
-typedef struct { |
|
149 |
- enum encodings encoding; |
|
150 |
- size_t size; |
|
151 |
-} * iconv_t; |
|
152 |
- |
|
153 |
-static iconv_t iconv_open(const char *tocode, const char *fromcode) |
|
146 |
+static size_t encoding_bytes(const unsigned char* fromcode, enum encodings* encoding) |
|
154 | 147 |
{ |
155 | 148 |
const unsigned char* from = (const unsigned char*) fromcode; |
156 |
- iconv_t iconv = cli_malloc(sizeof(*iconv)); |
|
157 |
- if(!iconv) |
|
158 |
- return NULL; |
|
159 |
- iconv->encoding = E_OTHER; |
|
160 |
- iconv->size = 1; |
|
161 |
- /*TODO: check that tocode is UTF16BE */ |
|
162 | 149 |
/* special case for these unusual byteorders */ |
150 |
+ *encoding=E_OTHER; |
|
163 | 151 |
if(from == UCS4_2143) |
164 |
- iconv->encoding = E_UCS4_2134; |
|
152 |
+ *encoding = E_UCS4_2134; |
|
165 | 153 |
else if (from == UCS4_3412) |
166 |
- iconv->encoding = E_UCS4_3412; |
|
154 |
+ *encoding = E_UCS4_3412; |
|
167 | 155 |
else { |
168 |
- struct element * e = hashtab_find(&aliases_htable,from,strlen(fromcode)); |
|
156 |
+ struct element * e = hashtab_find(&aliases_htable,from,strlen((const char*)fromcode)); |
|
169 | 157 |
if(e && e->key) { |
170 |
- iconv->encoding = e->data; |
|
158 |
+ *encoding = e->data; |
|
171 | 159 |
} |
172 | 160 |
} |
173 | 161 |
|
174 |
- switch(iconv->encoding) { |
|
162 |
+ switch(*encoding) { |
|
175 | 163 |
case E_UCS4: |
176 | 164 |
case E_UCS4_1234: |
177 | 165 |
case E_UCS4_4321: |
178 | 166 |
case E_UCS4_2134: |
179 | 167 |
case E_UCS4_3412: |
180 |
- iconv->size = 4; |
|
181 |
- break; |
|
168 |
+ return 4; |
|
182 | 169 |
case E_UTF16: |
183 | 170 |
case E_UTF16_BE: |
184 | 171 |
case E_UTF16_LE: |
185 |
- iconv->size = 2; |
|
186 |
- break; |
|
172 |
+ return 2; |
|
187 | 173 |
case E_UTF8: |
188 | 174 |
case E_UNKNOWN: |
189 | 175 |
case E_OTHER: |
190 | 176 |
default: |
191 |
- iconv->size = 1; |
|
177 |
+ return 1; |
|
192 | 178 |
} |
179 |
+ } |
|
180 |
+ |
|
181 |
+#ifndef HAVE_ICONV_H |
|
182 |
+typedef struct { |
|
183 |
+ enum encodings encoding; |
|
184 |
+ size_t size; |
|
185 |
+} * iconv_t; |
|
186 |
+ |
|
187 |
+static iconv_t iconv_open(const char *tocode, const char* fromcode) |
|
188 |
+{ |
|
189 |
+ iconv_t iconv = cli_malloc(sizeof(*iconv)); |
|
190 |
+ if(!iconv) |
|
191 |
+ return NULL; |
|
192 |
+ /* TODO: check that tocode is UTF16BE */ |
|
193 |
+ iconv->size = encoding_bytes(fromcode,&iconv->encoding); |
|
193 | 194 |
return iconv; |
194 | 195 |
} |
196 |
+} |
|
195 | 197 |
|
196 | 198 |
static int iconv_close(iconv_t cd) |
197 | 199 |
{ |
... | ... |
@@ -379,6 +387,10 @@ static int iconv(iconv_t iconv_struct,char **inbuf, size_t *inbytesleft, |
379 | 379 |
return 0; |
380 | 380 |
} |
381 | 381 |
|
382 |
+#else |
|
383 |
+ |
|
384 |
+ |
|
385 |
+ |
|
382 | 386 |
#endif |
383 | 387 |
|
384 | 388 |
/* new iconv() version */ |
... | ... |
@@ -495,18 +507,20 @@ static unsigned char* normalize_encoding(const unsigned char* enc) |
495 | 495 |
return norm; |
496 | 496 |
} |
497 | 497 |
|
498 |
-static const char* encoding_name(unsigned char* encoding) |
|
498 |
+static const unsigned char* encoding_name(unsigned char* encoding) |
|
499 | 499 |
{ |
500 | 500 |
if(!encoding) |
501 |
- return "ISO-8859-1"; |
|
501 |
+ return (const unsigned char*)"ISO-8859-1"; |
|
502 | 502 |
else |
503 |
- return (char*)encoding; |
|
503 |
+ return encoding; |
|
504 | 504 |
} |
505 | 505 |
|
506 |
- |
|
507 |
- |
|
508 | 506 |
void process_encoding_set(struct entity_conv* conv,const unsigned char* encoding,enum encoding_priority prio) |
509 | 507 |
{ |
508 |
+ unsigned char *tmp_encoding; |
|
509 |
+ enum encodings tmp; |
|
510 |
+ size_t new_size,old_size; |
|
511 |
+ |
|
510 | 512 |
cli_dbgmsg("Setting encoding for %x to %s, priority: %d\n",conv, encoding, prio); |
511 | 513 |
if(encoding == OTHER) |
512 | 514 |
return; |
... | ... |
@@ -514,8 +528,17 @@ void process_encoding_set(struct entity_conv* conv,const unsigned char* encoding |
514 | 514 |
return;/* Content-type in header is highest priority, no overrides possible*/ |
515 | 515 |
if(conv->priority == BOM && prio == NOBOM_AUTODETECT) |
516 | 516 |
return; |
517 |
+ |
|
518 |
+ tmp_encoding = normalize_encoding(encoding);/* FIXME: better obey priorities*/ |
|
519 |
+ old_size = encoding_bytes(conv->encoding,&tmp); |
|
520 |
+ new_size = encoding_bytes(tmp_encoding,&tmp); |
|
521 |
+ if(old_size != new_size) { |
|
522 |
+ cli_dbgmsg("process_encoding_set: refusing to override encoding - new encoding size differs: %s(%ld) != %s(%ld)\n",conv->encoding,old_size,tmp_encoding,new_size); |
|
523 |
+ free(tmp_encoding); |
|
524 |
+ return; |
|
525 |
+ } |
|
517 | 526 |
free(conv->encoding); |
518 |
- conv->encoding = normalize_encoding(encoding);/* FIXME: better obey priorities*/ |
|
527 |
+ conv->encoding = tmp_encoding; |
|
519 | 528 |
cli_dbgmsg("New encoding for %x:%s\n",conv,conv->encoding); |
520 | 529 |
/* reset stream */ |
521 | 530 |
} |
... | ... |
@@ -595,24 +618,192 @@ static size_t read_raw(FILE *stream, m_area_t *m_area, unsigned int max_len, uns |
595 | 595 |
} |
596 | 596 |
} |
597 | 597 |
|
598 |
-static void output_first(struct entity_conv* conv,unsigned char** out, unsigned char** in) |
|
598 |
+static void output_first(struct entity_conv* conv,unsigned char** out, unsigned char** in,size_t* inleft) |
|
599 | 599 |
{ |
600 | 600 |
if(conv->has_bom) { |
601 | 601 |
switch(conv->enc_bytes) { |
602 | 602 |
case 1: |
603 |
- if(conv->autodetected == UTF8) |
|
603 |
+ if(conv->autodetected == UTF8) { |
|
604 | 604 |
*in += 3; |
605 |
+ *inleft -= 3; |
|
606 |
+ } |
|
605 | 607 |
break; |
606 | 608 |
case 2: |
607 | 609 |
*in += 2; |
610 |
+ *inleft -= 2; |
|
608 | 611 |
break; |
609 | 612 |
case 4: |
610 | 613 |
*in += 4; |
614 |
+ *inleft -= 4; |
|
611 | 615 |
break; |
612 | 616 |
} |
613 | 617 |
} |
614 | 618 |
} |
615 | 619 |
|
620 |
+/* sarge leaks on iconv_open/iconv_close, so lets not open/close so many times, |
|
621 |
+ * just keep on each thread its own pool of iconvs*/ |
|
622 |
+ |
|
623 |
+struct iconv_cache { |
|
624 |
+ iconv_t* tab; |
|
625 |
+ size_t len; |
|
626 |
+ size_t last; |
|
627 |
+ struct hashtable hashtab; |
|
628 |
+}; |
|
629 |
+ |
|
630 |
+static void iconv_cache_init(struct iconv_cache* cache) |
|
631 |
+{ |
|
632 |
+/* cache->tab = NULL; |
|
633 |
+ cache->len = 0; |
|
634 |
+ cache->used = 0; - already done by memset*/ |
|
635 |
+ cli_dbgmsg("Initializing iconv pool:%p\n",cache); |
|
636 |
+ hashtab_init(&cache->hashtab, 32); |
|
637 |
+} |
|
638 |
+ |
|
639 |
+static void iconv_cache_destroy(struct iconv_cache* cache) |
|
640 |
+{ |
|
641 |
+ size_t i; |
|
642 |
+ cli_dbgmsg("Destroying iconv pool:%p\n",cache); |
|
643 |
+ for(i=0;i < cache->last;i++) { |
|
644 |
+ cli_dbgmsg("closing iconv:%p\n",cache->tab[i]); |
|
645 |
+ iconv_close(cache->tab[i]); |
|
646 |
+ } |
|
647 |
+ hashtab_clear(&cache->hashtab); |
|
648 |
+ free(cache->hashtab.htable); |
|
649 |
+ free(cache->tab); |
|
650 |
+ free(cache); |
|
651 |
+} |
|
652 |
+ |
|
653 |
+ |
|
654 |
+#ifdef CL_THREAD_SAFE |
|
655 |
+static pthread_key_t iconv_pool_tls_key; |
|
656 |
+static pthread_once_t iconv_pool_tls_key_once = PTHREAD_ONCE_INIT; |
|
657 |
+ |
|
658 |
+/* destructor called for all threads that exit via pthread_exit, or cancellation. Unfortunately that doesn't include |
|
659 |
+ * the main thread, so we have to call this manually for the main thread.*/ |
|
660 |
+ |
|
661 |
+static int cache_atexit_registered = 0; |
|
662 |
+ |
|
663 |
+static void iconv_pool_tls_instance_destroy(void* ptr) |
|
664 |
+{ |
|
665 |
+ if(ptr) { |
|
666 |
+ iconv_cache_destroy(ptr); |
|
667 |
+ } |
|
668 |
+} |
|
669 |
+ |
|
670 |
+static void iconv_cache_cleanup_main(void) |
|
671 |
+{ |
|
672 |
+ struct iconv_cache* cache = pthread_getspecific(iconv_pool_tls_key); |
|
673 |
+ if(cache) { |
|
674 |
+ iconv_pool_tls_instance_destroy(cache); |
|
675 |
+ pthread_setspecific(iconv_pool_tls_key,NULL); |
|
676 |
+ } |
|
677 |
+ pthread_key_delete(iconv_pool_tls_key); |
|
678 |
+} |
|
679 |
+ |
|
680 |
+static void iconv_pool_tls_key_alloc(void) |
|
681 |
+{ |
|
682 |
+ pthread_key_create(&iconv_pool_tls_key, iconv_pool_tls_instance_destroy); |
|
683 |
+ if(!cache_atexit_registered) { |
|
684 |
+ cli_dbgmsg("iconv:registering atexit\n"); |
|
685 |
+ if(atexit(iconv_cache_cleanup_main)) { |
|
686 |
+ cli_dbgmsg("failed to register atexit\n"); |
|
687 |
+ } |
|
688 |
+ cache_atexit_registered = 1; |
|
689 |
+ } |
|
690 |
+} |
|
691 |
+ |
|
692 |
+static void init_iconv_pool_ifneeded(void) |
|
693 |
+{ |
|
694 |
+ pthread_once(&iconv_pool_tls_key_once, iconv_pool_tls_key_alloc); |
|
695 |
+} |
|
696 |
+ |
|
697 |
+static inline struct iconv_cache* cache_get_tls_instance(void) |
|
698 |
+{ |
|
699 |
+ struct iconv_cache* cache = pthread_getspecific(iconv_pool_tls_key); |
|
700 |
+ if(!cache) { |
|
701 |
+ cache = cli_calloc(1,sizeof(*cache)); |
|
702 |
+ if(!cache) { |
|
703 |
+ cli_dbgmsg("!Out of memory allocating TLS iconv instance\n"); |
|
704 |
+ return NULL; |
|
705 |
+ } |
|
706 |
+ iconv_cache_init(cache); |
|
707 |
+ pthread_setspecific(iconv_pool_tls_key, cache); |
|
708 |
+ } |
|
709 |
+ return cache; |
|
710 |
+} |
|
711 |
+ |
|
712 |
+#else |
|
713 |
+ |
|
714 |
+static struct iconv_cache* global_iconv_cache = NULL; |
|
715 |
+static int iconv_global_inited = 0; |
|
716 |
+ |
|
717 |
+ |
|
718 |
+static void iconv_cache_cleanup_main(void) |
|
719 |
+{ |
|
720 |
+ iconv_cache_destroy(global_iconv_cache); |
|
721 |
+} |
|
722 |
+ |
|
723 |
+static inline void init_iconv_pool_ifneeded() |
|
724 |
+{ |
|
725 |
+ if(!iconv_global_inited) { |
|
726 |
+ global_iconv_cache = cli_calloc(1,sizeof(*global_iconv_cache)); |
|
727 |
+ if(global_iconv_cache) { |
|
728 |
+ iconv_cache_init(global_iconv_cache); |
|
729 |
+ atexit(iconv_cache_cleanup_main); |
|
730 |
+ iconv_global_inited = 1; |
|
731 |
+ } |
|
732 |
+ } |
|
733 |
+} |
|
734 |
+ |
|
735 |
+ |
|
736 |
+static inline struct iconv_cache* cache_get_tls_instance(void) |
|
737 |
+{ |
|
738 |
+ return global_iconv_cache; |
|
739 |
+} |
|
740 |
+ |
|
741 |
+#endif |
|
742 |
+ |
|
743 |
+static iconv_t iconv_open_cached(const unsigned char* fromcode) |
|
744 |
+{ |
|
745 |
+ struct iconv_cache * cache; |
|
746 |
+ size_t idx; |
|
747 |
+ const size_t fromcode_len = strlen((const char*)fromcode); |
|
748 |
+ struct element * e; |
|
749 |
+ |
|
750 |
+ init_iconv_pool_ifneeded(); |
|
751 |
+ cache = cache_get_tls_instance();/* gets TLS iconv pool */ |
|
752 |
+ if(!cache) { |
|
753 |
+ cli_dbgmsg("!Unable to get TLS iconv cache!\n"); |
|
754 |
+ errno = EINVAL; |
|
755 |
+ return (iconv_t)-1; |
|
756 |
+ } |
|
757 |
+ |
|
758 |
+ e = hashtab_find(&cache->hashtab, fromcode, fromcode_len); |
|
759 |
+ if(e && (e->data < 0 || (size_t)e->data > cache->len)) { |
|
760 |
+ e = NULL; |
|
761 |
+ } |
|
762 |
+ if(e) { |
|
763 |
+ return cache->tab[e->data]; |
|
764 |
+ } |
|
765 |
+ cli_dbgmsg("iconv not found in cache, for encoding:%s\n",fromcode); |
|
766 |
+ idx = cache->last++; |
|
767 |
+ if(idx >= cache->len) { |
|
768 |
+ cache->len += 16; |
|
769 |
+ cache->tab = cli_realloc(cache->tab, cache->len*sizeof(cache->tab[0])); |
|
770 |
+ if(!cache->tab) { |
|
771 |
+ cli_dbgmsg("!Out of mem in iconv-pool\n"); |
|
772 |
+ errno = ENOMEM; |
|
773 |
+ return (iconv_t)-1; |
|
774 |
+ } |
|
775 |
+ } |
|
776 |
+ |
|
777 |
+ hashtab_insert(&cache->hashtab, fromcode, fromcode_len, idx); |
|
778 |
+ cache->tab[idx] = iconv_open("UTF-16BE",(const char*)fromcode); |
|
779 |
+ cli_dbgmsg("iconv_open(),for:%s -> %p\n",fromcode,(void*)cache->tab[idx]); |
|
780 |
+ return cache->tab[idx]; |
|
781 |
+} |
|
782 |
+ |
|
783 |
+ |
|
616 | 784 |
/* tmp_m_area and conv->out_area are of size maxlen */ |
617 | 785 |
unsigned char* encoding_norm_readline(struct entity_conv* conv, FILE* stream_in, m_area_t* in_m_area, const size_t maxlen) |
618 | 786 |
{ |
... | ... |
@@ -638,7 +829,7 @@ unsigned char* encoding_norm_readline(struct entity_conv* conv, FILE* stream_in, |
638 | 638 |
size_t rc, inleft; |
639 | 639 |
ssize_t i; |
640 | 640 |
|
641 |
- char alignfix; |
|
641 |
+ signed char alignfix; |
|
642 | 642 |
|
643 | 643 |
/* move whatever left in conv->tmp_area to beginning */ |
644 | 644 |
if(tmp_move) |
... | ... |
@@ -654,16 +845,16 @@ unsigned char* encoding_norm_readline(struct entity_conv* conv, FILE* stream_in, |
654 | 654 |
conv->out_area.offset = 0; |
655 | 655 |
|
656 | 656 |
tmpbuff = conv->tmp_area.buffer; |
657 |
+ inleft = conv->tmp_area.length; |
|
657 | 658 |
if(!conv->bom_cnt && conv->tmp_area.length >= 4) {/* detect Byte Order Mark */ |
658 | 659 |
memcpy( conv->bom, tmpbuff, 4); |
659 | 660 |
process_bom(conv); |
660 | 661 |
process_encoding_set(conv,conv->autodetected,conv->has_bom ? BOM : NOBOM_AUTODETECT); |
661 |
- output_first(conv,&out,&tmpbuff); |
|
662 |
+ output_first(conv,&out,&tmpbuff,&inleft); |
|
662 | 663 |
conv->bom_cnt++; |
663 | 664 |
} |
664 | 665 |
|
665 | 666 |
/* convert encoding conv->tmp_area. conv->out_area */ |
666 |
- inleft = conv->tmp_area.length; |
|
667 | 667 |
alignfix = inleft%4;/* iconv gives an error if we give him 3 bytes to convert, |
668 | 668 |
and we are using ucs4, ditto for utf16, and 1 byte*/ |
669 | 669 |
inleft -= alignfix; |
... | ... |
@@ -676,24 +867,30 @@ unsigned char* encoding_norm_readline(struct entity_conv* conv, FILE* stream_in, |
676 | 676 |
alignfix = -inleft; |
677 | 677 |
} |
678 | 678 |
|
679 |
- iconv_struct = iconv_open("UTF-16BE",encoding_name(conv->encoding)); |
|
679 |
+ iconv_struct = iconv_open_cached(encoding_name(conv->encoding)); |
|
680 | 680 |
|
681 | 681 |
if(iconv_struct == (iconv_t)-1) { |
682 | 682 |
cli_dbgmsg("Iconv init problem for encoding:%s, falling back to iso encoding!\n",encoding_name(conv->encoding)); |
683 |
+ /* message shown only once/file */ |
|
683 | 684 |
/* what can we do? just fall back for it being an ISO-8859-1 */ |
684 |
- iconv_struct = iconv_open("UTF-16BE","ISO-8859-1"); |
|
685 |
+ free(conv->encoding); |
|
686 |
+ conv->encoding = (unsigned char*) cli_strdup("ISO-8859-1"); |
|
687 |
+ iconv_struct = iconv_open_cached(conv->encoding); |
|
685 | 688 |
if(iconv_struct == (iconv_t)-1) { |
686 | 689 |
cli_dbgmsg("fallback failed... bail out\n"); |
687 | 690 |
return cli_readline(NULL,&conv->tmp_area,maxlen); |
688 | 691 |
} |
689 | 692 |
} |
690 | 693 |
|
691 |
- if(inleft) /* iconv doesn't like inleft to be 0 */ |
|
694 |
+ if(inleft && outleft > conv->buffer_size/2 ) /* iconv doesn't like inleft to be 0 */ { |
|
692 | 695 |
rc = iconv(iconv_struct, (char**) &tmpbuff, &inleft, (char**) &out, &outleft); |
696 |
+ } |
|
693 | 697 |
else |
694 | 698 |
rc = 0; |
695 | 699 |
|
696 |
- iconv_close(iconv_struct); |
|
700 |
+#if 0 |
|
701 |
+ iconv_close(iconv_struct);/* - don't close, we are using a cached instance */ |
|
702 |
+#endif |
|
697 | 703 |
|
698 | 704 |
if(rc==(size_t)-1 && errno != E2BIG) { |
699 | 705 |
cli_dbgmsg("iconv error:%s, silently resuming (%ld,%ld,%ld,%ld)\n",strerror(errno),out-conv->out_area.buffer,tmpbuff-conv->tmp_area.buffer,inleft,outleft); |
... | ... |
@@ -705,20 +902,31 @@ unsigned char* encoding_norm_readline(struct entity_conv* conv, FILE* stream_in, |
705 | 705 |
} |
706 | 706 |
|
707 | 707 |
conv->tmp_area.length = inleft + (alignfix > 0 ? alignfix : 0); |
708 |
- conv->out_area.length = out - conv->out_area.buffer; |
|
708 |
+ conv->out_area.length = out - conv->out_area.buffer - out_move; |
|
709 | 709 |
|
710 | 710 |
conv->tmp_area.offset = tmpbuff - conv->tmp_area.buffer; |
711 | 711 |
conv->tmp_area.length += conv->tmp_area.offset; |
712 | 712 |
|
713 | 713 |
|
714 | 714 |
/* move whatever left in conv->norm_area to beginning */ |
715 |
- if(norm_move) |
|
715 |
+ if(norm_move) { |
|
716 |
+ if(norm_move < conv->buffer_size/2) { |
|
716 | 717 |
memmove(conv->norm_area.buffer, conv->norm_area.buffer + conv->norm_area.offset, norm_move); |
717 | 718 |
conv->norm_area.offset = 0; |
719 |
+ norm = conv->norm_area.buffer + norm_move; |
|
720 |
+ } |
|
721 |
+ else { |
|
722 |
+ /* don't modify offset here */ |
|
723 |
+ norm = conv->norm_area.buffer + conv->norm_area.length; |
|
724 |
+ } |
|
725 |
+ } |
|
726 |
+ else { |
|
727 |
+ conv->norm_area.offset = 0; |
|
728 |
+ norm = conv->norm_area.buffer; |
|
729 |
+ } |
|
718 | 730 |
|
719 | 731 |
/* now do the real normalization */ |
720 | 732 |
out = conv->out_area.buffer;/* skip over utf16 bom, FIXME: check if iconv really outputted a BOM */ |
721 |
- norm = conv->norm_area.buffer + norm_move; |
|
722 | 733 |
norm_end = conv->norm_area.buffer + conv->buffer_size; |
723 | 734 |
if(conv->out_area.length>0 && out[0] == 0xFF && out[1] == 0xFE) |
724 | 735 |
i = 2; |
... | ... |
@@ -727,10 +935,12 @@ unsigned char* encoding_norm_readline(struct entity_conv* conv, FILE* stream_in, |
727 | 727 |
for(; i < conv->out_area.length; i += 2) { |
728 | 728 |
uint16_t u16 = ( ((uint16_t)out[i]) << 8 ) | out[i+1]; |
729 | 729 |
if(!u16) { |
730 |
- if(alignfix >= 0) /* if alignfix is negative, this 0 byte is on-purpose, its padding */ |
|
730 |
+ if(alignfix >= 0 && !conv->msg_zero_shown) /* if alignfix is negative, this 0 byte is on-purpose, its padding */ { |
|
731 |
+ conv->msg_zero_shown = 1; |
|
731 | 732 |
cli_dbgmsg("Skipping null character in html stream\n"); |
732 | 733 |
} |
733 |
- else if(u16 < 0x80) { |
|
734 |
+ } |
|
735 |
+ else if((u16 < 0x80 && u16 >= 0x20) || u16 == 0x0d || u16 == 0x0a) { |
|
734 | 736 |
if(norm >= norm_end) |
735 | 737 |
break; |
736 | 738 |
if((unsigned char)u16 ==0) |
... | ... |
@@ -753,6 +963,7 @@ unsigned char* encoding_norm_readline(struct entity_conv* conv, FILE* stream_in, |
753 | 753 |
} |
754 | 754 |
} |
755 | 755 |
conv->out_area.offset = i; /* so that we can resume next time from here */ |
756 |
+ |
|
756 | 757 |
conv->norm_area.length = norm - conv->norm_area.buffer; |
757 | 758 |
/* |
758 | 759 |
conv->norm_area.buffer[conv->buffer_size-1]=0;DONT DO THIS |
... | ... |
@@ -1199,7 +1199,7 @@ static int cli_html_normalise(int fd, m_area_t *m_area, const char *dirname, tag |
1199 | 1199 |
tag_val[tag_val_length++] = value; /* store encoded values too */ |
1200 | 1200 |
} |
1201 | 1201 |
|
1202 |
- if(value < 0x80) |
|
1202 |
+ if((value < 0x80 && value >= 0x20) || value == 0x0d || value == 0x0a) |
|
1203 | 1203 |
html_output_c(file_buff_o1, file_buff_o2, tolower(value)); |
1204 | 1204 |
else { |
1205 | 1205 |
unsigned char buff[10]; |