Browse code

apply entconv patch from Edwin

git-svn: trunk@2675

Tomasz Kojm authored on 2007/02/08 01:23:40
Showing 5 changed files
... ...
@@ -1,3 +1,11 @@
1
+Wed Feb  7 17:20:12 CET 2007 (tk)
2
+---------------------------------
3
+  * libclamav: apply entconv patch from Edwin:
4
+		- workaround sarge libc leak using a per-thread cache
5
+		- normalize <0x20 chars too
6
+		- fix utf-16 double-decoding
7
+		- fix performance issue with some encodings
8
+
1 9
 Sun Feb  4 17:58:16 CET 2007 (tk)
2 10
 ---------------------------------
3 11
   * libclamav: remove some warnings from gcc
... ...
@@ -23,7 +23,6 @@
23 23
 #define _ENCODING_ALIASES_H
24 24
 #include "clamav-config.h"
25 25
 
26
-#ifndef HAVE_ICONV_H
27 26
 
28 27
 #include <stdio.h>
29 28
 #include "hashtab.h"
... ...
@@ -90,4 +89,3 @@ const struct hashtable aliases_htable = {
90 90
 };
91 91
 
92 92
 #endif
93
-#endif
... ...
@@ -32,6 +32,11 @@
32 32
 #include <dirent.h>
33 33
 #include <errno.h>
34 34
 
35
+
36
+#ifdef CL_THREAD_SAFE
37
+#include <pthread.h>
38
+#endif
39
+
35 40
 #include "clamav.h"
36 41
 #include "others.h"
37 42
 #include "htmlnorm.h"
... ...
@@ -41,9 +46,9 @@
41 41
 
42 42
 #ifdef HAVE_ICONV_H
43 43
 #include <iconv.h>
44
-#else
45
-#include "encoding_aliases.h"
46 44
 #endif
45
+#include "encoding_aliases.h"
46
+
47 47
 
48 48
 #define MAX_LINE 1024
49 49
 
... ...
@@ -136,6 +141,7 @@ int init_entity_converter(struct entity_conv* conv,const unsigned char* encoding
136 136
 		}
137 137
 
138 138
 		conv->ht = &entities_htable;
139
+		conv->msg_zero_shown = 0;
139 140
 
140 141
 		return 0;
141 142
 	}
... ...
@@ -143,55 +149,57 @@ int init_entity_converter(struct entity_conv* conv,const unsigned char* encoding
143 143
 		return CL_ENULLARG;
144 144
 }
145 145
 
146
-#ifndef HAVE_ICONV_H
147
-
148
-typedef struct {
149
-	enum encodings encoding;
150
-	size_t size;
151
-} * iconv_t;
152
-
153
-static iconv_t iconv_open(const char *tocode, const char *fromcode)
146
+static size_t encoding_bytes(const unsigned char* fromcode, enum encodings* encoding)
154 147
 {
155 148
 	const unsigned char* from = (const unsigned char*) fromcode;
156
-	iconv_t iconv = cli_malloc(sizeof(*iconv));
157
-	if(!iconv)
158
-		return NULL;
159
-	iconv->encoding = E_OTHER;
160
-	iconv->size = 1;
161
-	/*TODO: check that tocode is UTF16BE */
162 149
 	/* special case for these unusual byteorders */
150
+	*encoding=E_OTHER;
163 151
 	if(from == UCS4_2143)
164
-		iconv->encoding = E_UCS4_2134;
152
+		*encoding = E_UCS4_2134;
165 153
 	else if (from == UCS4_3412)
166
-		iconv->encoding = E_UCS4_3412;
154
+		*encoding = E_UCS4_3412;
167 155
 	else {
168
-		struct element * e = hashtab_find(&aliases_htable,from,strlen(fromcode));
156
+		struct element * e = hashtab_find(&aliases_htable,from,strlen((const char*)fromcode));
169 157
 		if(e && e->key) {
170
-			iconv->encoding = e->data;
158
+			*encoding = e->data;
171 159
 		}
172 160
 	}
173 161
 
174
-	switch(iconv->encoding) {
162
+	switch(*encoding) {
175 163
 		case E_UCS4:
176 164
 		case E_UCS4_1234:
177 165
 		case E_UCS4_4321:
178 166
 		case E_UCS4_2134:
179 167
 		case E_UCS4_3412:
180
-			iconv->size = 4;
181
-			break;
168
+			return 4;
182 169
 		case E_UTF16:
183 170
 		case E_UTF16_BE:
184 171
 		case E_UTF16_LE:
185
-			iconv->size = 2;
186
-			break;
172
+			return 2;
187 173
 		case E_UTF8:
188 174
 		case E_UNKNOWN:
189 175
 		case E_OTHER:
190 176
 		default:
191
-			iconv->size = 1;
177
+			return 1;
192 178
 	}
179
+	}
180
+
181
+#ifndef HAVE_ICONV_H
182
+typedef struct {
183
+	enum encodings encoding;
184
+	size_t size;
185
+} * iconv_t;
186
+
187
+static iconv_t iconv_open(const char *tocode, const char* fromcode)
188
+{
189
+	iconv_t iconv = cli_malloc(sizeof(*iconv));
190
+	if(!iconv)
191
+		return NULL;
192
+	/* TODO: check that tocode is UTF16BE */
193
+	iconv->size = encoding_bytes(fromcode,&iconv->encoding);
193 194
 	return iconv;
194 195
 }
196
+}
195 197
 
196 198
 static int iconv_close(iconv_t cd)
197 199
 {
... ...
@@ -379,6 +387,10 @@ static int iconv(iconv_t iconv_struct,char **inbuf, size_t *inbytesleft,
379 379
 	return  0;
380 380
 }
381 381
 
382
+#else
383
+
384
+
385
+
382 386
 #endif
383 387
 
384 388
 /* new iconv() version */
... ...
@@ -495,18 +507,20 @@ static unsigned char* normalize_encoding(const unsigned char* enc)
495 495
 	return norm;
496 496
 }
497 497
 
498
-static const char* encoding_name(unsigned char* encoding)
498
+static const unsigned char* encoding_name(unsigned char* encoding)
499 499
 {
500 500
 	if(!encoding)
501
-		return "ISO-8859-1";
501
+		return (const unsigned char*)"ISO-8859-1";
502 502
 	else
503
-		return (char*)encoding;
503
+		return encoding;
504 504
 }
505 505
 
506
-
507
-
508 506
 void process_encoding_set(struct entity_conv* conv,const unsigned char* encoding,enum encoding_priority prio)
509 507
 {
508
+	unsigned char *tmp_encoding;
509
+	enum encodings tmp;
510
+	size_t new_size,old_size;
511
+
510 512
 	cli_dbgmsg("Setting encoding for %x  to %s, priority: %d\n",conv, encoding, prio);
511 513
 	if(encoding == OTHER)
512 514
 		return;
... ...
@@ -514,8 +528,17 @@ void process_encoding_set(struct entity_conv* conv,const unsigned char* encoding
514 514
 		return;/* Content-type in header is highest priority, no overrides possible*/
515 515
 	if(conv->priority ==  BOM && prio == NOBOM_AUTODETECT)
516 516
 		return;
517
+
518
+	tmp_encoding = normalize_encoding(encoding);/* FIXME: better obey priorities*/
519
+	old_size = encoding_bytes(conv->encoding,&tmp);
520
+	new_size = encoding_bytes(tmp_encoding,&tmp);
521
+	if(old_size != new_size)  {
522
+		cli_dbgmsg("process_encoding_set: refusing to override encoding - new encoding size differs: %s(%ld) != %s(%ld)\n",conv->encoding,old_size,tmp_encoding,new_size);
523
+		free(tmp_encoding);
524
+		return;
525
+	}
517 526
 	free(conv->encoding);
518
-	conv->encoding = normalize_encoding(encoding);/* FIXME: better obey priorities*/
527
+	conv->encoding = tmp_encoding;
519 528
 	cli_dbgmsg("New encoding for %x:%s\n",conv,conv->encoding);
520 529
 	/* reset stream */
521 530
 }
... ...
@@ -595,24 +618,192 @@ static size_t read_raw(FILE *stream, m_area_t *m_area, unsigned int max_len, uns
595 595
 	}
596 596
 }
597 597
 
598
-static void output_first(struct entity_conv* conv,unsigned char** out, unsigned char** in)
598
+static void output_first(struct entity_conv* conv,unsigned char** out, unsigned char** in,size_t* inleft)
599 599
 {
600 600
 	if(conv->has_bom) {
601 601
 		switch(conv->enc_bytes) {
602 602
 			case 1:
603
-				if(conv->autodetected == UTF8) 
603
+				if(conv->autodetected == UTF8) {
604 604
 					*in += 3;
605
+					*inleft -= 3;
606
+				}
605 607
 				break;
606 608
 			case 2:
607 609
 				*in += 2;
610
+				*inleft -= 2;
608 611
 				break;
609 612
 			case 4:
610 613
 				*in += 4;
614
+				*inleft -= 4;
611 615
 				break;
612 616
 		}
613 617
 	}
614 618
 }
615 619
 
620
+/* sarge leaks on iconv_open/iconv_close, so lets not open/close so many times,
621
+ * just keep on each thread its own pool of iconvs*/
622
+
623
+struct iconv_cache {
624
+	iconv_t* tab;
625
+	size_t     len;
626
+	size_t   last;
627
+	struct   hashtable hashtab;
628
+};
629
+
630
+static void iconv_cache_init(struct iconv_cache* cache)
631
+{
632
+/*	cache->tab = NULL;
633
+	cache->len = 0;
634
+	cache->used = 0; - already done by memset*/
635
+	cli_dbgmsg("Initializing iconv pool:%p\n",cache);
636
+	hashtab_init(&cache->hashtab, 32);
637
+}
638
+
639
+static void iconv_cache_destroy(struct iconv_cache* cache)
640
+{
641
+	size_t i;
642
+	cli_dbgmsg("Destroying iconv pool:%p\n",cache);
643
+	for(i=0;i < cache->last;i++) {
644
+		cli_dbgmsg("closing iconv:%p\n",cache->tab[i]);
645
+		iconv_close(cache->tab[i]);
646
+	}
647
+	hashtab_clear(&cache->hashtab);
648
+	free(cache->hashtab.htable);
649
+	free(cache->tab);
650
+	free(cache);
651
+}
652
+
653
+
654
+#ifdef CL_THREAD_SAFE
655
+static pthread_key_t iconv_pool_tls_key;
656
+static pthread_once_t iconv_pool_tls_key_once = PTHREAD_ONCE_INIT;
657
+
658
+/* destructor called for all threads that exit via pthread_exit, or cancellation. Unfortunately that doesn't include
659
+ * the main thread, so we have to call this manually for the main thread.*/
660
+
661
+static int cache_atexit_registered = 0;
662
+
663
+static void iconv_pool_tls_instance_destroy(void* ptr)
664
+{
665
+	if(ptr) {
666
+		iconv_cache_destroy(ptr);
667
+	}
668
+}
669
+
670
+static void iconv_cache_cleanup_main(void)
671
+{
672
+	struct iconv_cache* cache = pthread_getspecific(iconv_pool_tls_key);
673
+	if(cache) {
674
+		iconv_pool_tls_instance_destroy(cache);
675
+		pthread_setspecific(iconv_pool_tls_key,NULL);
676
+	}
677
+	pthread_key_delete(iconv_pool_tls_key);
678
+}
679
+
680
+static void iconv_pool_tls_key_alloc(void)
681
+{
682
+	pthread_key_create(&iconv_pool_tls_key, iconv_pool_tls_instance_destroy);
683
+	if(!cache_atexit_registered) {
684
+		cli_dbgmsg("iconv:registering atexit\n");
685
+		if(atexit(iconv_cache_cleanup_main)) {
686
+			cli_dbgmsg("failed to register atexit\n");
687
+		}
688
+		cache_atexit_registered = 1;
689
+	}
690
+}
691
+
692
+static void init_iconv_pool_ifneeded(void)
693
+{
694
+	pthread_once(&iconv_pool_tls_key_once, iconv_pool_tls_key_alloc);
695
+}
696
+
697
+static inline struct iconv_cache* cache_get_tls_instance(void)
698
+{
699
+	struct iconv_cache* cache = pthread_getspecific(iconv_pool_tls_key);
700
+	if(!cache) {
701
+		cache = cli_calloc(1,sizeof(*cache));
702
+		if(!cache) {
703
+			cli_dbgmsg("!Out of memory allocating TLS iconv instance\n");
704
+			return NULL;
705
+		}
706
+		iconv_cache_init(cache);
707
+		pthread_setspecific(iconv_pool_tls_key, cache);
708
+	}
709
+	return cache;
710
+}
711
+
712
+#else
713
+
714
+static struct iconv_cache* global_iconv_cache = NULL;
715
+static int    iconv_global_inited = 0;
716
+
717
+
718
+static void iconv_cache_cleanup_main(void)
719
+{
720
+	iconv_cache_destroy(global_iconv_cache);
721
+}
722
+
723
+static inline void init_iconv_pool_ifneeded() 
724
+{
725
+	if(!iconv_global_inited) {
726
+		global_iconv_cache = cli_calloc(1,sizeof(*global_iconv_cache));
727
+		if(global_iconv_cache) {
728
+			iconv_cache_init(global_iconv_cache);
729
+			atexit(iconv_cache_cleanup_main);
730
+			iconv_global_inited = 1;
731
+		}
732
+	}
733
+}
734
+
735
+
736
+static inline struct iconv_cache* cache_get_tls_instance(void)
737
+{
738
+	return global_iconv_cache;
739
+}
740
+
741
+#endif
742
+
743
+static iconv_t iconv_open_cached(const unsigned char* fromcode)
744
+{
745
+	struct iconv_cache * cache;
746
+	size_t idx;
747
+	const size_t fromcode_len = strlen((const char*)fromcode);
748
+	struct element * e;
749
+
750
+	init_iconv_pool_ifneeded();
751
+	cache = cache_get_tls_instance();/* gets TLS iconv pool */
752
+	if(!cache) {
753
+		cli_dbgmsg("!Unable to get TLS iconv cache!\n");
754
+		errno = EINVAL;
755
+		return (iconv_t)-1;
756
+	}
757
+
758
+	e = hashtab_find(&cache->hashtab, fromcode, fromcode_len);
759
+	if(e && (e->data < 0 || (size_t)e->data > cache->len)) {
760
+		e = NULL;
761
+	}
762
+	if(e) {
763
+		return cache->tab[e->data];
764
+	}
765
+	cli_dbgmsg("iconv not found in cache, for encoding:%s\n",fromcode);
766
+	idx = cache->last++;
767
+	if(idx >= cache->len) {
768
+		cache->len += 16;
769
+		cache->tab = cli_realloc(cache->tab, cache->len*sizeof(cache->tab[0]));
770
+		if(!cache->tab) {
771
+			cli_dbgmsg("!Out of mem in iconv-pool\n");
772
+			errno = ENOMEM;
773
+			return (iconv_t)-1;
774
+		}
775
+	}
776
+
777
+	hashtab_insert(&cache->hashtab, fromcode, fromcode_len, idx);
778
+	cache->tab[idx] = iconv_open("UTF-16BE",(const char*)fromcode);
779
+	cli_dbgmsg("iconv_open(),for:%s -> %p\n",fromcode,(void*)cache->tab[idx]);
780
+	return cache->tab[idx];
781
+}
782
+
783
+
616 784
 /* tmp_m_area and conv->out_area are of size maxlen */
617 785
 unsigned char* encoding_norm_readline(struct entity_conv* conv, FILE* stream_in, m_area_t* in_m_area, const size_t maxlen)
618 786
 {
... ...
@@ -638,7 +829,7 @@ unsigned char* encoding_norm_readline(struct entity_conv* conv, FILE* stream_in,
638 638
 		size_t rc, inleft;
639 639
 		ssize_t i;
640 640
 
641
-		char alignfix;
641
+		signed char alignfix;
642 642
 
643 643
 		/* move whatever left in conv->tmp_area to beginning */
644 644
 		if(tmp_move)
... ...
@@ -654,16 +845,16 @@ unsigned char* encoding_norm_readline(struct entity_conv* conv, FILE* stream_in,
654 654
 		conv->out_area.offset = 0;
655 655
 
656 656
 		tmpbuff = conv->tmp_area.buffer;
657
+		inleft = conv->tmp_area.length;
657 658
 		if(!conv->bom_cnt && conv->tmp_area.length >= 4) {/* detect Byte Order Mark */
658 659
 			memcpy( conv->bom, tmpbuff, 4);
659 660
 			process_bom(conv);
660 661
 			process_encoding_set(conv,conv->autodetected,conv->has_bom ? BOM : NOBOM_AUTODETECT);
661
-			output_first(conv,&out,&tmpbuff);
662
+			output_first(conv,&out,&tmpbuff,&inleft);
662 663
 			conv->bom_cnt++;
663 664
 		}
664 665
 
665 666
 		/* convert encoding conv->tmp_area. conv->out_area */
666
-		inleft = conv->tmp_area.length;
667 667
 		alignfix = inleft%4;/* iconv gives an error if we give him 3 bytes to convert, 
668 668
 				       and we are using ucs4, ditto for utf16, and 1 byte*/
669 669
 		inleft -= alignfix;
... ...
@@ -676,24 +867,30 @@ unsigned char* encoding_norm_readline(struct entity_conv* conv, FILE* stream_in,
676 676
 			alignfix = -inleft;
677 677
 		}
678 678
 
679
-		iconv_struct = iconv_open("UTF-16BE",encoding_name(conv->encoding));
679
+		iconv_struct = iconv_open_cached(encoding_name(conv->encoding));
680 680
 
681 681
 		if(iconv_struct == (iconv_t)-1) {
682 682
 			cli_dbgmsg("Iconv init problem for encoding:%s, falling back to iso encoding!\n",encoding_name(conv->encoding));
683
+			/* message shown only once/file */
683 684
 			/* what can we do? just fall back for it being an ISO-8859-1 */
684
-			iconv_struct = iconv_open("UTF-16BE","ISO-8859-1");
685
+		        free(conv->encoding);
686
+			conv->encoding = (unsigned char*) cli_strdup("ISO-8859-1");
687
+			iconv_struct = iconv_open_cached(conv->encoding);
685 688
 			if(iconv_struct == (iconv_t)-1) {
686 689
 				cli_dbgmsg("fallback failed... bail out\n");
687 690
 				return cli_readline(NULL,&conv->tmp_area,maxlen);
688 691
 			}
689 692
 		}
690 693
 
691
-		if(inleft) /* iconv doesn't like inleft to be 0 */
694
+		if(inleft && outleft > conv->buffer_size/2 ) /* iconv doesn't like inleft to be 0 */ {
692 695
 			rc = iconv(iconv_struct, (char**) &tmpbuff,  &inleft, (char**) &out, &outleft);	
696
+		}
693 697
 		else
694 698
 			rc = 0;
695 699
 
696
-		iconv_close(iconv_struct);
700
+#if 0
701
+		 iconv_close(iconv_struct);/* - don't close, we are using a cached instance */
702
+#endif
697 703
 
698 704
 		if(rc==(size_t)-1 && errno != E2BIG) {
699 705
 				cli_dbgmsg("iconv error:%s, silently resuming (%ld,%ld,%ld,%ld)\n",strerror(errno),out-conv->out_area.buffer,tmpbuff-conv->tmp_area.buffer,inleft,outleft);
... ...
@@ -705,20 +902,31 @@ unsigned char* encoding_norm_readline(struct entity_conv* conv, FILE* stream_in,
705 705
 		}
706 706
 
707 707
 		conv->tmp_area.length = inleft + (alignfix > 0 ? alignfix : 0);
708
-		conv->out_area.length = out - conv->out_area.buffer;
708
+		conv->out_area.length = out - conv->out_area.buffer - out_move;
709 709
 
710 710
 		conv->tmp_area.offset = tmpbuff - conv->tmp_area.buffer;
711 711
 		conv->tmp_area.length += conv->tmp_area.offset;
712 712
 
713 713
 
714 714
 		/* move whatever left in conv->norm_area to beginning */
715
-		if(norm_move)
715
+		if(norm_move) {
716
+			if(norm_move < conv->buffer_size/2) {
716 717
 			memmove(conv->norm_area.buffer, conv->norm_area.buffer + conv->norm_area.offset, norm_move);
717 718
 		conv->norm_area.offset = 0;
719
+				norm = conv->norm_area.buffer + norm_move;
720
+			}
721
+			else {
722
+				/* don't modify offset here */
723
+				norm = conv->norm_area.buffer + conv->norm_area.length;
724
+			}
725
+		}
726
+		else {
727
+			conv->norm_area.offset = 0;
728
+			norm = conv->norm_area.buffer;	
729
+		}
718 730
 
719 731
 		/* now do the real normalization */
720 732
 		out = conv->out_area.buffer;/* skip over utf16 bom, FIXME: check if iconv really outputted a BOM */
721
-		norm = conv->norm_area.buffer + norm_move;
722 733
 		norm_end = conv->norm_area.buffer + conv->buffer_size;
723 734
 		if(conv->out_area.length>0 && out[0] == 0xFF && out[1] == 0xFE)
724 735
 			i = 2;
... ...
@@ -727,10 +935,12 @@ unsigned char* encoding_norm_readline(struct entity_conv* conv, FILE* stream_in,
727 727
 		for(; i < conv->out_area.length; i += 2) {
728 728
 			uint16_t u16 = ( ((uint16_t)out[i]) << 8 ) | out[i+1];
729 729
 			if(!u16) {
730
-				if(alignfix >= 0) /* if alignfix is negative, this 0 byte is on-purpose, its padding */
730
+				if(alignfix >= 0 && !conv->msg_zero_shown) /* if alignfix is negative, this 0 byte is on-purpose, its padding */ {
731
+					conv->msg_zero_shown = 1;
731 732
 					cli_dbgmsg("Skipping null character in html stream\n");
732 733
 			}
733
-			else if(u16 < 0x80) {
734
+			}
735
+			else if((u16 < 0x80 && u16 >= 0x20) || u16 == 0x0d || u16 == 0x0a) {
734 736
 				if(norm >= norm_end)
735 737
 					break;
736 738
 				if((unsigned char)u16 ==0)
... ...
@@ -753,6 +963,7 @@ unsigned char* encoding_norm_readline(struct entity_conv* conv, FILE* stream_in,
753 753
 			}	
754 754
 		}
755 755
 		conv->out_area.offset = i; /* so that we can resume next time from here */
756
+
756 757
 		conv->norm_area.length = norm - conv->norm_area.buffer;
757 758
 /*
758 759
 		conv->norm_area.buffer[conv->buffer_size-1]=0;DONT DO THIS
... ...
@@ -72,6 +72,7 @@ struct entity_conv {
72 72
 	m_area_t tmp_area;
73 73
 	m_area_t out_area;
74 74
 	m_area_t norm_area;
75
+	int      msg_zero_shown;
75 76
 };
76 77
 
77 78
 
... ...
@@ -1199,7 +1199,7 @@ static int cli_html_normalise(int fd, m_area_t *m_area, const char *dirname, tag
1199 1199
 					tag_val[tag_val_length++] = value; /* store encoded values too */
1200 1200
 					}
1201 1201
 
1202
-					if(value < 0x80)
1202
+					if((value < 0x80 && value >= 0x20) || value == 0x0d || value == 0x0a)
1203 1203
 						html_output_c(file_buff_o1, file_buff_o2, tolower(value));
1204 1204
 					else {
1205 1205
 						unsigned char buff[10];