Browse code

iconv: fix incorrect resuming on invalid UTF8 character

git-svn: trunk@2586

Tomasz Kojm authored on 2007/01/01 01:32:01
Showing 3 changed files
... ...
@@ -1,3 +1,8 @@
1
+Sun Dec 31 17:29:11 CET 2006 (tk)
2
+---------------------------------
3
+  * libclamav: iconv: fix incorrect resuming on invalid UTF8 character,
4
+	       reported by nitrox <mail*nerdbase.de> (bb#215, patch by Edwin)
5
+
1 6
 Sat Dec 30 17:10:42 GMT 2006 (njh)
2 7
 ----------------------------------
3 8
   * libclamav/mbox.c:	Fix compilation error on Solaris10 (reported by Andy
... ...
@@ -297,50 +297,70 @@ static int iconv(iconv_t iconv_struct,char **inbuf, size_t *inbytesleft,
297 297
 			{
298 298
 				const size_t maxread  = *inbytesleft;
299 299
 				const size_t maxwrite = *outbytesleft;
300
-				uint16_t* out = (uint16_t*) output;
301 300
 				size_t j;
302 301
 				for(i=0,j=0 ; i < maxread && j < maxwrite;) {
303 302
 					if(input[i] < 0x7F)  {
304
-						out[j++] = input[i++];
303
+						output[j++] = 0;
304
+						output[j++] = input[i++];
305 305
 							}
306 306
 					else if( (input[i]&0xE0) == 0xC0 ) {
307 307
 						if ((input[i+1]&0xC0) == 0x80) {
308 308
 							/* 2 bytes long 110yyyyy zzzzzzzz -> 00000yyy yyzzzzzz*/
309
+							output[j++] = ((input[i] & 0x1F) >> 2) & 0x07;
309 310
 							output[j++] = ((input[i] & 0x1F) << 6) | (input[i+1] & 0x3F);
310 311
 						}
311
-						else
312
+						else {
312 313
 							cli_dbgmsg("invalid UTF8 character encountered\n");
314
+							break;
315
+						}
313 316
 						i+=2;
314 317
 					}
315 318
 					else if( (input[i]&0xE0) == 0xE0) {
316 319
 						if( (input[i+1]&0xC0) == 0x80 && (input[i+2]&0xC0) == 0x80) {
317 320
 							/* 3 bytes long 1110xxxx 10yyyyyy 10zzzzzzzz -> xxxxyyyy yyzzzzzz*/
318
-							output[j++] = ((input[i] & 0x0F) << 12) | ((input[i+1] & 0x3F)<<6) | (input[i+2] & 0x3F);
321
+							output[j++] = (input[i] << 4) | ((input[i+1] >> 2) & 0x0F);
322
+							output[j++] = (input[i+1] << 6) | (input[i+2] & 0x3F);
319 323
 						}
320
-						else
324
+						else {
321 325
 							cli_dbgmsg("invalid UTF8 character encountered\n");
326
+							break;
327
+						}
322 328
 						i+=3;
323 329
 					}
324 330
 					else if( (input[i]&0xF8) == 0xF0) {
325 331
 						if((input[i+1]&0xC0) == 0x80 && (input[i+2]&0xC0) == 0x80 && (input[i+3]&0xC0) == 0x80) {
326 332
 							/* 4 bytes long 11110www 10xxxxxx 10yyyyyy 10zzzzzz -> 000wwwxx xxxxyyyy yyzzzzzz*/
327
-							output[j++] = ((input[i] & 0x07) << 18) | ((input[i+1] & 0x3F)<<12) | ((input[i+2] & 0x3F) <<6) | (input[i+3] & 0x3F);
333
+							cli_dbgmsg("UTF8 character out of UTF16 range encountered");
334
+							output[j++] = 0xff;
335
+							output[j++] = 0xff;
336
+
337
+							/*out[j++] = ((input[i] & 0x07) << 2) | ((input[i+1] >> 4) & 0x3);
338
+							out[j++] = (input[i+1] << 4) | ((input[i+2] >> 2) & 0x0F);
339
+							out[j++] = (input[i+2] << 6) | (input[i+2] & 0x3F);*/
328 340
 						}
329
-						else
341
+						else {
330 342
 							cli_dbgmsg("invalid UTF8 character encountered\n");
343
+							break;
344
+						}
331 345
 						i+=4;
332 346
 					}
333 347
 					else {
334
-						i++;
335 348
 						cli_dbgmsg("invalid UTF8 character encountered\n");
349
+						break;
336 350
 					}							
337 351
 				}
338 352
 				*inbytesleft -= i;
339 353
 				*outbytesleft -= j;
340 354
 				*inbuf += i;
341 355
 				*outbuf += j;
342
-				if(*inbytesleft)
343
-					return E2BIG;
356
+				if(*inbytesleft && *outbytesleft) {
357
+					errno = EILSEQ;/* we had an early exit */
358
+					return -1;
359
+				}
360
+				if(*inbytesleft) {
361
+					errno = E2BIG;
362
+					return -1;
363
+				}
344 364
 				return 0;
345 365
 			}
346 366
 	}
... ...
@@ -349,8 +369,10 @@ static int iconv(iconv_t iconv_struct,char **inbuf, size_t *inbytesleft,
349 349
 	*inbytesleft  -= maxcopy;
350 350
 	*inbuf += maxcopy;
351 351
 	*outbuf += maxcopy;
352
-	if(*inbytesleft)
353
-		return E2BIG;
352
+	if(*inbytesleft) {
353
+		errno = E2BIG;
354
+		return -1;
355
+	}
354 356
 	return  0;
355 357
 }
356 358
 
... ...
@@ -671,8 +693,12 @@ unsigned char* encoding_norm_readline(struct entity_conv* conv, FILE* stream_in,
671 671
 		iconv_close(iconv_struct);
672 672
 
673 673
 		if(rc==(size_t)-1 && errno != E2BIG) {
674
-				cli_dbgmsg("iconv error:%s, silently resuming\n",strerror(errno));
675
-				return cli_readline(NULL, &conv->tmp_area, maxlen);
674
+				cli_dbgmsg("iconv error:%s, silently resuming (%ld,%ld,%ld,%ld)\n",strerror(errno),out-conv->out_area.buffer,tmpbuff-conv->tmp_area.buffer,inleft,outleft);
675
+				/* output raw byte, and resume at next byte */
676
+				*out++ = 0;
677
+				*out++ = *tmpbuff++;
678
+				inleft--;
679
+/*				return cli_readline(NULL, &conv->norm_area, maxlen);*/
676 680
 		}
677 681
 
678 682
 		conv->tmp_area.length = inleft + (alignfix > 0 ? alignfix : 0);
... ...
@@ -35,6 +35,8 @@ typedef struct m_area_tag {
35 35
 	off_t offset;
36 36
 } m_area_t;
37 37
 
38
+
39
+unsigned char *cli_readline(FILE *stream, m_area_t *m_area, unsigned int max_len);
38 40
 int html_normalise_mem(unsigned char *in_buff, off_t in_size, const char *dirname, tag_arguments_t *hrefs);
39 41
 int html_normalise_fd(int fd, const char *dirname, tag_arguments_t *hrefs);
40 42
 void html_tag_arg_free(tag_arguments_t *tags);