GitList

libclamav/entconv.c

4bdf6efd
4ac11cb0	output[j++] = 0; output[j++] = input[i++];
4bdf6efd	} else if( (input[i]&0xE0) == 0xC0 ) { if ((input[i+1]&0xC0) == 0x80) { /* 2 bytes long 110yyyyy zzzzzzzz -> 00000yyy yyzzzzzz*/
4ac11cb0	output[j++] = ((input[i] & 0x1F) >> 2) & 0x07;
4bdf6efd	output[j++] = ((input[i] & 0x1F) << 6) \| (input[i+1] & 0x3F); }
4ac11cb0	else {
4bdf6efd	cli_dbgmsg("invalid UTF8 character encountered\n");
4ac11cb0	break; }
4bdf6efd	i+=2; } else if( (input[i]&0xE0) == 0xE0) { if( (input[i+1]&0xC0) == 0x80 && (input[i+2]&0xC0) == 0x80) { /* 3 bytes long 1110xxxx 10yyyyyy 10zzzzzzzz -> xxxxyyyy yyzzzzzz*/
4ac11cb0	output[j++] = (input[i] << 4) \| ((input[i+1] >> 2) & 0x0F); output[j++] = (input[i+1] << 6) \| (input[i+2] & 0x3F);
4bdf6efd	}
4ac11cb0	else {
4bdf6efd	cli_dbgmsg("invalid UTF8 character encountered\n");
4ac11cb0	break; }
4bdf6efd	i+=3; } else if( (input[i]&0xF8) == 0xF0) { if((input[i+1]&0xC0) == 0x80 && (input[i+2]&0xC0) == 0x80 && (input[i+3]&0xC0) == 0x80) { /* 4 bytes long 11110www 10xxxxxx 10yyyyyy 10zzzzzz -> 000wwwxx xxxxyyyy yyzzzzzz*/
4ac11cb0	cli_dbgmsg("UTF8 character out of UTF16 range encountered"); output[j++] = 0xff; output[j++] = 0xff; /out[j++] = ((input[i] & 0x07) << 2) \| ((input[i+1] >> 4) & 0x3); out[j++] = (input[i+1] << 4) \| ((input[i+2] >> 2) & 0x0F); out[j++] = (input[i+2] << 6) \| (input[i+2] & 0x3F);/
4bdf6efd	}
4ac11cb0	else {
4bdf6efd	cli_dbgmsg("invalid UTF8 character encountered\n");
4ac11cb0	break; }
4bdf6efd	i+=4; } else { cli_dbgmsg("invalid UTF8 character encountered\n");
4ac11cb0	break;
4bdf6efd	} } inbytesleft -= i; outbytesleft -= j; inbuf += i; outbuf += j;
4ac11cb0	if(inbytesleft && outbytesleft) { errno = EILSEQ;/* we had an early exit / return -1; } if(inbytesleft) { errno = E2BIG; return -1; }
4bdf6efd	return 0; } } outbytesleft -= maxcopy; inbytesleft -= maxcopy; inbuf += maxcopy; outbuf += maxcopy;
4ac11cb0	if(*inbytesleft) { errno = E2BIG; return -1; }
4bdf6efd	return 0; } #endif /* new iconv() version / static inline void process_bom(struct entity_conv conv) { const unsigned char* bom = conv->bom; const unsigned char* encoding = OTHER; int has_bom = 0; uint8_t enc_bytes = 4;/* default is UTF8, which has a maximum of 4 bytes/ switch(bom[0]) { case 0x00: if(bom[1] == 0x00) { if(bom[2] == 0xFE && bom[3] == 0xFF) { encoding = UCS4_1234;/ UCS-4 big-endian/ has_bom = 1; } else if(bom[2] == 0xFF && bom[3] == 0xFE) { encoding = UCS4_2143;/ UCS-4 unusual order 2143 / has_bom = 1; } else if(bom[2] == 0x00 && bom[3] == 0x3C) { encoding = UNDECIDED_32_1234; } else if(bom[2] == 0x3C && bom[3] == 0x00) { encoding = UNDECIDED_32_2143; } }/ 0x00 0x00 / else if(bom[1] == 0x3C) { if(bom[2] == 0x00) { if(bom[3] == 0x00) { encoding = UNDECIDED_32_3412; } else if(bom[3] == 0x3F) { encoding = UNDECIDED_16_BE; enc_bytes = 2; } }/0x00 0x3C 0x00/ }/0x00 0x3C/ break; case 0xFF: if(bom[1] == 0xFE) { if(bom[2] == 0x00 && bom[3] == 0x00) { encoding = UCS4_4321; has_bom = 1; } else { encoding = UTF16_LE; has_bom = 1; enc_bytes = 2; } }/0xFF 0xFE/ break; case 0xFE: if(bom[1] == 0xFF) { if(bom[2] == 0x00 && bom[3] == 0x00) { encoding = UCS4_3412; has_bom = 1; } else { encoding = UTF16_BE; has_bom = 1; enc_bytes = 2; } }/0xFE 0xFF/ break; case 0xEF: if(bom[1] == 0xBB && bom[2] == 0xBF) { encoding = UTF8; has_bom = 1; /enc_bytes = 4;- default, maximum 4 bytes/ }/0xEF 0xBB 0xBF/ break; case 0x3C: if(bom[1] == 0x00) { if(bom[2] == 0x00 && bom[3] == 0x00) { encoding = UNDECIDED_32_4321; } else if(bom[2] == 0x3F && bom[3] == 0x00) { encoding = UNDECIDED_16_LE; enc_bytes = 2; } }/0x3C 0x00/ else if(bom[1] == 0x3F && bom[2] == 0x78 && bom[3]==0x6D) { encoding = UNDECIDED_8; enc_bytes = 1; }/0x3C 3F 78 6D/ break; case 0x4C: if(bom[1] == 0x6F && bom[2] == 0xA7 && bom[3] == 0x94) { encoding = EBCDIC; enc_bytes = 1; }/4C 6F A7 94/ break; }/switch/ conv->autodetected = encoding; conv->enc_bytes = enc_bytes; conv->has_bom = has_bom; } static unsigned char normalize_encoding(const unsigned char* enc) { unsigned char* norm; size_t i; const size_t len = strlen((const char)enc); norm = cli_malloc( len+1); if(!norm) return NULL; if(enc == OTHER) enc = (const unsigned char)"ISO-8859-1"; for(i=0;i < strlen((const char)enc); i++) norm[i] = toupper(enc[i]); norm[len]='\0'; return norm; } static const char encoding_name(unsigned char* encoding) { if(!encoding) return "ISO-8859-1"; else return (char)encoding; } void process_encoding_set(struct entity_conv conv,const unsigned char* encoding,enum encoding_priority prio) { cli_dbgmsg("Setting encoding for %x to %s, priority: %d\n",conv, encoding, prio); if(encoding == OTHER) return; if(conv->priority == CONTENT_TYPE) return;/* Content-type in header is highest priority, no overrides possible/ if(conv->priority == BOM && prio == NOBOM_AUTODETECT) return; free(conv->encoding); conv->encoding = normalize_encoding(encoding);/ FIXME: better obey priorities/ cli_dbgmsg("New encoding for %x:%s\n",conv,conv->encoding); / reset stream / } static int encoding_norm_done(struct entity_conv conv) { if(conv->encoding) { free(conv->encoding); conv->encoding = NULL; } conv->buffer_size = 0; if(conv->tmp_area.buffer) { free(conv->tmp_area.buffer); conv->tmp_area.buffer = NULL; } if(conv->out_area.buffer) { free(conv->out_area.buffer); conv->out_area.buffer = NULL; } if(conv->norm_area.buffer) { free(conv->norm_area.buffer); conv->norm_area.buffer = NULL; } return 0; } int entity_norm_done(struct entity_conv* conv) { return encoding_norm_done(conv); } static size_t read_raw(FILE stream, m_area_t m_area, unsigned int max_len, unsigned char* outbuff) { /* Try and use the memory buffer first / if (m_area) { size_t area_maxcopy; const unsigned char src; size_t copied; if(m_area->offset >= m_area->length) return 0; area_maxcopy = (m_area->length > m_area->offset + max_len) ? max_len : m_area->length - m_area->offset; src = m_area->buffer + m_area->offset; m_area->offset += area_maxcopy; copied = area_maxcopy; while(area_maxcopy && src != '\n') { outbuff++ = src++; area_maxcopy--; } if(area_maxcopy > 3) { /copy 3 more bytes, just in case its ucs4 / outbuff++ = src++; outbuff++ = src++; outbuff++ = src++; area_maxcopy -= 3; } m_area->offset -= area_maxcopy; copied -= area_maxcopy; return copied; } else { if (!stream) { cli_dbgmsg("No HTML stream\n"); return 0; } else { const size_t iread = fread(outbuff, 1, max_len, stream); size_t i; if(ferror(stream)) { cli_errmsg("Error while reading HTML stream\n"); } for(i=0; i < iread; i++) if(outbuff[i] == '\n') { return i+3 > iread ? iread : i+3; } return iread; } } } static void output_first(struct entity_conv conv,unsigned char out, unsigned char in) { if(conv->has_bom) { switch(conv->enc_bytes) { case 1: if(conv->autodetected == UTF8) in += 3; break; case 2: in += 2; break; case 4: in += 4; break; } } } / tmp_m_area and conv->out_area are of size maxlen / unsigned char encoding_norm_readline(struct entity_conv* conv, FILE* stream_in, m_area_t* in_m_area, const size_t maxlen) { if(!conv \|\| !conv->out_area.buffer \|\| !conv->tmp_area.buffer) return NULL; else { /* stream_in\|in_m_area ->(read_raw) conv->tmp_area -> (iconv) conv->out_area -> (normalize) conv->norm_area -> (cli_readline) return value/ const size_t tmp_move = conv->tmp_area.length - conv->tmp_area.offset; const size_t tmp_available = conv->buffer_size - tmp_move; const size_t max_read = maxlen < tmp_available ? maxlen : tmp_available; unsigned char tmpbuff = &conv->tmp_area.buffer[tmp_move]; const size_t out_move = conv->out_area.length < conv->out_area.offset ? 0 : conv->out_area.length - conv->out_area.offset; size_t outleft = conv->buffer_size - out_move; unsigned char* out = &conv->out_area.buffer[out_move]; const size_t norm_move = conv->norm_area.length - conv->norm_area.offset; unsigned char* norm; const unsigned char* norm_end; iconv_t iconv_struct; size_t rc, inleft; ssize_t i; char alignfix; /* move whatever left in conv->tmp_area to beginning / if(tmp_move) memmove(conv->tmp_area.buffer, conv->tmp_area.buffer + conv->tmp_area.offset, tmp_move); conv->tmp_area.offset = 0; / read raw data from stream, or in_m_area into conv->tmp_area/ conv->tmp_area.length = tmp_move + read_raw(stream_in, in_m_area, max_read, tmpbuff); / move whatever left in conv->out_area to beginning / if(out_move) memmove(conv->out_area.buffer, conv->out_area.buffer + conv->out_area.offset, out_move); conv->out_area.offset = 0; tmpbuff = conv->tmp_area.buffer; if(!conv->bom_cnt && conv->tmp_area.length >= 4) {/ detect Byte Order Mark / memcpy( conv->bom, tmpbuff, 4); process_bom(conv); process_encoding_set(conv,conv->autodetected,conv->has_bom ? BOM : NOBOM_AUTODETECT); output_first(conv,&out,&tmpbuff); conv->bom_cnt++; } / convert encoding conv->tmp_area. conv->out_area / inleft = conv->tmp_area.length; alignfix = inleft%4;/ iconv gives an error if we give him 3 bytes to convert, and we are using ucs4, ditto for utf16, and 1 byte/ inleft -= alignfix; if(!inleft && alignfix) { size_t k; for(k=0;k+alignfix < 4;k++) tmpbuff[alignfix+k] = '\0'; inleft = 4; alignfix = -inleft; } iconv_struct = iconv_open("UTF-16BE",encoding_name(conv->encoding)); if(iconv_struct == (iconv_t)-1) { cli_dbgmsg("Iconv init problem for encoding:%s, falling back to iso encoding!\n",encoding_name(conv->encoding)); / what can we do? just fall back for it being an ISO-8859-1 / iconv_struct = iconv_open("UTF-16BE","ISO-8859-1"); if(iconv_struct == (iconv_t)-1) { cli_dbgmsg("fallback failed... bail out\n"); return cli_readline(NULL,&conv->tmp_area,maxlen); } } if(inleft) / iconv doesn't like inleft to be 0 / rc = iconv(iconv_struct, (char) &tmpbuff, &inleft, (char*) &out, &outleft); else rc = 0; iconv_close(iconv_struct); if(rc==(size_t)-1 && errno != E2BIG) {
4ac11cb0	cli_dbgmsg("iconv error:%s, silently resuming (%ld,%ld,%ld,%ld)\n",strerror(errno),out-conv->out_area.buffer,tmpbuff-conv->tmp_area.buffer,inleft,outleft); /* output raw byte, and resume at next byte / out++ = 0; out++ = tmpbuff++; inleft--; /* return cli_readline(NULL, &conv->norm_area, maxlen);*/
4bdf6efd	} conv->tmp_area.length = inleft + (alignfix > 0 ? alignfix : 0); conv->out_area.length = out - conv->out_area.buffer; conv->tmp_area.offset = tmpbuff - conv->tmp_area.buffer; conv->tmp_area.length += conv->tmp_area.offset; /* move whatever left in conv->norm_area to beginning / if(norm_move) memmove(conv->norm_area.buffer, conv->norm_area.buffer + conv->norm_area.offset, norm_move); conv->norm_area.offset = 0; / now do the real normalization / out = conv->out_area.buffer;/ skip over utf16 bom, FIXME: check if iconv really outputted a BOM / norm = conv->norm_area.buffer + norm_move; norm_end = conv->norm_area.buffer + conv->buffer_size; if(conv->out_area.length>0 && out[0] == 0xFF && out[1] == 0xFE) i = 2; else i = 0; for(; i < conv->out_area.length; i += 2) { uint16_t u16 = ( ((uint16_t)out[i]) << 8 ) \| out[i+1]; if(!u16) { if(alignfix >= 0) / if alignfix is negative, this 0 byte is on-purpose, its padding / cli_dbgmsg("Skipping null character in html stream\n"); } else if(u16 < 0x80) { if(norm >= norm_end) break; if((unsigned char)u16 ==0) cli_dbgmsg("Impossible\n"); norm++ = (unsigned char)u16; } else if (u16 == 160) {/nbsp/
b52afa4a	if(norm >= norm_end) break;
4bdf6efd	norm++ = 0x20; } else { char buff[10]; snprintf(buff,9,"&#%d;",u16); buff[9] = '\0'; if(norm + strlen(buff) >= norm_end) break; strncpy((char)norm, buff, strlen(buff)); norm += strlen(buff); } } conv->out_area.offset = i; /* so that we can resume next time from here / conv->norm_area.length = norm - conv->norm_area.buffer; / conv->norm_area.buffer[conv->buffer_size-1]=0;DONT DO THIS if( (o =strstr(conv->norm_area.buffer,"Content")) && strstr(conv->norm_area.buffer,"text/x-")) printf("%s\n",o);/ / final cli_readline from conv->norm_area */ return cli_readline(NULL, &conv->norm_area, maxlen); } } #endif