/* * HTML Entity & Encoding normalization. * * Copyright (C) 2006 Török Edvin <edwin@clamav.net> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as * published by the Free Software Foundation. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, * MA 02110-1301, USA. * */ #include "clamav-config.h" #include <stdio.h> #include <stdlib.h> #include <string.h> #include <ctype.h> #include <errno.h> #ifdef CL_THREAD_SAFE #include <pthread.h> #endif #ifndef CL_DEBUG #define NDEBUG #endif #include <assert.h> #include "clamav.h" #include "others.h" #include "htmlnorm.h" #include "hashtab.h" #include "entconv.h" #include "entitylist.h" #include "cltypes.h" #ifdef HAVE_ICONV #include <iconv.h> #endif #include "encoding_aliases.h" #define MODULE_NAME "entconv: " #define MAX_LINE 1024 #ifndef EILSEQ #define EILSEQ 84 #endif #ifndef HAVE_ICONV typedef struct { enum encodings encoding; size_t size; } * iconv_t; #endif static unsigned char tohex[] = {'0','1','2','3','4','5','6','7','8','9','a','b','c','d','e','f'}; /* TODO: gcc refuses to inline because it consider call unlikely and code size grows */ static inline unsigned char* u16_normalize(uint16_t u16, unsigned char* out, const ssize_t limit) { assert(limit > 0 && "u16_normalize must be called with positive limit"); /* \0 is just ignored */ if(!u16) { return out; } if(u16 < 0xff) { assert((uint8_t)u16 != 0); *out++ = (uint8_t)u16; } else { size_t i; /* normalize only >255 to speed up */ if(limit <= 8) { /* not enough space available */ return NULL; } /* inline version of * out += snprintf(out, max_num_length, "&#x%x;", u16) */ out[0] = '&'; out[1] = '#'; out[2] = 'x'; out[7] = ';'; for(i=6; i >= 3; --i) { out[i] = tohex[u16 & 0xf]; u16 >>= 4; } out += 8; } return out; } /* buffer must be at least 2 bytes in size */ unsigned char* u16_normalize_tobuffer(uint16_t u16, unsigned char* dst, size_t dst_size) { unsigned char* out = u16_normalize(u16, dst, dst_size-1); if(out) { *out++ = '\0'; return out; } return NULL; } const char* entity_norm(struct entity_conv* conv,const unsigned char* entity) { struct element* e = hashtab_find(&entities_htable, (const char*)entity, strlen((const char*)entity)); if(e && e->key) { unsigned char* out = u16_normalize(e->data, conv->entity_buff, sizeof(conv->entity_buff)-1); if(out) { *out++ = '\0'; return (const char*)conv->entity_buff; } } return NULL; } /* sane default, must be larger, than the longest possible return string, * which is * &#xxx;*/ #define MIN_BUFFER_SIZE 32 #define LINEMODE_LIMIT 16384 int init_entity_converter(struct entity_conv* conv, size_t buffer_size) { if(buffer_size < MIN_BUFFER_SIZE) { cli_warnmsg("Entity converter: Supplied buffer size:%lu, smaller than minimum required: %d\n",(unsigned long)buffer_size,MIN_BUFFER_SIZE); return CL_ENULLARG; } if(conv) { conv->encoding = NULL; conv->encoding_symbolic = E_UNKNOWN; conv->bom_cnt = 0; conv->buffer_size = buffer_size; conv->priority = NOPRIO; /* start in linemode */ conv->linemode = 1; conv->linemode_processed = 0; conv->tmp_area.offset = 0; conv->tmp_area.length = 0; conv->tmp_area.buffer = cli_malloc(buffer_size); if(!conv->tmp_area.buffer) { return CL_EMEM; } conv->out_area.offset = 0; conv->out_area.length = buffer_size; conv->out_area.buffer = cli_malloc(buffer_size); if(!conv->out_area.buffer) { free(conv->tmp_area.buffer); return CL_EMEM; } conv->buffer_size = buffer_size; conv->norm_area.offset = 0; conv->norm_area.length = 0; conv->norm_area.buffer = cli_malloc(buffer_size); if(!conv->norm_area.buffer) { free(conv->tmp_area.buffer); free(conv->out_area.buffer); return CL_EMEM; } conv->iconv_struct = cli_calloc(1, sizeof(iconv_t)); if(!conv->iconv_struct) { free(conv->tmp_area.buffer); free(conv->out_area.buffer); free(conv->norm_area.buffer); return CL_EMEM; } return 0; } else return CL_ENULLARG; } static size_t encoding_bytes(const char* fromcode, enum encodings* encoding) { /* special case for these unusual byteorders */ struct element * e = hashtab_find(&aliases_htable,fromcode,strlen(fromcode)); if(e && e->key) { *encoding = e->data; } else { *encoding = E_OTHER; } switch(*encoding) { case E_UCS4: case E_UCS4_1234: case E_UCS4_4321: case E_UCS4_2143: case E_UCS4_3412: return 4; case E_UTF16: case E_UTF16_BE: case E_UTF16_LE: return 2; case E_UTF8: case E_UNKNOWN: case E_OTHER: default: return 1; } } #ifndef HAVE_ICONV static iconv_t iconv_open(const char *tocode, const char* fromcode) { iconv_t iconv = cli_malloc(sizeof(*iconv)); if(!iconv) return NULL; cli_dbgmsg(MODULE_NAME "Internal iconv\n"); /* TODO: check that tocode is UTF16BE */ iconv->size = encoding_bytes(fromcode,&iconv->encoding); return iconv; } static int iconv_close(iconv_t cd) { if(cd) free(cd); return 0; } static int iconv(iconv_t iconv_struct,char **inbuf, size_t *inbytesleft, char** outbuf, size_t *outbytesleft) { const uint8_t* input; uint8_t* output; size_t maxcopy, i; if(!inbuf || !outbuf) { return 0; } maxcopy = (*inbytesleft > *outbytesleft ? *outbytesleft : *inbytesleft) & ~(iconv_struct->size - 1); input = (const uint8_t*)*inbuf; output = (uint8_t*)*outbuf; /*,maxcopy is aligned to data size */ /* output is always utf16be !*/ switch(iconv_struct->encoding) { case E_UCS4: case E_UCS4_1234: { for(i=0;i < maxcopy; i += 4) { if(!input[i+2] && !input[i+3]) { output[i/2] = input[i+1]; /* is compiler smart enough to replace /2, with >>1 ? */ output[i/2+1] = input[i]; } else { cli_dbgmsg(MODULE_NAME "Warning: unicode character out of utf16 range!\n"); output[i/2] = 0xff; output[i/2+1] = 0xff; } } break; } case E_UCS4_4321: { const uint16_t *in = (const uint16_t*)input;/*UCS4_4321, and UTF16_BE have same endianness, no need for byteswap here*/ uint16_t *out = (uint16_t*)output; for(i=0;i<maxcopy/2; i+=2) { if(!in[i]) { out[i/2] = in[i+1]; } else { out[i/2] = 0xffff; } } break; } case E_UCS4_2143: { const uint16_t *in = (const uint16_t*)input; uint16_t* out = (uint16_t*)output; for(i=0;i<maxcopy/2;i+=2) { if(!in[i+1]) out[i/2] = in[i]; else out[i/2] = 0xffff; } break; } case E_UCS4_3412: { for(i=0;i < maxcopy;i += 4) { if(!input[i] && !input[i+1]) { output[i/2] = input[i+3]; output[i/2+1] = input[i+2]; } else { output[i/2] = 0xff; output[i/2+1] = 0xff; } } break; } case E_UTF16: case E_UTF16_LE: { for(i=0;i < maxcopy;i += 2) { output[i] = input[i+1]; output[i+1] = input[i]; } break; } case E_UTF16_BE: memcpy(output,input,maxcopy); break; case E_UNKNOWN: case E_OTHER: { const size_t max_copy = *inbytesleft > (*outbytesleft/2) ? (*outbytesleft/2) : *inbytesleft; for(i=0;i<max_copy;i++) { output[i*2] = 0; output[i*2+1] = input[i]; } *outbytesleft -= max_copy*2; *inbytesleft -= max_copy; *inbuf += max_copy; *outbuf += max_copy*2; if(*inbytesleft) return E2BIG; return 0; } case E_UTF8: { const size_t maxread = *inbytesleft; const size_t maxwrite = *outbytesleft; size_t j; for(i=0,j=0 ; i < maxread && j < maxwrite;) { if(input[i] < 0x7F) { output[j++] = 0; output[j++] = input[i++]; } else if( (input[i]&0xE0) == 0xC0 ) { if ((input[i+1]&0xC0) == 0x80) { /* 2 bytes long 110yyyyy zzzzzzzz -> 00000yyy yyzzzzzz*/ output[j++] = ((input[i] & 0x1F) >> 2) & 0x07; output[j++] = ((input[i] & 0x1F) << 6) | (input[i+1] & 0x3F); } else { cli_dbgmsg(MODULE_NAME "invalid UTF8 character encountered\n"); break; } i+=2; } else if( (input[i]&0xE0) == 0xE0) { if( (input[i+1]&0xC0) == 0x80 && (input[i+2]&0xC0) == 0x80) { /* 3 bytes long 1110xxxx 10yyyyyy 10zzzzzzzz -> xxxxyyyy yyzzzzzz*/ output[j++] = (input[i] << 4) | ((input[i+1] >> 2) & 0x0F); output[j++] = (input[i+1] << 6) | (input[i+2] & 0x3F); } else { cli_dbgmsg(MODULE_NAME "invalid UTF8 character encountered\n"); break; } i+=3; } else if( (input[i]&0xF8) == 0xF0) { if((input[i+1]&0xC0) == 0x80 && (input[i+2]&0xC0) == 0x80 && (input[i+3]&0xC0) == 0x80) { /* 4 bytes long 11110www 10xxxxxx 10yyyyyy 10zzzzzz -> 000wwwxx xxxxyyyy yyzzzzzz*/ cli_dbgmsg(MODULE_NAME "UTF8 character out of UTF16 range encountered"); output[j++] = 0xff; output[j++] = 0xff; /*out[j++] = ((input[i] & 0x07) << 2) | ((input[i+1] >> 4) & 0x3); out[j++] = (input[i+1] << 4) | ((input[i+2] >> 2) & 0x0F); out[j++] = (input[i+2] << 6) | (input[i+2] & 0x3F);*/ } else { cli_dbgmsg(MODULE_NAME "invalid UTF8 character encountered\n"); break; } i+=4; } else { cli_dbgmsg(MODULE_NAME "invalid UTF8 character encountered\n"); break; } } *inbytesleft -= i; *outbytesleft -= j; *inbuf += i; *outbuf += j; if(*inbytesleft && *outbytesleft) { errno = EILSEQ;/* we had an early exit */ return -1; } if(*inbytesleft) { errno = E2BIG; return -1; } return 0; } } *outbytesleft -= maxcopy; *inbytesleft -= maxcopy; *inbuf += maxcopy; *outbuf += maxcopy; if(*inbytesleft) { errno = E2BIG; return -1; } return 0; } #else #endif /* new iconv() version */ static inline void process_bom(struct entity_conv* conv) { const unsigned char* bom = conv->bom; const char* encoding = NULL; int has_bom = 0; uint8_t enc_bytes = 1;/* default is UTF8, which has a minimum of 1 bytes*/ /* undecided 32-bit encodings are treated as ucs4, and * 16 bit as utf16*/ switch(bom[0]) { case 0x00: if(bom[1] == 0x00) { if(bom[2] == 0xFE && bom[3] == 0xFF) { encoding = UCS4_1234;/* UCS-4 big-endian*/ has_bom = 1; } else if(bom[2] == 0xFF && bom[3] == 0xFE) { encoding = UCS4_2143;/* UCS-4 unusual order 2143 */ has_bom = 1; } else if(bom[2] == 0x00 && bom[3] == 0x3C) { /* undecided, treat as ucs4 */ encoding = UCS4_1234; } else if(bom[2] == 0x3C && bom[3] == 0x00) { encoding = UCS4_2143; } }/* 0x00 0x00 */ else if(bom[1] == 0x3C) { if(bom[2] == 0x00) { if(bom[3] == 0x00) { encoding = UCS4_3412; } else if(bom[3] == 0x3F) { encoding = UTF16_BE; enc_bytes = 2; } }/*0x00 0x3C 0x00*/ }/*0x00 0x3C*/ break; case 0xFF: if(bom[1] == 0xFE) { if(bom[2] == 0x00 && bom[3] == 0x00) { encoding = UCS4_4321; has_bom = 1; } else { encoding = UTF16_LE; has_bom = 1; enc_bytes = 2; } }/*0xFF 0xFE*/ break; case 0xFE: if(bom[1] == 0xFF) { if(bom[2] == 0x00 && bom[3] == 0x00) { encoding = UCS4_3412; has_bom = 1; } else { encoding = UTF16_BE; has_bom = 1; enc_bytes = 2; } }/*0xFE 0xFF*/ break; case 0xEF: if(bom[1] == 0xBB && bom[2] == 0xBF) { encoding = UTF8; has_bom = 1; /*enc_bytes = 4;- default, maximum 4 bytes*/ }/*0xEF 0xBB 0xBF*/ break; case 0x3C: if(bom[1] == 0x00) { if(bom[2] == 0x00 && bom[3] == 0x00) { encoding = UCS4_4321; } else if(bom[2] == 0x3F && bom[3] == 0x00) { encoding = UTF16_LE; enc_bytes = 2; } }/*0x3C 0x00*/ else if(bom[1] == 0x3F && bom[2] == 0x78 && bom[3]==0x6D) { encoding = NULL; enc_bytes = 1; }/*0x3C 3F 78 6D*/ break; case 0x4C: if(bom[1] == 0x6F && bom[2] == 0xA7 && bom[3] == 0x94) { cli_dbgmsg(MODULE_NAME "EBCDIC encoding is not supported in line mode\n"); encoding = NULL; enc_bytes = 1; }/*4C 6F A7 94*/ break; }/*switch*/ if(encoding) { cli_dbgmsg(MODULE_NAME "encoding detected as :%s\n", encoding); process_encoding_set(conv, (const unsigned char*)encoding, has_bom ? BOM : NOBOM_AUTODETECT); } conv->enc_bytes = enc_bytes; conv->has_bom = has_bom; } /*()-./0123456789:ABCDEFGHIJKLMNOPQRSTUVWXYZ_abcdefghijklmnopqrstuvwxyz*/ static const uint8_t encname_chars[256] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }; /* checks that encoding is sane, and normalizes to uppercase */ static char* normalize_encoding(const unsigned char* enc) { char* norm; size_t i, len; if(!enc) return NULL; len = strlen((const char*)enc); if(len > 32) return NULL; for(i=0;i<len;i++) { if(!encname_chars[enc[i]]) return NULL; } norm = cli_malloc( len+1 ); if(!norm) return NULL; for(i=0;i < len; i++) norm[i] = toupper(enc[i]); norm[len]='\0'; return norm; } static int encoding_norm_done(struct entity_conv* conv) { if(conv->encoding) { free(conv->encoding); conv->encoding = NULL; } conv->buffer_size = 0; if(conv->tmp_area.buffer) { free(conv->tmp_area.buffer); conv->tmp_area.buffer = NULL; } if(conv->out_area.buffer) { free(conv->out_area.buffer); conv->out_area.buffer = NULL; } if(conv->norm_area.buffer) { free(conv->norm_area.buffer); conv->norm_area.buffer = NULL; } if(conv->iconv_struct) { free(conv->iconv_struct); } return 0; } int entity_norm_done(struct entity_conv* conv) { return encoding_norm_done(conv); } static unsigned short bom_length(struct entity_conv* conv) { if(conv->has_bom) { switch(conv->enc_bytes) { case 1: if(conv->encoding_symbolic == E_UTF8) { return 3; } break; case 2: return 2; case 4: return 4; } } return 0; } /* sarge leaks on iconv_open/iconv_close, so lets not open/close so many times, * just keep on each thread its own pool of iconvs*/ struct iconv_cache { iconv_t* tab; size_t len; size_t last; struct hashtable hashtab; }; static void iconv_cache_init(struct iconv_cache* cache) { /* cache->tab = NULL; cache->len = 0; cache->used = 0; - already done by memset*/ cli_dbgmsg(MODULE_NAME "Initializing iconv pool:%p\n",(void*)cache); hashtab_init(&cache->hashtab, 32); } static void iconv_cache_destroy(struct iconv_cache* cache) { size_t i; cli_dbgmsg(MODULE_NAME "Destroying iconv pool:%p\n",(void*)cache); for(i=0;i < cache->last;i++) { cli_dbgmsg(MODULE_NAME "closing iconv:%p\n",cache->tab[i]); iconv_close(cache->tab[i]); } hashtab_clear(&cache->hashtab); free(cache->hashtab.htable); free(cache->tab); free(cache); } #ifdef CL_THREAD_SAFE static pthread_key_t iconv_pool_tls_key; static pthread_once_t iconv_pool_tls_key_once = PTHREAD_ONCE_INIT; /* destructor called for all threads that exit via pthread_exit, or cancellation. Unfortunately that doesn't include * the main thread, so we have to call this manually for the main thread.*/ static int cache_atexit_registered = 0; static void iconv_pool_tls_instance_destroy(void* ptr) { if(ptr) { iconv_cache_destroy(ptr); } } static void iconv_cache_cleanup_main(void) { struct iconv_cache* cache = pthread_getspecific(iconv_pool_tls_key); if(cache) { iconv_pool_tls_instance_destroy(cache); pthread_setspecific(iconv_pool_tls_key,NULL); } pthread_key_delete(iconv_pool_tls_key); } static void iconv_pool_tls_key_alloc(void) { pthread_key_create(&iconv_pool_tls_key, iconv_pool_tls_instance_destroy); if(!cache_atexit_registered) { cli_dbgmsg(MODULE_NAME "iconv:registering atexit\n"); if(atexit(iconv_cache_cleanup_main)) { cli_dbgmsg(MODULE_NAME "failed to register atexit\n"); } cache_atexit_registered = 1; } } static void init_iconv_pool_ifneeded(void) { pthread_once(&iconv_pool_tls_key_once, iconv_pool_tls_key_alloc); } static inline struct iconv_cache* cache_get_tls_instance(void) { struct iconv_cache* cache = pthread_getspecific(iconv_pool_tls_key); if(!cache) { cache = cli_calloc(1,sizeof(*cache)); if(!cache) { cli_dbgmsg(MODULE_NAME "!Out of memory allocating TLS iconv instance\n"); return NULL; } iconv_cache_init(cache); pthread_setspecific(iconv_pool_tls_key, cache); } return cache; } #else static struct iconv_cache* global_iconv_cache = NULL; static int iconv_global_inited = 0; static void iconv_cache_cleanup_main(void) { iconv_cache_destroy(global_iconv_cache); } static inline void init_iconv_pool_ifneeded() { if(!iconv_global_inited) { global_iconv_cache = cli_calloc(1,sizeof(*global_iconv_cache)); if(global_iconv_cache) { iconv_cache_init(global_iconv_cache); atexit(iconv_cache_cleanup_main); iconv_global_inited = 1; } } } static inline struct iconv_cache* cache_get_tls_instance(void) { return global_iconv_cache; } #endif static iconv_t iconv_open_cached(const char* fromcode) { struct iconv_cache * cache; size_t idx; const size_t fromcode_len = strlen((const char*)fromcode); struct element * e; iconv_t iconv_struct; init_iconv_pool_ifneeded(); cache = cache_get_tls_instance();/* gets TLS iconv pool */ if(!cache) { cli_dbgmsg(MODULE_NAME "!Unable to get TLS iconv cache!\n"); errno = EINVAL; return (iconv_t)-1; } e = hashtab_find(&cache->hashtab, fromcode, fromcode_len); if(e && (e->data < 0 || (size_t)e->data > cache->len)) { e = NULL; } if(e) { size_t dummy_in, dummy_out; /* reset state */ iconv(cache->tab[e->data], NULL, &dummy_in, NULL, &dummy_out); return cache->tab[e->data]; } cli_dbgmsg(MODULE_NAME "iconv not found in cache, for encoding:%s\n",fromcode); iconv_struct = iconv_open("UTF-16BE",(const char*)fromcode); if(iconv_struct != (iconv_t)-1) { idx = cache->last++; if(idx >= cache->len) { cache->len += 16; cache->tab = cli_realloc2(cache->tab, cache->len*sizeof(cache->tab[0])); if(!cache->tab) { cli_dbgmsg(MODULE_NAME "!Out of mem in iconv-pool\n"); errno = ENOMEM; return (iconv_t)-1; } } hashtab_insert(&cache->hashtab, fromcode, fromcode_len, idx); cache->tab[idx] = iconv_struct; cli_dbgmsg(MODULE_NAME "iconv_open(),for:%s -> %p\n",fromcode,(void*)cache->tab[idx]); return cache->tab[idx]; } return (iconv_t)-1; } void process_encoding_set(struct entity_conv* conv,const unsigned char* encoding,enum encoding_priority prio) { char *tmp_encoding; enum encodings tmp; size_t new_size,old_size; if(!encoding && prio == SWITCH_TO_BLOCKMODE) { if(conv->linemode) { cli_dbgmsg(MODULE_NAME "Switching to block-mode, bytes processed in line-mode: %u\n", conv->linemode_processed); conv->linemode = 0; } return; } cli_dbgmsg(MODULE_NAME "Request to set encoding for %p to %s, priority: %d\n", (void*)conv, encoding, prio); if(conv->priority == CONTENT_TYPE || conv->encoding || conv->encoding_symbolic == E_ICONV) { cli_dbgmsg(MODULE_NAME "won't override encoding due to priorities\n"); return; /* Content-type in header is highest priority, no overrides possible. * Also no overrides after an encoding has been set.*/ } /* validate encoding name, and normalize to uppercase */ if(!(tmp_encoding = normalize_encoding(encoding))) { cli_dbgmsg(MODULE_NAME "encoding name is not valid, ignoring\n"); return; } /* don't allow to change between unicode encodings that have different byte-size */ if(prio == META) { /* need to consider minimum size of an encoding here */ old_size = conv->enc_bytes; new_size = encoding_bytes(tmp_encoding,&tmp); if(old_size != new_size) { /* on x86 gcc wants %u for size_t, on x86_64 it wants %lu for size_t. So just cast to unsigned long to make warnings go away. */ cli_dbgmsg(MODULE_NAME "refusing to override encoding - new encoding size differs: %s(%lu) != %s(%lu)\n", conv->encoding, (unsigned long)old_size, tmp_encoding, (unsigned long)new_size); free(tmp_encoding); return; } } conv->encoding = tmp_encoding; cli_dbgmsg(MODULE_NAME "New encoding for %p:%s\n", (void*)conv, conv->encoding); *(iconv_t*)conv->iconv_struct = iconv_open_cached( conv->encoding ); if(*(iconv_t*)conv->iconv_struct == (iconv_t)-1) { cli_dbgmsg(MODULE_NAME "Encoding not accepted by iconv_open()%s, falling back to default!\n", conv->encoding); /* message shown only once/file */ /* what can we do? short-circuit iconv */ free(conv->encoding); conv->encoding = NULL; /* we will process using whatever we currently have for encoding_symbolic. * If encoding was already set to iconv, we shouldn't be here.*/ assert(conv->encoding_symbolic != E_ICONV); } else { cli_dbgmsg(MODULE_NAME "Switching to block-mode, bytes processed in line-mode: %u\n", conv->linemode_processed); conv->encoding_symbolic = E_ICONV; conv->priority = prio; conv->linemode = 0; } } static int in_iconv_u16(m_area_t* in_m_area, iconv_t* iconv_struct, m_area_t* out_m_area) { char tmp4[4]; size_t inleft = in_m_area->length - in_m_area->offset; size_t rc, alignfix; char* input = (char*)in_m_area->buffer + in_m_area->offset; size_t outleft = out_m_area->length > 0 ? out_m_area->length : 0;/*TODO: use real buffer size not last one*/ char* out = (char*)out_m_area->buffer; if(!inleft) { /* EOF */ out_m_area->offset = out_m_area->length = 0; return 0; } /* convert encoding conv->tmp_area. conv->out_area */ alignfix = inleft%4;/* iconv gives an error if we give him 3 bytes to convert, and we are using ucs4, ditto for utf16, and 1 byte*/ inleft -= alignfix; if(!inleft && alignfix) { /* EOF, and we have less than 4 bytes to convert */ memset(tmp4, 0, 4); memcpy(tmp4, input, alignfix); input = tmp4; inleft = 4; alignfix = 0; } while (inleft && (outleft >= 2)) { /* iconv doesn't like inleft to be 0 */ const size_t outleft_last = outleft; assert(*iconv_struct != (iconv_t)-1); rc = iconv(*iconv_struct, (char**) &input, &inleft, (char**) &out, &outleft); if(rc == (size_t)-1) { if(errno == E2BIG) { /* not enough space in output buffer */ break; } cli_dbgmsg(MODULE_NAME "iconv error:%s\n", strerror(errno)); } else if(outleft == outleft_last) { cli_dbgmsg(MODULE_NAME "iconv stall (no output)\n"); } else { /* everything ok */ continue; } cli_dbgmsg(MODULE_NAME "resuming (inleft:%lu, outleft:%lu, inpos:%ld, %ld)\n", inleft, outleft, input - (char*)in_m_area->buffer, out - (char*)out_m_area->buffer); /* output raw byte, and resume at next byte */ if(outleft < 2) break; outleft -= 2; *out++ = 0; *out++ = *input++; inleft--; } /* length - offset - alignfix is original value of inleft, new value is inleft, * difference tells how much it moved. */ in_m_area->offset = in_m_area->length - alignfix - inleft; if(out_m_area->length >= 0 && out_m_area->length >= (off_t)outleft) { out_m_area->length -= (off_t)outleft; } else { cli_dbgmsg(MODULE_NAME "outleft overflown, ignoring\n"); out_m_area->length = 0; } out_m_area->offset = 0; return 0; } #define NORMALIZE_CHAR(c, out, limit, linemode) \ {\ if (linemode && c == '\n') {\ i++;\ break;\ } else {\ unsigned char* out_new = u16_normalize(c, out, limit);\ if(out_new) {\ limit -= out_new - out;\ }\ out = out_new;\ }\ } /* don't use CLI_ISCONTAINED2 here, because values are signed, and gcc4.3 * assumes signed overflow doesn't occur when optimizing (see -Wstrict-overflow) */ #define LIMIT_LENGTH(siz, siz_limit) ((siz) <= (siz_limit) ? (siz) : (siz_limit)) #define OFFSET_INBOUNDS(offset, length) ((offset) >= 0 && (length) >= 0 && (offset) < (length)) /* EOF marker is m_area->length == 0 */ /* reads input from either @m_area or @stream, and returns an m_area_t pointing to the data read. * When we can't read anything due to EOF ->length will be set to 0. * bounds checks offset and length*/ static inline m_area_t* read_raw(struct entity_conv* conv, m_area_t* m_area, FILE* stream) { if(!m_area) { size_t iread; m_area = &conv->tmp_area; if(OFFSET_INBOUNDS(m_area->offset, m_area->length)) { return m_area; } /* offset out of bounds -> all the buffer was processed, fill it again */ iread = fread(m_area->buffer, 1, conv->buffer_size, stream); m_area->length = LIMIT_LENGTH(iread, conv->buffer_size); m_area->offset = 0; if(ferror(stream)) { cli_errmsg("Error while reading HTML stream\n"); } } else { if(!OFFSET_INBOUNDS(m_area->offset, m_area->length)) { cli_dbgmsg(MODULE_NAME "EOF reached\n"); m_area->offset = m_area->length; /* EOF marker */ } } return m_area; } static inline uint16_t get_u16(const unsigned char* buf, const size_t i) { return ((uint16_t)buf[i] << 8) | buf[i+1]; } unsigned char* encoding_norm_readline(struct entity_conv* conv, FILE* stream_in, m_area_t* in_m_area) { unsigned char* out = conv->out_area.buffer; if(!conv || !conv->out_area.buffer || !conv->tmp_area.buffer || !out) { return NULL; } if(!(in_m_area = read_raw(conv, in_m_area, stream_in))) { /* error encountered */ return NULL; } else { const off_t input_limit = in_m_area->length; const unsigned char* input = in_m_area->buffer; off_t input_offset = in_m_area->offset; off_t limit = conv->out_area.length - 1; off_t limit_prev = limit; off_t i = 0; /* read_raw() ensures this condition */ assert((!input_limit && !input_offset) || (input_offset >=0 && input_limit > 0 && input_offset <= input_limit)); if(!conv->bom_cnt && input_offset + 4 < input_limit) {/* detect Byte Order Mark */ size_t bom_len; memcpy(conv->bom, input, 4); process_bom(conv); bom_len = bom_length(conv); in_m_area->offset = input_offset = input_offset + bom_len; conv->bom_cnt = 1; } if(conv->linemode && conv->linemode_processed > LINEMODE_LIMIT) { cli_dbgmsg(MODULE_NAME "Line-mode limit exceeded (%u), switching to block-mode\n", conv->linemode_processed); conv->linemode = 0; } switch(conv->encoding_symbolic) { case E_ICONV:/* only in block-mode */ /* normalize already converted characters from a previous pass * (output buffer was full, and we couldn't normalize more in previous pass) */ for(i = conv->norm_area.offset;i < conv->norm_area.length && limit > 0 && out; i += 2) { const uint16_t c = get_u16(conv->norm_area.buffer, i); NORMALIZE_CHAR(c, out, limit, 0); } conv->norm_area.offset = i; if(limit > 0) { conv->norm_area.length = conv->buffer_size; in_iconv_u16(in_m_area, conv->iconv_struct, &conv->norm_area); /*in_iconv_u16 always fills entire norm_area buffer starting from 0. */ for(i = 0;i < conv->norm_area.length && limit > 0 && out; i += 2) { const uint16_t c = get_u16(conv->norm_area.buffer, i); NORMALIZE_CHAR(c, out, limit, 0); } if(i) { conv->norm_area.offset = i; } } if(limit == limit_prev) { /* output pointer didn't move => EOF */ return NULL; } break; /* out_area must have enough space to allow all bytes in norm_area normalized, * if we norm with &x;, then we need 7* space. */ default: cli_dbgmsg(MODULE_NAME "Unhandled encoding:%d\n",conv->encoding_symbolic); conv->encoding_symbolic = E_OTHER; case E_UNKNOWN: case E_OTHER: if(!input_limit || input_offset == input_limit) { /* nothing to do, EOF */ return NULL; } for(i = input_offset; i < input_limit && limit > 0; i++) { const unsigned char c = input[i]; if(conv->linemode && c == '\n') { i++; break; } if(c) { *out++ = c; limit--; } } in_m_area->offset = i; } if(conv->linemode) { conv->linemode_processed += i - input_offset; } if(limit < 0) limit = 0; conv->out_area.buffer[conv->out_area.length - limit - 1] = '\0'; return conv->out_area.buffer; } }