4bdf6efd |
return 0;
}
#endif
/* new iconv() version */
static inline void process_bom(struct entity_conv* conv)
{
const unsigned char* bom = conv->bom;
const unsigned char* encoding = OTHER;
int has_bom = 0;
uint8_t enc_bytes = 4;/* default is UTF8, which has a maximum of 4 bytes*/
switch(bom[0]) {
case 0x00:
if(bom[1] == 0x00) {
if(bom[2] == 0xFE && bom[3] == 0xFF) {
encoding = UCS4_1234;/* UCS-4 big-endian*/
has_bom = 1;
}
else if(bom[2] == 0xFF && bom[3] == 0xFE) {
encoding = UCS4_2143;/* UCS-4 unusual order 2143 */
has_bom = 1;
}
else if(bom[2] == 0x00 && bom[3] == 0x3C) {
encoding = UNDECIDED_32_1234;
}
else if(bom[2] == 0x3C && bom[3] == 0x00) {
encoding = UNDECIDED_32_2143;
}
}/* 0x00 0x00 */
else if(bom[1] == 0x3C) {
if(bom[2] == 0x00) {
if(bom[3] == 0x00) {
encoding = UNDECIDED_32_3412;
}
else if(bom[3] == 0x3F) {
encoding = UNDECIDED_16_BE;
enc_bytes = 2;
}
}/*0x00 0x3C 0x00*/
}/*0x00 0x3C*/
break;
case 0xFF:
if(bom[1] == 0xFE) {
if(bom[2] == 0x00 && bom[3] == 0x00) {
encoding = UCS4_4321;
has_bom = 1;
}
else {
encoding = UTF16_LE;
has_bom = 1;
enc_bytes = 2;
}
}/*0xFF 0xFE*/
break;
case 0xFE:
if(bom[1] == 0xFF) {
if(bom[2] == 0x00 && bom[3] == 0x00) {
encoding = UCS4_3412;
has_bom = 1;
}
else {
encoding = UTF16_BE;
has_bom = 1;
enc_bytes = 2;
}
}/*0xFE 0xFF*/
break;
case 0xEF:
if(bom[1] == 0xBB && bom[2] == 0xBF) {
encoding = UTF8;
has_bom = 1;
/*enc_bytes = 4;- default, maximum 4 bytes*/
}/*0xEF 0xBB 0xBF*/
break;
case 0x3C:
if(bom[1] == 0x00) {
if(bom[2] == 0x00 && bom[3] == 0x00) {
encoding = UNDECIDED_32_4321;
}
else if(bom[2] == 0x3F && bom[3] == 0x00) {
encoding = UNDECIDED_16_LE;
enc_bytes = 2;
}
}/*0x3C 0x00*/
else if(bom[1] == 0x3F && bom[2] == 0x78 && bom[3]==0x6D) {
encoding = UNDECIDED_8;
enc_bytes = 1;
}/*0x3C 3F 78 6D*/
break;
case 0x4C:
if(bom[1] == 0x6F && bom[2] == 0xA7 && bom[3] == 0x94) {
encoding = EBCDIC;
enc_bytes = 1;
}/*4C 6F A7 94*/
break;
}/*switch*/
conv->autodetected = encoding;
conv->enc_bytes = enc_bytes;
conv->has_bom = has_bom;
}
static unsigned char* normalize_encoding(const unsigned char* enc)
{
unsigned char* norm;
size_t i;
const size_t len = strlen((const char*)enc);
norm = cli_malloc( len+1);
if(!norm)
return NULL;
if(enc == OTHER)
enc = (const unsigned char*)"ISO-8859-1";
for(i=0;i < strlen((const char*)enc); i++)
norm[i] = toupper(enc[i]);
norm[len]='\0';
return norm;
}
static const char* encoding_name(unsigned char* encoding)
{
if(!encoding)
return "ISO-8859-1";
else
return (char*)encoding;
}
void process_encoding_set(struct entity_conv* conv,const unsigned char* encoding,enum encoding_priority prio)
{
cli_dbgmsg("Setting encoding for %x to %s, priority: %d\n",conv, encoding, prio);
if(encoding == OTHER)
return;
if(conv->priority == CONTENT_TYPE)
return;/* Content-type in header is highest priority, no overrides possible*/
if(conv->priority == BOM && prio == NOBOM_AUTODETECT)
return;
free(conv->encoding);
conv->encoding = normalize_encoding(encoding);/* FIXME: better obey priorities*/
cli_dbgmsg("New encoding for %x:%s\n",conv,conv->encoding);
/* reset stream */
}
static int encoding_norm_done(struct entity_conv* conv)
{
if(conv->encoding) {
free(conv->encoding);
conv->encoding = NULL;
}
conv->buffer_size = 0;
if(conv->tmp_area.buffer) {
free(conv->tmp_area.buffer);
conv->tmp_area.buffer = NULL;
}
if(conv->out_area.buffer) {
free(conv->out_area.buffer);
conv->out_area.buffer = NULL;
}
if(conv->norm_area.buffer) {
free(conv->norm_area.buffer);
conv->norm_area.buffer = NULL;
}
return 0;
}
int entity_norm_done(struct entity_conv* conv)
{
return encoding_norm_done(conv);
}
static size_t read_raw(FILE *stream, m_area_t *m_area, unsigned int max_len, unsigned char* outbuff)
{
/* Try and use the memory buffer first */
if (m_area) {
size_t area_maxcopy;
const unsigned char* src;
size_t copied;
if(m_area->offset >= m_area->length)
return 0;
area_maxcopy = (m_area->length > m_area->offset + max_len) ? max_len : m_area->length - m_area->offset;
src = m_area->buffer + m_area->offset;
m_area->offset += area_maxcopy;
copied = area_maxcopy;
while(area_maxcopy && *src != '\n') {
*outbuff++ = *src++;
area_maxcopy--;
}
if(area_maxcopy > 3) {
/*copy 3 more bytes, just in case its ucs4 */
*outbuff++ = *src++;
*outbuff++ = *src++;
*outbuff++ = *src++;
area_maxcopy -= 3;
}
m_area->offset -= area_maxcopy;
copied -= area_maxcopy;
return copied;
} else {
if (!stream) {
cli_dbgmsg("No HTML stream\n");
return 0;
}
else {
const size_t iread = fread(outbuff, 1, max_len, stream);
size_t i;
if(ferror(stream)) {
cli_errmsg("Error while reading HTML stream\n");
}
for(i=0; i < iread; i++)
if(outbuff[i] == '\n') {
return i+3 > iread ? iread : i+3;
}
return iread;
}
}
}
static void output_first(struct entity_conv* conv,unsigned char** out, unsigned char** in)
{
if(conv->has_bom) {
switch(conv->enc_bytes) {
case 1:
if(conv->autodetected == UTF8)
*in += 3;
break;
case 2:
*in += 2;
break;
case 4:
*in += 4;
break;
}
}
}
/* tmp_m_area and conv->out_area are of size maxlen */
unsigned char* encoding_norm_readline(struct entity_conv* conv, FILE* stream_in, m_area_t* in_m_area, const size_t maxlen)
{
if(!conv || !conv->out_area.buffer || !conv->tmp_area.buffer)
return NULL;
else {
/* stream_in|in_m_area ->(read_raw) conv->tmp_area -> (iconv) conv->out_area -> (normalize) conv->norm_area -> (cli_readline) return value*/
const size_t tmp_move = conv->tmp_area.length - conv->tmp_area.offset;
const size_t tmp_available = conv->buffer_size - tmp_move;
const size_t max_read = maxlen < tmp_available ? maxlen : tmp_available;
unsigned char* tmpbuff = &conv->tmp_area.buffer[tmp_move];
const size_t out_move = conv->out_area.length < conv->out_area.offset ? 0 : conv->out_area.length - conv->out_area.offset;
size_t outleft = conv->buffer_size - out_move;
unsigned char* out = &conv->out_area.buffer[out_move];
const size_t norm_move = conv->norm_area.length - conv->norm_area.offset;
unsigned char* norm;
const unsigned char* norm_end;
iconv_t iconv_struct;
size_t rc, inleft;
ssize_t i;
char alignfix;
/* move whatever left in conv->tmp_area to beginning */
if(tmp_move)
memmove(conv->tmp_area.buffer, conv->tmp_area.buffer + conv->tmp_area.offset, tmp_move);
conv->tmp_area.offset = 0;
/* read raw data from stream, or in_m_area into conv->tmp_area*/
conv->tmp_area.length = tmp_move + read_raw(stream_in, in_m_area, max_read, tmpbuff);
/* move whatever left in conv->out_area to beginning */
if(out_move)
memmove(conv->out_area.buffer, conv->out_area.buffer + conv->out_area.offset, out_move);
conv->out_area.offset = 0;
tmpbuff = conv->tmp_area.buffer;
if(!conv->bom_cnt && conv->tmp_area.length >= 4) {/* detect Byte Order Mark */
memcpy( conv->bom, tmpbuff, 4);
process_bom(conv);
process_encoding_set(conv,conv->autodetected,conv->has_bom ? BOM : NOBOM_AUTODETECT);
output_first(conv,&out,&tmpbuff);
conv->bom_cnt++;
}
/* convert encoding conv->tmp_area. conv->out_area */
inleft = conv->tmp_area.length;
alignfix = inleft%4;/* iconv gives an error if we give him 3 bytes to convert,
and we are using ucs4, ditto for utf16, and 1 byte*/
inleft -= alignfix;
if(!inleft && alignfix) {
size_t k;
for(k=0;k+alignfix < 4;k++)
tmpbuff[alignfix+k] = '\0';
inleft = 4;
alignfix = -inleft;
}
iconv_struct = iconv_open("UTF-16BE",encoding_name(conv->encoding));
if(iconv_struct == (iconv_t)-1) {
cli_dbgmsg("Iconv init problem for encoding:%s, falling back to iso encoding!\n",encoding_name(conv->encoding));
/* what can we do? just fall back for it being an ISO-8859-1 */
iconv_struct = iconv_open("UTF-16BE","ISO-8859-1");
if(iconv_struct == (iconv_t)-1) {
cli_dbgmsg("fallback failed... bail out\n");
return cli_readline(NULL,&conv->tmp_area,maxlen);
}
}
if(inleft) /* iconv doesn't like inleft to be 0 */
rc = iconv(iconv_struct, (char**) &tmpbuff, &inleft, (char**) &out, &outleft);
else
rc = 0;
iconv_close(iconv_struct);
if(rc==(size_t)-1 && errno != E2BIG) { |
4ac11cb0 |
cli_dbgmsg("iconv error:%s, silently resuming (%ld,%ld,%ld,%ld)\n",strerror(errno),out-conv->out_area.buffer,tmpbuff-conv->tmp_area.buffer,inleft,outleft);
/* output raw byte, and resume at next byte */
*out++ = 0;
*out++ = *tmpbuff++;
inleft--;
/* return cli_readline(NULL, &conv->norm_area, maxlen);*/ |
4bdf6efd |
*norm++ = 0x20;
}
else {
char buff[10];
snprintf(buff,9,"&#%d;",u16);
buff[9] = '\0';
if(norm + strlen(buff) >= norm_end)
break;
strncpy((char*)norm, buff, strlen(buff));
norm += strlen(buff);
}
}
conv->out_area.offset = i; /* so that we can resume next time from here */
conv->norm_area.length = norm - conv->norm_area.buffer;
/*
conv->norm_area.buffer[conv->buffer_size-1]=0;DONT DO THIS
if( (o =strstr(conv->norm_area.buffer,"Content")) && strstr(conv->norm_area.buffer,"text/x-"))
printf("%s\n",o);*/
/* final cli_readline from conv->norm_area */
return cli_readline(NULL, &conv->norm_area, maxlen);
}
}
#endif |