/* * HTML Entity & Encoding normalization. * * Copyright (C) 2006 Török Edvin * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, * MA 02110-1301, USA. * */ #ifndef _ENTITIES_H #define _ENTITIES_H #include "cltypes.h" #include "hashtab.h" #define UCS4_1234 (const unsigned char*)"UCS-4LE" #define UCS4_4321 (const unsigned char*)"UCS-4BE" #define UCS4_2143 (const unsigned char*)"UCS4" #define UCS4_3412 (const unsigned char*)"UCS-4" #define UTF16_BE (const unsigned char*)"UTF-16BE" #define UTF16_LE (const unsigned char*)"UTF-16LE" #define UTF8 (const unsigned char*)"UTF-8" #define UNDECIDED_32_1234 UCS4_1234 #define UNDECIDED_32_4321 UCS4_4321 #define UNDECIDED_32_2143 UCS4_2143 #define UNDECIDED_32_3412 UCS4_3412 #define UNDECIDED_16_BE UTF16_BE #define UNDECIDED_16_LE UTF16_LE #define UNDECIDED_8 (const unsigned char*)"ISO-8859-1" #define EBCDIC (const unsigned char*)"EBCDIC-US" #define UNKNOWN (const unsigned char*)"\0" #define OTHER (const unsigned char*)"OTHER" enum encoding_priority {NOPRIO,CONTENT_TYPE,BOM,NOBOM_AUTODETECT,XML_CHARSET,META}; enum encodings {E_UCS4,E_UTF16,E_UCS4_1234,E_UCS4_4321,E_UCS4_2134,E_UCS4_3412,E_UTF16_BE,E_UTF16_LE,E_UTF8,E_UNKNOWN,E_OTHER}; #define MAX_ENTITY_SIZE 22 struct entity_conv { unsigned char* encoding; const unsigned char* autodetected; enum encoding_priority priority; unsigned short int encoding_specific;/* sub-encoding, used for ISO*/ const struct hashtable* ht; uint8_t has_bom; uint8_t enc_bytes; uint8_t bytes_read; uint8_t bom_cnt; uint32_t partial; unsigned char bom[4]; #if 0 char* buffer; char* buffer2; #endif size_t buffer_size; size_t buffer_cnt; uint8_t entity_buffcnt; char entity_buff[MAX_ENTITY_SIZE+2]; m_area_t tmp_area; m_area_t out_area; m_area_t norm_area; int msg_zero_shown; }; int init_entity_converter(struct entity_conv* conv,const unsigned char* encoding,size_t buffer_size); void process_encoding_set(struct entity_conv* conv,const unsigned char* encoding,enum encoding_priority priority); int entity_norm_done(struct entity_conv* conv); unsigned char* encoding_norm_readline(struct entity_conv* conv, FILE* stream_in, m_area_t* in_m_area, const size_t maxlen); unsigned char* entity_norm(const struct entity_conv* conv,const unsigned char* entity); int entitynorm_init(void); #endif