/*
 *  HTML Entity & Encoding normalization.
 *
 *  Copyright (C) 2006 Török Edvin <edwin@clamav.net>
 *
 *  This program is free software; you can redistribute it and/or modify
 *  it under the terms of the GNU General Public License as published by
 *  the Free Software Foundation; either version 2 of the License, or
 *  (at your option) any later version.
 *
 *  This program is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *  GNU General Public License for more details.
 *
 *  You should have received a copy of the GNU General Public License
 *  along with this program; if not, write to the Free Software
 *  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
 *  MA 02110-1301, USA.
 *
 */

#ifndef _ENTITIES_H
#define _ENTITIES_H
#include "cltypes.h"

#include "hashtab.h"

#define UCS4_1234 (const unsigned char*)"UCS-4LE"
#define UCS4_4321 (const unsigned char*)"UCS-4BE"
#define UCS4_2143 (const unsigned char*)"UCS4"
#define UCS4_3412 (const unsigned char*)"UCS-4"
#define UTF16_BE (const unsigned char*)"UTF-16BE"
#define UTF16_LE (const unsigned char*)"UTF-16LE"
#define UTF8     (const unsigned char*)"UTF-8"
#define UNDECIDED_32_1234 UCS4_1234
#define UNDECIDED_32_4321 UCS4_4321
#define UNDECIDED_32_2143 UCS4_2143
#define UNDECIDED_32_3412 UCS4_3412
#define UNDECIDED_16_BE UTF16_BE
#define UNDECIDED_16_LE UTF16_LE
#define UNDECIDED_8 (const unsigned char*)"ISO-8859-1"
#define EBCDIC (const unsigned char*)"EBCDIC-US"
#define UNKNOWN (const unsigned char*)"\0"
#define OTHER   (const unsigned char*)"OTHER"

enum encoding_priority {NOPRIO,CONTENT_TYPE,BOM,NOBOM_AUTODETECT,XML_CHARSET,META};

enum encodings {E_UCS4,E_UTF16,E_UCS4_1234,E_UCS4_4321,E_UCS4_2134,E_UCS4_3412,E_UTF16_BE,E_UTF16_LE,E_UTF8,E_UNKNOWN,E_OTHER};
#define MAX_ENTITY_SIZE 22

struct entity_conv {
	unsigned char* encoding;
	const unsigned char* autodetected;
	enum encoding_priority priority;
	unsigned short int encoding_specific;/* sub-encoding, used for ISO*/
	const struct hashtable* ht;
	uint8_t has_bom;
	uint8_t enc_bytes;
	uint8_t bytes_read;
	uint8_t  bom_cnt;
	uint32_t partial;
	unsigned char bom[4];
#if 0	
	char* buffer;
	char* buffer2;
#endif	
	size_t buffer_size;
	size_t buffer_cnt;
	uint8_t entity_buffcnt;
	char entity_buff[MAX_ENTITY_SIZE+2];
	m_area_t tmp_area;
	m_area_t out_area;
	m_area_t norm_area;
	int      msg_zero_shown;
};


int init_entity_converter(struct entity_conv* conv,const unsigned char* encoding,size_t buffer_size);
void process_encoding_set(struct entity_conv* conv,const unsigned char* encoding,enum encoding_priority priority);
int entity_norm_done(struct entity_conv* conv);

unsigned char* encoding_norm_readline(struct entity_conv* conv, FILE* stream_in, m_area_t* in_m_area, const size_t maxlen);
unsigned char* entity_norm(const struct entity_conv* conv,const unsigned char* entity);
int entitynorm_init(void);

#endif