libclamav/htmlnorm.c
888f5794
 /*
6289eda8
  *  Copyright (C) 2015, 2018 Cisco Systems, Inc. and/or its affiliates. All rights reserved.
ac1d2fba
  *  Copyright (C) 2007-2013 Sourcefire, Inc.
e57fa318
  *
2023340a
  *  Authors: Trog
6289eda8
  * 
  *  Summary: Normalise HTML text. Decode MS Script Encoder protection. 
  *           The ScrEnc decoder was initially based upon an analysis by Andreas Marx.
888f5794
  *
  *  This program is free software; you can redistribute it and/or modify
2023340a
  *  it under the terms of the GNU General Public License version 2 as
  *  published by the Free Software Foundation.
888f5794
  *
  *  This program is distributed in the hope that it will be useful,
  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  *  GNU General Public License for more details.
  *
  *  You should have received a copy of the GNU General Public License
  *  along with this program; if not, write to the Free Software
48b7b4a7
  *  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
  *  MA 02110-1301, USA.
888f5794
  */
 
b58fdfc2
 #if HAVE_CONFIG_H
 #include "clamav-config.h"
 #endif
 
888f5794
 #include <stdio.h>
b58fdfc2
 #ifdef	HAVE_UNISTD_H
888f5794
 #include <unistd.h>
b58fdfc2
 #endif
888f5794
 #include <sys/types.h>
 #include <sys/stat.h>
 #include <fcntl.h>
c7029064
 #ifdef HAVE_STRINGS_H
 #include <strings.h>
 #endif
e57fa318
 #include <string.h>
 #include <errno.h>
 #include <stdio.h>
a58f7bc5
 #include <ctype.h>
e57fa318
 
60d8d2c3
 #include "clamav.h"
ba65fdc8
 #include "fmap.h"
888f5794
 #include "others.h"
e57fa318
 #include "htmlnorm.h"
888f5794
 
3506ac49
 #include "entconv.h"
8be1d5a4
 #include "jsparse/js-norm.h"
333d724d
 
e57fa318
 #define HTML_STR_LENGTH 1024
333d724d
 #define MAX_TAG_CONTENTS_LENGTH HTML_STR_LENGTH
888f5794
 
e57fa318
 typedef enum {
a58f7bc5
     HTML_BAD_STATE,
     HTML_NORM,
13bfb273
     HTML_8BIT,
a58f7bc5
     HTML_COMMENT,
     HTML_CHAR_REF,
3506ac49
     HTML_ENTITY_REF_DECODE,
a58f7bc5
     HTML_SKIP_WS,
     HTML_TRIM_WS,
     HTML_TAG,
     HTML_TAG_ARG,
     HTML_TAG_ARG_VAL,
     HTML_TAG_ARG_EQUAL,
     HTML_PROCESS_TAG,
     HTML_CHAR_REF_DECODE,
a5f19645
     HTML_LOOKFOR_SCRENC,
a58f7bc5
     HTML_JSDECODE,
     HTML_JSDECODE_LENGTH,
     HTML_JSDECODE_DECRYPT,
     HTML_SPECIAL_CHAR,
a92110df
     HTML_RFC2397_TYPE,
     HTML_RFC2397_INIT,
     HTML_RFC2397_DATA,
     HTML_RFC2397_FINISH,
     HTML_RFC2397_ESC,
fc83da82
     HTML_ESCAPE_CHAR
e57fa318
 } html_state;
 
 typedef enum {
     SINGLE_QUOTED,
     DOUBLE_QUOTED,
fc83da82
     NOT_QUOTED
e57fa318
 } quoted_state;
 
 
 #define HTML_FILE_BUFF_LEN 8192
 
 typedef struct file_buff_tag {
 	int fd;
 	unsigned char buffer[HTML_FILE_BUFF_LEN];
 	int length;
 } file_buff_t;
 
7d4b5f16
 struct tag_contents {
 	size_t pos;
08402afa
 	unsigned char contents[MAX_TAG_CONTENTS_LENGTH + 1];
7d4b5f16
 };
 
e57fa318
 static const int base64_chars[256] = {
     -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1,
     -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1,
     -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,62, -1,-1,-1,63,
     52,53,54,55, 56,57,58,59, 60,61,-1,-1, -1,-1,-1,-1,
     -1, 0, 1, 2,  3, 4, 5, 6,  7, 8, 9,10, 11,12,13,14,
     15,16,17,18, 19,20,21,22, 23,24,25,-1, -1,-1,-1,-1,
     -1,26,27,28, 29,30,31,32, 33,34,35,36, 37,38,39,40,
     41,42,43,44, 45,46,47,48, 49,50,51,-1, -1,-1,-1,-1,
     -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1,
     -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1,
     -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1,
     -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1,
     -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1,
     -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1,
     -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1,
     -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1,
 };
 
 int table_order[] = {
        00, 02, 01, 00, 02, 01, 02, 01, 01, 02, 01, 02, 00, 01, 02, 01,
        00, 01, 02, 01, 00, 00, 02, 01, 01, 02, 00, 01, 02, 01, 01, 02,
        00, 00, 01, 02, 01, 02, 01, 00, 01, 00, 00, 02, 01, 00, 01, 02,
        00, 01, 02, 01, 00, 00, 02, 01, 01, 00, 00, 02, 01, 00, 01, 02
 };
888f5794
 
e57fa318
 int decrypt_tables[3][128] = {
       {0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x57, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F,
        0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F,
        0x2E, 0x47, 0x7A, 0x56, 0x42, 0x6A, 0x2F, 0x26, 0x49, 0x41, 0x34, 0x32, 0x5B, 0x76, 0x72, 0x43,
        0x38, 0x39, 0x70, 0x45, 0x68, 0x71, 0x4F, 0x09, 0x62, 0x44, 0x23, 0x75, 0x3C, 0x7E, 0x3E, 0x5E,
        0xFF, 0x77, 0x4A, 0x61, 0x5D, 0x22, 0x4B, 0x6F, 0x4E, 0x3B, 0x4C, 0x50, 0x67, 0x2A, 0x7D, 0x74,
        0x54, 0x2B, 0x2D, 0x2C, 0x30, 0x6E, 0x6B, 0x66, 0x35, 0x25, 0x21, 0x64, 0x4D, 0x52, 0x63, 0x3F,
        0x7B, 0x78, 0x29, 0x28, 0x73, 0x59, 0x33, 0x7F, 0x6D, 0x55, 0x53, 0x7C, 0x3A, 0x5F, 0x65, 0x46,
        0x58, 0x31, 0x69, 0x6C, 0x5A, 0x48, 0x27, 0x5C, 0x3D, 0x24, 0x79, 0x37, 0x60, 0x51, 0x20, 0x36},
 
       {0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x7B, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F,
        0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F,
        0x32, 0x30, 0x21, 0x29, 0x5B, 0x38, 0x33, 0x3D, 0x58, 0x3A, 0x35, 0x65, 0x39, 0x5C, 0x56, 0x73,
        0x66, 0x4E, 0x45, 0x6B, 0x62, 0x59, 0x78, 0x5E, 0x7D, 0x4A, 0x6D, 0x71, 0x3C, 0x60, 0x3E, 0x53,
        0xFF, 0x42, 0x27, 0x48, 0x72, 0x75, 0x31, 0x37, 0x4D, 0x52, 0x22, 0x54, 0x6A, 0x47, 0x64, 0x2D,
        0x20, 0x7F, 0x2E, 0x4C, 0x5D, 0x7E, 0x6C, 0x6F, 0x79, 0x74, 0x43, 0x26, 0x76, 0x25, 0x24, 0x2B,
        0x28, 0x23, 0x41, 0x34, 0x09, 0x2A, 0x44, 0x3F, 0x77, 0x3B, 0x55, 0x69, 0x61, 0x63, 0x50, 0x67,
        0x51, 0x49, 0x4F, 0x46, 0x68, 0x7C, 0x36, 0x70, 0x6E, 0x7A, 0x2F, 0x5F, 0x4B, 0x5A, 0x2C, 0x57},
 
       {0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x6E, 0x0A, 0x0B, 0x0C, 0x06, 0x0E, 0x0F,
        0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F,
        0x2D, 0x75, 0x52, 0x60, 0x71, 0x5E, 0x49, 0x5C, 0x62, 0x7D, 0x29, 0x36, 0x20, 0x7C, 0x7A, 0x7F,
        0x6B, 0x63, 0x33, 0x2B, 0x68, 0x51, 0x66, 0x76, 0x31, 0x64, 0x54, 0x43, 0x3C, 0x3A, 0x3E, 0x7E,
        0xFF, 0x45, 0x2C, 0x2A, 0x74, 0x27, 0x37, 0x44, 0x79, 0x59, 0x2F, 0x6F, 0x26, 0x72, 0x6A, 0x39,
        0x7B, 0x3F, 0x38, 0x77, 0x67, 0x53, 0x47, 0x34, 0x78, 0x5D, 0x30, 0x23, 0x5A, 0x5B, 0x6C, 0x48,
        0x55, 0x70, 0x69, 0x2E, 0x4C, 0x21, 0x24, 0x4E, 0x50, 0x09, 0x56, 0x73, 0x35, 0x61, 0x4B, 0x58,
        0x3B, 0x57, 0x22, 0x6D, 0x4D, 0x25, 0x28, 0x46, 0x4A, 0x32, 0x41, 0x3D, 0x5F, 0x4F, 0x42, 0x65}
 };
 
a6de01aa
 static inline unsigned int rewind_tospace(const unsigned char* chunk, unsigned int len)
888f5794
 {
a6de01aa
 	unsigned int count = len;
 	while (!isspace(chunk[len - 1]) && (len > 1)) {
 		len--;
 	}
 	if (len == 1) {
 		return count;
 	}
 	return len;
 }
888f5794
 
a6de01aa
 /* read at most @max_len of data from @m_area or @stream, skipping NULL chars.
  * This used to be called cli_readline, but we don't stop at end-of-line anymore */
 static unsigned char *cli_readchunk(FILE *stream, m_area_t *m_area, unsigned int max_len)
 {
 	unsigned char *chunk, *start, *ptr, *end;
 	unsigned int chunk_len, count;
 
 	chunk = (unsigned char *) cli_malloc(max_len);
 	if (!chunk) {
241e7eb1
         cli_errmsg("readchunk: Unable to allocate memory for chunk\n");
888f5794
 		return NULL;
 	}
 
e57fa318
 	/* Try and use the memory buffer first */
 	if (m_area) {
ba65fdc8
 		/* maximum we can copy into the buffer,
 		 * we could have less than max_len bytes available */
 		chunk_len = MIN(m_area->length-m_area->offset, max_len-1);
 		if(!chunk_len) {
 			free(chunk);
 			return NULL;
 		}
 		if(m_area->map)
1ae65c29
 		    ptr = (unsigned char *)fmap_need_off_once(m_area->map, m_area->offset, chunk_len);
ba65fdc8
 		else
 		    ptr = m_area->buffer + m_area->offset;
 		start = ptr;
 		end = ptr - m_area->offset + m_area->length;
caa00029
 
d819b67b
 		if ((start >= end) || !start) {
a6de01aa
 			free(chunk);
e57fa318
 			return NULL;
 		}
a6de01aa
 
 		/* look for NULL chars */
 		ptr = memchr(start, 0, chunk_len);
 	        if(!ptr) {
 			/* no NULL chars found, copy all */
 			memcpy(chunk, start, chunk_len);
 			chunk[chunk_len] = '\0';
 			m_area->offset += chunk_len;
 			/* point ptr to end of chunk,
 			 * so we can check and rewind to a space below */
 			ptr = start + chunk_len;
e57fa318
 		} else {
a6de01aa
 			/* copy portion that doesn't contain NULL chars */
 			chunk_len = ptr - start;
 			if(chunk_len < max_len) {
 				memcpy(chunk, start, chunk_len);
 			} else {
 				chunk_len = 0;
 				ptr = start;
 			}
567f60a3
 			if(m_area->map)
1ae65c29
 			    ptr = (unsigned char *)fmap_need_ptr_once(m_area->map, ptr, end - ptr);
caa00029
 			if (!ptr) {
 			    cli_warnmsg("fmap inconsistency\n");
 			    ptr = end;
 			}
a6de01aa
 			/* we have unknown number of NULL chars,
 			 * copy char-by-char and skip them */
 			while((ptr < end) && (chunk_len < max_len-1)) {
 				const unsigned char c = *ptr++;
ba65fdc8
 				/* we can't use chunk_len to determine how many bytes we read, since
 				 * we skipped chars */
a6de01aa
 				if(c) {
 					chunk[chunk_len++] = c;
 				}
888f5794
 			}
567f60a3
 			m_area->offset += ptr - start;
a6de01aa
 			chunk[chunk_len] = '\0';
 		}
 		if(ptr && ptr < end && !isspace(*ptr)) {
 			/* we hit max_len, rewind to a space */
 			count = rewind_tospace(chunk, chunk_len);
 			if(count < chunk_len) {
 				chunk[count] = '\0';
 				m_area->offset -= chunk_len - count;
888f5794
 			}
e57fa318
 		}
 	} else {
 		if (!stream) {
 			cli_dbgmsg("No HTML stream\n");
a6de01aa
 			free(chunk);
e57fa318
 			return NULL;
 		}
a6de01aa
 		chunk_len = fread(chunk, 1, max_len-1, stream);
 		if(!chunk_len || chunk_len > max_len-1) {
 			/* EOF, or prevent overflow */
 			free(chunk);
e57fa318
 			return NULL;
 		}
 
a6de01aa
 		/* Look for NULL chars */
 		ptr = memchr(chunk, 0, chunk_len);
 		if(ptr) {
 			/* NULL char found */
 			/* save buffer limits */
 		        start = ptr;
 			end = chunk + chunk_len;
 
 			/* start of NULL chars, we will copy non-NULL characters
 			 * to this position */
 			chunk_len = ptr - chunk;
 
 			/* find first non-NULL char */
 			while((ptr < end) && !(*ptr)) {
 				ptr++;
 			}
 			/* skip over NULL chars, and move back the rest */
 		        while((ptr < end) && (chunk_len < max_len-1)) {
 				const unsigned char c = *ptr++;
 				if(c) {
 					chunk[chunk_len++] = c;
888f5794
 				}
 			}
a6de01aa
 		}
d9282b97
 		chunk[chunk_len] = '\0';
a6de01aa
 		if(chunk_len == max_len - 1) {
 			/* rewind to a space (which includes newline) */
 			count = rewind_tospace(chunk, chunk_len);
 			if(count < chunk_len) {
 				chunk[count] = '\0';
 				/* seek-back to space */
a5f19645
 				fseek(stream, -(long)(chunk_len - count), SEEK_CUR);
a6de01aa
 			}
888f5794
 		}
 	}
a6de01aa
 
 	return chunk;
888f5794
 }
 
e57fa318
 static void html_output_flush(file_buff_t *fbuff)
888f5794
 {
e57fa318
 	if (fbuff && (fbuff->length > 0)) {
 		cli_writen(fbuff->fd, fbuff->buffer, fbuff->length);
 		fbuff->length = 0;
888f5794
 	}
e57fa318
 }
 
d5a5fef9
 static inline void html_output_c(file_buff_t *fbuff1, unsigned char c)
e57fa318
 {
 	if (fbuff1) {
 		if (fbuff1->length == HTML_FILE_BUFF_LEN) {
 			html_output_flush(fbuff1);
 		}
 		fbuff1->buffer[fbuff1->length++] = c;
888f5794
 	}
 }
 
5cd3f734
 static void html_output_str(file_buff_t *fbuff, const unsigned char *str, size_t len)
888f5794
 {
e57fa318
 	if (fbuff) {
 		if ((fbuff->length + len) >= HTML_FILE_BUFF_LEN) {
 			html_output_flush(fbuff);
 		}
922f69fa
 		if (len >= HTML_FILE_BUFF_LEN) {
 			html_output_flush(fbuff);
 			cli_writen(fbuff->fd, str, len);
 		} else {
 			memcpy(fbuff->buffer + fbuff->length, str, len);
 			fbuff->length += len;
 		}
888f5794
 	}
e57fa318
 }
 
fc83da82
 static char *html_tag_arg_value(tag_arguments_t *tags, const char *tag)
e57fa318
 {
 	int i;
888f5794
 	
e57fa318
 	for (i=0; i < tags->count; i++) {
b9b47784
 		if (strcmp((const char*)tags->tag[i], tag) == 0) {
 			return (char*)tags->value[i];
e57fa318
 		}
888f5794
 	}
e57fa318
 	return NULL;
 }
 
fc83da82
 static void html_tag_arg_set(tag_arguments_t *tags, const char *tag, const char *value)
e57fa318
 {
 	int i;
888f5794
 	
e57fa318
 	for (i=0; i < tags->count; i++) {
b9b47784
 		if (strcmp((const char*)tags->tag[i], tag) == 0) {
e57fa318
 			free(tags->value[i]);
b9b47784
 			tags->value[i] = (unsigned char*)cli_strdup(value);
e57fa318
 			return;
888f5794
 		}
e57fa318
 	}
 	return;
 }
f2b71eb9
 void html_tag_arg_add(tag_arguments_t *tags,
b9b47784
 		const char *tag, char *value)
e57fa318
 {
ea04d2de
 	int len, i;
e57fa318
 	tags->count++;
84fd5a61
 	tags->tag = (unsigned char **) cli_realloc2(tags->tag,
e57fa318
 				tags->count * sizeof(char *));
ea04d2de
 	if (!tags->tag) {
 		goto abort;
 	}
84fd5a61
 	tags->value = (unsigned char **) cli_realloc2(tags->value,
e57fa318
 				tags->count * sizeof(char *));
ea04d2de
 	if (!tags->value) {
 		goto abort;
e57fa318
 	}
333d724d
 	if(tags->scanContents) {
7d4b5f16
 		tags->contents= (unsigned char **) cli_realloc2(tags->contents,
333d724d
 				tags->count*sizeof(*tags->contents));
 		if(!tags->contents) {
 			goto abort;
 		}
 		tags->contents[tags->count-1]=NULL;
 	}
b9b47784
 	tags->tag[tags->count-1] = (unsigned char*)cli_strdup(tag);
e57fa318
 	if (value) {
 		if (*value == '"') {
b9b47784
 			tags->value[tags->count-1] = (unsigned char*)cli_strdup(value+1);
 			len = strlen((const char*)value+1);
e57fa318
 			if (len > 0) {
 				tags->value[tags->count-1][len-1] = '\0';
 			}
f05eb936
 		} else {
b9b47784
 			tags->value[tags->count-1] = (unsigned char*)cli_strdup(value);
f05eb936
 		}
e57fa318
 	} else {
 		tags->value[tags->count-1] = NULL;
888f5794
 	}
ea04d2de
 	return;
 	
 abort:
 	/* Bad error - can't do 100% recovery */
 	tags->count--;
 	for (i=0; i < tags->count; i++) {
 		if (tags->tag) {
 			free(tags->tag[i]);
 		}
 		if (tags->value) {
 			free(tags->value[i]);
 		}
333d724d
 		if(tags->contents) {
 			if(tags->contents[i])
7d4b5f16
 				free(tags->contents[i]);
333d724d
 		}
ea04d2de
 	}
 	if (tags->tag) {
 		free(tags->tag);
 	}
 	if (tags->value) {
 		free(tags->value);
 	}
333d724d
 	if (tags->contents)
 		free(tags->contents);
 	tags->contents=NULL;
ea04d2de
 	tags->tag = tags->value = NULL;
 	tags->count = 0;	
 	return;
e57fa318
 }
 
 static void html_output_tag(file_buff_t *fbuff, char *tag, tag_arguments_t *tags)
 {
a58f7bc5
 	int i, j, len;
e57fa318
 
d5a5fef9
 	html_output_c(fbuff, '<');
b9b47784
 	html_output_str(fbuff, (const unsigned char*)tag, strlen(tag));
e57fa318
 	for (i=0; i < tags->count; i++) {
d5a5fef9
 		html_output_c(fbuff, ' ');
b9b47784
 		html_output_str(fbuff, tags->tag[i], strlen((const char*)tags->tag[i]));
e57fa318
 		if (tags->value[i]) {
b9b47784
 			html_output_str(fbuff, (const unsigned char*)"=\"", 2);
 			len = strlen((const char*)tags->value[i]);
a58f7bc5
 			for (j=0 ; j<len ; j++) {
d5a5fef9
 				html_output_c(fbuff, tolower(tags->value[i][j]));
a58f7bc5
 			}
d5a5fef9
 			html_output_c(fbuff, '"');
e57fa318
 		}
888f5794
 	}
d5a5fef9
 	html_output_c(fbuff, '>');
e57fa318
 }
 
 void html_tag_arg_free(tag_arguments_t *tags)
 {
 	int i;
888f5794
 	
e57fa318
 	for (i=0; i < tags->count; i++) {
 		free(tags->tag[i]);
 		if (tags->value[i]) {
 			free(tags->value[i]);
 		}
333d724d
 		if(tags->contents)
 			if (tags->contents[i])
7d4b5f16
 				free(tags->contents[i]);
e57fa318
 	}
 	if (tags->tag) {
 		free(tags->tag);
 	}
 	if (tags->value) {
 		free(tags->value);
 	}
333d724d
 	if(tags->contents)
 		free(tags->contents);
 	tags->contents = NULL;
e57fa318
 	tags->tag = tags->value = NULL;
 	tags->count = 0;
888f5794
 }
 
333d724d
 /**
  * the displayed text for an <a href> tag
  */
7d4b5f16
 static inline void html_tag_contents_append(struct tag_contents *cont, const unsigned char* begin,const unsigned char *end)
333d724d
 {
7d4b5f16
 	size_t i;
13bfb273
         uint32_t mbchar = 0;
7d4b5f16
 	if(!begin || !end)
 		return;
 	for(i = cont->pos; i < MAX_TAG_CONTENTS_LENGTH && (begin < end);i++) {
13bfb273
             uint8_t c = *begin++;
             if (mbchar && (c < 0x80 || mbchar >= 0x10000)) {
                 if (mbchar == 0xE38082 || mbchar == 0xEFBC8E
                     || mbchar == 0xEFB992 ||
3afedd07
                     (mbchar == 0xA1 && (c == 0x43 || c == 0x44 || c == 0x4F))) {
13bfb273
                     cont->contents[i++] = '.';
4522746e
                     if (mbchar == 0xA1) {
                         --i;
                         mbchar = 0;
                         continue;
                     }
13bfb273
                 } else {
                     uint8_t c0 = mbchar >> 16;
                     uint8_t c1 = (mbchar >> 8)&0xff;
                     uint8_t c2 = (mbchar & 0xff);
                     if (c0 && i+1 < MAX_TAG_CONTENTS_LENGTH)
                         cont->contents[i++] = c0;
                     if ((c0 || c1) && i+1 < MAX_TAG_CONTENTS_LENGTH)
                         cont->contents[i++] = c1;
                     if (i+1 < MAX_TAG_CONTENTS_LENGTH)
                         cont->contents[i++] = c2;
                 }
                 mbchar = 0;
             }
             if (c >= 0x80) {
                 mbchar = (mbchar << 8) | c;
                 --i;
             }
             else
 		cont->contents[i] = c;
333d724d
 	}
7d4b5f16
 	cont->pos = i;
333d724d
 }
 
 
7d4b5f16
 static inline void html_tag_contents_done(tag_arguments_t *tags,int idx, struct tag_contents *cont)
333d724d
 {
7d4b5f16
 	unsigned char *p;
 	cont->contents[cont->pos++] = '\0';
 	p = cli_malloc(cont->pos);
241e7eb1
 	if(!p) {
         cli_errmsg("html_tag_contents_done: Unable to allocate memory for p\n");
7d4b5f16
 		return;
241e7eb1
     }
7d4b5f16
 	memcpy(p, cont->contents, cont->pos);
 	tags->contents[idx-1] = p;
 	cont->pos = 0;
333d724d
 }
 
a5f19645
 struct screnc_state {
 	uint32_t length;
 	uint32_t sum;
 	uint8_t  table_pos;
 };
 
 /* inplace decoding, so that we can normalize it later */
745d4b38
 static void screnc_decode(unsigned char *ptr, struct screnc_state *s)
a5f19645
 {
 	uint8_t  value;
 	unsigned char *dst = ptr;
 
 	if(!ptr || !s)
 		return;
 	while(s->length > 0 && *ptr) {
 		if ((*ptr == '\n') || (*ptr == '\r')) {
 			ptr++;
 			continue;
 		}
 		if (*ptr < 0x80) {
 			value = decrypt_tables[table_order[s->table_pos]][*ptr];
 			if (value == 0xFF) { /* special character */
 				ptr++;
 				s->length--;
 				switch (*ptr) {
 					case '\0':
 						/* Fixup for end of line */
 						ptr--;
 						break;
 					case 0x21:
 						value = 0x3c;
 						break;
 					case 0x23:
 						value = 0x0d;
 						break;
 					case 0x24:
 						value = 0x40;
 						break;
 					case 0x26:
 						value = 0x0a;
 						break;
 					case 0x2a:
 						value = 0x3e;
 						break;
 				}
 			}
 			s->sum += value;
 			*dst++ = value;
 			s->table_pos = (s->table_pos + 1) % 64;
 		} else {
 			*dst++ = *ptr++;
 			*dst++ = *ptr;
12f43117
 			if (!*ptr) {
 				dst--;
d9282b97
 				break;
12f43117
 			}
a5f19645
 		}
 		ptr++;
 		s->length--;
 	}
 	if(!s->length) {
 		size_t remaining;
b9b47784
 		if(strlen((const char*)ptr) >= 12) {
a5f19645
 			uint32_t expected;
 			expected = base64_chars[ptr[0]] << 2;
 			expected += base64_chars[ptr[1]] >> 4;
 			expected += (base64_chars[ptr[1]] & 0x0f) << 12;
 			expected += (base64_chars[ptr[2]] >> 2) << 8;
 			expected += (base64_chars[ptr[2]] & 0x03) << 22;
 			expected += base64_chars[ptr[3]] << 16;
 			expected += (base64_chars[ptr[4]] << 2) << 24;
 			expected += (base64_chars[ptr[5]] >> 4) << 24;
 			ptr += 8;
 			if(s->sum != expected) {
5cd3f734
 				cli_dbgmsg("screnc_decode: checksum mismatch: %u != %u\n", s->sum, expected);
a5f19645
 			} else {
b9b47784
 				if(strncmp((const char*)ptr, "^#~@", 4) != 0) {
a5f19645
 					cli_dbgmsg("screnc_decode: terminator not found\n");
 				} else {
 					cli_dbgmsg("screnc_decode: OK\n");
 				}
 			}
 			ptr += 4;
 		}
 		/* copy remaining */
b9b47784
 		remaining = strlen((const char*)ptr) + 1;
a5f19645
 		memmove(dst, ptr, remaining);
 	} else {
12f43117
 		*dst = '\0';
a5f19645
 	}
 }
 
b9b47784
 static void js_process(struct parser_state *js_state, const unsigned char *js_begin, const unsigned char *js_end,
 		const unsigned char *line, const unsigned char *ptr, int in_script, const char *dirname)
6a53bbdf
 {
 	if(!js_begin)
 		js_begin = line;
 	if(!js_end)
 		js_end = ptr;
 	if(js_end > js_begin &&
 			CLI_ISCONTAINED(line, 8192, js_begin, 1) &&
 			CLI_ISCONTAINED(line, 8192, js_end, 1)) {
b9b47784
 		cli_js_process_buffer(js_state, (const char*)js_begin, js_end - js_begin);
6a53bbdf
 	}
 	if(!in_script) {
 		/*  we found a /script, normalize script now */
 		cli_js_parse_done(js_state);
 		cli_js_output(js_state, dirname);
 		cli_js_destroy(js_state);
 	}
 }
 
462e8e5e
 static int cli_html_normalise(int fd, m_area_t *m_area, const char *dirname, tag_arguments_t *hrefs,const struct cli_dconf* dconf)
888f5794
 {
b9b47784
 	int fd_tmp, tag_length = 0, tag_arg_length = 0, binary;
 	int retval=FALSE, escape=FALSE, value = 0, hex=FALSE, tag_val_length=0;
7959343d
 	int look_for_screnc=FALSE, in_screnc=FALSE,in_script=FALSE, text_space_written=FALSE;
5e2a487c
 	FILE *stream_in = NULL;
a5f19645
 	html_state state=HTML_NORM, next_state=HTML_BAD_STATE, saved_next_state=HTML_BAD_STATE;
e57fa318
 	char filename[1024], tag[HTML_STR_LENGTH+1], tag_arg[HTML_STR_LENGTH+1];
b9b47784
 	char tag_val[HTML_STR_LENGTH+1], *tmp_file, *arg_value;
1eed8302
 	unsigned char *line = NULL, *ptr, *ptr_screnc = NULL;
e57fa318
 	tag_arguments_t tag_args;
b9b47784
 	quoted_state quoted = NOT_QUOTED;
 	unsigned long length = 0;
a5f19645
 	struct screnc_state screnc_state;
d5a5fef9
 	file_buff_t *file_buff_o2, *file_buff_text;
b9b47784
 	file_buff_t *file_tmp_o1 = NULL;
333d724d
 	int in_ahref=0;/* index of <a> tag, whose contents we are parsing. Indexing starts from 1, 0 means outside of <a>*/
 	unsigned char* href_contents_begin=NULL;/*beginning of the next portion of <a> contents*/
 	unsigned char* ptrend=NULL;/*end of <a> contents*/
 	unsigned char* in_form_action = NULL;/* the action URL of the current <form> tag, if any*/
462e8e5e
 
3506ac49
 	struct entity_conv conv;
 	unsigned char entity_val[HTML_STR_LENGTH+1];
 	size_t entity_val_length = 0;
1f95d81c
 	const int dconf_entconv = dconf ? dconf->phishing&PHISHING_CONF_ENTCONV : 1;
 	const int dconf_js = dirname && (dconf ? dconf->doc&DOC_CONF_JSNORM : 1); /* TODO */
462e8e5e
 	/* dconf for phishing engine sets scanContents, so no need for a flag here */
8be1d5a4
 	struct parser_state *js_state = NULL;
5be3029f
 	const unsigned char *js_begin = NULL, *js_end = NULL;
7d4b5f16
 	struct tag_contents contents;
13bfb273
         uint32_t mbchar = 0;
4522746e
         uint32_t mbchar2 = 0;
a92110df
 
462e8e5e
 	tag_args.scanContents=0;/* do we need to store the contents of <a></a>?*/
7d4b5f16
 	contents.pos = 0;
e57fa318
 	if (!m_area) {
 		if (fd < 0) {
 			cli_dbgmsg("Invalid HTML fd\n");
 			return FALSE;
 		}
 		lseek(fd, 0, SEEK_SET);	
 		fd_tmp = dup(fd);
 		if (fd_tmp < 0) {
 			return FALSE;
 		}
 		stream_in = fdopen(fd_tmp, "r");
 		if (!stream_in) {
 			close(fd_tmp);
 			return FALSE;
 		}
888f5794
 	}
ea04d2de
 
 	tag_args.count = 0;
 	tag_args.tag = NULL;
 	tag_args.value = NULL;
333d724d
 	tag_args.contents = NULL;
e57fa318
 	if (dirname) {
58481352
 		snprintf(filename, 1024, "%s"PATHSEP"rfc2397", dirname);
3506ac49
 		if (mkdir(filename, 0700) && errno != EEXIST) {
d5a5fef9
 			file_buff_o2 = file_buff_text = NULL;
ea04d2de
 			goto abort;
 		}
4e1127c5
 
5e733972
 		file_buff_o2 = (file_buff_t *) cli_malloc(sizeof(file_buff_t));
ea04d2de
 		if (!file_buff_o2) {
241e7eb1
             cli_errmsg("cli_html_normalise: Unable to allocate memory for file_buff_o2\n");
d5a5fef9
 			file_buff_o2 = file_buff_text = NULL;
ea04d2de
 			goto abort;
e57fa318
 		}
 
d5a5fef9
 		/* this will still contains scripts that are inside comments */
58481352
 		snprintf(filename, 1024, "%s"PATHSEP"nocomment.html", dirname);
e3a6f061
 		file_buff_o2->fd = open(filename, O_WRONLY|O_CREAT|O_TRUNC, S_IWUSR|S_IRUSR);
72ce4b70
 		if (file_buff_o2->fd == -1) {
e57fa318
 			cli_dbgmsg("open failed: %s\n", filename);
5e733972
 			free(file_buff_o2);
d5a5fef9
 			file_buff_o2 = file_buff_text = NULL;
ea04d2de
 			goto abort;
e57fa318
 		}
 
0664128a
 		file_buff_text = (file_buff_t *) cli_malloc(sizeof(file_buff_t));
 		if(!file_buff_text) {
72ce4b70
 			close(file_buff_o2->fd);
0664128a
 			free(file_buff_o2);
d5a5fef9
 			file_buff_o2 = file_buff_text = NULL;
241e7eb1
             cli_errmsg("cli_html_normalise: Unable to allocate memory for file_buff_text\n");
0664128a
 			goto abort;
 		}
d5a5fef9
 
58481352
 		snprintf(filename, 1024, "%s"PATHSEP"notags.html", dirname);
0664128a
 		file_buff_text->fd = open(filename, O_WRONLY|O_CREAT|O_TRUNC, S_IWUSR|S_IRUSR);
72ce4b70
 		if(file_buff_text->fd == -1) {
0664128a
 			cli_dbgmsg("open failed: %s\n", filename);
 			close(file_buff_o2->fd);
 			free(file_buff_o2);
 			free(file_buff_text);
d5a5fef9
 			file_buff_o2 = file_buff_text = NULL;
72ce4b70
 			goto abort;
0664128a
 		}
5e733972
 		file_buff_o2->length = 0;
0664128a
 		file_buff_text->length = 0;
e57fa318
 	} else {
 		file_buff_o2 = NULL;
0664128a
 		file_buff_text = NULL;
888f5794
 	}
4e1127c5
 
a92110df
 	binary = FALSE;
3506ac49
 
b3fc7f97
 	ptr = line = cli_readchunk(stream_in, m_area, 8192);
3506ac49
 
888f5794
 	while (line) {
333d724d
 		if(href_contents_begin)
 			href_contents_begin=ptr;/*start of a new line, last line already appended to contents see below*/
e57fa318
 		while (*ptr && isspace(*ptr)) {
 			ptr++;
888f5794
 		}
e57fa318
 		while (*ptr) {
a92110df
 			if (!binary && *ptr == '\n') {
ee4e852a
 				/* Convert it to a space and re-process */
 				*ptr = ' ';
e57fa318
 				continue;
 			}
a92110df
 			if (!binary && *ptr == '\r') {
e57fa318
 				ptr++;
 				continue;
 			}
 			switch (state) {
fc83da82
 			case HTML_SPECIAL_CHAR:
 				cli_dbgmsg("Impossible, special_char can't occur here\n");
 				break;
e57fa318
 			case HTML_BAD_STATE:
 				/* An engine error has occurred */
 				cli_dbgmsg("HTML Engine Error\n");
 				goto abort;
 			case HTML_SKIP_WS:
 				if (isspace(*ptr)) {
 					ptr++;
 				} else {
 					state = next_state;
 					next_state = HTML_BAD_STATE;
 				}
 				break;
 			case HTML_TRIM_WS:
 				if (isspace(*ptr)) {
 					ptr++;
 				} else {
22b961c2
 					if(!in_script)
 						html_output_c(file_buff_o2, ' ');
e57fa318
 					state = next_state;
 					next_state = HTML_BAD_STATE;
 				}
 				break;
13bfb273
                         case HTML_8BIT:
                                 if (*ptr < 0x80 || mbchar >= 0x10000) {
                                     if (mbchar == 0xE38082 || mbchar == 0xEFBC8E
                                         || mbchar == 0xEFB992 ||
3afedd07
                                         (mbchar == 0xA1 && (*ptr == 0x43 || *ptr == 0x44 || *ptr == 0x4F))) {
13bfb273
                                         /* bb #4097 */
                                         html_output_c(file_buff_o2, '.');
                                         html_output_c(file_buff_text, '.');
4522746e
                                         if (mbchar == 0xA1) {
                                             ptr++;
                                             mbchar = 0;
                                             continue;
                                         }
13bfb273
                                     } else {
                                         uint8_t c0 = mbchar >> 16;
                                         uint8_t c1 = (mbchar >> 8)&0xff;
                                         uint8_t c2 = (mbchar & 0xff);
                                         if (c0) {
                                             html_output_c(file_buff_o2, c0);
                                             html_output_c(file_buff_text, c0);
                                         }
                                         if (c0 || c1) {
                                             html_output_c(file_buff_o2, c1);
                                             html_output_c(file_buff_text, c1);
                                         }
                                         html_output_c(file_buff_o2, c2);
                                         html_output_c(file_buff_text, c1);
                                     }
                                     mbchar = 0;
                                     state = next_state;
                                     next_state = HTML_NORM;
                                 } else {
                                     mbchar = (mbchar << 8) | *ptr;
                                     ptr++;
                                 }
                                 break;
e57fa318
 			case HTML_NORM:
 				if (*ptr == '<') {
f74bc827
 					ptrend=ptr; /* for use by scanContents */
d5a5fef9
 					html_output_c(file_buff_o2, '<');
 					if (!in_script && !text_space_written) {
 						html_output_c(file_buff_text, ' ');
0664128a
 						text_space_written = TRUE;
e57fa318
 					}
333d724d
 					if(hrefs && hrefs->scanContents && in_ahref && href_contents_begin) {
 						/*append this text portion to the contents of <a>*/
7d4b5f16
 						html_tag_contents_append(&contents,href_contents_begin,ptr);
333d724d
 						href_contents_begin=NULL;/*We just encountered another tag inside <a>, so skip it*/
 					}
e57fa318
 					ptr++;
 					state = HTML_SKIP_WS;
 					tag_length=0;
 					next_state = HTML_TAG;
 				} else if (isspace(*ptr)) {
0664128a
 					if(!text_space_written && !in_script) {
d5a5fef9
 						html_output_c(file_buff_text, ' ');
0664128a
 						text_space_written = TRUE;
 					}
e57fa318
 					state = HTML_TRIM_WS;
 					next_state = HTML_NORM;
 				} else if (*ptr == '&') {
0664128a
 					if(!text_space_written && !in_script) {
d5a5fef9
 						html_output_c(file_buff_text, ' ');
0664128a
 						text_space_written = TRUE;
 					}
e57fa318
 					state = HTML_CHAR_REF;
 					next_state = HTML_NORM;
 					ptr++;
13bfb273
                                 } else if (*ptr >= 0x80) {
                                         state = HTML_8BIT;
                                         next_state = HTML_NORM;
                                         mbchar = *ptr;
                                         ptr++;
e57fa318
 				} else {
2354901f
 					unsigned char c = tolower(*ptr);
 					/* normalize ' to " for scripts */
 					if(in_script && c == '\'') c = '"';
 					html_output_c(file_buff_o2, c);
d5a5fef9
 					if (!in_script) {
22b961c2
 						if(*ptr < 0x20) {
 							if(!text_space_written) {
 								html_output_c(file_buff_text, ' ');
 								text_space_written = TRUE;
 							}
 						} else {
2354901f
 							html_output_c(file_buff_text, c);
22b961c2
 							text_space_written = FALSE;
 						}
e57fa318
 					}
 					ptr++;
 				}
 				break;
 			case HTML_TAG:
 				if ((tag_length == 0) && (*ptr == '!')) {
 					/* Comment */
 					if (in_script) {
d5a5fef9
 						/* we still write scripts to nocomment.html */
 						html_output_c(file_buff_o2, '!');
 					} else {
 						/* Need to rewind in the no-comment output stream */
 						if (file_buff_o2 && (file_buff_o2->length > 0)) {
 							file_buff_o2->length--;
 						}
e57fa318
 					}
 					state = HTML_COMMENT;
 					next_state = HTML_BAD_STATE;
 					ptr++;
 				} else if (*ptr == '>') {
d5a5fef9
 					html_output_c(file_buff_o2, '>');
e57fa318
 					ptr++;
 					tag[tag_length] = '\0';
 					state = HTML_SKIP_WS;
 					next_state = HTML_PROCESS_TAG;
 				} else if (!isspace(*ptr)) {
d5a5fef9
 					html_output_c(file_buff_o2, tolower(*ptr));
6f7c0a15
 					/* if we're inside a script we only care for </script>.*/
 					if(in_script && tag_length==0 && *ptr != '/') {
 						state = HTML_NORM;
 					}
e57fa318
 					if (tag_length < HTML_STR_LENGTH) {
 						tag[tag_length++] = tolower(*ptr);
 					}
 					ptr++;
 				}  else {
 					tag[tag_length] = '\0';
 					state = HTML_SKIP_WS;
 					tag_arg_length = 0;
6f7c0a15
 					/* if we'd go to HTML_TAG_ARG whitespace would be inconsistently normalized for in_script*/
eb0757aa
 					next_state = !in_script ? HTML_TAG_ARG : HTML_PROCESS_TAG;
e57fa318
 				}
 				break;
 			case HTML_TAG_ARG:
 				if (*ptr == '=') {
d5a5fef9
 					html_output_c(file_buff_o2, '=');
e57fa318
 					tag_arg[tag_arg_length] = '\0';
 					ptr++;
 					state = HTML_SKIP_WS;
 					escape = FALSE;
 					quoted = NOT_QUOTED;
 					tag_val_length = 0;
 					next_state = HTML_TAG_ARG_VAL;
 				} else if (isspace(*ptr)) {
 					ptr++;
 					tag_arg[tag_arg_length] = '\0';
 					state = HTML_SKIP_WS;
 					next_state = HTML_TAG_ARG_EQUAL;
 				} else if (*ptr == '>') {
d5a5fef9
 					html_output_c(file_buff_o2, '>');
e57fa318
 					if (tag_arg_length > 0) {
 						tag_arg[tag_arg_length] = '\0';
 						html_tag_arg_add(&tag_args, tag_arg, NULL);
 					}
 					ptr++;
 					state = HTML_PROCESS_TAG;
 					next_state = HTML_BAD_STATE;
 				} else {
 					if (tag_arg_length == 0) {
 						/* Start of new tag - add space */
d5a5fef9
 						html_output_c(file_buff_o2,' ');
e57fa318
 					}
d5a5fef9
 					html_output_c(file_buff_o2, tolower(*ptr));
e57fa318
 					if (tag_arg_length < HTML_STR_LENGTH) {
 						tag_arg[tag_arg_length++] = tolower(*ptr);
 					}
 					ptr++;
 				}
 				break;
 			case HTML_TAG_ARG_EQUAL:
 				if (*ptr == '=') {
d5a5fef9
 					html_output_c(file_buff_o2, '=');
e57fa318
 					ptr++;
 					state = HTML_SKIP_WS;
 					escape = FALSE;
 					quoted = NOT_QUOTED;
 					tag_val_length = 0;
 					next_state = HTML_TAG_ARG_VAL;
 				} else {
 					if (tag_arg_length > 0) {
 						tag_arg[tag_arg_length] = '\0';
 						html_tag_arg_add(&tag_args, tag_arg, NULL);
 					}
 					tag_arg_length=0;
 					state = HTML_TAG_ARG;
 					next_state = HTML_BAD_STATE;
 				}
 				break;
 			case HTML_TAG_ARG_VAL:
a92110df
 				if ((tag_val_length == 5) && (strncmp(tag_val, "data:", 5) == 0)) {
 					/* RFC2397 inline data */
 
7cd9337a
 					/* Rewind one byte so we don't recursive */
a92110df
 					if (file_buff_o2 && (file_buff_o2->length > 0)) {
 						file_buff_o2->length--;
 					}
4e1127c5
 
a92110df
 					if (quoted != NOT_QUOTED) {
d5a5fef9
 						html_output_c(file_buff_o2, '"');
a92110df
 					}
 					tag_val_length = 0;
 					state = HTML_RFC2397_TYPE;
 					next_state = HTML_TAG_ARG;
 				} else if ((tag_val_length == 6) && (strncmp(tag_val, "\"data:", 6) == 0)) {
 					/* RFC2397 inline data */
 
7cd9337a
 					/* Rewind one byte so we don't recursive */
a92110df
 					if (file_buff_o2 && (file_buff_o2->length > 0)) {
 						file_buff_o2->length--;
 					}
4e1127c5
 
a92110df
 					if (quoted != NOT_QUOTED) {
d5a5fef9
 						html_output_c(file_buff_o2, '"');
a92110df
 					}
 
 					tag_val_length = 0;
 					state = HTML_RFC2397_TYPE;
 					next_state = HTML_TAG_ARG;
 				} else if (*ptr == '&') {
e57fa318
 					state = HTML_CHAR_REF;
 					next_state = HTML_TAG_ARG_VAL;
 					ptr++;
 				} else if (*ptr == '\'') {
 					if (tag_val_length == 0) {
 						quoted = SINGLE_QUOTED;
d5a5fef9
 						html_output_c(file_buff_o2, '"');
e57fa318
 						if (tag_val_length < HTML_STR_LENGTH) {
 							tag_val[tag_val_length++] = '"';
 						}
 						ptr++;
 					} else {
 						if (!escape && (quoted==SINGLE_QUOTED)) {
d5a5fef9
 							html_output_c(file_buff_o2, '"');
e57fa318
 							if (tag_val_length < HTML_STR_LENGTH) {
 								tag_val[tag_val_length++] = '"';
 							}
 							tag_val[tag_val_length] = '\0';
 							html_tag_arg_add(&tag_args, tag_arg, tag_val);
 							ptr++;
 							state = HTML_SKIP_WS;
 							tag_arg_length=0;
 							next_state = HTML_TAG_ARG;
 						} else {
d5a5fef9
 							html_output_c(file_buff_o2, '"');
e57fa318
 							if (tag_val_length < HTML_STR_LENGTH) {
 								tag_val[tag_val_length++] = '"';
 							}
 							ptr++;
 						}
 					}
 				} else if (*ptr == '"') {
 					if (tag_val_length == 0) {
 						quoted = DOUBLE_QUOTED;
d5a5fef9
 						html_output_c(file_buff_o2, '"');
e57fa318
 						if (tag_val_length < HTML_STR_LENGTH) {
 							tag_val[tag_val_length++] = '"';
 						}
 						ptr++;
 					} else {
4e1127c5
 						if (!escape && (quoted==DOUBLE_QUOTED)) {
d5a5fef9
 							html_output_c(file_buff_o2, '"');
e57fa318
 							if (tag_val_length < HTML_STR_LENGTH) {
 								tag_val[tag_val_length++] = '"';
 							}
 							tag_val[tag_val_length] = '\0';
 							html_tag_arg_add(&tag_args, tag_arg, tag_val);
 							ptr++;
 							state = HTML_SKIP_WS;
 							tag_arg_length=0;
 							next_state = HTML_TAG_ARG;
 						} else {
d5a5fef9
 							html_output_c(file_buff_o2, '"');
e57fa318
 							if (tag_val_length < HTML_STR_LENGTH) {
 								tag_val[tag_val_length++] = '"';
 							}
 							ptr++;
 						}
 					}
 				} else if (isspace(*ptr) || (*ptr == '>')) {
 					if (quoted == NOT_QUOTED) {
 						tag_val[tag_val_length] = '\0';
 						html_tag_arg_add(&tag_args, tag_arg, tag_val);
 						state = HTML_SKIP_WS;
 						tag_arg_length=0;
 						next_state = HTML_TAG_ARG;
 					} else {
d5a5fef9
 						html_output_c(file_buff_o2, *ptr);
e57fa318
 						if (tag_val_length < HTML_STR_LENGTH) {
 							if (isspace(*ptr)) {
 								tag_val[tag_val_length++] = ' ';
 							} else {
 								tag_val[tag_val_length++] = '>';
 							}
 						}
 						state = HTML_SKIP_WS;
 						escape = FALSE;
 						quoted = NOT_QUOTED;
 						next_state = HTML_TAG_ARG_VAL;
 						ptr++;
 					}
 				} else {
4522746e
                                     if (mbchar2 && (*ptr < 0x80 || mbchar2 >= 0x10000)) {
                                         if (mbchar2 == 0xE38082 || mbchar2 == 0xEFBC8E
                                             || mbchar2 == 0xEFB992 ||
3afedd07
                                             (mbchar2 == 0xA1 && (*ptr == 0x43 || *ptr == 0x44 || *ptr == 0x4F))) {
4522746e
                                             html_output_c(file_buff_o2, '.');
                                             if (tag_val_length < HTML_STR_LENGTH)
 						tag_val[tag_val_length++] = '.';
                                             if (mbchar2 == 0xA1) {
                                                 ptr++;
                                                 mbchar2 = 0;
                                                 continue;
                                             }
                                         } else {
                                             uint8_t c0 = mbchar2 >> 16;
                                             uint8_t c1 = (mbchar2 >> 8)&0xff;
                                             uint8_t c2 = (mbchar2 & 0xff);
                                             if (c0)
                                                 html_output_c(file_buff_o2, c0);
                                             if (c0 || c1)
                                                 html_output_c(file_buff_o2, c1);
                                             html_output_c(file_buff_o2, c2);
                                             if (c0 && tag_val_length < HTML_STR_LENGTH)
 						tag_val[tag_val_length++] = c0;
                                             if ((c0 || c1) && tag_val_length < HTML_STR_LENGTH)
 						tag_val[tag_val_length++] = c1;
                                             if (tag_val_length < HTML_STR_LENGTH)
 						tag_val[tag_val_length++] = c2;
 					}
                                         mbchar2 = 0;
                                     }
                                     if (*ptr >= 0x80)
                                         mbchar2 = (mbchar2 << 8) | *ptr;
                                     else {
d5a5fef9
 					html_output_c(file_buff_o2, tolower(*ptr));
e57fa318
 					if (tag_val_length < HTML_STR_LENGTH) {
a58f7bc5
 						tag_val[tag_val_length++] = *ptr;
e57fa318
 					}
4522746e
                                     }
 				    ptr++;
e57fa318
 				}
4e1127c5
 
e57fa318
 				if (*ptr == '\\') {
 					escape = TRUE;
 				} else {
 					escape = FALSE;
 				}
 				break;
 			case HTML_COMMENT:
22b961c2
 				if (in_script && !isspace(*ptr)) {
94ec8955
 					unsigned char c = tolower(*ptr);
d5a5fef9
 					/* dump script to nocomment.html, since we no longer have
 					 * comment.html/script.html */
94ec8955
 					if(c == '\'') c = '"';
 					html_output_c(file_buff_o2, c);
e57fa318
 				}
 				if (*ptr == '>') {
 					state = HTML_SKIP_WS;
d5a5fef9
 					next_state = HTML_NORM;
e57fa318
 				}
 				ptr++;
 				break;
 			case HTML_PROCESS_TAG:
4e1127c5
 
e57fa318
 				/* Default to no action for this tag */
 				state = HTML_SKIP_WS;
 				next_state = HTML_NORM;
 				if (tag[0] == '/') {
 					/* End tag */
 					state = HTML_SKIP_WS;
 					next_state = HTML_NORM;
 					if (strcmp(tag, "/script") == 0) {
8be1d5a4
 						in_script = FALSE;
6a53bbdf
 						if(js_state) {
5be3029f
 							js_end = ptr;
6a53bbdf
 							js_process(js_state, js_begin, js_end, line, ptr, in_script, dirname);
 							js_state = NULL;
 							js_begin = js_end = NULL;
 						}
0f247775
 						/*don't output newlines in nocomment.html
 						 * html_output_c(file_buff_o2, '\n');*/
e57fa318
 					}
333d724d
 					if (hrefs && hrefs->scanContents && in_ahref) {
 						if(strcmp(tag,"/a") == 0) {
7d4b5f16
 							html_tag_contents_done(hrefs,in_ahref, &contents);
333d724d
 							in_ahref=0;/* we are no longer inside an <a href>
 							nesting <a> tags not supported, and shouldn't be supported*/
 						}
 						href_contents_begin=ptr;
 					}
 					if (strcmp(tag, "/form") == 0)  {
462e8e5e
 						if (in_form_action)
 							free(in_form_action);
333d724d
 						in_form_action = NULL;
 					}
e57fa318
 				} else if (strcmp(tag, "script") == 0) {
 					arg_value = html_tag_arg_value(&tag_args, "language");
d5a5fef9
 					/* TODO: maybe we can output all tags only via html_output_tag */
b9b47784
 					if (arg_value && (strcasecmp((const char*)arg_value, "jscript.encode") == 0)) {
e57fa318
 						html_tag_arg_set(&tag_args, "language", "javascript");
 						state = HTML_SKIP_WS;
 						next_state = HTML_JSDECODE;
d5a5fef9
 						/* we already output the old tag, output the new tag now */
 						html_output_tag(file_buff_o2, tag, &tag_args);
b9b47784
 					} else if (arg_value && (strcasecmp((const char*)arg_value, "vbscript.encode") == 0)) {
e57fa318
 						html_tag_arg_set(&tag_args, "language", "vbscript");
 						state = HTML_SKIP_WS;
 						next_state = HTML_JSDECODE;
d5a5fef9
 						/* we already output the old tag, output the new tag now */
 						html_output_tag(file_buff_o2, tag, &tag_args);
6a53bbdf
 					}
 					in_script = TRUE;
 					if(dconf_js && !js_state) {
 						js_state = cli_js_init();
 						if(!js_state) {
1405207a
 							cli_dbgmsg("htmlnorm: Failed to initialize js parser\n");
8be1d5a4
 						}
6a53bbdf
 						js_begin = ptr;
 						js_end = NULL;
e57fa318
 					}
a5f19645
 				} else if(strcmp(tag, "%@") == 0) {
 					arg_value = html_tag_arg_value(&tag_args, "language");
b9b47784
 					if(arg_value && (strcasecmp((const char*)arg_value,"jscript.encode") == 0||
 							strcasecmp((const char*)arg_value, "vbscript.encode") == 0)) {
a5f19645
 
 						saved_next_state = next_state;
 						next_state = state;
e2354bdb
 						look_for_screnc = FALSE;
a5f19645
 						state = HTML_LOOKFOR_SCRENC;
 					}
247bc5c6
 				} else if (hrefs) {
333d724d
 					if(in_ahref && !href_contents_begin)
 						href_contents_begin=ptr;
247bc5c6
 					if (strcmp(tag, "a") == 0) {
 						arg_value = html_tag_arg_value(&tag_args, "href");
b9b47784
 						if (arg_value && strlen((const char*)arg_value) > 0) {
333d724d
 							if (hrefs->scanContents) {
b9b47784
 								char* arg_value_title = html_tag_arg_value(&tag_args,"title");
333d724d
 								/*beginning of an <a> tag*/
 								if (in_ahref)
 									/*we encountered nested <a> tags, pretend previous closed*/
 									if (href_contents_begin) {
7d4b5f16
 										html_tag_contents_append(&contents, href_contents_begin, ptrend);
333d724d
 										/*add pending contents between tags*/
7d4b5f16
 										html_tag_contents_done(hrefs, in_ahref, &contents);
333d724d
 										in_ahref=0;
 										}
 								if (arg_value_title) {
 									/* title is a 'displayed link'*/
 									html_tag_arg_add(hrefs,"href_title",arg_value_title);
b9b47784
 									html_tag_contents_append(&contents,(const unsigned char*)arg_value,
 										(const unsigned char*)arg_value+strlen(arg_value));
7d4b5f16
 									html_tag_contents_done(hrefs, hrefs->count, &contents);
333d724d
 								}
 								if (in_form_action) {
 									/* form action is the real URL, and href is the 'displayed' */
 									html_tag_arg_add(hrefs,"form",arg_value);
7d4b5f16
 									contents.pos = 0;
 									html_tag_contents_append(&contents, in_form_action,
b9b47784
 											in_form_action + strlen((const char*)in_form_action));
7d4b5f16
 									html_tag_contents_done(hrefs, hrefs->count, &contents);
333d724d
 								}
 							}
247bc5c6
 							html_tag_arg_add(hrefs, "href", arg_value);
333d724d
 							if (hrefs->scanContents) {
 								in_ahref=hrefs->count; /* index of this tag (counted from 1) */
 								href_contents_begin=ptr;/* contents begin after <a ..> ends */
7d4b5f16
 								contents.pos = 0;
333d724d
 							}
247bc5c6
 						}
333d724d
 					} else if (strcmp(tag,"form") == 0 && hrefs->scanContents) {
b9b47784
 						const char* arg_action_value = html_tag_arg_value(&tag_args,"action");
ec481027
 						if (arg_action_value) {
4e1127c5
 							if(in_form_action)
 								free(in_form_action);
b9b47784
 							in_form_action = (unsigned char*)cli_strdup(arg_action_value);
ec481027
 						}
247bc5c6
 					} else if (strcmp(tag, "img") == 0) {
 						arg_value = html_tag_arg_value(&tag_args, "src");
 						if (arg_value && strlen(arg_value) > 0) {
 							html_tag_arg_add(hrefs, "src", arg_value);
333d724d
 							if(hrefs->scanContents && in_ahref)
 								/* "contents" of an img tag, is the URL of its parent <a> tag */
b9b47784
 								hrefs->contents[hrefs->count-1] = (unsigned char*)cli_strdup((const char*)hrefs->value[in_ahref-1]);
333d724d
 							if (in_form_action) {
 								/* form action is the real URL, and href is the 'displayed' */
 								html_tag_arg_add(hrefs,"form",arg_value);
7d4b5f16
 								contents.pos = 0;
 								html_tag_contents_append(&contents, in_form_action,
b9b47784
 										in_form_action + strlen((const char*)in_form_action));
7d4b5f16
 								html_tag_contents_done(hrefs, hrefs->count, &contents);
333d724d
 							}
247bc5c6
 						}
 						arg_value = html_tag_arg_value(&tag_args, "dynsrc");
 						if (arg_value && strlen(arg_value) > 0) {
 							html_tag_arg_add(hrefs, "dynsrc", arg_value);
333d724d
 							if(hrefs->scanContents && in_ahref)
 								/* see above */
b9b47784
 								hrefs->contents[hrefs->count-1] = (unsigned char*)cli_strdup((const char*)hrefs->value[in_ahref-1]);
333d724d
 							if (in_form_action) {
 								/* form action is the real URL, and href is the 'displayed' */
 								html_tag_arg_add(hrefs,"form",arg_value);
7d4b5f16
 								contents.pos = 0;
 								html_tag_contents_append(&contents, in_form_action,
b9b47784
 										in_form_action + strlen((const char*)in_form_action));
7d4b5f16
 								html_tag_contents_done(hrefs, hrefs->count, &contents);
333d724d
 							}
247bc5c6
 						}
 					} else if (strcmp(tag, "iframe") == 0) {
 						arg_value = html_tag_arg_value(&tag_args, "src");
 						if (arg_value && strlen(arg_value) > 0) {
 							html_tag_arg_add(hrefs, "iframe", arg_value);
333d724d
 							if(hrefs->scanContents && in_ahref)
 								/* see above */
b9b47784
 								hrefs->contents[hrefs->count-1] = (unsigned char*)cli_strdup((const char*)hrefs->value[in_ahref-1]);
333d724d
 							if (in_form_action) {
 								/* form action is the real URL, and href is the 'displayed' */
 								html_tag_arg_add(hrefs,"form",arg_value);
7d4b5f16
 								contents.pos = 0;
 								html_tag_contents_append(&contents, in_form_action,
b9b47784
 										in_form_action + strlen((const char*)in_form_action));
7d4b5f16
 								html_tag_contents_done(hrefs, hrefs->count, &contents);
333d724d
 							}
247bc5c6
 						}
333d724d
 					} else if (strcmp(tag,"area") == 0) {
 						arg_value = html_tag_arg_value(&tag_args,"href");
 						if (arg_value && strlen(arg_value) > 0) {
 							html_tag_arg_add(hrefs, "area", arg_value);
 							if(hrefs->scanContents && in_ahref)
 								/* see above */
b9b47784
 								hrefs->contents[hrefs->count-1] = (unsigned char*)cli_strdup((const char*)hrefs->value[in_ahref-1]);
333d724d
 							if (in_form_action) {
 								/* form action is the real URL, and href is the 'displayed' */
 								html_tag_arg_add(hrefs,"form",arg_value);
7d4b5f16
 								contents.pos = 0;
 								html_tag_contents_append(&contents, in_form_action,
b9b47784
 									in_form_action + strlen((const char*)in_form_action));
7d4b5f16
 								html_tag_contents_done(hrefs, hrefs->count, &contents);
333d724d
 							}
4e1127c5
 						}
333d724d
 					}
 					/* TODO:imagemaps can have urls too */
ec774193
 				} else if (strcmp(tag, "a") == 0) {
 					/* a/img tags for buff_text can be processed only if we're not processing hrefs */
 					arg_value = html_tag_arg_value(&tag_args, "href");
 					if(arg_value && arg_value[0]) {
b9b47784
 						html_output_str(file_buff_text, (const unsigned char*)arg_value, strlen((const char*)arg_value));
d5a5fef9
 						html_output_c(file_buff_text, ' ');
22b961c2
 						text_space_written = TRUE;
ec774193
 					}
 				} else if (strcmp(tag, "img") == 0) {
 					arg_value = html_tag_arg_value(&tag_args, "src");
 					if(arg_value && arg_value[0]) {
b9b47784
 						html_output_str(file_buff_text, (const unsigned char*)arg_value, strlen((const char*)arg_value));
d5a5fef9
 						html_output_c(file_buff_text, ' ');
22b961c2
 						text_space_written = TRUE;
ec774193
 					}
e57fa318
 				}
 				html_tag_arg_free(&tag_args);
 				break;
 			case HTML_CHAR_REF:
 				if (*ptr == '#') {
 					value = 0;
 					hex = FALSE;
 					state = HTML_CHAR_REF_DECODE;
 					ptr++;
 				} else {
462e8e5e
 					if(dconf_entconv)
 						state = HTML_ENTITY_REF_DECODE;
 					else {
 						if(next_state == HTML_TAG_ARG_VAL && tag_val_length < HTML_STR_LENGTH) {
 							tag_val[tag_val_length++] = '&';
 						}
d5a5fef9
 						html_output_c(file_buff_o2, '&');
3506ac49
 
462e8e5e
 						state = next_state;
 						next_state = HTML_BAD_STATE;
 					}
3506ac49
 				}
 				break;
 			case HTML_ENTITY_REF_DECODE:
 				if(*ptr == ';') {
 					size_t i;
b0b8398b
 					const char* normalized;
3506ac49
 					entity_val[entity_val_length] = '\0';
 					normalized = entity_norm(&conv, entity_val);
 					if(normalized) {
 						for(i=0; i < strlen(normalized); i++) {
b0b8398b
 							const unsigned char c = normalized[i]&0xff;
d5a5fef9
 							html_output_c(file_buff_o2, c);
66f7a691
 							if (next_state == HTML_TAG_ARG_VAL && tag_val_length < HTML_STR_LENGTH) {
3506ac49
 								tag_val[tag_val_length++] = c;
 							}
 						}
 					}
 					else {
d5a5fef9
 						html_output_c(file_buff_o2, '&');
66f7a691
 						if (next_state == HTML_TAG_ARG_VAL && tag_val_length < HTML_STR_LENGTH) {
 								tag_val[tag_val_length++] = '&';
4e1127c5
 						}
3506ac49
 						for(i=0; i < entity_val_length; i++) {
 							const char c = tolower(entity_val[i]);
d5a5fef9
 							html_output_c(file_buff_o2, c);
66f7a691
 							if (next_state == HTML_TAG_ARG_VAL && tag_val_length < HTML_STR_LENGTH) {
3506ac49
 								tag_val[tag_val_length++] = c;
 							}
 						}
66f7a691
 						if (next_state == HTML_TAG_ARG_VAL && tag_val_length < HTML_STR_LENGTH) {
 							tag_val[tag_val_length++] = ';';
 						}
d5a5fef9
 						html_output_c(file_buff_o2, ';');
3506ac49
 					}
 					entity_val_length = 0;
e57fa318
 					state = next_state;
 					next_state = HTML_BAD_STATE;
3506ac49
 					ptr++;
 				}
 				else if ( (isalnum(*ptr) || *ptr=='_' || *ptr==':' || (*ptr=='-')) && entity_val_length < HTML_STR_LENGTH) {
 					entity_val[entity_val_length++] = *ptr++;
 				}
 				else {
 						/* entity too long, or not valid, dump it */
 						size_t i;
66f7a691
 						if (next_state==HTML_TAG_ARG_VAL && tag_val_length < HTML_STR_LENGTH) {
 								tag_val[tag_val_length++] = '&';
 						}
d5a5fef9
 						html_output_c(file_buff_o2, '&');
3506ac49
 						for(i=0; i < entity_val_length; i++) {
 							const char c = tolower(entity_val[i]);
d5a5fef9
 							html_output_c(file_buff_o2, c);
66f7a691
 							if (next_state==HTML_TAG_ARG_VAL && tag_val_length < HTML_STR_LENGTH) {
3506ac49
 								tag_val[tag_val_length++] = c;
 							}
 						}
 
 						state = next_state;
 						next_state = HTML_BAD_STATE;
 						entity_val_length = 0;
e57fa318
 				}
 				break;
 			case HTML_CHAR_REF_DECODE:
 				if ((value==0) && ((*ptr == 'x') || (*ptr == 'X'))) {
 					hex=TRUE;
 					ptr++;
 				} else if (*ptr == ';') {
73611293
 					if (next_state==HTML_TAG_ARG_VAL && tag_val_length < HTML_STR_LENGTH) {
462e8e5e
 							tag_val[tag_val_length++] = value; /* store encoded values too */
73611293
 					}
 					if(dconf_entconv) {
3506ac49
 
462e8e5e
 						if(value < 0x80)
d5a5fef9
 							html_output_c(file_buff_o2, tolower(value));
462e8e5e
 						else {
 							unsigned char buff[10];
8b22c9b5
 							unsigned char* out = u16_normalize_tobuffer(value, buff, 10);
d5a5fef9
 							if(out && out>buff) {
 								html_output_str(file_buff_o2, buff, out-buff-1);
8b22c9b5
 							}
462e8e5e
 						}
 					} else
d5a5fef9
 							html_output_c(file_buff_o2, tolower(value&0xff));
e57fa318
 					state = next_state;
 					next_state = HTML_BAD_STATE;
 					ptr++;
 				} else if (isdigit(*ptr) || (hex && isxdigit(*ptr))) {
 					if (hex) {
 						value *= 16;
 					} else {
 						value *= 10;
 					}
 					if (isdigit(*ptr)) {
 						value += (*ptr - '0');
 					} else {
 						value += (tolower(*ptr) - 'a' + 10);
 					}
 					ptr++;
 				} else {
d5a5fef9
 					html_output_c(file_buff_o2, value);
e57fa318
 					state = next_state;
 					next_state = HTML_BAD_STATE;
 				}
 				break;
a5f19645
 			case HTML_LOOKFOR_SCRENC:
 				look_for_screnc = TRUE;
b9b47784
 				ptr_screnc = (unsigned char*)strstr((char*)ptr, "#@~^");
a5f19645
 				if(ptr_screnc) {
6a53bbdf
 					ptr_screnc[0] = '/';
 					ptr_screnc[1] = '/';
a5f19645
 					ptr_screnc += 4;
 				}
 				state = next_state;
 				next_state = saved_next_state;
 				break;
e57fa318
 			case HTML_JSDECODE:
 				/* Check for start marker */
b9b47784
 				if (strncmp((const char*)ptr, "#@~^", 4) == 0) {
6a53bbdf
 					ptr[0] = '/';
 					ptr[1] = '/';
e57fa318
 					ptr += 4;
 					state = HTML_JSDECODE_LENGTH;
 					next_state = HTML_BAD_STATE;
 				} else {
d5a5fef9
 					html_output_c(file_buff_o2, tolower(*ptr));
e57fa318
 					ptr++;
 				}
 				break;
 			case HTML_JSDECODE_LENGTH:
b9b47784
 				if (strlen((const char*)ptr) < 8) {
e57fa318
 					state = HTML_NORM;
 					next_state = HTML_BAD_STATE;
 					break;
 				}
a5f19645
 				memset(&screnc_state, 0, sizeof(screnc_state));
 				screnc_state.length = base64_chars[ptr[0]] << 2;
 				screnc_state.length += base64_chars[ptr[1]] >> 4;
 				screnc_state.length += (base64_chars[ptr[1]] & 0x0f) << 12;
 				screnc_state.length += (base64_chars[ptr[2]] >> 2) << 8;
 				screnc_state.length += (base64_chars[ptr[2]] & 0x03) << 22;
 				screnc_state.length += base64_chars[ptr[3]] << 16;
 				screnc_state.length += (base64_chars[ptr[4]] << 2) << 24;
 				screnc_state.length += (base64_chars[ptr[5]] >> 4) << 24;
e57fa318
 				state = HTML_JSDECODE_DECRYPT;
a5f19645
 				in_screnc = TRUE;
e57fa318
 				next_state = HTML_BAD_STATE;
6a53bbdf
 				/* for JS normalizer */
 				ptr[7] = '\n';
e57fa318
 				ptr += 8;
 				break;
 			case HTML_JSDECODE_DECRYPT:
a5f19645
 				screnc_decode(ptr, &screnc_state);
 				if(!screnc_state.length) {
 					state = HTML_NORM;
 					next_state = HTML_BAD_STATE;
 					in_screnc = FALSE;
e57fa318
 					break;
a5f19645
 				} else {
 					state = HTML_NORM;
 					next_state = HTML_BAD_STATE;
e57fa318
 				}
 				break;
a92110df
 			case HTML_RFC2397_TYPE:
 				if (*ptr == '\'') {
 					if (!escape && (quoted==SINGLE_QUOTED)) {
 						/* Early end of data detected. Error */
 						ptr++;
 						state = HTML_SKIP_WS;
 						tag_arg_length=0;
 						next_state = HTML_TAG_ARG;
 					} else {
 						if (tag_val_length < HTML_STR_LENGTH) {
 							tag_val[tag_val_length++] = '"';
 						}
 						ptr++;
 					}
 				} else if (*ptr == '"') {
 					if (!escape && (quoted==DOUBLE_QUOTED)) {
 						/* Early end of data detected. Error */
 						ptr++;
 						state = HTML_SKIP_WS;
 						tag_arg_length=0;
 						next_state = HTML_TAG_ARG;
 					} else {
 						if (tag_val_length < HTML_STR_LENGTH) {
 							tag_val[tag_val_length++] = '"';
 						}
 						ptr++;
 					}
 				} else if (isspace(*ptr) || (*ptr == '>')) {
 					if (quoted == NOT_QUOTED) {
 						/* Early end of data detected. Error */
 						state = HTML_SKIP_WS;
 						tag_arg_length=0;
 						next_state = HTML_TAG_ARG;
 					} else {
 						if (tag_val_length < HTML_STR_LENGTH) {
 							if (isspace(*ptr)) {
 								tag_val[tag_val_length++] = ' ';
 							} else {
 								tag_val[tag_val_length++] = '>';
 							}
 						}
 						state = HTML_SKIP_WS;
 						escape = FALSE;
 						quoted = NOT_QUOTED;
 						next_state = HTML_RFC2397_TYPE;
 						ptr++;
 					}
 				} else if (*ptr == ',') {
 					/* Beginning of data */
 					tag_val[tag_val_length] = '\0';
 					state = HTML_RFC2397_INIT;
 					escape = FALSE;
 					next_state = HTML_BAD_STATE;
 					ptr++;
4e1127c5
 
a92110df
 				} else {
 					if (tag_val_length < HTML_STR_LENGTH) {
 						tag_val[tag_val_length++] = tolower(*ptr);
 					}
 					ptr++;
 				}
 				if (*ptr == '\\') {
 					escape = TRUE;
 				} else {
 					escape = FALSE;
 				}
 				break;
 			case HTML_RFC2397_INIT:
45d6cbd9
 				if (dirname) {
 					file_tmp_o1 = (file_buff_t *) cli_malloc(sizeof(file_buff_t));
 					if (!file_tmp_o1) {
241e7eb1
                         cli_errmsg("cli_html_normalise: Unable to allocate memory for file_tmp_o1\n");
45d6cbd9
 						goto abort;
 					}
58481352
 					snprintf(filename, 1024, "%s"PATHSEP"rfc2397", dirname);
45d6cbd9
 					tmp_file = cli_gentemp(filename);
5fc380f1
 					if(!tmp_file) {
 						goto abort;
 					}
45d6cbd9
 					cli_dbgmsg("RFC2397 data file: %s\n", tmp_file);
 					file_tmp_o1->fd = open(tmp_file, O_WRONLY|O_CREAT|O_TRUNC, S_IWUSR|S_IRUSR);
 					free(tmp_file);
e357da7b
 					if (file_tmp_o1->fd < 0) {
45d6cbd9
 						cli_dbgmsg("open failed: %s\n", filename);
 						goto abort;
 					}
 					file_tmp_o1->length = 0;
4e1127c5
 
b9b47784
 					html_output_str(file_tmp_o1, (const unsigned char*)"From html-normalise\n", 20);
 					html_output_str(file_tmp_o1, (const unsigned char*)"Content-type: ", 14);
45d6cbd9
 					if ((tag_val_length == 0) && (*tag_val == ';')) {
b9b47784
 						html_output_str(file_tmp_o1, (const unsigned char*)"text/plain\n", 11);
45d6cbd9
 					}
b9b47784
 					html_output_str(file_tmp_o1, (const unsigned char*)tag_val, tag_val_length);
d5a5fef9
 					html_output_c(file_tmp_o1, '\n');
45d6cbd9
 					if (strstr(tag_val, ";base64") != NULL) {
b9b47784
 						html_output_str(file_tmp_o1, (const unsigned char*)"Content-transfer-encoding: base64\n", 34);
45d6cbd9
 					}
d5a5fef9
 					html_output_c(file_tmp_o1, '\n');
45d6cbd9
 				} else {
 					file_tmp_o1 = NULL;
a92110df
 				}
 				state = HTML_RFC2397_DATA;
 				binary = TRUE;
 				break;
 			case HTML_RFC2397_DATA:
 				if (*ptr == '&') {
 					state = HTML_CHAR_REF;
 					next_state = HTML_RFC2397_DATA;
 					ptr++;
 				} else if (*ptr == '%') {
 					length = 0;
 					value = 0;
 					state = HTML_ESCAPE_CHAR;
 					next_state = HTML_RFC2397_ESC;
 					ptr++;
 				} else if (*ptr == '\'') {
 					if (!escape && (quoted==SINGLE_QUOTED)) {
 						state = HTML_RFC2397_FINISH;
 						ptr++;
 					} else {
d5a5fef9
 						html_output_c(file_tmp_o1, *ptr);
a92110df
 						ptr++;
 					}
 				} else if (*ptr == '\"') {
379870fa
 					if (!escape && (quoted==DOUBLE_QUOTED)) {
a92110df
 						state = HTML_RFC2397_FINISH;
 						ptr++;
 					} else {
d5a5fef9
 						html_output_c(file_tmp_o1, *ptr);
a92110df
 						ptr++;
 					}
 				} else if (isspace(*ptr) || (*ptr == '>')) {
 					if (quoted == NOT_QUOTED) {
 						state = HTML_RFC2397_FINISH;
 						ptr++;
 					} else {
d5a5fef9
 						html_output_c(file_tmp_o1, *ptr);
a92110df
 						ptr++;
 					}
 				} else {
d5a5fef9
 					html_output_c(file_tmp_o1, *ptr);
a92110df
 					ptr++;
 				}
 				if (*ptr == '\\') {
 					escape = TRUE;
 				} else {
 					escape = FALSE;
 				}
 				break;
 			case HTML_RFC2397_FINISH:
45d6cbd9
 				if(file_tmp_o1) {
 					html_output_flush(file_tmp_o1);
 					close(file_tmp_o1->fd);
 					free(file_tmp_o1);
4b2400fd
 					file_tmp_o1 = NULL;
45d6cbd9
 				}
a92110df
 				state = HTML_SKIP_WS;
 				escape = FALSE;
 				quoted = NOT_QUOTED;
 				next_state = HTML_TAG_ARG;
 				binary = FALSE;
 				break;
 			case HTML_RFC2397_ESC:
 				if (length == 2) {
d5a5fef9
 					html_output_c(file_tmp_o1, value);
a92110df
 				} else if (length == 1) {
d5a5fef9
 					html_output_c(file_tmp_o1, '%');
 					html_output_c(file_tmp_o1, value+'0');
a92110df
 				} else {
d5a5fef9
 					html_output_c(file_tmp_o1, '%');
a92110df
 				}
 				state = HTML_RFC2397_DATA;
4e1127c5
 				break;
a92110df
 			case HTML_ESCAPE_CHAR:
 				value *= 16;
 				length++;
 				if (isxdigit(*ptr)) {
 					if (isdigit(*ptr)) {
 						value += (*ptr - '0');
 					} else {
 						value += (tolower(*ptr) - 'a' + 10);
 					}
 				} else {
 					state = next_state;
 				}
 				if (length == 2) {
 					state = next_state;
 				}
 				ptr++;
4e1127c5
 				break;
888f5794
 			}
 		}
333d724d
 		if(hrefs && hrefs->scanContents && in_ahref && href_contents_begin)
 			/* end of line, append contents now, resume on next line */
7d4b5f16
 			html_tag_contents_append(&contents,href_contents_begin,ptr);
333d724d
 		ptrend = NULL;
8be1d5a4
 
 		if(js_state) {
6a53bbdf
 			js_process(js_state, js_begin, js_end, line, ptr, in_script, dirname);
8be1d5a4
 			js_begin = js_end = NULL;
 			if(!in_script) {
 				js_state = NULL;
 			}
 		}
a5f19645
 		if(look_for_screnc && ptr_screnc) {
 			/* start found, and stuff before it already processed */
 			ptr = ptr_screnc;
 			ptr_screnc = NULL;
 			state = HTML_JSDECODE_LENGTH;
 			next_state = HTML_BAD_STATE;
 			continue;
 		}
b3fc7f97
 		free(line);
 		ptr = line = cli_readchunk(stream_in, m_area, 8192);
a5f19645
 		if (in_screnc) {
 			state = HTML_JSDECODE_DECRYPT;
 			next_state = HTML_BAD_STATE;
e2354bdb
 		} else if(look_for_screnc && !ptr_screnc &&
 				state != HTML_LOOKFOR_SCRENC) {
a5f19645
 			saved_next_state = next_state;
 			next_state = state;
 			state = HTML_LOOKFOR_SCRENC;
 		}
e2354bdb
 		if(next_state == state) {
 			/* safeguard against infloop */
 			cli_dbgmsg("htmlnorm.c: next_state == state, changing next_state\n");
 			next_state = HTML_BAD_STATE;
 		}
e57fa318
 	}
4e1127c5
 
 	if(dconf_entconv) {
7cd9337a
 		/* handle "unfinished" entities */
3506ac49
 		size_t i;
b0b8398b
 		const char* normalized;
3506ac49
 		entity_val[entity_val_length] = '\0';
 		normalized = entity_norm(&conv, entity_val);
 		if(normalized) {
 			for(i=0; i < strlen(normalized); i++)
d5a5fef9
 				html_output_c(file_buff_o2, normalized[i]&0xff);
3506ac49
 		}
 		else {
 			if(entity_val_length) {
d5a5fef9
 				html_output_c(file_buff_o2, '&');
3506ac49
 				for(i=0; i < entity_val_length; i++)
d5a5fef9
 					html_output_c(file_buff_o2, tolower(entity_val[i]));
3506ac49
 			}
 		}
 	}
e57fa318
 	retval = TRUE;
 abort:
ac1d2fba
 	if (line) /* only needed for abort case */
 		free(line);
333d724d
 	if (in_form_action)
 		free(in_form_action);
7d4b5f16
         if (in_ahref) /* tag not closed, force closing */
                 html_tag_contents_done(hrefs, in_ahref, &contents);
462e8e5e
 
8be1d5a4
 	if(js_state) {
fb6208fd
 		/*  output script so far */
 		cli_js_parse_done(js_state);
 		cli_js_output(js_state, dirname);
8be1d5a4
 		cli_js_destroy(js_state);
 		js_state = NULL;
 	}
e57fa318
 	html_tag_arg_free(&tag_args);
 	if (!m_area) {
 		fclose(stream_in);
888f5794
 	}
3d0ca3cf
 	if (file_buff_o2) {
 		html_output_flush(file_buff_o2);
e357da7b
 		if(file_buff_o2->fd != -1)
 			close(file_buff_o2->fd);
3d0ca3cf
 		free(file_buff_o2);
 	}
0664128a
 	if(file_buff_text) {
 		html_output_flush(file_buff_text);
e357da7b
 		if(file_buff_text->fd != -1)
 			close(file_buff_text->fd);
0664128a
 		free(file_buff_text);
ec253fd7
         file_buff_text=NULL;
0664128a
 	}
4b2400fd
 	if(file_tmp_o1) {
 		html_output_flush(file_tmp_o1);
ec253fd7
 		if(file_buff_text && file_buff_text->fd != -1)
44470814
 			close(file_tmp_o1->fd);
4b2400fd
 		free(file_tmp_o1);
 	}
e57fa318
 	return retval;
 }
 
462e8e5e
 int html_normalise_mem(unsigned char *in_buff, off_t in_size, const char *dirname, tag_arguments_t *hrefs,const struct cli_dconf* dconf)
e57fa318
 {
 	m_area_t m_area;
b3fc7f97
 
e57fa318
 	m_area.buffer = in_buff;
 	m_area.length = in_size;
 	m_area.offset = 0;
ba65fdc8
 	m_area.map = NULL;
b3fc7f97
 
462e8e5e
 	return cli_html_normalise(-1, &m_area, dirname, hrefs, dconf);
888f5794
 }
 
49cc1e3c
 int html_normalise_map(fmap_t *map, const char *dirname, tag_arguments_t *hrefs,const struct cli_dconf* dconf)
888f5794
 {
e57fa318
 	int retval=FALSE;
 	m_area_t m_area;
084d19aa
 
 	m_area.length = map->len;
 	m_area.offset = 0;
 	m_area.map = map;
 	retval = cli_html_normalise(-1, &m_area, dirname, hrefs, dconf);
e57fa318
 	return retval;
888f5794
 }
 
32f7e1d7
 int html_screnc_decode(fmap_t *map, const char *dirname)
888f5794
 {
32f7e1d7
 	int count, retval=FALSE;
c8faa5c5
 	unsigned char *line = NULL, tmpstr[6];
e57fa318
 	unsigned char *ptr, filename[1024];
a5f19645
 	int ofd;
 	struct screnc_state screnc_state;
32f7e1d7
 	m_area_t m_area;
a5f19645
 
32f7e1d7
 	memset(&m_area, 0, sizeof(m_area));
 	m_area.length = map->len;
 	m_area.offset = 0;
 	m_area.map = map;
a5f19645
 
58481352
 	snprintf((char*)filename, 1024, "%s"PATHSEP"screnc.html", dirname);
b9b47784
 	ofd = open((const char*)filename, O_WRONLY|O_CREAT|O_TRUNC, S_IWUSR|S_IRUSR);
a5f19645
 
e357da7b
 	if (ofd < 0) {
e57fa318
 		cli_dbgmsg("open failed: %s\n", filename);
 		return FALSE;
888f5794
 	}
a5f19645
 
32f7e1d7
 	while ((line = cli_readchunk(NULL, &m_area, 8192)) != NULL) {
b9b47784
 		ptr = (unsigned char*)strstr((char*)line, "#@~^");
e57fa318
 		if (ptr) {
 			break;
888f5794
 		}
e57fa318
 		free(line);
c8faa5c5
 		line = NULL;
 	}
e57fa318
 	if (!line) {
 		goto abort;
 	}
a5f19645
 
e57fa318
 	/* Calculate the length of the encoded string */
 	ptr += 4;
 	count = 0;
 	do {
 		if (! *ptr) {
 			free(line);
32f7e1d7
 			ptr = line = cli_readchunk(NULL, &m_area, 8192);
e57fa318
 			if (!line) {
 				goto abort;
888f5794
 			}
 		}
a5f19645
 		if(count < 6)
 			tmpstr[count] = *ptr;
 		count++;
e57fa318
 		ptr++;
a5f19645
 	} while (count < 8);
 
 	memset(&screnc_state, 0, sizeof(screnc_state));
 	screnc_state.length = base64_chars[tmpstr[0]] << 2;
 	screnc_state.length += base64_chars[tmpstr[1]] >> 4;
 	screnc_state.length += (base64_chars[tmpstr[1]] & 0x0f) << 12;
 	screnc_state.length += (base64_chars[tmpstr[2]] >> 2) << 8;
 	screnc_state.length += (base64_chars[tmpstr[2]] & 0x03) << 22;
 	screnc_state.length += base64_chars[tmpstr[3]] << 16;
 	screnc_state.length += (base64_chars[tmpstr[4]] << 2) << 24;
 	screnc_state.length += (base64_chars[tmpstr[5]] >> 4) << 24;
 
048725d0
 	cli_writen(ofd, "<script>",strlen("<script>"));
a5f19645
 	while (screnc_state.length && line) {
 		screnc_decode(ptr, &screnc_state);
b9b47784
 		cli_writen(ofd, ptr, strlen((const char*)ptr));
e57fa318
 		free(line);
c8faa5c5
 		line = NULL;
a5f19645
 		if (screnc_state.length) {
32f7e1d7
 			ptr = line = cli_readchunk(NULL, &m_area, 8192);
888f5794
 		}
 	}
048725d0
 	cli_writen(ofd, "</script>",strlen("</script>"));
a5f19645
 	if(screnc_state.length)
5cd3f734
 		cli_dbgmsg("html_screnc_decode: missing %u bytes\n",screnc_state.length);
e57fa318
 	retval = TRUE;
a5f19645
 
e57fa318
 abort:
a5f19645
 	close(ofd);
c8faa5c5
 	if (line) {
 		free(line);
 	}
e57fa318
 	return retval;
888f5794
 }