libclamav/htmlnorm.c
2fe19b26
 /*
  *  Copyright (C) 2004 Trog <trog@clamav.net>
  *
  *  This program is free software; you can redistribute it and/or modify
  *  it under the terms of the GNU General Public License as published by
  *  the Free Software Foundation; either version 2 of the License, or
  *  (at your option) any later version.
  *
  *  This program is distributed in the hope that it will be useful,
  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  *  GNU General Public License for more details.
  *
  *  You should have received a copy of the GNU General Public License
  *  along with this program; if not, write to the Free Software
  *  Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  */
 
 #if HAVE_CONFIG_H
 #include "clamav-config.h"
 #endif
 
 #include <stdio.h>
f91f55e0
 #include <string.h>
 #include <ctype.h>
2fe19b26
 #include <unistd.h>
 #include <sys/types.h>
 #include <sys/stat.h>
 #include <fcntl.h>
 
 #include "others.h"
 
 #define FALSE (0)
 #define TRUE (1)
 
 /* Normalize an HTML buffer using the following rules:
 	o Remove multiple contiguous spaces
 	o Remove spaces around '<' and '>' in tags
 	o Remove spaces around '=' in tags
 	o Replace single quote with double quote in tags
 	o Convert to lowercase
 	o Convert all white space to a space character
 */
 
 unsigned char *html_normalize(unsigned char *in_buff, off_t in_size)
 {
 	unsigned char *out_buff;
 	off_t out_size=0, i;
 	int had_space=FALSE, tag_depth=0, in_quote=FALSE;
 
 	out_buff = (unsigned char *) cli_malloc(in_size+1);
 	if (!out_buff) {
 		cli_dbgmsg("html_normalize(): malloc failed\n");
 		return NULL;
 	}
 
 	for (i=0 ; i < in_size ; i++) {
 		if (in_buff[i] == '<') {
 			out_buff[out_size++] = '<';
 			tag_depth++;
 			if (tag_depth == 1) {
 				had_space=TRUE; /* consume spaces */
 			}
 		} else if ((in_buff[i] == '=') && (tag_depth == 1)) {
 			/* Remove preceeding spaces */
 			while ((out_size > 0) &&
 				(out_buff[out_size-1] == ' ')) {
 				out_size--;
 			}
 			out_buff[out_size++] = '=';
 			had_space=TRUE;
 		} else if (isspace(in_buff[i])) {
 			if (!had_space) {
 				out_buff[out_size++] = ' ';
 				had_space=TRUE;
 			}
 		} else if (in_buff[i] == '>') {
 			/* Remove preceeding spaces */
 			if (tag_depth == 1) {
 				while ((out_size > 0) &&
 					(out_buff[out_size-1] == ' ')) {
 					out_size--;
 				}
 			}
 			out_buff[out_size++] = '>';
 			tag_depth--;	
 		} else if ((in_buff[i] == '\'') && (tag_depth==1)) {
 			/* Convert single quotes to double quotes */
 			if (in_quote || out_buff[out_size-1] == '=') {
 				out_buff[out_size++] = '\"';
 				in_quote = !in_quote;
 			} else {
 				out_buff[out_size++] = '\'';
 			}
 		} else {
 			out_buff[out_size++] = tolower(in_buff[i]);
 			had_space=FALSE;
 		}
 	}
 	out_buff[out_size] = '\0';
 	return out_buff;
 }
 
 /* Remove HTML style comments from buffer */
 unsigned char *remove_html_comments(unsigned char *line)
 {
 	unsigned char *newline, *newcurrent;
 	int in_comment=FALSE;
 	
 	if (!line) {
 		return NULL;
 	}
 	
 	newcurrent = newline = (unsigned char *) cli_malloc(strlen(line) + 1);
 	if (!newline) {
 		return NULL;
 	}
 	
 	while(line) {
 		if (!(in_comment)) {
 			while (*line && *line != '<') {
 				*newcurrent = *line;
 				newcurrent++;
 				line++;
 			}
 			if (! *line) {
 				break;
 			}
 			if (!line[1]) {
 				*newcurrent = *line;
 				newcurrent++;
 				line++;
 				continue;
 			}
 			if (line[1] == '!') {
 				in_comment = TRUE;
 				line += 1;
 			} else {
 				*newcurrent = *line;
 				newcurrent++;
 				line++;
 			}
 		} else {
 			while (*line && *line != '>') {
 				line++;
 			}
 			if (! *line) {
 				break;
 			}
 			in_comment = FALSE;
 			line++;
 		}
 	}
 	*newcurrent = '\0';
 	return newline;
 }
 
 /* Decode an HTML escape character into it's character value */
 unsigned int decode_html_char_ref(unsigned char *cref,
                                     unsigned char *dest)
 {
 
 	unsigned int hex=FALSE, value=0, count=0;
 	
 	if (!cref[0] || !cref[1]) {
 		return 0;
 	}
 	
 	if (((*cref == 'x') || (*cref == 'X')) && isxdigit(cref[1])) {
 		hex=TRUE;
 		cref++;
 		count++;
 	}
 	
 	while (isdigit(*cref) || (hex && isxdigit(*cref))) {
 		if (hex) {
 			value *= 16;
 		} else {
 			value *= 10;
 		}
43b45e8a
 		if (isdigit(*cref)) {
 			value += (*cref - '0');
 		} else {
 			value += (tolower(*cref) - 'a' + 10);
 		}
2fe19b26
 		cref++;
 		count++;
 	}
 	if (*cref == ';') {
 		cref++;
 		count++;
 	}
 	
 	*dest = value;
 	
 	return count;
 }
 
 /* Remove HTML character escape sequences from buffer */
 unsigned char *remove_html_char_ref(unsigned char *line)
 {
 	unsigned char *newline, *newcurrent;
 	unsigned char *linepos, count;
 	
 	if (!line) {
 		return NULL;
 	}
 	
 	newcurrent = newline = (unsigned char *) cli_malloc(strlen(line) + 1);
 	if (!newline) {
 		return NULL;
 	}
 	while (line) {
 		linepos = strchr(line, '&');
 		if (!linepos) {
 			strcpy(newcurrent, line);
 			return newline;
 		}
 		strncpy(newcurrent, line, linepos-line);
 		newcurrent += linepos-line;
 
 		if (!linepos[1] || !linepos[2]) {
 			*newcurrent = '&';
 			newcurrent++;
 			line = linepos+1;
 			continue;
 		}
 		switch (linepos[1]) {
 		case '#':
 			count = decode_html_char_ref(linepos+2,
 					newcurrent);
 			if (count > 0) {
 				newcurrent++;
 				linepos += count+2;
 			} else {
 				*newcurrent = '&';
 				newcurrent++;
 				linepos++;
 			}
 			break;
 		/* TODO: character entities, &amp; etc. */
 		default:
 			*newcurrent = '&';
 			newcurrent++;
 			linepos++;
 		}
 		line = linepos;
 	}
 	*newcurrent = '\0';
 	return newline;
 }
 
 int char2hex(unsigned char c)
 {
 	if ((c-'0') <= 9) {
 		return (c-'0');
 	} else if ((c-'A') <= 5) {
 		return (c-'A'+10);
 	}
 	return (c-'a'+10);
 }
 
 char *quoted_decode(unsigned char *line, off_t in_size)
 {
 	unsigned char *newline, *newcurrent, *line_end;
 	
 	newcurrent = newline = (unsigned char *) cli_malloc(in_size + 1);
 	if (!newline) {
 		return NULL;
 	}
 	
 	line_end = line+in_size;
 	while (line <= line_end) {
 		while ((line < line_end) && *line != '=') {
 			*newcurrent = *line;
 			line++;
 			newcurrent++;
 		}
 		if ((line < line_end) && isspace(line[1])) {
 			line++;
 			while ((line < line_end) && isspace(*line)) {
 				line++;
 			}
 			continue;
 		}
 		if ((line+2) <= line_end) {
 			if (isxdigit(line[1]) && isxdigit(line[2])) {
 				*newcurrent = 	(char2hex(line[1]) * 16) +
 						char2hex(line[2]);
 				newcurrent++;
 				line += 3;
 				continue;
 			}
 		}
 		line++;	
 	}
 	*newcurrent = '\0';
 	return newline;
 }