GitList

libclamav/htmlnorm.c

2fe19b26	/* * Copyright (C) 2004 Trog <trog@clamav.net> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ #if HAVE_CONFIG_H #include "clamav-config.h" #endif #include <stdio.h>
f91f55e0	#include <string.h> #include <ctype.h>
2fe19b26	#include <unistd.h> #include <sys/types.h> #include <sys/stat.h> #include <fcntl.h> #include "others.h" #define FALSE (0) #define TRUE (1) /* Normalize an HTML buffer using the following rules: o Remove multiple contiguous spaces o Remove spaces around '<' and '>' in tags o Remove spaces around '=' in tags o Replace single quote with double quote in tags o Convert to lowercase o Convert all white space to a space character / unsigned char html_normalize(unsigned char in_buff, off_t in_size) { unsigned char out_buff; off_t out_size=0, i; int had_space=FALSE, tag_depth=0, in_quote=FALSE; out_buff = (unsigned char ) cli_malloc(in_size+1); if (!out_buff) { cli_dbgmsg("html_normalize(): malloc failed\n"); return NULL; } for (i=0 ; i < in_size ; i++) { if (in_buff[i] == '<') { out_buff[out_size++] = '<'; tag_depth++; if (tag_depth == 1) { had_space=TRUE; / consume spaces / } } else if ((in_buff[i] == '=') && (tag_depth == 1)) { / Remove preceeding spaces / while ((out_size > 0) && (out_buff[out_size-1] == ' ')) { out_size--; } out_buff[out_size++] = '='; had_space=TRUE; } else if (isspace(in_buff[i])) { if (!had_space) { out_buff[out_size++] = ' '; had_space=TRUE; } } else if (in_buff[i] == '>') { / Remove preceeding spaces / if (tag_depth == 1) { while ((out_size > 0) && (out_buff[out_size-1] == ' ')) { out_size--; } } out_buff[out_size++] = '>'; tag_depth--; } else if ((in_buff[i] == '\'') && (tag_depth==1)) { / Convert single quotes to double quotes / if (in_quote \|\| out_buff[out_size-1] == '=') { out_buff[out_size++] = '\"'; in_quote = !in_quote; } else { out_buff[out_size++] = '\''; } } else { out_buff[out_size++] = tolower(in_buff[i]); had_space=FALSE; } } out_buff[out_size] = '\0'; return out_buff; } / Remove HTML style comments from buffer / unsigned char remove_html_comments(unsigned char line) { unsigned char newline, newcurrent; int in_comment=FALSE; if (!line) { return NULL; } newcurrent = newline = (unsigned char ) cli_malloc(strlen(line) + 1); if (!newline) { return NULL; } while(line) { if (!(in_comment)) { while (line && line != '<') { newcurrent = line; newcurrent++; line++; } if (! line) { break; } if (!line[1]) { newcurrent = line; newcurrent++; line++; continue; } if (line[1] == '!') { in_comment = TRUE; line += 1; } else { newcurrent = line; newcurrent++; line++; } } else { while (line && line != '>') { line++; } if (! line) { break; } in_comment = FALSE; line++; } } newcurrent = '\0'; return newline; } / Decode an HTML escape character into it's character value / unsigned int decode_html_char_ref(unsigned char cref, unsigned char dest) { unsigned int hex=FALSE, value=0, count=0; if (!cref[0] \|\| !cref[1]) { return 0; } if (((cref == 'x') \|\| (cref == 'X')) && isxdigit(cref[1])) { hex=TRUE; cref++; count++; } while (isdigit(cref) \|\| (hex && isxdigit(cref))) { if (hex) { value = 16; } else { value *= 10; }
43b45e8a	if (isdigit(cref)) { value += (cref - '0'); } else { value += (tolower(*cref) - 'a' + 10); }
2fe19b26	cref++; count++; } if (cref == ';') { cref++; count++; } dest = value; return count; } /* Remove HTML character escape sequences from buffer / unsigned char remove_html_char_ref(unsigned char line) { unsigned char newline, newcurrent; unsigned char linepos, count; if (!line) { return NULL; } newcurrent = newline = (unsigned char ) cli_malloc(strlen(line) + 1); if (!newline) { return NULL; } while (line) { linepos = strchr(line, '&'); if (!linepos) { strcpy(newcurrent, line); return newline; } strncpy(newcurrent, line, linepos-line); newcurrent += linepos-line; if (!linepos[1] \|\| !linepos[2]) { newcurrent = '&'; newcurrent++; line = linepos+1; continue; } switch (linepos[1]) { case '#': count = decode_html_char_ref(linepos+2, newcurrent); if (count > 0) { newcurrent++; linepos += count+2; } else { newcurrent = '&'; newcurrent++; linepos++; } break; / TODO: character entities, & etc. / default: newcurrent = '&'; newcurrent++; linepos++; } line = linepos; } newcurrent = '\0'; return newline; } int char2hex(unsigned char c) { if ((c-'0') <= 9) { return (c-'0'); } else if ((c-'A') <= 5) { return (c-'A'+10); } return (c-'a'+10); } char quoted_decode(unsigned char line, off_t in_size) { unsigned char newline, newcurrent, line_end; newcurrent = newline = (unsigned char ) cli_malloc(in_size + 1); if (!newline) { return NULL; } line_end = line+in_size; while (line <= line_end) { while ((line < line_end) && line != '=') { newcurrent = line; line++; newcurrent++; } if ((line < line_end) && isspace(line[1])) { line++; while ((line < line_end) && isspace(line)) { line++; } continue; } if ((line+2) <= line_end) { if (isxdigit(line[1]) && isxdigit(line[2])) { newcurrent = (char2hex(line[1]) * 16) + char2hex(line[2]); newcurrent++; line += 3; continue; } } line++; } *newcurrent = '\0'; return newline; }