libclamav/textnorm.c
3064a542
 /*
  *  Generic text normalizer.
  *
c442ca9c
  *  Copyright (C) 2013-2019 Cisco Systems, Inc. and/or its affiliates. All rights reserved.
  *  Copyright (C) 2008-2013 Sourcefire, Inc.
2023340a
  *
  *  Authors: Török Edvin
3064a542
  *
  *  This program is free software; you can redistribute it and/or modify
2023340a
  *  it under the terms of the GNU General Public License version 2 as
3064a542
  *  published by the Free Software Foundation.
  *
  *  This program is distributed in the hope that it will be useful,
  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  *  GNU General Public License for more details.
  *
  *  You should have received a copy of the GNU General Public License
  *  along with this program; if not, write to the Free Software
  *  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
  *  MA 02110-1301, USA.
  */
 
c77ebdab
 #if HAVE_CONFIG_H
 #include "clamav-config.h"
 #endif
 
3064a542
 #include <stdlib.h>
 #include <stdio.h>
 #include <ctype.h>
 #include "clamav.h"
 #include "textnorm.h"
c1bc49e7
 #include "bignum_fast.h"
3064a542
 
 int text_normalize_init(struct text_norm_state *state, unsigned char *out, size_t out_len)
 {
 	if(!state) {
 		return CL_ENULLARG;
 	}
 	state->out = out;
 	state->out_len = out_len;
63851995
 	state->out_pos = 0;
3064a542
 	state->space_written = 0;
 	return CL_SUCCESS;
 }
 
 void text_normalize_reset(struct text_norm_state* state)
 {
 	state->out_pos = 0;
 	state->space_written = 0;
 }
 
 enum normalize_action {
 	NORMALIZE_COPY,
 	NORMALIZE_SKIP,
 	NORMALIZE_AS_WHITESPACE,
 	NORMALIZE_ADD_32
 };
 
c1bc49e7
 
3064a542
 /* use shorter names in the table */
 #define IGN NORMALIZE_SKIP
 #define WSP NORMALIZE_AS_WHITESPACE
 #define A32 NORMALIZE_ADD_32
 #define NOP NORMALIZE_COPY
 
 /*
  * whitespace: \t, \n, \f, \v, \r, [ ]
  * nop: all characters 0x20 < c < 0x80, that are not A32 and WSP
  * tolowercase: all uppercase characters
  * ignore: control character < 0x20 that are not whitespace, and all > 0x7f
  */
 
 static const enum normalize_action char_action[256] = {
 	IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN, WSP, WSP, WSP, WSP, WSP, IGN, IGN,
 	IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN,
 	WSP, NOP, NOP, NOP, NOP, NOP, NOP, NOP, NOP, NOP, NOP, NOP, NOP, NOP, NOP, NOP,/* 0x20 - 0x2f */
 	NOP, NOP, NOP, NOP, NOP, NOP, NOP, NOP, NOP, NOP, NOP, NOP, NOP, NOP, NOP, NOP,
 	NOP, A32, A32, A32, A32, A32, A32, A32, A32, A32, A32, A32, A32, A32, A32, A32,
         A32, A32, A32, A32, A32, A32, A32, A32, A32, A32, A32, NOP, NOP, NOP, NOP, NOP,
 	NOP, NOP, NOP, NOP, NOP, NOP, NOP, NOP, NOP, NOP, NOP, NOP, NOP, NOP, NOP, NOP,
 	NOP, NOP, NOP, NOP, NOP, NOP, NOP, NOP, NOP, NOP, NOP, NOP, NOP, NOP, NOP, NOP,/* 0x70 - 0x7f */
 	IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN,
 	IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN,
 	IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN,
 	IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN,
 	IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN,
 	IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN,
 	IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN,
 	IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN
 };
 
 /* Normalizes the text at @buf of length @buf_len, @buf can include \0 characters.
  * Stores the normalized text in @state's buffer. 
  * Returns how many bytes it consumed of the input. */
bb1e844c
 size_t text_normalize_buffer(struct text_norm_state *state, const unsigned char *buf, const size_t buf_len)
3064a542
 {
 	size_t i;
 	const unsigned char *out_end = state->out + state->out_len;
 	unsigned char *p = state->out + state->out_pos;
 
 	for(i=0; i < buf_len && p < out_end; i++) {
 		unsigned char c = buf[i];
 		switch(char_action[c]) {
 			case NORMALIZE_SKIP:
 				continue;
 			case NORMALIZE_AS_WHITESPACE:
 				/* convert consecutive whitespaces to a single space */
 				if(!state->space_written) {
 					*p++ = ' ';
 				}
 				state->space_written = 1;
 				continue;
 			case NORMALIZE_ADD_32:
 				/* aka uppercase to lowercase */
 				c += 32;
 				/* fall through */
 			case NORMALIZE_COPY:
 				state->space_written = 0;
 				*p++ = c;
 		}
 	}
 	state->out_pos = p - state->out;
bb1e844c
 	return i;
3064a542
 }
 
c1bc49e7
 /* Normalizes the text in @fmap and stores the result in @state's buffer.
  * Returns number of characters written to buffer. */
 size_t text_normalize_map(struct text_norm_state *state, fmap_t *map, size_t offset)
 {
 	const unsigned char *map_loc;
 	unsigned int map_pgsz;
 	uint64_t map_len;
 	size_t buff_len;
 	size_t acc;
 	size_t acc_total;
 	size_t acc_len;
 
 	map_len = map->len;
 	map_pgsz = map->pgsz;
 	buff_len = state->out_len;
 
 	acc_total = 0;
 	acc = 0;
 
 	while (1) {
 		/* Break out if we've reached the end of the map or our buffer. */
 		if(!(acc_len = MIN_3(map_pgsz, map_len - offset, buff_len - acc_total))) break;
 
 		/* If map_loc is NULL, then there's nothing left to do but recover. */
 		if(!(map_loc = fmap_need_off_once(map, offset, acc_len))) break;
 		offset += acc_len;
 
 		/* If we didn't normalize anything, no need to update values, just break out. */
 		if(!(acc = text_normalize_buffer(state, map_loc, acc_len))) break;
 		acc_total += acc;
 	}
 
 	return acc_total;
 }