libclamav/textnorm.c
3064a542
 /*
  *  Generic text normalizer.
  *
e1cbc270
  *  Copyright (C) 2013-2019 Cisco Systems, Inc. and/or its affiliates. All rights reserved.
  *  Copyright (C) 2008-2013 Sourcefire, Inc.
2023340a
  *
  *  Authors: Török Edvin
3064a542
  *
  *  This program is free software; you can redistribute it and/or modify
2023340a
  *  it under the terms of the GNU General Public License version 2 as
3064a542
  *  published by the Free Software Foundation.
  *
  *  This program is distributed in the hope that it will be useful,
  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  *  GNU General Public License for more details.
  *
  *  You should have received a copy of the GNU General Public License
  *  along with this program; if not, write to the Free Software
  *  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
  *  MA 02110-1301, USA.
  */
 
c77ebdab
 #if HAVE_CONFIG_H
 #include "clamav-config.h"
 #endif
 
3064a542
 #include <stdlib.h>
 #include <stdio.h>
 #include <ctype.h>
 #include "clamav.h"
 #include "textnorm.h"
c1bc49e7
 #include "bignum_fast.h"
3064a542
 
 int text_normalize_init(struct text_norm_state *state, unsigned char *out, size_t out_len)
 {
288057e9
     if (!state) {
         return CL_ENULLARG;
     }
     state->out           = out;
     state->out_len       = out_len;
     state->out_pos       = 0;
     state->space_written = 0;
     return CL_SUCCESS;
3064a542
 }
 
288057e9
 void text_normalize_reset(struct text_norm_state *state)
3064a542
 {
288057e9
     state->out_pos       = 0;
     state->space_written = 0;
3064a542
 }
 
 enum normalize_action {
288057e9
     NORMALIZE_COPY,
     NORMALIZE_SKIP,
     NORMALIZE_AS_WHITESPACE,
     NORMALIZE_ADD_32
3064a542
 };
 
 /* use shorter names in the table */
 #define IGN NORMALIZE_SKIP
 #define WSP NORMALIZE_AS_WHITESPACE
 #define A32 NORMALIZE_ADD_32
 #define NOP NORMALIZE_COPY
 
 /*
  * whitespace: \t, \n, \f, \v, \r, [ ]
  * nop: all characters 0x20 < c < 0x80, that are not A32 and WSP
  * tolowercase: all uppercase characters
  * ignore: control character < 0x20 that are not whitespace, and all > 0x7f
  */
 
 static const enum normalize_action char_action[256] = {
288057e9
     IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN, WSP, WSP, WSP, WSP, WSP, IGN, IGN,
     IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN,
     WSP, NOP, NOP, NOP, NOP, NOP, NOP, NOP, NOP, NOP, NOP, NOP, NOP, NOP, NOP, NOP, /* 0x20 - 0x2f */
     NOP, NOP, NOP, NOP, NOP, NOP, NOP, NOP, NOP, NOP, NOP, NOP, NOP, NOP, NOP, NOP,
     NOP, A32, A32, A32, A32, A32, A32, A32, A32, A32, A32, A32, A32, A32, A32, A32,
     A32, A32, A32, A32, A32, A32, A32, A32, A32, A32, A32, NOP, NOP, NOP, NOP, NOP,
     NOP, NOP, NOP, NOP, NOP, NOP, NOP, NOP, NOP, NOP, NOP, NOP, NOP, NOP, NOP, NOP,
     NOP, NOP, NOP, NOP, NOP, NOP, NOP, NOP, NOP, NOP, NOP, NOP, NOP, NOP, NOP, NOP, /* 0x70 - 0x7f */
     IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN,
     IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN,
     IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN,
     IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN,
     IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN,
     IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN,
     IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN,
     IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN};
3064a542
 
 /* Normalizes the text at @buf of length @buf_len, @buf can include \0 characters.
  * Stores the normalized text in @state's buffer. 
  * Returns how many bytes it consumed of the input. */
bb1e844c
 size_t text_normalize_buffer(struct text_norm_state *state, const unsigned char *buf, const size_t buf_len)
3064a542
 {
288057e9
     size_t i;
     const unsigned char *out_end = state->out + state->out_len;
     unsigned char *p             = state->out + state->out_pos;
 
     for (i = 0; i < buf_len && p < out_end; i++) {
         unsigned char c = buf[i];
         switch (char_action[c]) {
             case NORMALIZE_SKIP:
                 continue;
             case NORMALIZE_AS_WHITESPACE:
                 /* convert consecutive whitespaces to a single space */
                 if (!state->space_written) {
                     *p++ = ' ';
                 }
                 state->space_written = 1;
                 continue;
             case NORMALIZE_ADD_32:
                 /* aka uppercase to lowercase */
                 c += 32;
                 /* fall through */
             case NORMALIZE_COPY:
                 state->space_written = 0;
                 *p++                 = c;
         }
     }
     state->out_pos = p - state->out;
     return i;
3064a542
 }
 
c1bc49e7
 /* Normalizes the text in @fmap and stores the result in @state's buffer.
  * Returns number of characters written to buffer. */
 size_t text_normalize_map(struct text_norm_state *state, fmap_t *map, size_t offset)
 {
288057e9
     const unsigned char *map_loc;
     unsigned int map_pgsz;
     uint64_t map_len;
     size_t buff_len;
     size_t acc;
     size_t acc_total;
     size_t acc_len;
 
     map_len  = map->len;
     map_pgsz = map->pgsz;
     buff_len = state->out_len;
 
     acc_total = 0;
     acc       = 0;
 
     while (1) {
         /* Break out if we've reached the end of the map or our buffer. */
         if (!(acc_len = MIN_3(map_pgsz, map_len - offset, buff_len - acc_total))) break;
 
         /* If map_loc is NULL, then there's nothing left to do but recover. */
         if (!(map_loc = fmap_need_off_once(map, offset, acc_len))) break;
         offset += acc_len;
 
         /* If we didn't normalize anything, no need to update values, just break out. */
         if (!(acc = text_normalize_buffer(state, map_loc, acc_len))) break;
         acc_total += acc;
     }
 
     return acc_total;
c1bc49e7
 }