/*
 *  Generic text normalizer.
 *
 *  Copyright (C) 2013-2019 Cisco Systems, Inc. and/or its affiliates. All rights reserved.
 *  Copyright (C) 2008-2013 Sourcefire, Inc.
 *
 *  Authors: Török Edvin
 *
 *  This program is free software; you can redistribute it and/or modify
 *  it under the terms of the GNU General Public License version 2 as
 *  published by the Free Software Foundation.
 *
 *  This program is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *  GNU General Public License for more details.
 *
 *  You should have received a copy of the GNU General Public License
 *  along with this program; if not, write to the Free Software
 *  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
 *  MA 02110-1301, USA.
 */

#if HAVE_CONFIG_H
#include "clamav-config.h"
#endif

#include <stdlib.h>
#include <stdio.h>
#include <ctype.h>
#include "clamav.h"
#include "textnorm.h"
#include "bignum_fast.h"

int text_normalize_init(struct text_norm_state *state, unsigned char *out, size_t out_len)
{
    if (!state) {
        return CL_ENULLARG;
    }
    state->out           = out;
    state->out_len       = out_len;
    state->out_pos       = 0;
    state->space_written = 0;
    return CL_SUCCESS;
}

void text_normalize_reset(struct text_norm_state *state)
{
    state->out_pos       = 0;
    state->space_written = 0;
}

enum normalize_action {
    NORMALIZE_COPY,
    NORMALIZE_SKIP,
    NORMALIZE_AS_WHITESPACE,
    NORMALIZE_ADD_32
};

/* use shorter names in the table */
#define IGN NORMALIZE_SKIP
#define WSP NORMALIZE_AS_WHITESPACE
#define A32 NORMALIZE_ADD_32
#define NOP NORMALIZE_COPY

/*
 * whitespace: \t, \n, \f, \v, \r, [ ]
 * nop: all characters 0x20 < c < 0x80, that are not A32 and WSP
 * tolowercase: all uppercase characters
 * ignore: control character < 0x20 that are not whitespace, and all > 0x7f
 */

static const enum normalize_action char_action[256] = {
    IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN, WSP, WSP, WSP, WSP, WSP, IGN, IGN,
    IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN,
    WSP, NOP, NOP, NOP, NOP, NOP, NOP, NOP, NOP, NOP, NOP, NOP, NOP, NOP, NOP, NOP, /* 0x20 - 0x2f */
    NOP, NOP, NOP, NOP, NOP, NOP, NOP, NOP, NOP, NOP, NOP, NOP, NOP, NOP, NOP, NOP,
    NOP, A32, A32, A32, A32, A32, A32, A32, A32, A32, A32, A32, A32, A32, A32, A32,
    A32, A32, A32, A32, A32, A32, A32, A32, A32, A32, A32, NOP, NOP, NOP, NOP, NOP,
    NOP, NOP, NOP, NOP, NOP, NOP, NOP, NOP, NOP, NOP, NOP, NOP, NOP, NOP, NOP, NOP,
    NOP, NOP, NOP, NOP, NOP, NOP, NOP, NOP, NOP, NOP, NOP, NOP, NOP, NOP, NOP, NOP, /* 0x70 - 0x7f */
    IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN,
    IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN,
    IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN,
    IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN,
    IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN,
    IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN,
    IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN,
    IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN};

/* Normalizes the text at @buf of length @buf_len, @buf can include \0 characters.
 * Stores the normalized text in @state's buffer. 
 * Returns how many bytes it consumed of the input. */
size_t text_normalize_buffer(struct text_norm_state *state, const unsigned char *buf, const size_t buf_len)
{
    size_t i;
    const unsigned char *out_end = state->out + state->out_len;
    unsigned char *p             = state->out + state->out_pos;

    for (i = 0; i < buf_len && p < out_end; i++) {
        unsigned char c = buf[i];
        switch (char_action[c]) {
            case NORMALIZE_SKIP:
                continue;
            case NORMALIZE_AS_WHITESPACE:
                /* convert consecutive whitespaces to a single space */
                if (!state->space_written) {
                    *p++ = ' ';
                }
                state->space_written = 1;
                continue;
            case NORMALIZE_ADD_32:
                /* aka uppercase to lowercase */
                c += 32;
                /* fall through */
            case NORMALIZE_COPY:
                state->space_written = 0;
                *p++                 = c;
        }
    }
    state->out_pos = p - state->out;
    return i;
}

/* Normalizes the text in @fmap and stores the result in @state's buffer.
 * Returns number of characters written to buffer. */
size_t text_normalize_map(struct text_norm_state *state, fmap_t *map, size_t offset)
{
    const unsigned char *map_loc;
    unsigned int map_pgsz;
    uint64_t map_len;
    size_t buff_len;
    size_t acc;
    size_t acc_total;
    size_t acc_len;

    map_len  = map->len;
    map_pgsz = map->pgsz;
    buff_len = state->out_len;

    acc_total = 0;
    acc       = 0;

    while (1) {
        /* Break out if we've reached the end of the map or our buffer. */
        if (!(acc_len = MIN_3(map_pgsz, map_len - offset, buff_len - acc_total))) break;

        /* If map_loc is NULL, then there's nothing left to do but recover. */
        if (!(map_loc = fmap_need_off_once(map, offset, acc_len))) break;
        offset += acc_len;

        /* If we didn't normalize anything, no need to update values, just break out. */
        if (!(acc = text_normalize_buffer(state, map_loc, acc_len))) break;
        acc_total += acc;
    }

    return acc_total;
}