libclamav/msxml.c
a7ad8e7a
 /*
  * Extract component parts of MS XML files (e.g. MS Office 2003 XML Documents)
  * 
c442ca9c
  * Copyright (C) 2013-2019 Cisco Systems, Inc. and/or its affiliates. All rights reserved.
a7ad8e7a
  * Copyright (C) 2007-2013 Sourcefire, Inc.
  * 
  * Authors: Kevin Lin
  * 
  * This program is free software; you can redistribute it and/or modify it under
  * the terms of the GNU General Public License version 2 as published by the
  * Free Software Foundation.
  * 
  * This program is distributed in the hope that it will be useful, but WITHOUT
  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
  * more details.
  * 
  * You should have received a copy of the GNU General Public License along with
  * this program; if not, write to the Free Software Foundation, Inc., 51
  * Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
  */
 
4823482e
 #if HAVE_CONFIG_H
 #include "clamav-config.h"
 #endif
 
3da7c00a
 #include <sys/types.h>
 #include <sys/stat.h>
 #include <fcntl.h>
 
a7ad8e7a
 #include "clamav.h"
 #include "others.h"
3da7c00a
 #include "conv.h"
4823482e
 #include "json_api.h"
 #include "msxml.h"
5994bee6
 #include "msxml_parser.h"
4823482e
 
 #if HAVE_LIBXML2
 #include <libxml/xmlreader.h>
 
e014b623
 #define MSXML_VERBIOSE 0
616dbf8e
 #if MSXML_VERBIOSE
 #define cli_msxmlmsg(...) cli_dbgmsg(__VA_ARGS__)
 #else
 #define cli_msxmlmsg(...)
 #endif
 
40bda57a
 #define MSXML_READBUFF SCANBUFF
 
5994bee6
 static const struct key_entry msxml_keys[] = {
feb32f42
     { "worddocument",       "WordDocument",       MSXML_JSON_ROOT | MSXML_JSON_ATTRIB },
     { "workbook",           "Workbook",           MSXML_JSON_ROOT | MSXML_JSON_ATTRIB },
 
     { "bindata",            "BinaryData",         MSXML_SCAN_B64 | MSXML_JSON_COUNT | MSXML_JSON_ROOT },
5994bee6
     { "documentproperties", "DocumentProperties", MSXML_JSON_ROOT },
feb32f42
     { "author",             "Author",             MSXML_JSON_WRKPTR | MSXML_JSON_VALUE },
     { "lastauthor",         "LastAuthor",         MSXML_JSON_WRKPTR | MSXML_JSON_VALUE },
     { "revision",           "Revision",           MSXML_JSON_WRKPTR | MSXML_JSON_VALUE },
     { "totaltime",          "TotalTime",          MSXML_JSON_WRKPTR | MSXML_JSON_VALUE },
     { "created",            "Created",            MSXML_JSON_WRKPTR | MSXML_JSON_VALUE },
     { "lastsaved",          "LastSaved",          MSXML_JSON_WRKPTR | MSXML_JSON_VALUE },
     { "pages",              "Pages",              MSXML_JSON_WRKPTR | MSXML_JSON_VALUE },
     { "words",              "Words",              MSXML_JSON_WRKPTR | MSXML_JSON_VALUE },
     { "characters",         "Characters",         MSXML_JSON_WRKPTR | MSXML_JSON_VALUE },
     { "lines",              "Lines",              MSXML_JSON_WRKPTR | MSXML_JSON_VALUE },
     { "paragraph",          "Paragraph",          MSXML_JSON_WRKPTR | MSXML_JSON_VALUE },
     { "characterswithspaces", "CharactersWithSpaces", MSXML_JSON_WRKPTR | MSXML_JSON_VALUE },
     { "version",            "Version",            MSXML_JSON_WRKPTR | MSXML_JSON_VALUE },
 
b8004b5d
     { "allowpng",           "AllowPNG",           MSXML_JSON_WRKPTR | MSXML_JSON_VALUE },
feb32f42
 
     { "fonts",              "Fonts",              MSXML_IGNORE_ELEM },
     { "styles",             "Styles",             MSXML_IGNORE_ELEM }
5994bee6
 };
 static size_t num_msxml_keys = sizeof(msxml_keys) / sizeof(struct key_entry);
4823482e
 
d49a7dba
 static inline size_t msxml_read_cb_new_window(struct msxml_cbdata *cbdata)
40bda57a
 {
     const unsigned char *new_window = NULL;
     off_t new_mappos;
     size_t bytes;
 
     if (cbdata->mappos == cbdata->map->len) {
616dbf8e
         cli_msxmlmsg("msxml_read_cb: fmap REALLY EOF\n");
40bda57a
         return 0;
     }
 
     new_mappos = cbdata->mappos + cbdata->winsize;
     bytes = MIN(cbdata->map->len - new_mappos, MSXML_READBUFF);
     if (!bytes) {
         cbdata->window = NULL;
         cbdata->winpos = 0;
         cbdata->mappos = cbdata->map->len;
         cbdata->winsize = 0;
 
616dbf8e
         cli_msxmlmsg("msxml_read_cb: fmap EOF\n");
40bda57a
         return 0;
     }
 
     new_window = fmap_need_off_once(cbdata->map, new_mappos, bytes);
     if (!new_window) {
         cli_errmsg("msxml_read_cb: cannot acquire new window for fmap\n");
         return -1;
     }
 
     cbdata->window = new_window;
     cbdata->winpos = 0;
     cbdata->mappos = new_mappos;
     cbdata->winsize = bytes;
 
616dbf8e
     cli_msxmlmsg("msxml_read_cb: acquired new window @ [%llu(+%llu)(max:%llu)]\n",
                  (long long unsigned)cbdata->mappos, (long long unsigned)(cbdata->mappos + cbdata->winsize),
                  (long long unsigned)cbdata->map->len);
40bda57a
 
     return bytes;
 }
 
 int msxml_read_cb(void *ctx, char *buffer, int len)
 {
     struct msxml_cbdata *cbdata = (struct msxml_cbdata *)ctx;
143cf9b6
     size_t wbytes, rbytes;
     int winret;
40bda57a
 
616dbf8e
     cli_msxmlmsg("msxml_read_cb called\n");
40bda57a
 
     /* initial iteration */
     if (!cbdata->window) {
         if ((winret = msxml_read_cb_new_window(cbdata)) <= 0)
             return winret;
     }
 
616dbf8e
     cli_msxmlmsg("msxml_read_cb: requested %d bytes from offset %llu\n", len, (long long unsigned)(cbdata->mappos+cbdata->winpos));
40bda57a
 
     wbytes = 0;
     rbytes = cbdata->winsize - cbdata->winpos;
 
d349c61d
     /* copying loop with preprocessing */
40bda57a
     while (wbytes < len) {
d349c61d
         const unsigned char *read_from;
         char *write_to = buffer + wbytes;
         enum msxml_state *state;
 #if MSXML_VERBIOSE
         size_t written;
 #endif
40bda57a
 
         if (!rbytes) {
             if ((winret = msxml_read_cb_new_window(cbdata)) < 0)
                 return winret;
             if (winret == 0) {
616dbf8e
                 cli_msxmlmsg("msxml_read_cb: propagating fmap EOF [%llu]\n", (long long unsigned)wbytes);
40bda57a
                 return (int)wbytes;
             }
 
             rbytes = cbdata->winsize;
         }
 
d349c61d
 #if MSXML_VERBIOSE
40bda57a
         written = MIN(rbytes, len - wbytes);
d349c61d
         cli_msxmlmsg("msxml_read_cb: copying from window [%llu(+%llu)] %llu->~%llu\n",
616dbf8e
                      (long long unsigned)(cbdata->winsize - rbytes), (long long unsigned)cbdata->winsize,
                      (long long unsigned)cbdata->winpos, (long long unsigned)(cbdata->winpos + written));
d349c61d
 #endif
40bda57a
 
d349c61d
         read_from = cbdata->window + cbdata->winpos;
         state = &(cbdata->state);
 
         while (rbytes > 0 && wbytes < len) {
             switch (*state) {
             case MSXML_STATE_NORMAL:
                 if ((*read_from) == '&')
                     *state = MSXML_STATE_ENTITY_START_1;
                 break;
             case MSXML_STATE_ENTITY_START_1:
                 if ((*read_from) == '#')
                     *state = MSXML_STATE_ENTITY_START_2;
                 else
                     *state = MSXML_STATE_NORMAL;
                 break;
             case MSXML_STATE_ENTITY_START_2:
                 if ((*read_from) == 'x')
                     *state = MSXML_STATE_ENTITY_HEX;
                 else if (((*read_from) >= '0') && ((*read_from) <= '9'))
                     *state = MSXML_STATE_ENTITY_DEC;
                 else
                     *state = MSXML_STATE_NORMAL;
                 break;
             case MSXML_STATE_ENTITY_HEX:
                 if ((((*read_from) >= '0') && ((*read_from) <= '9')) ||
                     (((*read_from) >= 'a') && ((*read_from) <= 'f')) ||
                     (((*read_from) >= 'A') && ((*read_from) <= 'F'))) {}
                 else
                     *state = MSXML_STATE_ENTITY_CLOSE;
                 break;
             case MSXML_STATE_ENTITY_DEC:
                 if (((*read_from) >= '0') && ((*read_from) <= '9')) {}
                 else
                     *state = MSXML_STATE_ENTITY_CLOSE;
                 break;
             default:
                 cli_errmsg("unknown *state: %d\n", *state);
             }
40bda57a
 
d349c61d
             if (*state == MSXML_STATE_ENTITY_CLOSE) {
                 if ((*read_from) != ';') {
                     cli_msxmlmsg("msxml_read_cb: detected unterminated character entity @ winoff %d\n",
                                  (int)(read_from - cbdata->window));
                     (*write_to++) = ';';
                     wbytes++;
                 }
                 *state = MSXML_STATE_NORMAL;
                 if (wbytes >= len)
                     break;
             }
 
             *(write_to++) = *(read_from++);
             rbytes--;
             wbytes++;
         }
40bda57a
     }
 
     cbdata->winpos = cbdata->winsize - rbytes;
     return (int)wbytes;
 }
acdf9a80
 #endif
 
4823482e
 int cli_scanmsxml(cli_ctx *ctx)
a7ad8e7a
 {
4823482e
 #if HAVE_LIBXML2
40bda57a
     struct msxml_cbdata cbdata;
4823482e
     xmlTextReaderPtr reader = NULL;
     int state, ret = CL_SUCCESS;
 
     cli_dbgmsg("in cli_scanmsxml()\n");
 
40bda57a
     if (!ctx)
         return CL_ENULLARG;
 
     memset(&cbdata, 0, sizeof(cbdata));
     cbdata.map = *ctx->fmap;
4823482e
 
40bda57a
     reader = xmlReaderForIO(msxml_read_cb, NULL, &cbdata, "msxml.xml", NULL, CLAMAV_MIN_XMLREADER_FLAGS);
4823482e
     if (!reader) {
7cd9337a
         cli_dbgmsg("cli_scanmsxml: cannot initialize xmlReader\n");
4e2ae35b
 
 #if HAVE_JSON
         ret = cli_json_parse_error(ctx->wrkproperty, "OOXML_ERROR_XML_READER_IO");
 #endif
         return ret; // libxml2 failed!
4823482e
     }
 
416456da
     ret = cli_msxml_parse_document(ctx, reader, msxml_keys, num_msxml_keys, 1, NULL);
acdf9a80
 
     xmlTextReaderClose(reader);
     xmlFreeTextReader(reader);
4823482e
     return ret;
 #else
     UNUSEDPARAM(ctx);
     cli_dbgmsg("in cli_scanmsxml()\n");
4c9e8bbb
     cli_dbgmsg("cli_scanmsxml: scanning msxml documents requires libxml2!\n");
4823482e
 
a7ad8e7a
     return CL_SUCCESS;
4823482e
 #endif
a7ad8e7a
 }