libclamav/msxml.c
a7ad8e7a
 /*
  * Extract component parts of MS XML files (e.g. MS Office 2003 XML Documents)
02840644
  *
e1cbc270
  * Copyright (C) 2013-2019 Cisco Systems, Inc. and/or its affiliates. All rights reserved.
a7ad8e7a
  * Copyright (C) 2007-2013 Sourcefire, Inc.
02840644
  *
a7ad8e7a
  * Authors: Kevin Lin
02840644
  *
a7ad8e7a
  * This program is free software; you can redistribute it and/or modify it under
  * the terms of the GNU General Public License version 2 as published by the
  * Free Software Foundation.
02840644
  *
a7ad8e7a
  * This program is distributed in the hope that it will be useful, but WITHOUT
  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
  * more details.
02840644
  *
a7ad8e7a
  * You should have received a copy of the GNU General Public License along with
  * this program; if not, write to the Free Software Foundation, Inc., 51
  * Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
  */
 
4823482e
 #if HAVE_CONFIG_H
 #include "clamav-config.h"
 #endif
 
3da7c00a
 #include <sys/types.h>
 #include <sys/stat.h>
 #include <fcntl.h>
 
a7ad8e7a
 #include "clamav.h"
 #include "others.h"
3da7c00a
 #include "conv.h"
4823482e
 #include "json_api.h"
 #include "msxml.h"
5994bee6
 #include "msxml_parser.h"
4823482e
 
 #if HAVE_LIBXML2
 #include <libxml/xmlreader.h>
 
e014b623
 #define MSXML_VERBIOSE 0
616dbf8e
 #if MSXML_VERBIOSE
 #define cli_msxmlmsg(...) cli_dbgmsg(__VA_ARGS__)
 #else
 #define cli_msxmlmsg(...)
 #endif
 
40bda57a
 #define MSXML_READBUFF SCANBUFF
 
27948a03
 // clang-format off
feb32f42
 
27948a03
 static const struct key_entry msxml_keys[] = {
     { "worddocument",         "WordDocument",           MSXML_JSON_ROOT | MSXML_JSON_ATTRIB },
     { "workbook",             "Workbook",               MSXML_JSON_ROOT | MSXML_JSON_ATTRIB },
     { "bindata",              "BinaryData",             MSXML_SCAN_B64 | MSXML_JSON_COUNT | MSXML_JSON_ROOT },
     { "documentproperties",   "DocumentProperties",     MSXML_JSON_ROOT },
     { "author",               "Author",                 MSXML_JSON_WRKPTR | MSXML_JSON_VALUE },
     { "lastauthor",           "LastAuthor",             MSXML_JSON_WRKPTR | MSXML_JSON_VALUE },
     { "revision",             "Revision",               MSXML_JSON_WRKPTR | MSXML_JSON_VALUE },
     { "totaltime",            "TotalTime",              MSXML_JSON_WRKPTR | MSXML_JSON_VALUE },
     { "created",              "Created",                MSXML_JSON_WRKPTR | MSXML_JSON_VALUE },
     { "lastsaved",            "LastSaved",              MSXML_JSON_WRKPTR | MSXML_JSON_VALUE },
     { "pages",                "Pages",                  MSXML_JSON_WRKPTR | MSXML_JSON_VALUE },
     { "words",                "Words",                  MSXML_JSON_WRKPTR | MSXML_JSON_VALUE },
     { "characters",           "Characters",             MSXML_JSON_WRKPTR | MSXML_JSON_VALUE },
     { "lines",                "Lines",                  MSXML_JSON_WRKPTR | MSXML_JSON_VALUE },
     { "paragraph",            "Paragraph",              MSXML_JSON_WRKPTR | MSXML_JSON_VALUE },
     { "characterswithspaces", "CharactersWithSpaces",   MSXML_JSON_WRKPTR | MSXML_JSON_VALUE },
     { "version",              "Version",                MSXML_JSON_WRKPTR | MSXML_JSON_VALUE },
     { "allowpng",             "AllowPNG",               MSXML_JSON_WRKPTR | MSXML_JSON_VALUE },
     { "fonts",                "Fonts",                  MSXML_IGNORE_ELEM },
     { "styles",               "Styles",                 MSXML_IGNORE_ELEM }
5994bee6
 };
 static size_t num_msxml_keys = sizeof(msxml_keys) / sizeof(struct key_entry);
4823482e
 
27948a03
 // clang-format on
 
d49a7dba
 static inline size_t msxml_read_cb_new_window(struct msxml_cbdata *cbdata)
40bda57a
 {
     const unsigned char *new_window = NULL;
     off_t new_mappos;
     size_t bytes;
 
02840644
     if ((size_t)cbdata->mappos == cbdata->map->len) {
616dbf8e
         cli_msxmlmsg("msxml_read_cb: fmap REALLY EOF\n");
40bda57a
         return 0;
     }
 
     new_mappos = cbdata->mappos + cbdata->winsize;
288057e9
     bytes      = MIN(cbdata->map->len - new_mappos, MSXML_READBUFF);
40bda57a
     if (!bytes) {
288057e9
         cbdata->window  = NULL;
         cbdata->winpos  = 0;
         cbdata->mappos  = cbdata->map->len;
40bda57a
         cbdata->winsize = 0;
 
616dbf8e
         cli_msxmlmsg("msxml_read_cb: fmap EOF\n");
40bda57a
         return 0;
     }
 
     new_window = fmap_need_off_once(cbdata->map, new_mappos, bytes);
     if (!new_window) {
         cli_errmsg("msxml_read_cb: cannot acquire new window for fmap\n");
         return -1;
     }
 
288057e9
     cbdata->window  = new_window;
     cbdata->winpos  = 0;
     cbdata->mappos  = new_mappos;
40bda57a
     cbdata->winsize = bytes;
 
616dbf8e
     cli_msxmlmsg("msxml_read_cb: acquired new window @ [%llu(+%llu)(max:%llu)]\n",
                  (long long unsigned)cbdata->mappos, (long long unsigned)(cbdata->mappos + cbdata->winsize),
                  (long long unsigned)cbdata->map->len);
40bda57a
 
     return bytes;
 }
 
02840644
 int msxml_read_cb(void *ctx, char *buffer, int buffer_len)
40bda57a
 {
     struct msxml_cbdata *cbdata = (struct msxml_cbdata *)ctx;
143cf9b6
     size_t wbytes, rbytes;
     int winret;
02840644
     size_t len = (size_t)buffer_len;
40bda57a
 
616dbf8e
     cli_msxmlmsg("msxml_read_cb called\n");
40bda57a
 
     /* initial iteration */
     if (!cbdata->window) {
         if ((winret = msxml_read_cb_new_window(cbdata)) <= 0)
             return winret;
     }
 
02840644
     cli_msxmlmsg("msxml_read_cb: requested %zu bytes from offset %llu\n", len, (long long unsigned)(cbdata->mappos + cbdata->winpos));
40bda57a
 
     wbytes = 0;
     rbytes = cbdata->winsize - cbdata->winpos;
 
d349c61d
     /* copying loop with preprocessing */
40bda57a
     while (wbytes < len) {
d349c61d
         const unsigned char *read_from;
         char *write_to = buffer + wbytes;
         enum msxml_state *state;
 #if MSXML_VERBIOSE
         size_t written;
 #endif
40bda57a
 
         if (!rbytes) {
             if ((winret = msxml_read_cb_new_window(cbdata)) < 0)
                 return winret;
             if (winret == 0) {
616dbf8e
                 cli_msxmlmsg("msxml_read_cb: propagating fmap EOF [%llu]\n", (long long unsigned)wbytes);
40bda57a
                 return (int)wbytes;
             }
 
             rbytes = cbdata->winsize;
         }
 
d349c61d
 #if MSXML_VERBIOSE
40bda57a
         written = MIN(rbytes, len - wbytes);
d349c61d
         cli_msxmlmsg("msxml_read_cb: copying from window [%llu(+%llu)] %llu->~%llu\n",
616dbf8e
                      (long long unsigned)(cbdata->winsize - rbytes), (long long unsigned)cbdata->winsize,
                      (long long unsigned)cbdata->winpos, (long long unsigned)(cbdata->winpos + written));
d349c61d
 #endif
40bda57a
 
d349c61d
         read_from = cbdata->window + cbdata->winpos;
288057e9
         state     = &(cbdata->state);
d349c61d
 
02840644
         while ((rbytes > 0) && (wbytes < len)) {
d349c61d
             switch (*state) {
288057e9
                 case MSXML_STATE_NORMAL:
                     if ((*read_from) == '&')
                         *state = MSXML_STATE_ENTITY_START_1;
                     break;
                 case MSXML_STATE_ENTITY_START_1:
                     if ((*read_from) == '#')
                         *state = MSXML_STATE_ENTITY_START_2;
                     else
                         *state = MSXML_STATE_NORMAL;
                     break;
                 case MSXML_STATE_ENTITY_START_2:
                     if ((*read_from) == 'x')
                         *state = MSXML_STATE_ENTITY_HEX;
                     else if (((*read_from) >= '0') && ((*read_from) <= '9'))
                         *state = MSXML_STATE_ENTITY_DEC;
                     else
                         *state = MSXML_STATE_NORMAL;
                     break;
                 case MSXML_STATE_ENTITY_HEX:
                     if ((((*read_from) >= '0') && ((*read_from) <= '9')) ||
                         (((*read_from) >= 'a') && ((*read_from) <= 'f')) ||
                         (((*read_from) >= 'A') && ((*read_from) <= 'F'))) {
                     } else
                         *state = MSXML_STATE_ENTITY_CLOSE;
                     break;
                 case MSXML_STATE_ENTITY_DEC:
                     if (((*read_from) >= '0') && ((*read_from) <= '9')) {
                     } else
                         *state = MSXML_STATE_ENTITY_CLOSE;
                     break;
                 default:
                     cli_errmsg("unknown *state: %d\n", *state);
d349c61d
             }
40bda57a
 
d349c61d
             if (*state == MSXML_STATE_ENTITY_CLOSE) {
                 if ((*read_from) != ';') {
                     cli_msxmlmsg("msxml_read_cb: detected unterminated character entity @ winoff %d\n",
                                  (int)(read_from - cbdata->window));
                     (*write_to++) = ';';
                     wbytes++;
                 }
                 *state = MSXML_STATE_NORMAL;
                 if (wbytes >= len)
                     break;
             }
 
             *(write_to++) = *(read_from++);
             rbytes--;
             wbytes++;
         }
40bda57a
     }
 
     cbdata->winpos = cbdata->winsize - rbytes;
     return (int)wbytes;
 }
acdf9a80
 #endif
 
102cd430
 cl_error_t cli_scanmsxml(cli_ctx *ctx)
a7ad8e7a
 {
4823482e
 #if HAVE_LIBXML2
40bda57a
     struct msxml_cbdata cbdata;
4823482e
     xmlTextReaderPtr reader = NULL;
102cd430
     cl_error_t ret          = CL_SUCCESS;
4823482e
 
     cli_dbgmsg("in cli_scanmsxml()\n");
 
40bda57a
     if (!ctx)
         return CL_ENULLARG;
 
     memset(&cbdata, 0, sizeof(cbdata));
     cbdata.map = *ctx->fmap;
4823482e
 
40bda57a
     reader = xmlReaderForIO(msxml_read_cb, NULL, &cbdata, "msxml.xml", NULL, CLAMAV_MIN_XMLREADER_FLAGS);
4823482e
     if (!reader) {
7cd9337a
         cli_dbgmsg("cli_scanmsxml: cannot initialize xmlReader\n");
4e2ae35b
 
 #if HAVE_JSON
         ret = cli_json_parse_error(ctx->wrkproperty, "OOXML_ERROR_XML_READER_IO");
 #endif
         return ret; // libxml2 failed!
4823482e
     }
 
416456da
     ret = cli_msxml_parse_document(ctx, reader, msxml_keys, num_msxml_keys, 1, NULL);
acdf9a80
 
     xmlTextReaderClose(reader);
     xmlFreeTextReader(reader);
4823482e
     return ret;
 #else
     UNUSEDPARAM(ctx);
     cli_dbgmsg("in cli_scanmsxml()\n");
4c9e8bbb
     cli_dbgmsg("cli_scanmsxml: scanning msxml documents requires libxml2!\n");
4823482e
 
a7ad8e7a
     return CL_SUCCESS;
4823482e
 #endif
a7ad8e7a
 }