libclamav/ooxml.c
25556519
 /*
  * OOXML JSON Internals
  * 
  * Copyright (C) 2014 Cisco Systems, Inc. and/or its affiliates. All rights reserved.
  * 
  * Authors: Kevin Lin
  * 
  * This program is free software; you can redistribute it and/or modify it under
  * the terms of the GNU General Public License version 2 as published by the
  * Free Software Foundation.
  * 
  * This program is distributed in the hope that it will be useful, but WITHOUT
  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
  * more details.
  * 
  * You should have received a copy of the GNU General Public License along with
  * this program; if not, write to the Free Software Foundation, Inc., 51
  * Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
  */
 
 #if HAVE_CONFIG_H
 #include "clamav-config.h"
 #endif
 
60d8d2c3
 #include "clamav.h"
25556519
 #include "cltypes.h"
 #include "others.h"
 #include "unzip.h"
e91e4383
 #if HAVE_JSON
51f8cc3c
 #include "json.h"
e91e4383
 #endif
25556519
 #include "json_api.h"
 
cd94be7a
 #include "ooxml.h"
 
25556519
 #if HAVE_LIBXML2
 #ifdef _WIN32
 #ifndef LIBXML_WRITER_ENABLED
 #define LIBXML_WRITER_ENABLED 1
 #endif
 #endif
 #include <libxml/xmlreader.h>
 #endif
 
871b862e
 #define OOXML_DEBUG 0
 
a41009bc
 #if HAVE_LIBXML2 && HAVE_JSON
 
25556519
 #define OOXML_JSON_RECLEVEL 16
afd00a72
 #define OOXML_JSON_RECLEVEL_MAX 5
 #define OOXML_JSON_STRLEN_MAX 100
25556519
 
6c951663
 #define check_state(state)                                              \
     do {                                                                \
         if (state == -1) {                                              \
             cli_warnmsg("check_state: CL_EPARSE @ ln%d\n", __LINE__);   \
             return CL_EPARSE;                                           \
         }                                                               \
         else if (state == 0) {                                          \
             cli_dbgmsg("check_state: CL_BREAK @ ln%d\n", __LINE__);     \
             return CL_BREAK;                                            \
         }                                                               \
     } while(0)
 
d48fd4bb
 static int ooxml_is_int(const char *value, size_t len, int32_t *val)
25556519
 {
d48fd4bb
     long val2;
25556519
     char *endptr = NULL;
 
d48fd4bb
     val2 = strtol(value, &endptr, 10);
25556519
     if (endptr != value+len) {
         return 0;
     }
 
d48fd4bb
     *val = (int32_t)(val2 & 0x0000ffff);
25556519
 
     return 1;
 }
 
2fd832e5
 static int ooxml_parse_value(json_object *wrkptr, const char *arrname, const xmlChar *node_value)
d48fd4bb
 {
2fd832e5
     json_object *newobj, *arrobj;
d48fd4bb
     int val;
 
2fd832e5
     arrobj = cli_jsonarray(wrkptr, arrname);
d6b181ac
     if (arrobj == NULL) {
         return CL_EMEM;
     }
2fd832e5
 
cd94be7a
     if (ooxml_is_int((const char *)node_value, xmlStrlen(node_value), &val)) {
2fd832e5
         newobj = json_object_new_int(val);
d48fd4bb
     }
cd94be7a
     else if (!xmlStrcmp(node_value, (const xmlChar *)"true")) {
2fd832e5
         newobj = json_object_new_boolean(1);
d48fd4bb
     }
cd94be7a
     else if (!xmlStrcmp(node_value, (const xmlChar *)"false")) {
2fd832e5
         newobj = json_object_new_boolean(0);
d48fd4bb
     }
     else {
cd94be7a
         newobj = json_object_new_string((const char *)node_value);
d48fd4bb
     }
2fd832e5
 
     if (NULL == newobj) {
         cli_errmsg("ooxml_parse_value: no memory for json value for [%s]\n", arrname);
         return CL_EMEM;
d48fd4bb
     }
2fd832e5
 
     json_object_array_add(arrobj, newobj);
     return CL_SUCCESS;
d48fd4bb
 }
 
afd00a72
 static const char *ooxml_keys[] = {
     "coreproperties",
     "title",
     "subject",
     "creator",
     "keywords",
     "comments",
     "description",
     "lastmodifiedby",
     "revision",
     "created",
     "modified",
     "category",
     "contentstatus",
 
     "properties",
     "application",
     "appversion",
     "characters",
     "characterswithspaces",
     "company",
     "digsig",
     "docsecurity",
     //"headingpairs",
     "hiddenslides",
     "hlinks",
     "hyperlinkbase",
     "hyperlinkschanged",
     "lines",
     "linksuptodate",
     "manager",
     "mmclips",
     "notes",
     "pages",
     "paragraphs",
     "presentationformat",
     "properties",
     "scalecrop",
     "shareddoc",
     "slides",
     "template",
     //"titlesofparts",
     "totaltime",
     "words"
 };
 static const char *ooxml_json_keys[] = {
     "CoreProperties",
     "Title",
     "Subject",
     "Author",
     "Keywords",
     "Comments",
     "Description",
     "LastAuthor",
     "Revision",
     "Created",
     "Modified",
     "Category",
     "ContentStatus",
 
     "ExtendedProperties",
     "Application",
     "AppVersion",
     "Characters",
     "CharactersWithSpaces",
     "Company",
     "DigSig",
     "DocSecurity",
     //"HeadingPairs",
     "HiddenSlides",
     "HLinks",
     "HyperlinkBase",
     "HyperlinksChanged",
     "Lines",
     "LinksUpToDate",
     "Manager",
871b862e
     "MultimediaClips",
afd00a72
     "Notes",
     "Pages",
     "Paragraphs",
     "PresentationFormat",
     "Properties",
     "ScaleCrop",
     "SharedDoc",
     "Slides",
     "Template",
     //"TitlesOfParts",
     "TotalTime",
     "Words"
 };
 static size_t num_ooxml_keys = 40; //42
 
 static const char *ooxml_check_key(const char* key, size_t keylen)
 {
     unsigned i;
 
     if (keylen > OOXML_JSON_STRLEN_MAX-1) {
         cli_dbgmsg("ooxml_check_key: key name too long\n");
         return NULL;
     }
 
     for (i = 0; i < num_ooxml_keys; ++i) {
         //cli_dbgmsg("%d %d %s %s %s %s\n", keylen, strlen(ooxml_keys[i]), key, keycmp, ooxml_keys[i], ooxml_json_keys[i]);
871b862e
         if (keylen == strlen(ooxml_keys[i]) && !strncasecmp(key, ooxml_keys[i], keylen)) {
afd00a72
             return ooxml_json_keys[i];
         }
     }
 
     return NULL;
 }
a41009bc
 
3827b2cc
 static int ooxml_parse_element(cli_ctx *ctx, xmlTextReaderPtr reader, json_object *wrkptr, int rlvl, json_object *root)
afd00a72
 {
     const char *element_tag = NULL, *end_tag = NULL;
     const xmlChar *node_name = NULL, *node_value = NULL;
a763e44b
     json_object *thisjobj = NULL;
6c951663
     int node_type, ret = CL_SUCCESS, endtag = 0, toval = 0, state = 1;
afd00a72
 
     cli_dbgmsg("in ooxml_parse_element @ layer %d\n", rlvl);
 
     /* check recursion level */
     if (rlvl >= OOXML_JSON_RECLEVEL_MAX) {
cd94be7a
         cli_dbgmsg("ooxml_parse_element: reached ooxml json recursion limit\n");
3827b2cc
         cli_jsonbool(root, "HitRecursiveLimit", 1);
6c951663
         /* skip it */
         state = xmlTextReaderNext(reader);
         check_state(state);
d48fd4bb
         return CL_SUCCESS;
afd00a72
     }
 
     /* acquire element type */
     node_type = xmlTextReaderNodeType(reader);
6c951663
     if (node_type == -1)
         return CL_EPARSE;
 
afd00a72
     if (node_type != XML_READER_TYPE_ELEMENT) {
         cli_dbgmsg("ooxml_parse_element: first node typed %d, not %d\n", node_type, XML_READER_TYPE_ELEMENT);
         return CL_EPARSE; /* first type is not an element */
     }
 
     node_name = xmlTextReaderConstLocalName(reader);
     if (!node_name) {
         cli_dbgmsg("ooxml_parse_element: element tag node nameless\n");
         return CL_EPARSE; /* no name, nameless */
     }
cd94be7a
     element_tag = ooxml_check_key((const char *)node_name, xmlStrlen(node_name));
afd00a72
     if (!element_tag) {
         cli_dbgmsg("ooxml_parse_element: invalid element tag [%s]\n", node_name);
d48fd4bb
         /* skip it */
6c951663
         state = xmlTextReaderNext(reader);
         check_state(state);
d48fd4bb
         return CL_SUCCESS;
afd00a72
     }
 
2fd832e5
     /* generate json object */
     thisjobj = cli_jsonobj(wrkptr, element_tag);
     if (!thisjobj) {
         return CL_EPARSE;
     }
     cli_dbgmsg("ooxml_parse_element: generated json object [%s]\n", element_tag);
 
d6b181ac
     if (rlvl == 0)
         root = thisjobj;
 
2fd832e5
     /* handle attributes */
6c951663
     state = xmlTextReaderHasAttributes(reader);
     if (state == 1) {
a763e44b
         json_object *attributes;
 
         attributes = cli_jsonobj(thisjobj, "Attributes");
d6b181ac
         if (!attributes) {
a763e44b
             return CL_EPARSE;
         }
         cli_dbgmsg("ooxml_parse_element: retrieved json object [Attributes]\n");
 
         while (xmlTextReaderMoveToNextAttribute(reader) == 1) {
             const xmlChar *name, *value;
             name = xmlTextReaderConstLocalName(reader);
             value = xmlTextReaderConstValue(reader);
             if (name == NULL || value == NULL) continue;
 
             cli_dbgmsg("%s: %s\n", name, value);
 
cd94be7a
             cli_jsonstr(attributes, name, (const char *)value);
a763e44b
         }
     }
6c951663
     else if (state == -1)
         return CL_EPARSE;
afd00a72
 
6c951663
     state = xmlTextReaderIsEmptyElement(reader);
     if (state == 1) {
         state = xmlTextReaderNext(reader);
         check_state(state);
d6b181ac
         return CL_SUCCESS;
     }
6c951663
     else if (state == -1)
         return CL_EPARSE;
afd00a72
 
d48fd4bb
     /* advance to first content node */
6c951663
     state = xmlTextReaderRead(reader);
     check_state(state);
d48fd4bb
 
     /* parse until the end element tag */
6c951663
     while (!endtag) {
cfc405e5
         if (cli_json_timeout_cycle_check(ctx, &toval) != CL_SUCCESS) {
             return CL_ETIMEOUT;
         }
 
afd00a72
         node_type = xmlTextReaderNodeType(reader);
6c951663
         if (node_type == -1)
             return CL_EPARSE;
 
afd00a72
         switch (node_type) {
         case XML_READER_TYPE_ELEMENT:
3827b2cc
             ret = ooxml_parse_element(ctx, reader, thisjobj, rlvl+1, root);
afd00a72
             if (ret != CL_SUCCESS) {
                 return ret;
             }
             break;
d48fd4bb
 
afd00a72
         case XML_READER_TYPE_END_ELEMENT:
             cli_dbgmsg("in ooxml_parse_element @ layer %d closed\n", rlvl);
             node_name = xmlTextReaderConstLocalName(reader);
             if (!node_name) {
                 cli_dbgmsg("ooxml_parse_element: element end tag node nameless\n");
                 return CL_EPARSE; /* no name, nameless */
             }
d48fd4bb
 
cd94be7a
             end_tag = ooxml_check_key((const char *)node_name, xmlStrlen(node_name));
d48fd4bb
             if (!end_tag) {
                 cli_dbgmsg("ooxml_parse_element: invalid element end tag [%s]\n", node_name);
                 return CL_EFORMAT; /* unrecognized element tag */
             }
             if (strncmp(element_tag, end_tag, strlen(element_tag))) {
                 cli_dbgmsg("ooxml_parse_element: element tag does not match end tag\n");
                 return CL_EFORMAT;
afd00a72
             }
d48fd4bb
 
             /* advance to next element tag */
6c951663
             state = xmlTextReaderRead(reader);
             check_state(state);
d48fd4bb
 
             endtag = 1;
             break;
 
afd00a72
         case XML_READER_TYPE_TEXT:
d48fd4bb
             node_value = xmlTextReaderConstValue(reader);
afd00a72
 
2fd832e5
             ret = ooxml_parse_value(thisjobj, "Value", node_value);
d48fd4bb
             if (ret != CL_SUCCESS)
                 return ret;
afd00a72
 
d48fd4bb
             cli_dbgmsg("ooxml_parse_element: added json value [%s: %s]\n", element_tag, node_value);
afd00a72
 
d48fd4bb
             /* advance to next element tag */
6c951663
             state = xmlTextReaderRead(reader);
             check_state(state);
afd00a72
 
             break;
d48fd4bb
 
afd00a72
         default:
871b862e
 #if OOXML_DEBUG
afd00a72
             node_name = xmlTextReaderConstLocalName(reader);
             node_value = xmlTextReaderConstValue(reader);
 
             cli_dbgmsg("ooxml_parse_element: unhandled xml node %s [%d]: %s\n", node_name, node_type, node_value);
871b862e
 #endif
6c951663
             state = xmlTextReaderNext(reader);
             check_state(state);
d6b181ac
             return CL_SUCCESS;
afd00a72
         }
     }
 
     return CL_SUCCESS;
 }
 
 static int ooxml_parse_document(int fd, cli_ctx *ctx)
 {
     int ret = CL_SUCCESS;
     xmlTextReaderPtr reader = NULL;
 
     cli_dbgmsg("in ooxml_parse_document\n");
 
fb05a793
     reader = xmlReaderForFd(fd, "properties.xml", NULL, CLAMAV_MIN_XMLREADER_FLAGS);
afd00a72
     if (reader == NULL) {
         cli_dbgmsg("ooxml_parse_document: xmlReaderForFd error\n");
         return CL_SUCCESS; // internal error from libxml2
     }
 
     /* move reader to first element */
     if (xmlTextReaderRead(reader) != 1) {
         return CL_SUCCESS; /* libxml2 failed */
     }
 
3827b2cc
     ret = ooxml_parse_element(ctx, reader, ctx->wrkproperty, 0, NULL);
afd00a72
 
6c951663
     if (ret != CL_SUCCESS && ret != CL_ETIMEOUT && ret != CL_BREAK) {
7509af32
         cli_warnmsg("ooxml_parse_document: encountered issue in parsing properties document\n");
d6b181ac
         cli_jsonbool(ctx->wrkproperty, "ParseError", 1);
7509af32
     }
d6b181ac
 
afd00a72
     xmlTextReaderClose(reader);
     xmlFreeTextReader(reader);
     return ret;
 }
25556519
 
 static int ooxml_core_cb(int fd, cli_ctx *ctx)
 {
     cli_dbgmsg("in ooxml_core_cb\n");
afd00a72
     return ooxml_parse_document(fd, ctx);
     //return ooxml_basic_json(fd, ctx, "CoreProperties");
25556519
 }
 
 static int ooxml_extn_cb(int fd, cli_ctx *ctx)
 {
     cli_dbgmsg("in ooxml_extn_cb\n");
afd00a72
     return ooxml_parse_document(fd, ctx);
     //return ooxml_basic_json(fd, ctx, "ExtendedProperties");
25556519
 }
 
 static int ooxml_content_cb(int fd, cli_ctx *ctx)
 {
20b45621
     int ret = CL_SUCCESS, tmp, toval = 0;
a41ab49e
     int core=0, extn=0, cust=0, dsig=0;
d48fd4bb
     int mcore=0, mextn=0, mcust=0;
25556519
     const xmlChar *name, *value, *CT, *PN;
     xmlTextReaderPtr reader = NULL;
     uint32_t loff;
 
     cli_dbgmsg("in ooxml_content_cb\n");
 
fb05a793
     reader = xmlReaderForFd(fd, "[Content_Types].xml", NULL, CLAMAV_MIN_XMLREADER_FLAGS);
25556519
     if (reader == NULL) {
         cli_dbgmsg("ooxml_content_cb: xmlReaderForFd error for ""[Content_Types].xml""\n");
         return CL_SUCCESS; // libxml2 failed!
     }
 
     /* locate core-properties, extended-properties, and custom-properties (optional)  */
     while (xmlTextReaderRead(reader) == 1) {
20b45621
         if (cli_json_timeout_cycle_check(ctx, &toval) != CL_SUCCESS) {
             ret = CL_ETIMEOUT;
             goto ooxml_content_exit;
         }
 
25556519
         name = xmlTextReaderConstLocalName(reader);
         if (name == NULL) continue;
 
cd94be7a
         if (strcmp((const char *)name, "Override")) continue;
25556519
 
6c951663
         if (xmlTextReaderHasAttributes(reader) != 1) continue;
25556519
 
871b862e
         CT = PN = NULL;
25556519
         while (xmlTextReaderMoveToNextAttribute(reader) == 1) {
             name = xmlTextReaderConstLocalName(reader);
             value = xmlTextReaderConstValue(reader);
             if (name == NULL || value == NULL) continue;
 
cd94be7a
             if (!xmlStrcmp(name, (const xmlChar *)"ContentType")) {
25556519
                 CT = value;
             }
cd94be7a
             else if (!xmlStrcmp(name, (const xmlChar *)"PartName")) {
25556519
                 PN = value;
             }
 
             cli_dbgmsg("%s: %s\n", name, value);
         }
 
         if (!CT && !PN) continue;
 
cd94be7a
         if (!xmlStrcmp(CT, (const xmlChar *)"application/vnd.openxmlformats-package.core-properties+xml")) {
871b862e
             if (!core) {
                 /* default: /docProps/core.xml*/
cd94be7a
                 tmp = unzip_search(ctx, (const char *)(PN+1), xmlStrlen(PN)-1, &loff);
871b862e
                 if (tmp == CL_ETIMEOUT) {
                     ret = tmp;
                 }
                 else if (tmp != CL_VIRUS) {
                     cli_dbgmsg("cli_process_ooxml: failed to find core properties file \"%s\"!\n", PN);
d48fd4bb
                     mcore++;
871b862e
                 }
                 else {
                     cli_dbgmsg("ooxml_content_cb: found core properties file \"%s\" @ %x\n", PN, loff);
                     ret = unzip_single_internal(ctx, loff, ooxml_core_cb);
d48fd4bb
                     core++;
871b862e
                 }
25556519
             }
         }
cd94be7a
         else if (!xmlStrcmp(CT, (const xmlChar *)"application/vnd.openxmlformats-officedocument.extended-properties+xml")) {
871b862e
             if (!extn) {
                 /* default: /docProps/app.xml */
cd94be7a
                 tmp = unzip_search(ctx, (const char *)(PN+1), xmlStrlen(PN)-1, &loff);
871b862e
                 if (tmp == CL_ETIMEOUT) {
                     ret = tmp;
                 }
                 else if (tmp != CL_VIRUS) {
                     cli_dbgmsg("cli_process_ooxml: failed to find extended properties file \"%s\"!\n", PN);
d48fd4bb
                     mextn++;
871b862e
                 }
                 else {
                     cli_dbgmsg("ooxml_content_cb: found extended properties file \"%s\" @ %x\n", PN, loff);
                     ret = unzip_single_internal(ctx, loff, ooxml_extn_cb);
d48fd4bb
                     extn++;
871b862e
                 }
25556519
             }
         }
cd94be7a
         else if (!xmlStrcmp(CT, (const xmlChar *)"application/vnd.openxmlformats-officedocument.custom-properties+xml")) {
871b862e
             if (!cust) {
                 /* default: /docProps/custom.xml */
cd94be7a
                 tmp = unzip_search(ctx, (const char *)(PN+1), xmlStrlen(PN)-1, &loff);
871b862e
                 if (tmp == CL_ETIMEOUT) {
                     ret = tmp;
                 }
                 else if (tmp != CL_VIRUS) {
                     cli_dbgmsg("cli_process_ooxml: failed to find custom properties file \"%s\"!\n", PN);
d48fd4bb
                     mcust++;
871b862e
                 }
                 else {
                     cli_dbgmsg("ooxml_content_cb: found custom properties file \"%s\" @ %x\n", PN, loff);
d48fd4bb
                     cust++;
871b862e
                     //ret = unzip_single_internal(ctx, loff, ooxml_cust_cb);
                 }
25556519
             }
         }
cd94be7a
         else if (!xmlStrcmp(CT, (const xmlChar *)"application/vnd.openxmlformats-package.digital-signature-xmlsignature+xml")) {
871b862e
             dsig++;
a41ab49e
         }
25556519
 
         if (ret != CL_SUCCESS)
             goto ooxml_content_exit;
     }
 
d48fd4bb
     if (core)
871b862e
         cli_jsonint(ctx->wrkproperty, "CorePropertiesFileCount", core);
d48fd4bb
     else if (!mcore)
25556519
         cli_dbgmsg("cli_process_ooxml: file does not contain core properties file\n");
d48fd4bb
     if (mcore)
         cli_jsonint(ctx->wrkproperty, "CorePropertiesMissingFileCount", core);
 
     if (extn)
871b862e
         cli_jsonint(ctx->wrkproperty, "ExtendedPropertiesFileCount", extn);
d48fd4bb
     else if (!mextn)
25556519
         cli_dbgmsg("cli_process_ooxml: file does not contain extended properties file\n");
d48fd4bb
     if (mextn)
         cli_jsonint(ctx->wrkproperty, "ExtendedPropertiesMissingFileCount", extn);
 
     if (cust)
871b862e
         cli_jsonint(ctx->wrkproperty, "CustomPropertiesFileCount", cust);
d48fd4bb
     else if (!mcust)
25556519
         cli_dbgmsg("cli_process_ooxml: file does not contain custom properties file\n");
d48fd4bb
     if (mcust)
         cli_jsonint(ctx->wrkproperty, "CustomPropertiesMissingFileCount", cust);
 
871b862e
     if (dsig) {
         cli_jsonint(ctx->wrkproperty, "DigitalSignaturesCount", dsig);
     }
25556519
 
  ooxml_content_exit:
     xmlTextReaderClose(reader);
     xmlFreeTextReader(reader);
     return ret;
 }
20b45621
 #endif /* HAVE_LIBXML2 && HAVE_JSON */
25556519
 
 int cli_process_ooxml(cli_ctx *ctx)
 {
a41009bc
 #if HAVE_LIBXML2 && HAVE_JSON
25556519
     uint32_t loff = 0;
20b45621
     int tmp = CL_SUCCESS;
25556519
 
     cli_dbgmsg("in cli_processooxml\n");
     if (!ctx) {
         return CL_ENULLARG;
     }
 
     /* find "[Content Types].xml" */
20b45621
     tmp = unzip_search(ctx, "[Content_Types].xml", 18, &loff);
     if (tmp == CL_ETIMEOUT) {
         return CL_ETIMEOUT;
     }
     else if (tmp != CL_VIRUS) {
25556519
         cli_dbgmsg("cli_process_ooxml: failed to find ""[Content_Types].xml""!\n");
         return CL_EFORMAT;
     }
     cli_dbgmsg("cli_process_ooxml: found ""[Content_Types].xml"" @ %x\n", loff);
 
     return unzip_single_internal(ctx, loff, ooxml_content_cb);
 #else
cd94be7a
     UNUSEDPARAM(ctx);
25556519
     cli_dbgmsg("in cli_processooxml\n");
a41009bc
 #if !HAVE_LIBXML2
25556519
     cli_dbgmsg("cli_process_ooxml: libxml2 needs to enabled!");
a41009bc
 #endif
 #if !HAVE_JSON
     cli_dbgmsg("cli_process_ooxml: libjson needs to enabled!");
 #endif
25556519
     return CL_SUCCESS;
 #endif
 }