Browse code

HWPML: added hwpml_keys for hwpml parsing

Kevin Lin authored on 2015/12/16 03:01:40
Showing 5 changed files
... ...
@@ -23,8 +23,13 @@
23 23
 #include "clamav-config.h"
24 24
 #endif
25 25
 
26
-#if HAVE_ICONV
27
-#include <iconv.h>
26
+#if HAVE_LIBXML2
27
+#ifdef _WIN32
28
+#ifndef LIBXML_WRITER_ENABLED
29
+#define LIBXML_WRITER_ENABLED 1
30
+#endif
31
+#endif
32
+#include <libxml/xmlreader.h>
28 33
 #endif
29 34
 
30 35
 #include <sys/types.h>
... ...
@@ -39,6 +44,8 @@
39 39
 #include "str.h"
40 40
 #include "others.h"
41 41
 #include "scanners.h"
42
+#include "msxml_parser.h"
43
+#include "msxml.h"
42 44
 #include "json_api.h"
43 45
 #include "hwp.h"
44 46
 #if HAVE_JSON
... ...
@@ -47,6 +54,7 @@
47 47
 
48 48
 #define HWP5_DEBUG 0
49 49
 #define HWP3_DEBUG 1
50
+#define HWPML_DEBUG 1
50 51
 #if HWP5_DEBUG
51 52
 #define hwp5_debug(...) cli_dbgmsg(__VA_ARGS__)
52 53
 #else
... ...
@@ -57,6 +65,11 @@
57 57
 #else
58 58
 #define hwp3_debug(...) ;
59 59
 #endif
60
+#if HWPML_DEBUG
61
+#define hwpml_debug(...) cli_dbgmsg(__VA_ARGS__)
62
+#else
63
+#define hwpml_debug(...) ;
64
+#endif
60 65
 
61 66
 typedef int (*hwp_cb )(void *cbdata, int fd, cli_ctx *ctx);
62 67
 static int decompress_and_callback(cli_ctx *ctx, fmap_t *input, off_t at, size_t len, const char *parent, hwp_cb cb, void *cbdata)
... ...
@@ -488,8 +501,8 @@ static inline int parsehwp3_docsummary(cli_ctx *ctx, off_t offset)
488 488
             return ret;
489 489
     }
490 490
 #else
491
-    UNUSED(ctx);
492
-    UNUSED(offset);
491
+    UNUSEDPARAM(ctx);
492
+    UNUSEDPARAM(offset);
493 493
 #endif
494 494
     return CL_SUCCESS;
495 495
 }
... ...
@@ -637,3 +650,80 @@ int cli_scanhwp3(cli_ctx *ctx)
637 637
 
638 638
     return ret;
639 639
 }
640
+
641
+/*** HWPML (hijacking the msxml parser) ***/
642
+
643
+static const struct key_entry hwpml_keys[] = {
644
+    { "hwpml",              "HWPML",              MSXML_JSON_ROOT | MSXML_JSON_ATTRIB },
645
+
646
+    /* HEAD - Document Properties */
647
+    { "head",               "Head",               MSXML_JSON_ROOT },
648
+    { "docsummary",         "DocumentProperties", MSXML_JSON_WRKPTR },
649
+    { "title",              "Title",              MSXML_JSON_WRKPTR | MSXML_JSON_VALUE },
650
+    { "author",             "Author",             MSXML_JSON_WRKPTR | MSXML_JSON_VALUE },
651
+    { "date",               "Date",               MSXML_JSON_WRKPTR | MSXML_JSON_VALUE },
652
+    { "docsetting",         "DocumentSettings",   MSXML_JSON_WRKPTR },
653
+    { "beginnumber",        "BeginNumber",        MSXML_JSON_WRKPTR | MSXML_JSON_ATTRIB },
654
+    { "caretpos",           "CaretPos",           MSXML_JSON_WRKPTR | MSXML_JSON_ATTRIB },
655
+    { "bindatalist",        "BinDataList",        MSXML_JSON_WRKPTR },
656
+    { "binitem",            "BinItem",            MSXML_JSON_WRKPTR | MSXML_JSON_ATTRIB },
657
+    { "facenamelist",       "FaceNameList",       MSXML_IGNORE_ELEM }, /* fonts list */
658
+    { "borderfilllist",     "BorderFillList",     MSXML_IGNORE_ELEM }, /* borders list */
659
+    { "charshapelist",      "CharShapeList",      MSXML_IGNORE_ELEM }, /* character shapes */
660
+    { "tabdeflist",         "TableDefList",       MSXML_IGNORE_ELEM }, /* table defs */
661
+    { "numberinglist",      "NumberingList",      MSXML_IGNORE_ELEM }, /* numbering list */
662
+    { "parashapelist",      "ParagraphShapeList", MSXML_IGNORE_ELEM }, /* paragraph shapes */
663
+    { "stylelist",          "StyleList",          MSXML_IGNORE_ELEM }, /* styles */
664
+    { "compatibledocument", "WordCompatibility",  MSXML_IGNORE_ELEM }, /* word compatibility data */
665
+
666
+    /* BODY - Document Contents */
667
+    { "body",               "Body",               MSXML_IGNORE_ELEM }, /* document contents (we could build a document contents summary */
668
+
669
+    /* TAIL - Document Attachments */
670
+    { "tail",               "Tail",               MSXML_JSON_ROOT },
671
+    { "bindatastorage",     "BinaryDataStorage",  MSXML_JSON_WRKPTR },
672
+    { "bindata",            "BinaryData",         MSXML_SCAN_B64 | MSXML_JSON_WRKPTR | MSXML_JSON_ATTRIB },
673
+    { "scriptcode",         "ScriptCodeStorage",  MSXML_JSON_WRKPTR | MSXML_JSON_ATTRIB },
674
+    { "scriptheader",       "ScriptHeader",       MSXML_JSON_WRKPTR | MSXML_JSON_VALUE },
675
+    { "scriptsource",       "ScriptSource",       MSXML_JSON_WRKPTR | MSXML_JSON_VALUE }
676
+};
677
+static size_t num_hwpml_keys = sizeof(hwpml_keys) / sizeof(struct key_entry);
678
+
679
+int cli_scanhwpml(cli_ctx *ctx)
680
+{
681
+#if HAVE_LIBXML2
682
+    struct msxml_cbdata cbdata;
683
+    xmlTextReaderPtr reader = NULL;
684
+    int state, ret = CL_SUCCESS;
685
+
686
+    cli_dbgmsg("in cli_scanhwpml()\n");
687
+
688
+    if (!ctx)
689
+        return CL_ENULLARG;
690
+
691
+    memset(&cbdata, 0, sizeof(cbdata));
692
+    cbdata.map = *ctx->fmap;
693
+
694
+    reader = xmlReaderForIO(msxml_read_cb, NULL, &cbdata, "hwpml.xml", NULL, CLAMAV_MIN_XMLREADER_FLAGS);
695
+    if (!reader) {
696
+        cli_dbgmsg("cli_scanhwpml: cannot intialize xmlReader\n");
697
+
698
+#if HAVE_JSON
699
+        ret = cli_json_parse_error(ctx->wrkproperty, "HWPML_ERROR_XML_READER_IO");
700
+#endif
701
+        return ret; // libxml2 failed!
702
+    }
703
+
704
+    ret = cli_msxml_parse_document(ctx, reader, hwpml_keys, num_hwpml_keys, 1);
705
+
706
+    xmlTextReaderClose(reader);
707
+    xmlFreeTextReader(reader);
708
+    return ret;
709
+#else
710
+    UNUSEDPARAM(ctx);
711
+    cli_dbgmsg("in cli_scanhwpml()\n");
712
+    cli_dbgmsg("cli_scanhwpml: scanning hwpml documents requires libxml2!\n");
713
+
714
+    return CL_SUCCESS;
715
+#endif
716
+}
... ...
@@ -51,4 +51,7 @@ int cli_scanhwp5_stream(cli_ctx *ctx, hwp5_header_t *hwp5, char *name, int fd);
51 51
 /* HWP 3.0 - UNIQUE FORMAT */
52 52
 int cli_scanhwp3(cli_ctx *ctx);
53 53
 
54
+/* HWPML - SINGLE XML DOCUMENT (similar to MSXML) */
55
+int cli_scanhwpml(cli_ctx *ctx);
56
+
54 57
 #endif /* __HWP_H__ */
... ...
@@ -79,24 +79,6 @@ static const struct key_entry msxml_keys[] = {
79 79
 };
80 80
 static size_t num_msxml_keys = sizeof(msxml_keys) / sizeof(struct key_entry);
81 81
 
82
-enum msxml_state {
83
-    MSXML_STATE_NORMAL = 0,
84
-    MSXML_STATE_ENTITY_START_1,
85
-    MSXML_STATE_ENTITY_START_2,
86
-    MSXML_STATE_ENTITY_HEX,
87
-    MSXML_STATE_ENTITY_DEC,
88
-    MSXML_STATE_ENTITY_CLOSE,
89
-    MSXML_STATE_ENTITY_NONE
90
-};
91
-
92
-struct msxml_cbdata {
93
-    enum msxml_state state;
94
-    fmap_t *map;
95
-    const unsigned char *window;
96
-    off_t winpos, mappos;
97
-    size_t winsize;
98
-};
99
-
100 82
 static inline size_t msxml_read_cb_new_window(struct msxml_cbdata *cbdata)
101 83
 {
102 84
     const unsigned char *new_window = NULL;
... ...
@@ -30,6 +30,25 @@
30 30
 
31 31
 #include "others.h"
32 32
 
33
+enum msxml_state {
34
+    MSXML_STATE_NORMAL = 0,
35
+    MSXML_STATE_ENTITY_START_1,
36
+    MSXML_STATE_ENTITY_START_2,
37
+    MSXML_STATE_ENTITY_HEX,
38
+    MSXML_STATE_ENTITY_DEC,
39
+    MSXML_STATE_ENTITY_CLOSE,
40
+    MSXML_STATE_ENTITY_NONE
41
+};
42
+
43
+struct msxml_cbdata {
44
+    enum msxml_state state;
45
+    fmap_t *map;
46
+    const unsigned char *window;
47
+    off_t winpos, mappos;
48
+    size_t winsize;
49
+};
50
+
51
+int msxml_read_cb(void *ctx, char *buffer, int len);
33 52
 int cli_scanmsxml(cli_ctx *ctx);
34 53
 
35 54
 #endif /* __MSXML_H */
... ...
@@ -2275,6 +2275,9 @@ static int cli_scanraw(cli_ctx *ctx, cli_file_t type, uint8_t typercg, cli_file_
2275 2275
                 case CL_TYPE_XML_XL:
2276 2276
                     ret = cli_scanmsxml(ctx);
2277 2277
                     break;
2278
+                case CL_TYPE_XML_HWP:
2279
+                    ret = cli_scanhwpml(ctx);
2280
+                    break;
2278 2281
                 case CL_TYPE_RARSFX:
2279 2282
                     if(type != CL_TYPE_RAR && have_rar && SCAN_ARCHIVE && (DCONF_ARCH & ARCH_CONF_RAR)) {
2280 2283
                         char *tmpname = NULL;
... ...
@@ -2681,7 +2684,8 @@ static int magic_scandesc(cli_ctx *ctx, cli_file_t type)
2681 2681
                 type == CL_TYPE_OOXML_XL ||
2682 2682
                 type == CL_TYPE_XML_WORD ||
2683 2683
                 type == CL_TYPE_XML_XL ||
2684
-                type == CL_TYPE_HWP3) {
2684
+                type == CL_TYPE_HWP3 ||
2685
+                type == CL_TYPE_XML_HWP) {
2685 2686
                 ctx->properties = json_object_new_object();
2686 2687
                 if (NULL == ctx->properties) {
2687 2688
                     cli_errmsg("magic_scandesc: no memory for json properties object\n");
... ...
@@ -2851,6 +2855,10 @@ static int magic_scandesc(cli_ctx *ctx, cli_file_t type)
2851 2851
         ret = cli_scanmsxml(ctx);
2852 2852
         break;
2853 2853
 
2854
+    case CL_TYPE_XML_HWP:
2855
+        ret = cli_scanhwpml(ctx);
2856
+        break;
2857
+
2854 2858
     case CL_TYPE_XDP:
2855 2859
         ret = cli_scanxdp(ctx);
2856 2860
         break;