... | ... |
@@ -23,8 +23,13 @@ |
23 | 23 |
#include "clamav-config.h" |
24 | 24 |
#endif |
25 | 25 |
|
26 |
-#if HAVE_ICONV |
|
27 |
-#include <iconv.h> |
|
26 |
+#if HAVE_LIBXML2 |
|
27 |
+#ifdef _WIN32 |
|
28 |
+#ifndef LIBXML_WRITER_ENABLED |
|
29 |
+#define LIBXML_WRITER_ENABLED 1 |
|
30 |
+#endif |
|
31 |
+#endif |
|
32 |
+#include <libxml/xmlreader.h> |
|
28 | 33 |
#endif |
29 | 34 |
|
30 | 35 |
#include <sys/types.h> |
... | ... |
@@ -39,6 +44,8 @@ |
39 | 39 |
#include "str.h" |
40 | 40 |
#include "others.h" |
41 | 41 |
#include "scanners.h" |
42 |
+#include "msxml_parser.h" |
|
43 |
+#include "msxml.h" |
|
42 | 44 |
#include "json_api.h" |
43 | 45 |
#include "hwp.h" |
44 | 46 |
#if HAVE_JSON |
... | ... |
@@ -47,6 +54,7 @@ |
47 | 47 |
|
48 | 48 |
#define HWP5_DEBUG 0 |
49 | 49 |
#define HWP3_DEBUG 1 |
50 |
+#define HWPML_DEBUG 1 |
|
50 | 51 |
#if HWP5_DEBUG |
51 | 52 |
#define hwp5_debug(...) cli_dbgmsg(__VA_ARGS__) |
52 | 53 |
#else |
... | ... |
@@ -57,6 +65,11 @@ |
57 | 57 |
#else |
58 | 58 |
#define hwp3_debug(...) ; |
59 | 59 |
#endif |
60 |
+#if HWPML_DEBUG |
|
61 |
+#define hwpml_debug(...) cli_dbgmsg(__VA_ARGS__) |
|
62 |
+#else |
|
63 |
+#define hwpml_debug(...) ; |
|
64 |
+#endif |
|
60 | 65 |
|
61 | 66 |
typedef int (*hwp_cb )(void *cbdata, int fd, cli_ctx *ctx); |
62 | 67 |
static int decompress_and_callback(cli_ctx *ctx, fmap_t *input, off_t at, size_t len, const char *parent, hwp_cb cb, void *cbdata) |
... | ... |
@@ -488,8 +501,8 @@ static inline int parsehwp3_docsummary(cli_ctx *ctx, off_t offset) |
488 | 488 |
return ret; |
489 | 489 |
} |
490 | 490 |
#else |
491 |
- UNUSED(ctx); |
|
492 |
- UNUSED(offset); |
|
491 |
+ UNUSEDPARAM(ctx); |
|
492 |
+ UNUSEDPARAM(offset); |
|
493 | 493 |
#endif |
494 | 494 |
return CL_SUCCESS; |
495 | 495 |
} |
... | ... |
@@ -637,3 +650,80 @@ int cli_scanhwp3(cli_ctx *ctx) |
637 | 637 |
|
638 | 638 |
return ret; |
639 | 639 |
} |
640 |
+ |
|
641 |
+/*** HWPML (hijacking the msxml parser) ***/ |
|
642 |
+ |
|
643 |
+static const struct key_entry hwpml_keys[] = { |
|
644 |
+ { "hwpml", "HWPML", MSXML_JSON_ROOT | MSXML_JSON_ATTRIB }, |
|
645 |
+ |
|
646 |
+ /* HEAD - Document Properties */ |
|
647 |
+ { "head", "Head", MSXML_JSON_ROOT }, |
|
648 |
+ { "docsummary", "DocumentProperties", MSXML_JSON_WRKPTR }, |
|
649 |
+ { "title", "Title", MSXML_JSON_WRKPTR | MSXML_JSON_VALUE }, |
|
650 |
+ { "author", "Author", MSXML_JSON_WRKPTR | MSXML_JSON_VALUE }, |
|
651 |
+ { "date", "Date", MSXML_JSON_WRKPTR | MSXML_JSON_VALUE }, |
|
652 |
+ { "docsetting", "DocumentSettings", MSXML_JSON_WRKPTR }, |
|
653 |
+ { "beginnumber", "BeginNumber", MSXML_JSON_WRKPTR | MSXML_JSON_ATTRIB }, |
|
654 |
+ { "caretpos", "CaretPos", MSXML_JSON_WRKPTR | MSXML_JSON_ATTRIB }, |
|
655 |
+ { "bindatalist", "BinDataList", MSXML_JSON_WRKPTR }, |
|
656 |
+ { "binitem", "BinItem", MSXML_JSON_WRKPTR | MSXML_JSON_ATTRIB }, |
|
657 |
+ { "facenamelist", "FaceNameList", MSXML_IGNORE_ELEM }, /* fonts list */ |
|
658 |
+ { "borderfilllist", "BorderFillList", MSXML_IGNORE_ELEM }, /* borders list */ |
|
659 |
+ { "charshapelist", "CharShapeList", MSXML_IGNORE_ELEM }, /* character shapes */ |
|
660 |
+ { "tabdeflist", "TableDefList", MSXML_IGNORE_ELEM }, /* table defs */ |
|
661 |
+ { "numberinglist", "NumberingList", MSXML_IGNORE_ELEM }, /* numbering list */ |
|
662 |
+ { "parashapelist", "ParagraphShapeList", MSXML_IGNORE_ELEM }, /* paragraph shapes */ |
|
663 |
+ { "stylelist", "StyleList", MSXML_IGNORE_ELEM }, /* styles */ |
|
664 |
+ { "compatibledocument", "WordCompatibility", MSXML_IGNORE_ELEM }, /* word compatibility data */ |
|
665 |
+ |
|
666 |
+ /* BODY - Document Contents */ |
|
667 |
+ { "body", "Body", MSXML_IGNORE_ELEM }, /* document contents (we could build a document contents summary */ |
|
668 |
+ |
|
669 |
+ /* TAIL - Document Attachments */ |
|
670 |
+ { "tail", "Tail", MSXML_JSON_ROOT }, |
|
671 |
+ { "bindatastorage", "BinaryDataStorage", MSXML_JSON_WRKPTR }, |
|
672 |
+ { "bindata", "BinaryData", MSXML_SCAN_B64 | MSXML_JSON_WRKPTR | MSXML_JSON_ATTRIB }, |
|
673 |
+ { "scriptcode", "ScriptCodeStorage", MSXML_JSON_WRKPTR | MSXML_JSON_ATTRIB }, |
|
674 |
+ { "scriptheader", "ScriptHeader", MSXML_JSON_WRKPTR | MSXML_JSON_VALUE }, |
|
675 |
+ { "scriptsource", "ScriptSource", MSXML_JSON_WRKPTR | MSXML_JSON_VALUE } |
|
676 |
+}; |
|
677 |
+static size_t num_hwpml_keys = sizeof(hwpml_keys) / sizeof(struct key_entry); |
|
678 |
+ |
|
679 |
+int cli_scanhwpml(cli_ctx *ctx) |
|
680 |
+{ |
|
681 |
+#if HAVE_LIBXML2 |
|
682 |
+ struct msxml_cbdata cbdata; |
|
683 |
+ xmlTextReaderPtr reader = NULL; |
|
684 |
+ int state, ret = CL_SUCCESS; |
|
685 |
+ |
|
686 |
+ cli_dbgmsg("in cli_scanhwpml()\n"); |
|
687 |
+ |
|
688 |
+ if (!ctx) |
|
689 |
+ return CL_ENULLARG; |
|
690 |
+ |
|
691 |
+ memset(&cbdata, 0, sizeof(cbdata)); |
|
692 |
+ cbdata.map = *ctx->fmap; |
|
693 |
+ |
|
694 |
+ reader = xmlReaderForIO(msxml_read_cb, NULL, &cbdata, "hwpml.xml", NULL, CLAMAV_MIN_XMLREADER_FLAGS); |
|
695 |
+ if (!reader) { |
|
696 |
+ cli_dbgmsg("cli_scanhwpml: cannot intialize xmlReader\n"); |
|
697 |
+ |
|
698 |
+#if HAVE_JSON |
|
699 |
+ ret = cli_json_parse_error(ctx->wrkproperty, "HWPML_ERROR_XML_READER_IO"); |
|
700 |
+#endif |
|
701 |
+ return ret; // libxml2 failed! |
|
702 |
+ } |
|
703 |
+ |
|
704 |
+ ret = cli_msxml_parse_document(ctx, reader, hwpml_keys, num_hwpml_keys, 1); |
|
705 |
+ |
|
706 |
+ xmlTextReaderClose(reader); |
|
707 |
+ xmlFreeTextReader(reader); |
|
708 |
+ return ret; |
|
709 |
+#else |
|
710 |
+ UNUSEDPARAM(ctx); |
|
711 |
+ cli_dbgmsg("in cli_scanhwpml()\n"); |
|
712 |
+ cli_dbgmsg("cli_scanhwpml: scanning hwpml documents requires libxml2!\n"); |
|
713 |
+ |
|
714 |
+ return CL_SUCCESS; |
|
715 |
+#endif |
|
716 |
+} |
... | ... |
@@ -51,4 +51,7 @@ int cli_scanhwp5_stream(cli_ctx *ctx, hwp5_header_t *hwp5, char *name, int fd); |
51 | 51 |
/* HWP 3.0 - UNIQUE FORMAT */ |
52 | 52 |
int cli_scanhwp3(cli_ctx *ctx); |
53 | 53 |
|
54 |
+/* HWPML - SINGLE XML DOCUMENT (similar to MSXML) */ |
|
55 |
+int cli_scanhwpml(cli_ctx *ctx); |
|
56 |
+ |
|
54 | 57 |
#endif /* __HWP_H__ */ |
... | ... |
@@ -79,24 +79,6 @@ static const struct key_entry msxml_keys[] = { |
79 | 79 |
}; |
80 | 80 |
static size_t num_msxml_keys = sizeof(msxml_keys) / sizeof(struct key_entry); |
81 | 81 |
|
82 |
-enum msxml_state { |
|
83 |
- MSXML_STATE_NORMAL = 0, |
|
84 |
- MSXML_STATE_ENTITY_START_1, |
|
85 |
- MSXML_STATE_ENTITY_START_2, |
|
86 |
- MSXML_STATE_ENTITY_HEX, |
|
87 |
- MSXML_STATE_ENTITY_DEC, |
|
88 |
- MSXML_STATE_ENTITY_CLOSE, |
|
89 |
- MSXML_STATE_ENTITY_NONE |
|
90 |
-}; |
|
91 |
- |
|
92 |
-struct msxml_cbdata { |
|
93 |
- enum msxml_state state; |
|
94 |
- fmap_t *map; |
|
95 |
- const unsigned char *window; |
|
96 |
- off_t winpos, mappos; |
|
97 |
- size_t winsize; |
|
98 |
-}; |
|
99 |
- |
|
100 | 82 |
static inline size_t msxml_read_cb_new_window(struct msxml_cbdata *cbdata) |
101 | 83 |
{ |
102 | 84 |
const unsigned char *new_window = NULL; |
... | ... |
@@ -30,6 +30,25 @@ |
30 | 30 |
|
31 | 31 |
#include "others.h" |
32 | 32 |
|
33 |
+enum msxml_state { |
|
34 |
+ MSXML_STATE_NORMAL = 0, |
|
35 |
+ MSXML_STATE_ENTITY_START_1, |
|
36 |
+ MSXML_STATE_ENTITY_START_2, |
|
37 |
+ MSXML_STATE_ENTITY_HEX, |
|
38 |
+ MSXML_STATE_ENTITY_DEC, |
|
39 |
+ MSXML_STATE_ENTITY_CLOSE, |
|
40 |
+ MSXML_STATE_ENTITY_NONE |
|
41 |
+}; |
|
42 |
+ |
|
43 |
+struct msxml_cbdata { |
|
44 |
+ enum msxml_state state; |
|
45 |
+ fmap_t *map; |
|
46 |
+ const unsigned char *window; |
|
47 |
+ off_t winpos, mappos; |
|
48 |
+ size_t winsize; |
|
49 |
+}; |
|
50 |
+ |
|
51 |
+int msxml_read_cb(void *ctx, char *buffer, int len); |
|
33 | 52 |
int cli_scanmsxml(cli_ctx *ctx); |
34 | 53 |
|
35 | 54 |
#endif /* __MSXML_H */ |
... | ... |
@@ -2275,6 +2275,9 @@ static int cli_scanraw(cli_ctx *ctx, cli_file_t type, uint8_t typercg, cli_file_ |
2275 | 2275 |
case CL_TYPE_XML_XL: |
2276 | 2276 |
ret = cli_scanmsxml(ctx); |
2277 | 2277 |
break; |
2278 |
+ case CL_TYPE_XML_HWP: |
|
2279 |
+ ret = cli_scanhwpml(ctx); |
|
2280 |
+ break; |
|
2278 | 2281 |
case CL_TYPE_RARSFX: |
2279 | 2282 |
if(type != CL_TYPE_RAR && have_rar && SCAN_ARCHIVE && (DCONF_ARCH & ARCH_CONF_RAR)) { |
2280 | 2283 |
char *tmpname = NULL; |
... | ... |
@@ -2681,7 +2684,8 @@ static int magic_scandesc(cli_ctx *ctx, cli_file_t type) |
2681 | 2681 |
type == CL_TYPE_OOXML_XL || |
2682 | 2682 |
type == CL_TYPE_XML_WORD || |
2683 | 2683 |
type == CL_TYPE_XML_XL || |
2684 |
- type == CL_TYPE_HWP3) { |
|
2684 |
+ type == CL_TYPE_HWP3 || |
|
2685 |
+ type == CL_TYPE_XML_HWP) { |
|
2685 | 2686 |
ctx->properties = json_object_new_object(); |
2686 | 2687 |
if (NULL == ctx->properties) { |
2687 | 2688 |
cli_errmsg("magic_scandesc: no memory for json properties object\n"); |
... | ... |
@@ -2851,6 +2855,10 @@ static int magic_scandesc(cli_ctx *ctx, cli_file_t type) |
2851 | 2851 |
ret = cli_scanmsxml(ctx); |
2852 | 2852 |
break; |
2853 | 2853 |
|
2854 |
+ case CL_TYPE_XML_HWP: |
|
2855 |
+ ret = cli_scanhwpml(ctx); |
|
2856 |
+ break; |
|
2857 |
+ |
|
2854 | 2858 |
case CL_TYPE_XDP: |
2855 | 2859 |
ret = cli_scanxdp(ctx); |
2856 | 2860 |
break; |