Browse code

added new source file for shared code between ooxml and msxml

Kevin Lin authored on 2015/03/13 08:58:16
Showing 4 changed files
... ...
@@ -447,7 +447,9 @@ libclamav_la_SOURCES = \
447 447
 	msdoc.c \
448 448
 	msdoc.h \
449 449
 	msxml.c \
450
-	msxml.h
450
+	msxml.h \
451
+	msxml_parser.c \
452
+	msxml_parser.h
451 453
 
452 454
 libclamav_la_SOURCES += bignum.h\
453 455
 	bignum_fast.h\
... ...
@@ -32,6 +32,7 @@
32 32
 #include "conv.h"
33 33
 #include "json_api.h"
34 34
 #include "msxml.h"
35
+#include "msxml_parser.h"
35 36
 
36 37
 #if HAVE_LIBXML2
37 38
 #ifdef _WIN32
... ...
@@ -48,24 +49,14 @@
48 48
 #define cli_msxmlmsg(...)
49 49
 #endif
50 50
 
51
-//#define MSXML_RECLEVEL 16
52
-#define MSXML_RECLEVEL_MAX 20
53
-#define MSXML_JSON_STRLEN_MAX 100
54
-
55 51
 #define MSXML_READBUFF SCANBUFF
56 52
 
57
-#define check_state(state)                                              \
58
-    do {                                                                \
59
-        if (state == -1) {                                              \
60
-            cli_warnmsg("check_state[msxml]: CL_EPARSE @ ln%d\n", __LINE__); \
61
-            return CL_EPARSE;                                           \
62
-        }                                                               \
63
-        else if (state == 0) {                                          \
64
-            cli_dbgmsg("check_state[msxml]: CL_BREAK @ ln%d\n", __LINE__); \
65
-            return CL_BREAK;                                            \
66
-        }                                                               \
67
-    } while(0)
68
-
53
+static const struct key_entry msxml_keys[] = {
54
+    { "documentproperties", "DocumentProperties", MSXML_JSON_ROOT },
55
+    { "worddocument",       "WordDocument",       MSXML_JSON_ROOT },
56
+    { "bindata",            "BinaryData",         MSXML_SCAN_B64 | MSXML_JSON_COUNT | MSXML_JSON_ROOT }
57
+};
58
+static size_t num_msxml_keys = sizeof(msxml_keys) / sizeof(struct key_entry);
69 59
 
70 60
 struct msxml_cbdata {
71 61
     fmap_t *map;
... ...
@@ -162,213 +153,6 @@ int msxml_read_cb(void *ctx, char *buffer, int len)
162 162
     cbdata->winpos = cbdata->winsize - rbytes;
163 163
     return (int)wbytes;
164 164
 }
165
-
166
-static int msxml_parse_element(cli_ctx *ctx, xmlTextReaderPtr reader, int rlvl)
167
-{
168
-    const xmlChar *element_name = NULL;
169
-    const xmlChar *node_name = NULL, *node_value = NULL;
170
-    int ret, state, node_type, endtag = 0;
171
-
172
-    cli_msxmlmsg("in msxml_parse_element @ layer %d\n", rlvl);
173
-
174
-    /* check recursion level */
175
-    if (rlvl >= MSXML_RECLEVEL_MAX) {
176
-        cli_dbgmsg("msxml_parse_element: reached msxml json recursion limit\n");
177
-        //cli_jsonbool(root, "HitRecursiveLimit", 1);
178
-        /* skip it */
179
-        state = xmlTextReaderNext(reader);
180
-        check_state(state);
181
-        return CL_SUCCESS;
182
-    }
183
-
184
-    /* acquire element type */
185
-    node_type = xmlTextReaderNodeType(reader);
186
-    if (node_type == -1)
187
-        return CL_EPARSE;
188
-
189
-    node_name = xmlTextReaderConstLocalName(reader);
190
-    node_value = xmlTextReaderConstValue(reader);
191
-
192
-    /* branch on node type */
193
-    switch (node_type) {
194
-    case XML_READER_TYPE_ELEMENT:
195
-        cli_msxmlmsg("msxml_parse_element: ELEMENT %s [%d]: %s\n", node_name, node_type, node_value);
196
-
197
-        /* storing the element name for verification/collection */
198
-        element_name = xmlTextReaderConstLocalName(reader);
199
-        if (!node_name) {
200
-            cli_dbgmsg("msxml_parse_element: element tag node nameless\n");
201
-            return CL_EPARSE; /* no name, nameless */
202
-        }
203
-
204
-        /* handle attributes */
205
-        state = xmlTextReaderHasAttributes(reader);
206
-        if (state == 1) {
207
-            while (xmlTextReaderMoveToNextAttribute(reader) == 1) {
208
-                const xmlChar *name, *value;
209
-                name = xmlTextReaderConstLocalName(reader);
210
-                value = xmlTextReaderConstValue(reader);
211
-
212
-                cli_msxmlmsg("\t%s: %s\n", name, value);
213
-            }
214
-        }
215
-        else if (state == -1)
216
-            return CL_EPARSE;
217
-
218
-        /* check self-containment */
219
-        state = xmlTextReaderMoveToElement(reader);
220
-        if (state == -1)
221
-            return CL_EPARSE;
222
-
223
-        state = xmlTextReaderIsEmptyElement(reader);
224
-        if (state == 1) {
225
-            cli_msxmlmsg("msxml_parse_element: SELF-CLOSING\n");
226
-
227
-            state = xmlTextReaderNext(reader);
228
-            check_state(state);
229
-            return CL_SUCCESS;
230
-        } else if (state == -1)
231
-            return CL_EPARSE;
232
-
233
-        /* advance to first content node */
234
-        state = xmlTextReaderRead(reader);
235
-        check_state(state);
236
-
237
-        while (!endtag) {
238
-            node_type = xmlTextReaderNodeType(reader);
239
-            if (node_type == -1)
240
-                return CL_EPARSE;
241
-
242
-            switch (node_type) {
243
-            case XML_READER_TYPE_ELEMENT:
244
-                ret = msxml_parse_element(ctx, reader, rlvl+1);
245
-                if (ret != CL_SUCCESS) {
246
-                    return ret;
247
-                }
248
-                break;
249
-
250
-            case XML_READER_TYPE_TEXT:
251
-                node_value = xmlTextReaderConstValue(reader);
252
-
253
-                cli_msxmlmsg("TEXT: %s\n", node_value);
254
-
255
-                if (!strncmp(element_name, "binData", strlen(element_name))) {
256
-                    char name[1024];
257
-                    char *decoded, *tempfile = name;
258
-                    size_t decodedlen;
259
-                    int of;
260
-
261
-                    cli_msxmlmsg("BINARY DATA!\n");
262
-
263
-                    decoded = cl_base64_decode((char *)node_value, strlen((const char *)node_value), NULL, &decodedlen, 0);
264
-                    if (!decoded) {
265
-                        cli_warnmsg("msxml_parse_element: failed to decode base64-encoded binary data\n");
266
-                        state = xmlTextReaderRead(reader);
267
-                        check_state(state);
268
-                        break;
269
-                    }
270
-
271
-                    if(!(tempfile = cli_gentemp(ctx->engine->tmpdir))) {
272
-                        free(decoded);
273
-                        return CL_EMEM;
274
-                    }
275
-
276
-                    if((of = open(tempfile, O_RDWR|O_CREAT|O_TRUNC|O_BINARY, S_IRUSR|S_IWUSR))==-1) {
277
-                        cli_warnmsg("msxml_parse_element: failed to create temporary file %s\n", tempfile);
278
-                        free(decoded);
279
-                        return CL_ECREAT;
280
-                    }
281
-
282
-                    if(cli_writen(of, decoded, decodedlen) != (int)decodedlen) {
283
-                        free(decoded);
284
-                        close(of);
285
-                        return CL_EWRITE;
286
-                    }
287
-                    free(decoded);
288
-
289
-                    cli_dbgmsg("msxml_parse_element: extracted binary data to %s\n", tempfile);
290
-
291
-                    ret = cli_magic_scandesc(of, ctx);
292
-                    close(of);
293
-                    if (ret != CL_SUCCESS || (!SCAN_ALL && ret == CL_VIRUS)) {
294
-                        return ret;
295
-                    }
296
-
297
-                    /*
298
-                    ret = cli_mem_scandesc(decoded, decodedlen, ctx);
299
-                    free(decoded);
300
-                    if (ret != CL_SUCCESS) {
301
-                        return ret;
302
-                        }*/
303
-                }
304
-
305
-                /*
306
-                  ret = ooxml_parse_value(thisjobj, "Value", node_value);
307
-                  if (ret != CL_SUCCESS)
308
-                  return ret;
309
-
310
-                  cli_dbgmsg("ooxml_parse_element: added json value [%s: %s]\n", element_tag, node_value);
311
-                */
312
-
313
-                /* advance to next node */
314
-                state = xmlTextReaderRead(reader);
315
-                check_state(state);
316
-                break;
317
-
318
-            case XML_READER_TYPE_SIGNIFICANT_WHITESPACE:
319
-                /* advance to next node */
320
-                state = xmlTextReaderRead(reader);
321
-                check_state(state);
322
-                break;
323
-
324
-            case XML_READER_TYPE_END_ELEMENT:
325
-                cli_msxmlmsg("in msxml_parse_element @ layer %d closed\n", rlvl);
326
-                node_name = xmlTextReaderConstLocalName(reader);
327
-                if (!node_name) {
328
-                    cli_dbgmsg("msxml_parse_element: element end tag node nameless\n");
329
-                    return CL_EPARSE; /* no name, nameless */
330
-                }
331
-
332
-                if (strncmp(element_name, node_name, strlen(element_name))) {
333
-                    cli_dbgmsg("msxml_parse_element: element tag does not match end tag %s != %s\n", element_name, node_name);
334
-                    return CL_EFORMAT;
335
-                }
336
-
337
-                /* advance to next element tag */
338
-                state = xmlTextReaderRead(reader);
339
-                check_state(state);
340
-
341
-                endtag = 1;
342
-                break;
343
-
344
-            default:
345
-                node_name = xmlTextReaderConstLocalName(reader);
346
-                node_value = xmlTextReaderConstValue(reader);
347
-
348
-                cli_dbgmsg("msxml_parse_element: unhandled xml secondary node %s [%d]: %s\n", node_name, node_type, node_value);
349
-
350
-                state = xmlTextReaderNext(reader);
351
-                check_state(state);
352
-                return CL_SUCCESS;
353
-            }
354
-        }
355
-
356
-        break;
357
-    case XML_READER_TYPE_PROCESSING_INSTRUCTION:
358
-        cli_msxmlmsg("msxml_parse_element: PROCESSING INSTRUCTION %s [%d]: %s\n", node_name, node_type, node_value);
359
-        break;
360
-    case XML_READER_TYPE_SIGNIFICANT_WHITESPACE:
361
-        cli_msxmlmsg("msxml_parse_element: SIGNIFICANT WHITESPACE %s [%d]: %s\n", node_name, node_type, node_value);
362
-        break;
363
-    case XML_READER_TYPE_END_ELEMENT:
364
-        cli_msxmlmsg("msxml_parse_element: END ELEMENT %s [%d]: %s\n", node_name, node_type, node_value);
365
-        return CL_SUCCESS;
366
-    default:
367
-        cli_dbgmsg("msxml_parse_element: unhandled xml primary node %s [%d]: %s\n", node_name, node_type, node_value);
368
-    }
369
-
370
-    return CL_SUCCESS;
371
-}
372 165
 #endif
373 166
 
374 167
 int cli_scanmsxml(cli_ctx *ctx)
... ...
@@ -394,22 +178,7 @@ int cli_scanmsxml(cli_ctx *ctx)
394 394
         return CL_SUCCESS; // libxml2 failed!
395 395
     }
396 396
 
397
-    /* Main Processing Loop */
398
-    while ((state = xmlTextReaderRead(reader)) == 1) {
399
-        ret = msxml_parse_element(ctx, reader, 0);
400
-
401
-        if (ret != CL_SUCCESS && ret != CL_ETIMEOUT && ret != CL_BREAK) {
402
-            cli_warnmsg("cli_scanmsxml: encountered issue in parsing properties document\n");
403
-            break;
404
-        }
405
-    }
406
-
407
-    /* non-critical return supression */
408
-    if (ret == CL_ETIMEOUT || ret == CL_BREAK)
409
-        ret = CL_SUCCESS;
410
-
411
-    if (state == -1)
412
-        ret = CL_EPARSE;
397
+    ret = cli_msxml_parse_document(ctx, reader, msxml_keys, num_msxml_keys, 1);
413 398
 
414 399
     xmlTextReaderClose(reader);
415 400
     xmlFreeTextReader(reader);
416 401
new file mode 100644
... ...
@@ -0,0 +1,381 @@
0
+/*
1
+ * Extract component parts of MS XML files (e.g. MS Office 2003 XML Documents)
2
+ *
3
+ * Copyright (C) 2007-2013 Sourcefire, Inc.
4
+ *
5
+ * Authors: Kevin Lin
6
+ *
7
+ * This program is free software; you can redistribute it and/or modify it under
8
+ * the terms of the GNU General Public License version 2 as published by the
9
+ * Free Software Foundation.
10
+ *
11
+ * This program is distributed in the hope that it will be useful, but WITHOUT
12
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
13
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
14
+ * more details.
15
+ *
16
+ * You should have received a copy of the GNU General Public License along with
17
+ * this program; if not, write to the Free Software Foundation, Inc., 51
18
+ * Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
19
+ */
20
+
21
+#if HAVE_CONFIG_H
22
+#include "clamav-config.h"
23
+#endif
24
+
25
+#include <sys/types.h>
26
+#include <sys/stat.h>
27
+#include <fcntl.h>
28
+
29
+#include "clamav.h"
30
+#include "others.h"
31
+#include "conv.h"
32
+#include "json_api.h"
33
+#include "msxml_parser.h"
34
+
35
+#if HAVE_LIBXML2
36
+#ifdef _WIN32
37
+#ifndef LIBXML_WRITER_ENABLED
38
+#define LIBXML_WRITER_ENABLED 1
39
+#endif
40
+#endif
41
+#include <libxml/xmlreader.h>
42
+
43
+#define MSXML_VERBIOSE 1
44
+#if MSXML_VERBIOSE
45
+#define cli_msxmlmsg(...) cli_dbgmsg(__VA_ARGS__)
46
+#else
47
+#define cli_msxmlmsg(...)
48
+#endif
49
+
50
+#define check_state(state)                                              \
51
+    do {                                                                \
52
+        if (state == -1) {                                              \
53
+            cli_warnmsg("check_state[msxml]: CL_EPARSE @ ln%d\n", __LINE__); \
54
+            return CL_EPARSE;                                           \
55
+        }                                                               \
56
+        else if (state == 0) {                                          \
57
+            cli_dbgmsg("check_state[msxml]: CL_BREAK @ ln%d\n", __LINE__); \
58
+            return CL_BREAK;                                            \
59
+        }                                                               \
60
+    } while(0)
61
+
62
+
63
+struct key_entry blank_key = { NULL, NULL, 0 };
64
+
65
+static const struct key_entry *msxml_check_key(struct msxml_ctx *mxctx, const char *key, size_t keylen)
66
+{
67
+    unsigned i;
68
+
69
+    if (keylen > MSXML_JSON_STRLEN_MAX-1) {
70
+        cli_dbgmsg("msxml_check_key: key name too long\n");
71
+        return &blank_key;
72
+    }
73
+
74
+    for (i = 0; i < mxctx->num_keys; ++i) {
75
+        //cli_dbgmsg("%d %d %s %s %s %s\n", keylen, strlen(ooxml_keys[i]), key, keycmp, ooxml_keys[i], ooxml_json_keys[i]);
76
+        if (keylen == strlen(mxctx->keys[i].key) && !strncasecmp(key, mxctx->keys[i].key, keylen)) {
77
+            return &mxctx->keys[i];
78
+        }
79
+    }
80
+
81
+    return &blank_key;
82
+}
83
+
84
+static int msxml_parse_element(struct msxml_ctx *mxctx, xmlTextReaderPtr reader, int rlvl)
85
+{
86
+    const xmlChar *element_name = NULL;
87
+    const xmlChar *node_name = NULL, *node_value = NULL;
88
+    const struct key_entry *keyinfo;
89
+    int ret, state, node_type, endtag = 0;
90
+    cli_ctx *ctx = mxctx->ctx;
91
+#if HAVE_JSON
92
+    json_object *parent = mxctx->wrkptr;
93
+    json_object *thisjobj = NULL;
94
+#endif
95
+
96
+    cli_msxmlmsg("in msxml_parse_element @ layer %d\n", rlvl);
97
+
98
+    /* check recursion level */
99
+    if (rlvl >= MSXML_RECLEVEL_MAX) {
100
+        cli_dbgmsg("msxml_parse_element: reached msxml json recursion limit\n");
101
+        //cli_jsonbool(root, "HitRecursiveLimit", 1);
102
+        /* skip it */
103
+        state = xmlTextReaderNext(reader);
104
+        check_state(state);
105
+        return CL_SUCCESS;
106
+    }
107
+
108
+    /* acquire element type */
109
+    node_type = xmlTextReaderNodeType(reader);
110
+    if (node_type == -1)
111
+        return CL_EPARSE;
112
+
113
+    node_name = xmlTextReaderConstLocalName(reader);
114
+    node_value = xmlTextReaderConstValue(reader);
115
+
116
+    /* branch on node type */
117
+    switch (node_type) {
118
+    case XML_READER_TYPE_ELEMENT:
119
+        cli_msxmlmsg("msxml_parse_element: ELEMENT %s [%d]: %s\n", node_name, node_type, node_value);
120
+
121
+        /* storing the element name for verification/collection */
122
+        element_name = xmlTextReaderConstLocalName(reader);
123
+        if (!node_name) {
124
+            cli_dbgmsg("msxml_parse_element: element tag node nameless\n");
125
+            return CL_EPARSE; /* no name, nameless */
126
+        }
127
+
128
+        /* determine if the element is interesting */
129
+        keyinfo = msxml_check_key(mxctx, element_name, strlen(element_name));
130
+
131
+        cli_msxmlmsg("key:  %s\n", keyinfo->key);
132
+        cli_msxmlmsg("name: %s\n", keyinfo->name);
133
+        cli_msxmlmsg("type: %d\n", keyinfo->type);
134
+
135
+#if HAVE_JSON
136
+        if (keyinfo->type & MSXML_JSON_TRACK) {
137
+            if (MSXML_JSON_ROOT)
138
+                thisjobj = cli_jsonobj(mxctx->root, keyinfo->name);
139
+            else if (MSXML_JSON_WRKPTR)
140
+                thisjobj = cli_jsonobj(parent, keyinfo->name);
141
+
142
+            if (!thisjobj) {
143
+                return CL_EMEM;
144
+            }
145
+            cli_dbgmsg("msxml_parse_element: generated json object [%s]\n", keyinfo->name);
146
+
147
+            /* count this element */
148
+            if (thisjobj && keyinfo->type & MSXML_JSON_COUNT) {
149
+                json_object *counter;
150
+
151
+                if (!json_object_object_get_ex(thisjobj, "Count", &counter)) { /* object not found */
152
+                    cli_jsonint(thisjobj, "Count", 1);
153
+                    if (!counter) {
154
+                        return CL_EPARSE;
155
+                    }
156
+                } else {
157
+                    int value = json_object_get_int(counter);
158
+                    cli_jsonint(thisjobj, "Count", value+1);
159
+                }
160
+                cli_dbgmsg("msxml_parse_element: retrieved json object [Count]\n");
161
+            }
162
+
163
+            /* handle attributes */
164
+            state = xmlTextReaderHasAttributes(reader);
165
+            if (state == 1) {
166
+                json_object *attributes;
167
+
168
+                attributes = cli_jsonobj(thisjobj, "Attributes");
169
+                if (!attributes) {
170
+                    return CL_EPARSE;
171
+                }
172
+                cli_dbgmsg("msxml_parse_element: retrieved json object [Attributes]\n");
173
+
174
+                while (xmlTextReaderMoveToNextAttribute(reader) == 1) {
175
+                    const xmlChar *name, *value;
176
+                    name = xmlTextReaderConstLocalName(reader);
177
+                    value = xmlTextReaderConstValue(reader);
178
+
179
+                    cli_dbgmsg("\t%s: %s\n", name, value);
180
+                    cli_jsonstr(attributes, name, (const char *)value);
181
+                }
182
+            }
183
+            else if (state == -1)
184
+                return CL_EPARSE;
185
+        }
186
+#endif
187
+
188
+        /* check self-containment */
189
+        state = xmlTextReaderMoveToElement(reader);
190
+        if (state == -1)
191
+            return CL_EPARSE;
192
+
193
+        state = xmlTextReaderIsEmptyElement(reader);
194
+        if (state == 1) {
195
+            cli_msxmlmsg("msxml_parse_element: SELF-CLOSING\n");
196
+
197
+            state = xmlTextReaderNext(reader);
198
+            check_state(state);
199
+            return CL_SUCCESS;
200
+        } else if (state == -1)
201
+            return CL_EPARSE;
202
+
203
+        /* advance to first content node */
204
+        state = xmlTextReaderRead(reader);
205
+        check_state(state);
206
+
207
+        while (!endtag) {
208
+            node_type = xmlTextReaderNodeType(reader);
209
+            if (node_type == -1)
210
+                return CL_EPARSE;
211
+
212
+            switch (node_type) {
213
+            case XML_READER_TYPE_ELEMENT:
214
+                ret = msxml_parse_element(mxctx, reader, rlvl+1);
215
+                if (ret != CL_SUCCESS) {
216
+                    return ret;
217
+                }
218
+                break;
219
+
220
+            case XML_READER_TYPE_TEXT:
221
+                node_value = xmlTextReaderConstValue(reader);
222
+
223
+                cli_msxmlmsg("TEXT: %s\n", node_value);
224
+
225
+                /*
226
+                  ret = ooxml_parse_value(thisjobj, "Value", node_value);
227
+                  if (ret != CL_SUCCESS)
228
+                  return ret;
229
+
230
+                  cli_dbgmsg("ooxml_parse_element: added json value [%s: %s]\n", element_tag, node_value);
231
+                */
232
+
233
+
234
+                /* scanning protocol for embedded objects encoded in base64 */
235
+                if (keyinfo->type & MSXML_SCAN_B64) {
236
+                    char name[1024];
237
+                    char *decoded, *tempfile = name;
238
+                    size_t decodedlen;
239
+                    int of;
240
+
241
+                    cli_msxmlmsg("BINARY DATA!\n");
242
+
243
+                    decoded = cl_base64_decode((char *)node_value, strlen((const char *)node_value), NULL, &decodedlen, 0);
244
+                    if (!decoded) {
245
+                        cli_warnmsg("msxml_parse_element: failed to decode base64-encoded binary data\n");
246
+                        state = xmlTextReaderRead(reader);
247
+                        check_state(state);
248
+                        break;
249
+                    }
250
+
251
+                    if(!(tempfile = cli_gentemp(ctx->engine->tmpdir))) {
252
+                        free(decoded);
253
+                        return CL_EMEM;
254
+                    }
255
+
256
+                    if((of = open(tempfile, O_RDWR|O_CREAT|O_TRUNC|O_BINARY, S_IRUSR|S_IWUSR))==-1) {
257
+                        cli_warnmsg("msxml_parse_element: failed to create temporary file %s\n", tempfile);
258
+                        free(decoded);
259
+                        return CL_ECREAT;
260
+                    }
261
+
262
+                    if(cli_writen(of, decoded, decodedlen) != (int)decodedlen) {
263
+                        free(decoded);
264
+                        close(of);
265
+                        return CL_EWRITE;
266
+                    }
267
+                    free(decoded);
268
+
269
+                    cli_dbgmsg("msxml_parse_element: extracted binary data to %s\n", tempfile);
270
+
271
+                    ret = cli_magic_scandesc(of, ctx);
272
+                    close(of);
273
+                    if (ret != CL_SUCCESS || (!SCAN_ALL && ret == CL_VIRUS)) {
274
+                        return ret;
275
+                    }
276
+
277
+                    /*
278
+                    ret = cli_mem_scandesc(decoded, decodedlen, ctx);
279
+                    free(decoded);
280
+                    if (ret != CL_SUCCESS) {
281
+                        return ret;
282
+                        }*/
283
+                }
284
+
285
+                /* advance to next node */
286
+                state = xmlTextReaderRead(reader);
287
+                check_state(state);
288
+                break;
289
+
290
+            case XML_READER_TYPE_SIGNIFICANT_WHITESPACE:
291
+                /* advance to next node */
292
+                state = xmlTextReaderRead(reader);
293
+                check_state(state);
294
+                break;
295
+
296
+            case XML_READER_TYPE_END_ELEMENT:
297
+                cli_msxmlmsg("in msxml_parse_element @ layer %d closed\n", rlvl);
298
+                node_name = xmlTextReaderConstLocalName(reader);
299
+                if (!node_name) {
300
+                    cli_dbgmsg("msxml_parse_element: element end tag node nameless\n");
301
+                    return CL_EPARSE; /* no name, nameless */
302
+                }
303
+
304
+                if (strncmp(element_name, node_name, strlen(element_name))) {
305
+                    cli_dbgmsg("msxml_parse_element: element tag does not match end tag %s != %s\n", element_name, node_name);
306
+                    return CL_EFORMAT;
307
+                }
308
+
309
+                /* advance to next element tag */
310
+                state = xmlTextReaderRead(reader);
311
+                check_state(state);
312
+
313
+                endtag = 1;
314
+                break;
315
+
316
+            default:
317
+                node_name = xmlTextReaderConstLocalName(reader);
318
+                node_value = xmlTextReaderConstValue(reader);
319
+
320
+                cli_dbgmsg("msxml_parse_element: unhandled xml secondary node %s [%d]: %s\n", node_name, node_type, node_value);
321
+
322
+                state = xmlTextReaderNext(reader);
323
+                check_state(state);
324
+                return CL_SUCCESS;
325
+            }
326
+        }
327
+
328
+        break;
329
+    case XML_READER_TYPE_PROCESSING_INSTRUCTION:
330
+        cli_msxmlmsg("msxml_parse_element: PROCESSING INSTRUCTION %s [%d]: %s\n", node_name, node_type, node_value);
331
+        break;
332
+    case XML_READER_TYPE_SIGNIFICANT_WHITESPACE:
333
+        cli_msxmlmsg("msxml_parse_element: SIGNIFICANT WHITESPACE %s [%d]: %s\n", node_name, node_type, node_value);
334
+        break;
335
+    case XML_READER_TYPE_END_ELEMENT:
336
+        cli_msxmlmsg("msxml_parse_element: END ELEMENT %s [%d]: %s\n", node_name, node_type, node_value);
337
+        return CL_SUCCESS;
338
+    default:
339
+        cli_dbgmsg("msxml_parse_element: unhandled xml primary node %s [%d]: %s\n", node_name, node_type, node_value);
340
+    }
341
+
342
+    return CL_SUCCESS;
343
+}
344
+
345
+/* reader intialization and closing handled by caller */
346
+int cli_msxml_parse_document(cli_ctx *ctx, xmlTextReaderPtr reader, const struct key_entry *keys, const size_t num_keys, int mode)
347
+{
348
+    struct msxml_ctx mxctx;
349
+    int state, ret = CL_SUCCESS;
350
+
351
+    mxctx.ctx = ctx;
352
+    mxctx.keys = keys;
353
+    mxctx.num_keys = num_keys;
354
+#if HAVE_JSON
355
+    if (mode) {
356
+        mxctx.root = ctx->wrkproperty;
357
+        mxctx.wrkptr = ctx->wrkproperty;
358
+    }
359
+#endif
360
+
361
+    /* Main Processing Loop */
362
+    while ((state = xmlTextReaderRead(reader)) == 1) {
363
+        msxml_parse_element(&mxctx, reader, 0);
364
+        if (ret != CL_SUCCESS && ret != CL_ETIMEOUT && ret != CL_BREAK) {
365
+            cli_warnmsg("cli_msxml_parse_document: encountered issue in parsing xml document\n");
366
+            break;
367
+        }
368
+    }
369
+
370
+    if (state == -1)
371
+        return CL_EPARSE;
372
+
373
+    /* non-critical return supression */
374
+    if (ret == CL_ETIMEOUT || ret == CL_BREAK)
375
+        return CL_SUCCESS;
376
+
377
+    return ret;
378
+}
379
+
380
+#endif /* HAVE_LIBXML2 */
0 381
new file mode 100644
... ...
@@ -0,0 +1,76 @@
0
+/*
1
+ * Extract component parts of MS XML files (e.g. MS Office 2003 XML Documents)
2
+ * 
3
+ * Copyright (C) 2007-2013 Sourcefire, Inc.
4
+ * 
5
+ * Authors: Kevin Lin
6
+ * 
7
+ * This program is free software; you can redistribute it and/or modify it under
8
+ * the terms of the GNU General Public License version 2 as published by the
9
+ * Free Software Foundation.
10
+ * 
11
+ * This program is distributed in the hope that it will be useful, but WITHOUT
12
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
13
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
14
+ * more details.
15
+ * 
16
+ * You should have received a copy of the GNU General Public License along with
17
+ * this program; if not, write to the Free Software Foundation, Inc., 51
18
+ * Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
19
+ */
20
+
21
+#ifndef __MSXML_PARSER_H
22
+#define __MSXML_PARSER_H
23
+
24
+#if HAVE_LIBXML2
25
+
26
+#if HAVE_CONFIG_H
27
+#include "clamav-config.h"
28
+#endif
29
+
30
+#include "others.h"
31
+#include "json_api.h"
32
+
33
+#ifdef _WIN32
34
+#ifndef LIBXML_WRITER_ENABLED
35
+#define LIBXML_WRITER_ENABLED 1
36
+#endif
37
+#endif
38
+#include <libxml/xmlreader.h>
39
+
40
+
41
+#define MSXML_RECLEVEL_MAX 20
42
+#define MSXML_JSON_STRLEN_MAX 128
43
+
44
+struct key_entry {
45
+#define MSXML_IGNORE       0x00
46
+#define MSXML_IGNORE_ELEM  0x01
47
+#define MSXML_SCAN_B64     0x02
48
+#define MSXML_JSON_ROOT    0x04
49
+#define MSXML_JSON_WRKPTR  0x08
50
+#define MSXML_JSON_COUNT   0x10
51
+#define MSXML_JSON_VALUE   0x20
52
+
53
+#define MSXML_JSON_TRACK (MSXML_JSON_ROOT | MSXML_JSON_WRKPTR)
54
+
55
+    const char *key;
56
+    const char *name;
57
+    int type;
58
+};
59
+
60
+struct msxml_ctx {
61
+    cli_ctx *ctx;
62
+    const struct key_entry *keys;
63
+    size_t num_keys;
64
+
65
+#if HAVE_JSON
66
+    json_object *root;
67
+    json_object *wrkptr;
68
+#endif
69
+};
70
+
71
+int cli_msxml_parse_document(cli_ctx *ctx, xmlTextReaderPtr reader, const struct key_entry *keys, const size_t num_keys, int mode);
72
+
73
+#endif /* HAVE_LIBXML2 */
74
+
75
+#endif /* __MSXML_PARSER_H */