Browse code

msxml_parser: add callback-based scanning mechanism

Kevin Lin authored on 2015/12/17 06:12:20
Showing 5 changed files
... ...
@@ -714,7 +714,7 @@ int cli_scanhwpml(cli_ctx *ctx)
714 714
         return ret; // libxml2 failed!
715 715
     }
716 716
 
717
-    ret = cli_msxml_parse_document(ctx, reader, hwpml_keys, num_hwpml_keys, 1);
717
+    ret = cli_msxml_parse_document(ctx, reader, hwpml_keys, num_hwpml_keys, 1, hwpml_binary_cb);
718 718
 
719 719
     xmlTextReaderClose(reader);
720 720
     xmlFreeTextReader(reader);
... ...
@@ -253,7 +253,7 @@ int cli_scanmsxml(cli_ctx *ctx)
253 253
         return ret; // libxml2 failed!
254 254
     }
255 255
 
256
-    ret = cli_msxml_parse_document(ctx, reader, msxml_keys, num_msxml_keys, 1);
256
+    ret = cli_msxml_parse_document(ctx, reader, msxml_keys, num_msxml_keys, 1, NULL);
257 257
 
258 258
     xmlTextReaderClose(reader);
259 259
     xmlFreeTextReader(reader);
... ...
@@ -156,12 +156,14 @@ static int msxml_parse_value(json_object *wrkptr, const char *arrname, const xml
156 156
 }
157 157
 #endif /* HAVE_JSON */
158 158
 
159
+#define MAX_ATTRIBS 20
159 160
 static int msxml_parse_element(struct msxml_ctx *mxctx, xmlTextReaderPtr reader, int rlvl, void *jptr)
160 161
 {
161 162
     const xmlChar *element_name = NULL;
162 163
     const xmlChar *node_name = NULL, *node_value = NULL;
163 164
     const struct key_entry *keyinfo;
164
-    int ret, virus = 0, state, node_type, endtag = 0;
165
+    struct attrib_entry attribs[MAX_ATTRIBS];
166
+    int ret, virus = 0, state, node_type, endtag = 0, num_attribs = 0;
165 167
     cli_ctx *ctx = mxctx->ctx;
166 168
 #if HAVE_JSON
167 169
     json_object *parent = (json_object *)jptr;
... ...
@@ -284,6 +286,35 @@ static int msxml_parse_element(struct msxml_ctx *mxctx, xmlTextReaderPtr reader,
284 284
             }
285 285
         }
286 286
 #endif
287
+        /* populate attributes for scanning callback - BROKEN, probably from the fact the reader is pointed to the attribute from previously parsing attributes */
288
+        if ((keyinfo->type & MSXML_SCAN_CB) && mxctx->scan_cb) {
289
+            state = xmlTextReaderHasAttributes(reader);
290
+            if (state == 0) {
291
+                state = xmlTextReaderMoveToFirstAttribute(reader);
292
+                if (state == 1) {
293
+                    /* read first attribute (current head) */
294
+                    attribs[num_attribs].key = xmlTextReaderConstLocalName(reader);
295
+                    attribs[num_attribs].value = xmlTextReaderConstValue(reader);
296
+                    num_attribs++;
297
+                } else if (state == -1) {
298
+                    return CL_EPARSE;
299
+                }
300
+            }
301
+
302
+            /* start reading attributes or read remainder of attributes */
303
+            if (state == 1) {
304
+                cli_msxmlmsg("msxml_parse_element: adding attributes to scanning context\n");
305
+
306
+                while ((num_attribs < MAX_ATTRIBS) && (xmlTextReaderMoveToNextAttribute(reader) == 1)) {
307
+                    attribs[num_attribs].key = xmlTextReaderConstLocalName(reader);
308
+                    attribs[num_attribs].value = xmlTextReaderConstValue(reader);
309
+                    num_attribs++;
310
+                }
311
+            }
312
+            else if (state == -1) {
313
+                return CL_EPARSE;
314
+            }
315
+        }
287 316
 
288 317
         /* check self-containment */
289 318
         state = xmlTextReaderMoveToElement(reader);
... ...
@@ -339,9 +370,40 @@ static int msxml_parse_element(struct msxml_ctx *mxctx, xmlTextReaderPtr reader,
339 339
                     cli_msxmlmsg("msxml_parse_element: added json value [%s: %s]\n", keyinfo->name, (const char *)node_value);
340 340
                 }
341 341
 #endif
342
+                /* callback-based scanning mechanism for embedded objects (used by HWPML) */
343
+                if ((keyinfo->type & MSXML_SCAN_CB) && mxctx->scan_cb) {
344
+                    char name[1024];
345
+                    char *tempfile = name;
346
+                    int of;
347
+                    size_t vlen = strlen((const char *)node_value);
348
+
349
+                    cli_msxmlmsg("BINARY CALLBACK DATA!\n");
350
+
351
+                    if ((ret = cli_gentempfd(ctx->engine->tmpdir, &tempfile, &of)) != CL_SUCCESS) {
352
+                        cli_warnmsg("msxml_parse_element: failed to create temporary file %s\n", tempfile);
353
+                        return ret;
354
+                    }
342 355
 
343
-                /* scanning protocol for embedded objects encoded in base64 */
344
-                if ((keyinfo->type & MSXML_SCAN_B64) || (keyinfo->type & MSXML_SCAN_B64_TRIM4)) {
356
+                    if (cli_writen(of, (char *)node_value, vlen) != vlen) {
357
+                        close(of);
358
+                        return CL_EWRITE;
359
+                    }
360
+
361
+                    cli_dbgmsg("msxml_parse_element: extracted binary data to %s\n", tempfile);
362
+
363
+                    ret = mxctx->scan_cb(of, ctx, num_attribs, attribs);
364
+                    if (!(ctx->engine->keeptmp))
365
+                        cli_unlink(tempfile);
366
+                    free(tempfile);
367
+                    if (ret != CL_SUCCESS && (ret != CL_VIRUS || (!SCAN_ALL && ret == CL_VIRUS))) {
368
+                        return ret;
369
+                    } else if (SCAN_ALL && ret == CL_VIRUS) {
370
+                        virus = 1;
371
+                    }
372
+                }
373
+
374
+                /* scanning protocol for embedded objects encoded in base64 (used by MSXML) */
375
+                if (keyinfo->type & MSXML_SCAN_B64) {
345 376
                     char name[1024];
346 377
                     char *decoded, *tempfile = name;
347 378
                     size_t decodedlen;
... ...
@@ -372,31 +434,7 @@ static int msxml_parse_element(struct msxml_ctx *mxctx, xmlTextReaderPtr reader,
372 372
 
373 373
                     cli_dbgmsg("msxml_parse_element: extracted binary data to %s\n", tempfile);
374 374
 
375
-                    if (keyinfo->type & MSXML_SCAN_B64_TRIM4) {
376
-                        STATBUF statbuf;
377
-                        fmap_t *map;
378
-
379
-                        cli_dbgmsg("msxml_parse_element: trimming 4-byte prefix from binary stream\n");
380
-
381
-                        if (FSTAT(of, &statbuf) == -1) {
382
-                            cli_errmsg("msxml_parse_element: cannot stat file descriptor\n");
383
-                            close(of);
384
-                            return CL_ESTAT;
385
-                        }
386
-
387
-                        map = fmap(of, 0, statbuf.st_size);
388
-                        if (!map) {
389
-                            cli_errmsg("msxml_parse_element: failed to fmap binary data\n");
390
-                            close(of);
391
-                            return CL_EMAP;
392
-                        }
393
-
394
-                        ret = cli_map_scandesc(map, 4, 0, ctx, CL_TYPE_ANY);
395
-                        funmap(map);
396
-                    } else {
397
-                        ret = cli_magic_scandesc(of, ctx);
398
-                    }
399
-
375
+                    ret = cli_magic_scandesc(of, ctx);
400 376
                     close(of);
401 377
                     if (!(ctx->engine->keeptmp))
402 378
                         cli_unlink(tempfile);
... ...
@@ -469,7 +507,7 @@ static int msxml_parse_element(struct msxml_ctx *mxctx, xmlTextReaderPtr reader,
469 469
 }
470 470
 
471 471
 /* reader intialization and closing handled by caller */
472
-int cli_msxml_parse_document(cli_ctx *ctx, xmlTextReaderPtr reader, const struct key_entry *keys, const size_t num_keys, int mode)
472
+int cli_msxml_parse_document(cli_ctx *ctx, xmlTextReaderPtr reader, const struct key_entry *keys, const size_t num_keys, int mode, msxml_scan_cb scan_cb)
473 473
 {
474 474
     struct msxml_ctx mxctx;
475 475
     int state, virus = 0, ret = CL_SUCCESS;
... ...
@@ -478,6 +516,7 @@ int cli_msxml_parse_document(cli_ctx *ctx, xmlTextReaderPtr reader, const struct
478 478
         return CL_ENULLARG;
479 479
 
480 480
     mxctx.ctx = ctx;
481
+    mxctx.scan_cb = scan_cb;
481 482
     mxctx.keys = keys;
482 483
     mxctx.num_keys = num_keys;
483 484
 #if HAVE_JSON
... ...
@@ -43,12 +43,17 @@
43 43
 #define MSXML_RECLEVEL_MAX 20
44 44
 #define MSXML_JSON_STRLEN_MAX 128
45 45
 
46
+struct attrib_entry {
47
+    const char *key;
48
+    const char *value;
49
+};
50
+
46 51
 struct key_entry {
47 52
 /* how */
48 53
 #define MSXML_IGNORE         0x00
49 54
 #define MSXML_IGNORE_ELEM    0x01
50
-#define MSXML_SCAN_B64       0x02
51
-#define MSXML_SCAN_B64_TRIM4 0x04
55
+#define MSXML_SCAN_CB        0x02
56
+#define MSXML_SCAN_B64       0x04
52 57
 /* where */
53 58
 #define MSXML_JSON_ROOT      0x08
54 59
 #define MSXML_JSON_WRKPTR    0x10
... ...
@@ -64,8 +69,11 @@ struct key_entry {
64 64
     uint32_t type;
65 65
 };
66 66
 
67
+typedef int (*msxml_scan_cb)(int fd, cli_ctx *ctx, int num_attribs, struct attrib_entry *attribs);
68
+
67 69
 struct msxml_ctx {
68 70
     cli_ctx *ctx;
71
+    msxml_scan_cb scan_cb;
69 72
     const struct key_entry *keys;
70 73
     size_t num_keys;
71 74
 
... ...
@@ -75,7 +83,7 @@ struct msxml_ctx {
75 75
 #endif
76 76
 };
77 77
 
78
-int cli_msxml_parse_document(cli_ctx *ctx, xmlTextReaderPtr reader, const struct key_entry *keys, const size_t num_keys, int mode);
78
+int cli_msxml_parse_document(cli_ctx *ctx, xmlTextReaderPtr reader, const struct key_entry *keys, const size_t num_keys, int mode, msxml_scan_cb scan_cb);
79 79
 
80 80
 #endif /* HAVE_LIBXML2 */
81 81
 
... ...
@@ -126,7 +126,7 @@ static int ooxml_parse_document(int fd, cli_ctx *ctx)
126 126
         return CL_SUCCESS; // internal error from libxml2
127 127
     }
128 128
 
129
-    ret = cli_msxml_parse_document(ctx, reader, ooxml_keys, num_ooxml_keys, 1);
129
+    ret = cli_msxml_parse_document(ctx, reader, ooxml_keys, num_ooxml_keys, 1, NULL);
130 130
 
131 131
     if (ret != CL_SUCCESS && ret != CL_ETIMEOUT && ret != CL_BREAK)
132 132
         cli_warnmsg("ooxml_parse_document: encountered issue in parsing properties document\n");