Browse code

Added new pdf object stream parsing capability.

Micah Snyder (micasnyd) authored on 2018/08/15 06:00:31
Showing 11 changed files
... ...
@@ -94,7 +94,7 @@ int cli_bytecode_context_setparam_int(struct cli_bc_ctx *ctx, unsigned i, uint64
94 94
 int cli_bytecode_context_setparam_ptr(struct cli_bc_ctx *ctx, unsigned i, void *data, unsigned datalen);
95 95
 int cli_bytecode_context_setfile(struct cli_bc_ctx *ctx, fmap_t *map);
96 96
 int cli_bytecode_context_setpe(struct cli_bc_ctx *ctx, const struct cli_pe_hook_data *data, const struct cli_exe_section *sections);
97
-int cli_bytecode_context_setpdf(struct cli_bc_ctx *ctx, unsigned phase, unsigned nobjs, struct pdf_obj *objs, uint32_t *pdf_flags, uint32_t pdfsize, uint32_t pdfstartoff);
97
+int cli_bytecode_context_setpdf(struct cli_bc_ctx *ctx, unsigned phase, unsigned nobjs, struct pdf_obj **objs, uint32_t *pdf_flags, uint32_t pdfsize, uint32_t pdfstartoff);
98 98
 int cli_bytecode_context_clear(struct cli_bc_ctx *ctx);
99 99
 /* returns file descriptor, sets tempfile. Caller takes ownership, and is
100 100
  * responsible for freeing/unlinking */
... ...
@@ -1427,7 +1427,7 @@ uint32_t cli_bcapi_check_platform(struct cli_bc_ctx *ctx , uint32_t a, uint32_t
1427 1427
 
1428 1428
 int cli_bytecode_context_setpdf(struct cli_bc_ctx *ctx, unsigned phase,
1429 1429
                                 unsigned nobjs,
1430
-                                struct pdf_obj *objs, uint32_t *pdf_flags,
1430
+                                struct pdf_obj **objs, uint32_t *pdf_flags,
1431 1431
                                 uint32_t pdfsize, uint32_t pdfstartoff)
1432 1432
 {
1433 1433
     ctx->pdf_nobjs = nobjs;
... ...
@@ -1470,7 +1470,7 @@ int32_t cli_bcapi_pdf_lookupobj(struct cli_bc_ctx *ctx , uint32_t objid)
1470 1470
     if (!ctx->pdf_phase)
1471 1471
         return -1;
1472 1472
     for (i=0;i<ctx->pdf_nobjs;i++) {
1473
-        if (ctx->pdf_objs[i].id == objid)
1473
+        if (ctx->pdf_objs[i]->id == objid)
1474 1474
             return i;
1475 1475
     }
1476 1476
     return -1;
... ...
@@ -1484,8 +1484,8 @@ uint32_t cli_bcapi_pdf_getobjsize(struct cli_bc_ctx *ctx , int32_t objidx)
1484 1484
        )
1485 1485
         return 0;
1486 1486
     if ((uint32_t)(objidx + 1) == ctx->pdf_nobjs)
1487
-        return ctx->pdf_size - ctx->pdf_objs[objidx].start;
1488
-    return ctx->pdf_objs[objidx+1].start - ctx->pdf_objs[objidx].start - 4;
1487
+        return ctx->pdf_size - ctx->pdf_objs[objidx]->start;
1488
+    return ctx->pdf_objs[objidx+1]->start - ctx->pdf_objs[objidx]->start - 4;
1489 1489
 }
1490 1490
 
1491 1491
 const uint8_t* cli_bcapi_pdf_getobj(struct cli_bc_ctx *ctx , int32_t objidx, uint32_t amount)
... ...
@@ -1493,7 +1493,7 @@ const uint8_t* cli_bcapi_pdf_getobj(struct cli_bc_ctx *ctx , int32_t objidx, uin
1493 1493
     uint32_t size = cli_bcapi_pdf_getobjsize(ctx, objidx);
1494 1494
     if (amount > size)
1495 1495
         return NULL;
1496
-    return fmap_need_off(ctx->fmap, ctx->pdf_objs[objidx].start, amount);
1496
+    return fmap_need_off(ctx->fmap, ctx->pdf_objs[objidx]->start, amount);
1497 1497
 }
1498 1498
 
1499 1499
 int32_t cli_bcapi_pdf_getobjid(struct cli_bc_ctx *ctx , int32_t objidx)
... ...
@@ -1501,7 +1501,7 @@ int32_t cli_bcapi_pdf_getobjid(struct cli_bc_ctx *ctx , int32_t objidx)
1501 1501
     if (!ctx->pdf_phase ||
1502 1502
         (uint32_t)objidx >= ctx->pdf_nobjs)
1503 1503
         return -1;
1504
-    return ctx->pdf_objs[objidx].id;
1504
+    return ctx->pdf_objs[objidx]->id;
1505 1505
 }
1506 1506
 
1507 1507
 int32_t cli_bcapi_pdf_getobjflags(struct cli_bc_ctx *ctx , int32_t objidx)
... ...
@@ -1509,7 +1509,7 @@ int32_t cli_bcapi_pdf_getobjflags(struct cli_bc_ctx *ctx , int32_t objidx)
1509 1509
     if (!ctx->pdf_phase ||
1510 1510
         (uint32_t)objidx >= ctx->pdf_nobjs)
1511 1511
         return -1;
1512
-    return ctx->pdf_objs[objidx].flags;
1512
+    return ctx->pdf_objs[objidx]->flags;
1513 1513
 }
1514 1514
 
1515 1515
 int32_t cli_bcapi_pdf_setobjflags(struct cli_bc_ctx *ctx , int32_t objidx, int32_t flags)
... ...
@@ -1518,9 +1518,9 @@ int32_t cli_bcapi_pdf_setobjflags(struct cli_bc_ctx *ctx , int32_t objidx, int32
1518 1518
         (uint32_t)objidx >= ctx->pdf_nobjs)
1519 1519
         return -1;
1520 1520
     cli_dbgmsg("cli_pdf: bytecode setobjflags %08x -> %08x\n",
1521
-               ctx->pdf_objs[objidx].flags,
1521
+               ctx->pdf_objs[objidx]->flags,
1522 1522
                flags);
1523
-    ctx->pdf_objs[objidx].flags = flags;
1523
+    ctx->pdf_objs[objidx]->flags = flags;
1524 1524
     return 0;
1525 1525
 }
1526 1526
 
... ...
@@ -1529,7 +1529,7 @@ int32_t cli_bcapi_pdf_get_offset(struct cli_bc_ctx *ctx , int32_t objidx)
1529 1529
     if (!ctx->pdf_phase ||
1530 1530
         (uint32_t)objidx >= ctx->pdf_nobjs)
1531 1531
         return -1;
1532
-    return ctx->pdf_startoff + ctx->pdf_objs[objidx].start;
1532
+    return ctx->pdf_startoff + ctx->pdf_objs[objidx]->start;
1533 1533
 }
1534 1534
 
1535 1535
 int32_t cli_bcapi_pdf_get_phase(struct cli_bc_ctx *ctx)
... ...
@@ -184,7 +184,7 @@ struct cli_bc_ctx {
184 184
     uint32_t lsigcnt[64];
185 185
     uint32_t lsigoff[64];
186 186
     uint32_t pdf_nobjs;
187
-    struct pdf_obj *pdf_objs;
187
+    struct pdf_obj **pdf_objs;
188 188
     uint32_t* pdf_flags;
189 189
     uint32_t pdf_size;
190 190
     uint32_t pdf_startoff;
... ...
@@ -73,7 +73,7 @@ extern "C"
73 73
 #define CL_COUNT_PRECISION 4096
74 74
 
75 75
 /* return codes */
76
-typedef enum {
76
+typedef enum cl_error_t {
77 77
     /* libclamav specific */
78 78
     CL_CLEAN = 0,
79 79
     CL_SUCCESS = 0,
... ...
@@ -119,6 +119,11 @@ static void XFA_cb(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdfname_a
119 119
 #endif
120 120
 /* End PDF statistics callbacks and related */
121 121
 
122
+static int pdf_readint(const char *q0, int len, const char *key);
123
+static const char *pdf_getdict(const char *q0, int* len, const char *key);
124
+static char *pdf_readval(const char *q, int len, const char *key);
125
+static char *pdf_readstring(const char *q0, int len, const char *key, unsigned *slen, const char **qend, int noescape);
126
+
122 127
 static int xrefCheck(const char *xref, const char *eof)
123 128
 {
124 129
     const char *q;
... ...
@@ -156,6 +161,14 @@ static int xrefCheck(const char *xref, const char *eof)
156 156
 #define noisy_warnmsg(...)
157 157
 #endif
158 158
 
159
+/**
160
+ * @brief   Searching BACKwards, find the next character that is not a whitespace.
161
+ * 
162
+ * @param q         Index to start from (at the end of the search space)
163
+ * @param start     Beginning of the search space. 
164
+ * 
165
+ * @return const char*  Address of the final non-whitespace character OR the same address as the start.
166
+ */
159 167
 static const char *findNextNonWSBack(const char *q, const char *start)
160 168
 {
161 169
     while (q > start && (*q == 0 || *q == 9 || *q == 0xa || *q == 0xc || *q == 0xd || *q == 0x20))
... ...
@@ -164,15 +177,56 @@ static const char *findNextNonWSBack(const char *q, const char *start)
164 164
     return q;
165 165
 }
166 166
 
167
-static int find_stream_bounds(const char *start, off_t bytesleft, off_t bytesleft2, off_t *stream, off_t *endstream, int newline_hack)
167
+/**
168
+ * @brief   Searching FORwards, find the next character that is not a whitespace.
169
+ * 
170
+ * @param q         Index to start from (at the end of the search space)
171
+ * @param start     Beginning of the search space. 
172
+ * 
173
+ * @return const char*  Address of the final non-whitespace character OR the same address as the start.
174
+ */
175
+static const char *findNextNonWS(const char *q, const char *end)
176
+{
177
+    while (q < end && (*q == 0 || *q == 9 || *q == 0xa || *q == 0xc || *q == 0xd || *q == 0x20))
178
+        q++;
179
+
180
+    return q;
181
+}
182
+
183
+/**
184
+ * @brief   Find bounds of stream.
185
+ * 
186
+ * PDF streams are prefixed with "stream" and suffixed with "endstream".
187
+ * Return value indicates success or failure.
188
+ * 
189
+ * @param start             start address of search space.
190
+ * @param bytesleft         size of search space for "stream"
191
+ * @param bytesleft2        size of search space for "endstream"
192
+ * @param[out] stream       output param, address of start of stream data
193
+ * @param[out] endstream    output param, address of end of stream data
194
+ * @param newline_hack      hack to support newlines that are \r\n, and not just \n or just \r.
195
+ * 
196
+ * @return int  1 if stream bounds were found. 
197
+ * @return int  0 if stream bounds could not be found. 
198
+ */
199
+static int find_stream_bounds(
200
+    const char *start, 
201
+    off_t bytesleft, 
202
+    off_t bytesleft2, 
203
+    off_t *stream, 
204
+    off_t *endstream, 
205
+    int newline_hack)
168 206
 {
169 207
     const char *q2, *q;
208
+
209
+    /* Begin by finding the "stream" string that prefixes stream data. */
170 210
     if ((q2 = cli_memstr(start, bytesleft, "stream", 6))) {
171 211
         q2 += 6;
172 212
         bytesleft -= q2 - start;
173 213
         if (bytesleft < 0)
174 214
             return 0;
175 215
 
216
+        /* Skip any new line charcters. */
176 217
         if (bytesleft >= 2 && q2[0] == '\xd' && q2[1] == '\xa') {
177 218
             q2 += 2;
178 219
             if (newline_hack && (bytesleft > 2) && q2[0] == '\xa')
... ...
@@ -182,16 +236,23 @@ static int find_stream_bounds(const char *start, off_t bytesleft, off_t byteslef
182 182
         }
183 183
 
184 184
         *stream = q2 - start;
185
+
185 186
         bytesleft2 -= q2 - start;
186 187
         if (bytesleft2 <= 0)
187 188
             return 0;
188 189
 
190
+        /* Now find the "endstream" string that suffixes stream data */
189 191
         q = q2;
190 192
         q2 = cli_memstr(q, bytesleft2, "endstream", 9);
191
-        if (!q2)
193
+        if (!q2) {
194
+            /* Couldn't find "endstream", but that's ok --
195
+             * -- we'll just count the data we have until EOF. */
192 196
             q2 = q + bytesleft2-9; /* till EOF */
197
+        }
193 198
 
194 199
         *endstream = q2 - start;
200
+
201
+        /* Double-check that endstream >= stream */
195 202
         if (*endstream < *stream)
196 203
             *endstream = *stream;
197 204
 
... ...
@@ -202,61 +263,273 @@ static int find_stream_bounds(const char *start, off_t bytesleft, off_t byteslef
202 202
 }
203 203
 
204 204
 /**
205
- * @brief  Finds the next obj and adds it to our list of objects, and increments nobj.
206
- *
207
- * @param pdf   PDF structure
208
- * @return int  -1 if error
209
- * @return int  0 if no more objects
210
- * @return int  1 if success
211
- * @return int  2 if an invalid object was discovered, may be skipped.
205
+ * @brief Find the next *indirect* object in an object stream, adds it to our list of 
206
+ *        objects, and increments nobj.
207
+ * 
208
+ * Indirect objects in a stream DON'T begin with "obj" and end with "endobj".
209
+ * Instead, they have an obj ID and an offset from the first object to point you
210
+ * right at them.
211
+ * 
212
+ * If found, objstm->current will be updated to the next obj id.
213
+ * 
214
+ * All objects in an object stream are indirect and thus do not begin or start 
215
+ * with "obj" or "endobj".  Instead, the object stream takes the following 
216
+ * format.
217
+ * 
218
+ *      <dictionary describing stream> objstm content endobjstm
219
+ * 
220
+ * where content looks something like the following:
221
+ * 
222
+ *      15 0 16 3 17 46 (ab)<</IDS 8 0 R/JavaScript 27 0 R/URLS 9 0 R>><</Names[(Test)28 0 R]>>
223
+ * 
224
+ * In the above example, the literal string (ab) is indirect object # 15, and 
225
+ * begins at offset 0 of the set of objects.  The next object, # 16 begis at 
226
+ * offset 3 is a dictionary.  The final object is also a dictionary, beginning 
227
+ * at offset 46.
228
+ * 
229
+ * @param pdf   Pdf struct that keeps track of all information found in the PDF. 
230
+ * @param objstm
231
+ * 
232
+ * @return CL_SUCCESS  if success
233
+ * @return CL_EPARSE   if parsing error
234
+ * @return CL_EMEM     if error allocating memory
235
+ * @return CL_EARG     if invalid arguments
236
+ */
237
+int pdf_findobj_in_objstm(struct pdf_struct *pdf, struct objstm_struct *objstm, struct pdf_obj **obj_found)
238
+{
239
+    cl_error_t status = CL_EPARSE;
240
+    struct pdf_obj *obj = NULL;
241
+    unsigned long objid = 0, objsize = 0, objoff = 0;
242
+    const char *index = NULL;
243
+    size_t bytes_remaining = 0;
244
+
245
+    if (NULL == pdf || NULL == objstm) {
246
+        cli_warnmsg("pdf_findobj_in_objstm: invalid arguments\n");
247
+        return CL_EARG;
248
+    }
249
+
250
+    *obj_found = NULL;
251
+
252
+    index = objstm->streambuf + objstm->current_pair;
253
+    bytes_remaining = objstm->streambuf_len - objstm->current_pair;
254
+
255
+    obj = calloc(sizeof(struct pdf_obj), 1);
256
+    if (!obj) {
257
+        cli_warnmsg("pdf_findobj_in_objstm: out of memory finding objects in stream\n");
258
+        status = CL_EMEM;
259
+        goto done;
260
+    }
261
+
262
+    /* This object is in a stream, not in the regular map buffer. */
263
+    obj->objstm = objstm;
264
+
265
+    /* objstm->current_pair points directly to the obj id */
266
+    if (CL_SUCCESS != cli_strntoul_wrap(index, bytes_remaining, 0, 10, &objid)) {
267
+        /* Failed to find objid */
268
+        cli_dbgmsg("pdf_findobj_in_objstm: Failed to find objid for obj in object stream\n");
269
+        status = CL_EPARSE;
270
+        goto done;
271
+    }
272
+
273
+    /* Find the obj offset that appears just after the obj id*/
274
+    while ((index < objstm->streambuf + objstm->streambuf_len) && isdigit(*index)) {
275
+        index++;
276
+        bytes_remaining--;
277
+    }
278
+    index = findNextNonWS(index, objstm->streambuf + objstm->first);
279
+    bytes_remaining = objstm->streambuf + objstm->streambuf_len - index;
280
+
281
+    if (CL_SUCCESS != cli_strntoul_wrap(index, bytes_remaining, 0, 10, &objoff)) {
282
+        /* Failed to find obj offset */
283
+        cli_dbgmsg("pdf_findobj_in_objstm: Failed to find obj offset for obj in object stream\n");
284
+        status = CL_EPARSE;
285
+        goto done;
286
+    }
287
+
288
+    objstm->current = objstm->first + objoff;
289
+
290
+    obj->id = (objid << 8) | (0 & 0xff);
291
+    obj->start = objstm->current;
292
+    obj->flags = 0;
293
+
294
+    objstm->nobjs_found++;
295
+
296
+    while ((index < objstm->streambuf + objstm->streambuf_len) && isdigit(*index)) {
297
+        index++;
298
+        bytes_remaining--;
299
+    }
300
+    objstm->current_pair = (uint32_t)(findNextNonWS(index, objstm->streambuf + objstm->first) - objstm->streambuf);
301
+
302
+    /* Update current_pair, if there are more */
303
+    if ((objstm->nobjs_found < objstm->n) &&
304
+        (index < objstm->streambuf + objstm->streambuf_len))
305
+    {
306
+        unsigned long next_objid = 0, next_objoff = 0;
307
+
308
+        /* 
309
+         * While we're at it, 
310
+         *   lets record the size as running up to the next object offset.
311
+         * 
312
+         * To do so, we will need to parse the next obj pair.
313
+         */
314
+        /* objstm->current_pair points directly to the obj id */
315
+        index = objstm->streambuf + objstm->current_pair;
316
+        bytes_remaining = objstm->streambuf + objstm->streambuf_len - index;
317
+
318
+        if (CL_SUCCESS != cli_strntoul_wrap(index, bytes_remaining, 0, 10, &next_objid)) {
319
+            /* Failed to find objid for next obj */
320
+            cli_dbgmsg("pdf_findobj_in_objstm: Failed to find next objid for obj in object stream though there should be {%u} more.\n", objstm->n - objstm->nobjs_found);
321
+            status = CL_EPARSE;
322
+            goto done;
323
+        }
324
+
325
+        /* Find the obj offset that appears just after the obj id*/
326
+        while ((index < objstm->streambuf + objstm->streambuf_len) && isdigit(*index)) {
327
+            index++;
328
+            bytes_remaining--;
329
+        }
330
+        index = findNextNonWS(index, objstm->streambuf + objstm->first);
331
+        bytes_remaining = objstm->streambuf + objstm->streambuf_len - index;
332
+
333
+        if (CL_SUCCESS != cli_strntoul_wrap(index, bytes_remaining, 0, 10, &next_objoff)) {
334
+            /* Failed to find obj offset for next obj */
335
+            cli_dbgmsg("pdf_findobj_in_objstm: Failed to find next obj offset for obj in object stream though there should be {%u} more.\n", objstm->n - objstm->nobjs_found);
336
+            status = CL_EPARSE;
337
+            goto done;
338
+        }
339
+
340
+        obj->size = objstm->first + next_objoff - obj->start;
341
+    } 
342
+    else 
343
+    {
344
+        /*
345
+         * Should be no more objects. We should verify.
346
+         * 
347
+         * Either way...
348
+         *   obj->size should be the rest of the buffer. 
349
+         */
350
+        if (objstm->nobjs_found < objstm->n) {
351
+            cli_warnmsg("pdf_findobj_in_objstm: Fewer objects found in object stream than expected!\n");
352
+        }
353
+
354
+        obj->size = objstm->streambuf_len - obj->start;
355
+    }
356
+
357
+    /* Success! Add the object to the list of all objects found. */
358
+    pdf->nobjs++;
359
+    pdf->objs = cli_realloc2(pdf->objs, sizeof(struct pdf_obj*) * pdf->nobjs);
360
+    if (!pdf->objs) {
361
+        cli_warnmsg("pdf_findobj_in_objstm: out of memory finding objects in stream\n");
362
+        status = CL_EMEM;
363
+        goto done;
364
+    }
365
+    pdf->objs[pdf->nobjs-1] = obj;
366
+
367
+    *obj_found = obj;
368
+
369
+    status = CL_SUCCESS;
370
+
371
+done:
372
+    if (CL_SUCCESS != status) {
373
+        if (NULL != obj) {
374
+            free(obj);
375
+        }
376
+    }
377
+    return status;
378
+}
379
+
380
+/**
381
+ * @brief Find the next *indirect* object.
382
+ * 
383
+ * Indirect objects begin with "obj" and end with "endobj".
384
+ * Identify objects that contain streams.
385
+ * Identify truncated objects. 
386
+ * 
387
+ * If found, pdf->offset will be updated to just after the "endobj".
388
+ * If truncated, pdf->offset will == pdf->size.
389
+ * If not found, pdf->offset will not be updated.
390
+ * 
391
+ * @param pdf   Pdf context struct that keeps track of all information found in the PDF. 
392
+ * 
393
+ * @return CL_SUCCESS  if success
394
+ * @return CL_BREAK    if no more objects
395
+ * @return CL_EPARSE   if parsing error
396
+ * @return CL_EMEM     if error allocating memory
212 397
  */
213
-int pdf_findobj(struct pdf_struct *pdf)
398
+cl_error_t pdf_findobj(struct pdf_struct *pdf)
214 399
 {
400
+    cl_error_t status = CL_EPARSE;
215 401
     const char *start, *q, *q2, *q3, *eof;
216
-    struct pdf_obj *obj;
402
+    struct pdf_obj *obj = NULL;
217 403
     off_t bytesleft;
218 404
     unsigned long genid, objid;
219 405
 
220 406
     pdf->nobjs++;
221
-    pdf->objs = cli_realloc2(pdf->objs, sizeof(*pdf->objs)*pdf->nobjs);
407
+    pdf->objs = cli_realloc2(pdf->objs, sizeof(struct pdf_obj*) * pdf->nobjs);
222 408
     if (!pdf->objs) {
223
-        cli_warnmsg("cli_pdf: out of memory parsing objects (%u)\n", pdf->nobjs);
224
-        return -1;
409
+        status = CL_EMEM;
410
+        goto done;
411
+    }
412
+
413
+    obj = malloc(sizeof(struct pdf_obj));
414
+    if (!obj) {
415
+        status = CL_EMEM;
416
+        goto done;
225 417
     }
418
+    pdf->objs[pdf->nobjs-1] = obj;
226 419
 
227
-    obj = &pdf->objs[pdf->nobjs-1];
228 420
     memset(obj, 0, sizeof(*obj));
229
-    start = pdf->map+pdf->offset;
421
+
422
+    start = pdf->map + pdf->offset;
230 423
     bytesleft = pdf->size - pdf->offset;
231
-    while (bytesleft > 0) {
424
+
425
+    /* Indirect objects located outside of an object stream are prefaced with "obj"
426
+     * and suffixed with "endobj".  Find the "obj" preface. */
427
+    while (bytesleft > 0)
428
+    {
232 429
         q2 = cli_memstr(start, bytesleft, "obj", 3);
233
-        if (!q2)
234
-            return 0;/* no more objs */
430
+        if (!q2) {
431
+            status = CL_BREAK; /* no more objs */
432
+            goto done;
433
+        }
235 434
 
435
+        /* verify that "obj" has a whitespace before it, and is not the end of 
436
+         * a previous string like... "globj" */
236 437
         q2--;
237 438
         bytesleft -= q2 - start;
439
+
238 440
         if (*q2 != 0 && *q2 != 9 && *q2 != 0xa && *q2 != 0xc && *q2 != 0xd && *q2 != 0x20) {
441
+            /* This instance of the "obj" string appears to be part of another string.
442
+             * Skip it, and keep searching for an object. */
239 443
             start = q2+4;
240 444
             bytesleft -= 4;
241 445
             continue;
242 446
         }
243 447
 
244
-        break;
448
+        break; /* Found it. q2 should point to the whitespace before the "obj" string */
245 449
     }
246 450
 
247
-    if (bytesleft <= 0)
248
-        return 0;
451
+    if (bytesleft <= 0) {
452
+        status = CL_BREAK; /* No "obj" found. */
453
+        goto done;
454
+    }
455
+
456
+    /* "obj" found! */
249 457
 
458
+    /* Find the generation id (genid) that appears before the "obj" */
250 459
     q = findNextNonWSBack(q2-1, start);
251 460
     while (q > start && isdigit(*q))
252 461
         q--;
253 462
 
254 463
     if (CL_SUCCESS != cli_strntoul_wrap(q, (size_t)(bytesleft + (q2-q)), 0, 10, &genid)) {
255
-        cli_dbgmsg("cli_pdf: Failed to parse object genid (%u)\n", pdf->nobjs);
464
+        cli_dbgmsg("pdf_findobj: Failed to parse object genid (# objects found: %u)\n", pdf->nobjs);
256 465
         /* Failed to parse, probably not a real object.  Skip past the "obj" thing, and continue. */
257 466
         pdf->offset = q2 + 4 - pdf->map;
258
-        return 2;
467
+        status = CL_EPARSE;
468
+        goto done;
259 469
     }
470
+
471
+    /* Find the object id (objid) that appers before the genid */
260 472
     q = findNextNonWSBack(q-1,start);
261 473
     while (q > start && isdigit(*q))
262 474
         q--;
... ...
@@ -271,59 +544,82 @@ int pdf_findobj(struct pdf_struct *pdf)
271 271
             const char* lastfile = q - 4;
272 272
             if (0 != strncmp(lastfile, "\%\%EOF", 5)) {
273 273
                 /* Nope, wasn't %%EOF */
274
-                cli_dbgmsg("cli_pdf: Failed to parse object objid (%u)\n", pdf->nobjs);
274
+                cli_dbgmsg("pdf_findobj: Failed to parse object objid (# objects found: %u)\n", pdf->nobjs);
275 275
                 /* Skip past the "obj" thing, and continue. */
276 276
                 pdf->offset = q2 + 4 - pdf->map;
277
-                return 2;
277
+                status = CL_EPARSE;
278
+                goto done;
278 279
             }
279 280
             /* Yup, Looks, like the file continues after %%EOF.  
280 281
              * Probably another revision.  Keep parsing... */
281 282
             q++;
282
-            cli_dbgmsg("cli_pdf: \%\%EOF detected before end of file, at %zu\n", (size_t)q);
283
+            cli_dbgmsg("pdf_findobj: \%\%EOF detected before end of file, at %zu\n", (size_t)q);
283 284
         } else {
284 285
             /* Failed parsing at the very beginning */
285
-            cli_dbgmsg("cli_pdf: Failed to parse object objid (%u)\n", pdf->nobjs);
286
+            cli_dbgmsg("pdf_findobj: Failed to parse object objid (# objects found: %u)\n", pdf->nobjs);
286 287
             /* Probably not a real object.  Skip past the "obj" thing, and continue. */
287 288
             pdf->offset = q2 + 4 - pdf->map;
288
-            return 2;
289
+            status = CL_EPARSE;
290
+            goto done;
289 291
         }
290 292
         /* Try again, with offset slightly adjusted */
291 293
         if (CL_SUCCESS != cli_strntoul_wrap(q, (size_t)(bytesleft + (q2-q)), 0, 10, &objid)) {
292
-            cli_dbgmsg("cli_pdf: Failed to parse object objid (%u)\n", pdf->nobjs);
294
+            cli_dbgmsg("pdf_findobj: Failed to parse object objid (# objects found: %u)\n", pdf->nobjs);
293 295
             /* Still failed... Probably not a real object.  Skip past the "obj" thing, and continue. */
294 296
             pdf->offset = q2 + 4 - pdf->map;
295
-            return 2;
297
+            status = CL_EPARSE;
298
+            goto done;
296 299
         }
297
-        cli_dbgmsg("cli_pdf: There appears to be an additional revision. Continuing to parse...\n");
300
+        cli_dbgmsg("pdf_findobj: There appears to be an additional revision. Continuing to parse...\n");
298 301
     }
299
-    obj->id = (objid << 8) | (genid&0xff);
300
-    obj->start = q2+4 - pdf->map;
302
+
303
+    /*
304
+     * Ok so we have the objid, genid, and "obj" string.
305
+     *   Time to store that information and then ...
306
+     *     ... investigate what kind of object this is.
307
+     */
308
+    obj->id = (objid << 8) | (genid & 0xff);
309
+    obj->start = q2+4 - pdf->map; /* obj start begins just after the "obj" string */
301 310
     obj->flags = 0;
311
+
302 312
     bytesleft -= 4;
303 313
     eof = pdf->map + pdf->size;
304 314
     q = pdf->map + obj->start;
305 315
 
306
-    while (q < eof && bytesleft > 0) {
316
+    while (q < eof && bytesleft > 0)
317
+    {
307 318
         off_t p_stream, p_endstream;
308 319
         q2 = pdf_nextobject(q, bytesleft);
309 320
         if (!q2)
310
-            q2 = pdf->map + pdf->size;
321
+            q2 = pdf->map + pdf->size; /* No interesting objects found, fast-forward to eof */
311 322
 
312 323
         bytesleft -= q2 - q;
313 324
         if (find_stream_bounds(q-1, q2-q, bytesleft + (q2-q), &p_stream, &p_endstream, 1)) {
325
+            /*
326
+             * Found obj that contains a stream.
327
+             */
314 328
             obj->flags |= 1 << OBJ_STREAM;
315 329
             q2 = q-1 + p_endstream + 9;
316 330
             bytesleft -= q2 - q + 1;
317 331
 
318 332
             if (bytesleft < 0) {
333
+                /* ... and the stream is truncated.  Hmm... */
319 334
                 obj->flags |= 1 << OBJ_TRUNCATED;
320 335
                 pdf->offset = pdf->size;
321
-                return 1;/* truncated */
336
+
337
+                status = CL_SUCCESS;
338
+                goto done; /* Truncated file, no end to obj/stream. 
339
+                            * The next call to pdf_findobj() will return no more objects. */
322 340
             }
323 341
         } else if ((q3 = cli_memstr(q-1, q2-q+1, "endobj", 6))) {
342
+            /*
343
+             * obj found and offset positioned. ideal return case
344
+             */
324 345
             q2 = q3 + 6;
325
-            pdf->offset = q2 - pdf->map;
326
-            return 1; /* obj found and offset positioned */
346
+            pdf->offset = q2 - pdf->map; /* update the offset to just after the endobj */
347
+
348
+            status = CL_SUCCESS;
349
+            goto done; 
327 350
         } else {
328 351
             q2++;
329 352
             bytesleft--;
... ...
@@ -335,7 +631,32 @@ int pdf_findobj(struct pdf_struct *pdf)
335 335
     obj->flags |= 1 << OBJ_TRUNCATED;
336 336
     pdf->offset = pdf->size;
337 337
 
338
-    return 1;/* truncated */
338
+    status = CL_SUCCESS; /* truncated file, no end to obj. */
339
+
340
+done:
341
+    if (status == CL_SUCCESS) {
342
+        cli_dbgmsg("pdf_findobj: found %d %d obj @%lld\n", obj->id >> 8, obj->id&0xff, (long long)(obj->start + pdf->startoff));
343
+    }
344
+    else
345
+    {
346
+        if(status == CL_BREAK) {
347
+            cli_dbgmsg("pdf_findobj: No more objects (# objects found: %u)\n", pdf->nobjs);
348
+        } else if(status == CL_EMEM) {
349
+            cli_warnmsg("pdf_findobj: Error allocating memory (# objects found: %u)\n", pdf->nobjs);
350
+        } else {
351
+            cli_dbgmsg("pdf_findobj: Unexpected status code %d.\n", status);
352
+        }
353
+        /* Remove the unused obj reference from our list of objects found */
354
+        /* No need to realloc pdf->objs back down.  It won't leak. */
355
+        pdf->objs[pdf->nobjs-1] = NULL;
356
+        pdf->nobjs--;
357
+
358
+        /* Free up the obj struct. */
359
+        if (NULL != obj)
360
+            free(obj);
361
+    }
362
+
363
+    return status; 
339 364
 }
340 365
 
341 366
 static size_t filter_writen(struct pdf_struct *pdf, struct pdf_obj *obj, int fout, const char *buf, size_t len, size_t *sum)
... ...
@@ -424,7 +745,7 @@ void pdfobj_flag(struct pdf_struct *pdf, struct pdf_obj *obj, enum pdf_flag flag
424 424
         break;
425 425
     }
426 426
 
427
-    cli_dbgmsg("cli_pdf: %s flagged in object %u %u\n", s, obj->id>>8, obj->id&0xff);
427
+    cli_dbgmsg("pdfobj_flag: %s flagged in object %u %u\n", s, obj->id>>8, obj->id&0xff);
428 428
 }
429 429
 
430 430
 struct pdf_obj *find_obj(struct pdf_struct *pdf, struct pdf_obj *obj, uint32_t objid)
... ...
@@ -433,17 +754,20 @@ struct pdf_obj *find_obj(struct pdf_struct *pdf, struct pdf_obj *obj, uint32_t o
433 433
     uint32_t i;
434 434
 
435 435
     /* search starting at previous obj (if exists) */
436
-    i = (obj != pdf->objs) ? obj - pdf->objs : 0;
436
+    for (i = 0; i < pdf->nobjs; i++) {
437
+        if (pdf->objs[i] == obj)
438
+            break;
439
+    }
437 440
 
438
-    for (j=i;j<pdf->nobjs;j++) {
439
-        obj = &pdf->objs[j];
441
+    for (j = i; j < pdf->nobjs; j++) {
442
+        obj = pdf->objs[j];
440 443
         if (obj->id == objid)
441 444
             return obj;
442 445
     }
443 446
 
444 447
     /* restart search from beginning if not found */
445
-    for (j=0;j<i;j++) {
446
-        obj = &pdf->objs[j];
448
+    for (j = 0; j < i; j++) {
449
+        obj = pdf->objs[j];
447 450
         if (obj->id == objid)
448 451
             return obj;
449 452
     }
... ...
@@ -451,72 +775,173 @@ struct pdf_obj *find_obj(struct pdf_struct *pdf, struct pdf_obj *obj, uint32_t o
451 451
     return NULL;
452 452
 }
453 453
 
454
-static int find_length(struct pdf_struct *pdf, struct pdf_obj *obj, const char *start, off_t len)
454
+/**
455
+ * @brief   Find and interpret the "/Length" dictionary key value.
456
+ * 
457
+ * The value may be:
458
+ *  - a direct object (i.e. just a number) 
459
+ *  - an indirect object, where the value is somewhere else in the document and we have to look it up.
460
+ *    indirect objects are referenced using an object id (objid), generation id (genid) genid, and the letter 'R'.
461
+ * 
462
+ * Example dictionary with a single key "/Length" that relies direct object for the value.
463
+ * 
464
+ *      1 0 obj
465
+ *          << /Length 534
466
+ *              /Filter [ /ASCII85Decode /LZWDecode ]
467
+ *          >>
468
+ *          stream
469
+ *              J..)6T`?p&<!J9%_[umg"B7/Z7KNXbN'S+,*Q/&"OLT'FLIDK#!n`$"<Atdi`\Vn%b%)&'cA*VnK\CJY(sF>c!Jnl@
470
+ *              RM]WM;jjH6Gnc75idkL5]+cPZKEBPWdR>FF(kj1_R%W_d&/jS!;iuad7h?[L-F$+]]0A3Ck*$I0KZ?;<)CJtqi65Xb
471
+ *              Vc3\n5ua:Q/=0$W<#N3U;H,MQKqfg1?:lUpR;6oN[C2E4ZNr8Udn.'p+?#X+1>0Kuk$bCDF/(3fL5]Oq)^kJZ!C2H1
472
+ *              'TO]Rl?Q:&'<5&iP!$Rq;BXRecDN[IJB`,)o8XJOSJ9sDS]hQ;Rj@!ND)bD_q&C\g:inYC%)&u#:u,M6Bm%IY!Kb1+
473
+ *              ":aAa'S`ViJglLb8<W9k6Yl\\0McJQkDeLWdPN?9A'jX*al>iG1p&i;eVoK&juJHs9%;Xomop"5KatWRT"JQ#qYuL,
474
+ *              JD?M$0QP)lKn06l1apKDC@\qJ4B!!(5m+j.7F790m(Vj88l8Q:_CZ(Gm1%X\N1&u!FKHMB~>
475
+ *          endstream
476
+ *      endobj
477
+ * 
478
+ * Example dictionary with a single key "/Length" that relies on an indirect object for the value.
479
+ * 
480
+ *      7 0 obj
481
+ *          << /Length 8 0 R >> % An indirect reference to object 8, with generation id 0.
482
+ *          stream
483
+ *              BT
484
+ *                  /F1 12 Tf
485
+ *                   72 712 Td
486
+ *                  ( A stream with an indirect length ) Tj
487
+ *              ET
488
+ *          endstream
489
+ *      endobj
490
+ * 
491
+ *      8 0 obj
492
+ *          77 % The length of the preceding stream
493
+ *      endobj
494
+ * 
495
+ * @param pdf       Pdf context structure.
496
+ * @param obj       Pdf object context structure.
497
+ * @param start     Pointer start of the dictionary string.
498
+ * @param len       Remaining length of the dictioary string in bytes.
499
+ * @return size_t   Unsigned integer value of the "/Length" key
500
+ */
501
+static size_t find_length(struct pdf_struct *pdf, struct pdf_obj *obj, const char *dict_start, size_t dict_len)
455 502
 {
456
-    unsigned long length;
457
-    const char *q;
503
+    size_t length = 0;
504
+    const char *obj_start = dict_start;
505
+    size_t bytes_remaining = dict_len;
506
+    unsigned long length_ul = 0;
507
+    const char *index;
458 508
 
459
-    q = cli_memstr(start, len, "/Length", 7);
460
-    if (!q)
509
+    if (bytes_remaining < 8) {
461 510
         return 0;
511
+    }
462 512
 
463
-    q++;
464
-    len -= q - start;
465
-    start = pdf_nextobject(q, len);
466
-    if (!start)
513
+    /*
514
+     * Find the "/Length" dictionary key
515
+     */
516
+    index = cli_memstr(obj_start, bytes_remaining, "/Length", 7);
517
+    if (!index)
467 518
         return 0;
468 519
 
469
-    len -= start - q;
470
-    q = start;
471
-    if (CL_SUCCESS != cli_strntoul_wrap(q, (size_t)len, 0, 10, &length)) {
472
-        cli_dbgmsg("cli_pdf: failed to parse object length\n");
520
+    if (bytes_remaining < 1) {
473 521
         return 0;
474 522
     }
475 523
 
476
-    while (isdigit(*q) && len > 0) {
477
-        q++;
478
-        len--;
524
+    /* Step the index into the "/Length" string. */
525
+    index++;
526
+    bytes_remaining -= index - obj_start;
527
+
528
+    /* Find the start of the next direct or indirect object.
529
+     * pdf_nextobject() assumes we started searching from within a previous object */
530
+    obj_start = pdf_nextobject(index, bytes_remaining);
531
+    if (!obj_start)
532
+        return 0;
533
+
534
+    if (bytes_remaining < obj_start - index) {
535
+        return 0;
536
+    }
537
+    bytes_remaining -= obj_start - index;
538
+    index = obj_start;
539
+    
540
+    /* Read the value.  This could either be the direct length value,
541
+       or the object id of the indirect object that has the length */
542
+    if (CL_SUCCESS != cli_strntoul_wrap(index, bytes_remaining, 0, 10, &length_ul)) {
543
+        cli_dbgmsg("find_length: failed to parse object length\n");
544
+        return 0;
545
+    }
546
+    length = length_ul; /* length or maybe object id */
547
+
548
+    /* 
549
+     * Keep parsing, skipping past the first integer that might have been what we wanted. 
550
+     * If it's an indirect object, we'll find a Generation ID followed by the letter 'R' 
551
+     * I.e. something like " 0 R" 
552
+     */
553
+    while ((bytes_remaining > 0) && isdigit(*index)) {
554
+        index++;
555
+        bytes_remaining--;
479 556
     }
480 557
 
481
-    if (*q == ' ' && len > 0) {
558
+    if ((bytes_remaining > 0) && (*index == ' ')) {
482 559
         unsigned long genid;
483
-        q++;
484
-        len--;
485
-        if (CL_SUCCESS != cli_strntoul_wrap(q, (size_t)len, 0, 10, &genid)) {
486
-            cli_dbgmsg("cli_pdf: failed to parse object genid\n");
560
+
561
+        index++;
562
+        bytes_remaining--;
563
+
564
+        if (CL_SUCCESS != cli_strntoul_wrap(index, bytes_remaining, 0, 10, &genid)) {
565
+            cli_dbgmsg("find_length: failed to parse object genid\n");
487 566
             return 0;
488 567
         }
489 568
 
490
-        while(isdigit(*q) && len > 0) {
491
-            q++;
492
-            len--;
569
+        while((bytes_remaining > 0) && isdigit(*index)) {
570
+            index++;
571
+            bytes_remaining--;
572
+        }
573
+
574
+        if (bytes_remaining < 2) {
575
+            return 0;
493 576
         }
494 577
 
495
-        if (q[0] == ' ' && q[1] == 'R') {
496
-            cli_dbgmsg("cli_pdf: length is in indirect object %lu %lu\n", length, genid);
578
+        if (index[0] == ' ' && index[1] == 'R') {
579
+            /* 
580
+             * Ok so we found a genid and that 'R'.  Which means that first value 
581
+             * was actually the objid.
582
+             * We can look up the indirect object using this information.
583
+             */
584
+            unsigned long objid = length;
585
+            const char* indirect_obj_start = NULL;
586
+            
587
+            cli_dbgmsg("find_length: length is in indirect object %lu %lu\n", objid, genid);
497 588
 
498 589
             obj = find_obj(pdf, obj, (length << 8) | (genid&0xff));
499 590
             if (!obj) {
500
-                cli_dbgmsg("cli_pdf: indirect object not found\n");
591
+                cli_dbgmsg("find_length: indirect object not found\n");
501 592
                 return 0;
502 593
             }
503 594
 
504
-            q = pdf_nextobject(pdf->map+obj->start, pdf->size - obj->start);
505
-            if (!q) {
506
-                cli_dbgmsg("cli_pdf: next object not found\n");
595
+            indirect_obj_start = pdf->map + obj->start;
596
+            bytes_remaining = pdf->size - obj->start;
597
+            
598
+            /* Ok so we found the indirect object, lets read the value. */
599
+            index = pdf_nextobject(indirect_obj_start, bytes_remaining);
600
+            if (!index) {
601
+                cli_dbgmsg("find_length: next object not found\n");
507 602
                 return 0;
508 603
             }
604
+            
605
+            if (bytes_remaining < index - indirect_obj_start) {
606
+                return 0;
607
+            }
608
+            bytes_remaining -= index - indirect_obj_start;
509 609
 
510
-            if (CL_SUCCESS != cli_strntoul_wrap(q, (size_t)len, 0, 10, &length)) {
511
-                cli_dbgmsg("cli_pdf: failed to parse object length from indirect object\n");
610
+            /* Found the value, so lets parse it as an unsigned long */
611
+            if (CL_SUCCESS != cli_strntoul_wrap(index, bytes_remaining, 0, 10, &length)) {
612
+                cli_dbgmsg("find_length: failed to parse object length from indirect object\n");
512 613
                 return 0;
513 614
             }
514 615
         }
515 616
     }
516 617
 
517 618
     /* limit length */
518
-    if (start - pdf->map + length+5 > pdf->size)
519
-        length = pdf->size - (start - pdf->map)-5;
619
+    if (obj_start - pdf->map + length + 5 > pdf->size)
620
+        length = pdf->size - (obj_start - pdf->map) - 5;
520 621
 
521 622
     return length;
522 623
 }
... ...
@@ -525,36 +950,98 @@ static int find_length(struct pdf_struct *pdf, struct pdf_obj *obj, const char *
525 525
 
526 526
 static int obj_size(struct pdf_struct *pdf, struct pdf_obj *obj, int binary)
527 527
 {
528
-    unsigned i = obj - pdf->objs;
528
+    if (0 == obj->size)
529
+    {
530
+        /*
531
+         * Programmatically determine size if not already known.
532
+         */
533
+        unsigned i = 0;
534
+
535
+        /* Find the index of the current object */
536
+        for (i = 0; i < pdf->nobjs; i++) {
537
+            if (pdf->objs[i] == obj)
538
+                break;
539
+        }
540
+
541
+        /* Find the next object that exists in the same buffer (pdf fmap, or object stream) */
542
+        if (i < pdf->nobjs) {
543
+            i++;
544
+        }
545
+
546
+        if (obj->objstm == NULL) {
547
+            /* Current object isn't in an object stream, we want to find
548
+             * the next object that also isn't in an object stream. */
549
+            for ( ; i < pdf->nobjs; i++) {
550
+                if (pdf->objs[i]->objstm == NULL)
551
+                    break;
552
+            }
553
+        } else {
554
+            /* Current object is in an object stream, we want to find
555
+             * the next object that is in the same object stream.
556
+             *
557
+             * This really shouldn't happen, so throw a warning and
558
+             * then see if we can solve it anyhow */
559
+            cli_warnmsg("obj_size: Encountered pdf object in an object stream that has an unknown size!!\n");
560
+
561
+            for ( ; i < pdf->nobjs; i++) {
562
+                if (pdf->objs[i]->objstm == obj->objstm)
563
+                    break;
564
+            }
565
+        }
566
+
567
+        /* Step backwards from the "next" object to find the end of the current object */
568
+        if (i < pdf->nobjs) {
569
+            int s = pdf->objs[i]->start - obj->start - 4;
570
+            if (s > 0) {
571
+                if (!binary) {
572
+                    const char *p = NULL;
573
+                    const char *q = NULL;
574
+
575
+                    if (obj->objstm == NULL) {
576
+                        p = pdf->map + obj->start;
577
+                    } else {
578
+                        p = obj->objstm->streambuf + obj->start;
579
+                    }
580
+                    q = p + s;
529 581
 
530
-    i++;
531
-    if (i < pdf->nobjs) {
532
-        int s = pdf->objs[i].start - obj->start - 4;
533
-        if (s > 0) {
534
-            if (!binary) {
535
-                const char *p = pdf->map + obj->start;
536
-                const char *q = p + s;
582
+                    while (q > p && (isspace(*q) || isdigit(*q)))
583
+                        q--;
537 584
 
538
-                while (q > p && (isspace(*q) || isdigit(*q)))
539
-                       q--;
585
+                    if (q > p+5 && !memcmp(q-5,"endobj",6))
586
+                        q -= 6;
540 587
 
541
-                if (q > p+5 && !memcmp(q-5,"endobj",6))
542
-                    q -= 6;
588
+                    q = findNextNonWSBack(q, p);
589
+                    q++;
543 590
 
544
-                q = findNextNonWSBack(q, p);
545
-                q++;
591
+                    obj->size = q - p;
592
+                    goto done;
593
+                }
546 594
 
547
-                return q - p;
595
+                obj->size = s;
596
+                goto done;
548 597
             }
598
+        }
549 599
 
550
-            return s;
600
+        /* If we've gotten this far, we didn't find a "next" object... so our 
601
+         * current object must be at the end of the pdf fmap or the end of the 
602
+         * object stream. */
603
+        if (obj->objstm == NULL) {
604
+            /* Current object isn't in an object stream, so we can determine object 
605
+             * size based on the remaining size of the file (in theory). */
606
+            if (binary)
607
+                obj->size = pdf->size - obj->start;
608
+            else
609
+                obj->size = pdf->offset - obj->start - 6; /* This hack I think assumes that we reached the end of the file when finding objects. */
610
+        } else {
611
+            /* Current object is in an object stream, we want to find 
612
+             * the next object that is in the same object stream. */
613
+            obj->size = obj->objstm->streambuf_len - obj->start;
551 614
         }
552 615
     }
553 616
 
554
-    if (binary)
555
-        return pdf->size - obj->start;
617
+done:
556 618
 
557
-    return pdf->offset - obj->start - 6;
619
+    return obj->size;
558 620
 }
559 621
 
560 622
 static int run_pdf_hooks(struct pdf_struct *pdf, enum pdf_phase phase, int fd, int dumpid)
... ...
@@ -568,7 +1055,7 @@ static int run_pdf_hooks(struct pdf_struct *pdf, enum pdf_phase phase, int fd, i
568 568
 
569 569
     bc_ctx = cli_bytecode_context_alloc();
570 570
     if (!bc_ctx) {
571
-        cli_errmsg("cli_pdf: can't allocate memory for bc_ctx");
571
+        cli_errmsg("run_pdf_hooks: can't allocate memory for bc_ctx\n");
572 572
         return CL_EMEM;
573 573
     }
574 574
 
... ...
@@ -576,7 +1063,7 @@ static int run_pdf_hooks(struct pdf_struct *pdf, enum pdf_phase phase, int fd, i
576 576
     if (fd != -1) {
577 577
         map = fmap(fd, 0, 0);
578 578
         if (!map) {
579
-            cli_dbgmsg("can't mmap pdf extracted obj\n");
579
+            cli_dbgmsg("run_pdf_hooks: can't mmap pdf extracted obj\n");
580 580
             map = *ctx->fmap;
581 581
             fd = -1;
582 582
         }
... ...
@@ -603,15 +1090,15 @@ static void aes_decrypt(const unsigned char *in, size_t *length, unsigned char *
603 603
     unsigned char pad, i;
604 604
     int nrounds;
605 605
 
606
-    cli_dbgmsg("cli_pdf: aes_decrypt: key length: %d, data length: %zu\n", key_n, *length);
606
+    cli_dbgmsg("aes_decrypt: key length: %d, data length: %zu\n", key_n, *length);
607 607
     if (key_n > 32) {
608
-        cli_dbgmsg("cli_pdf: aes_decrypt: key length is %d!\n", key_n*8);
608
+        cli_dbgmsg("aes_decrypt: key length is %d!\n", key_n*8);
609 609
         return;
610 610
     }
611 611
 
612 612
     if (len < 32) {
613
-        cli_dbgmsg("cli_pdf: aes_decrypt: len is <32: %zu\n", len);
614
-        noisy_warnmsg("cli_pdf: aes_decrypt: len is <32: %zu\n", len);
613
+        cli_dbgmsg("aes_decrypt: len is <32: %zu\n", len);
614
+        noisy_warnmsg("aes_decrypt: len is <32: %zu\n", len);
615 615
         return;
616 616
     }
617 617
 
... ...
@@ -626,7 +1113,7 @@ static void aes_decrypt(const unsigned char *in, size_t *length, unsigned char *
626 626
     cli_dbgmsg("aes_decrypt: Calling rijndaelSetupDecrypt\n");
627 627
     nrounds = rijndaelSetupDecrypt(rk, (const unsigned char *)key, key_n*8);
628 628
     if (!nrounds) {
629
-    cli_dbgmsg("cli_pdf: aes_decrypt: nrounds = 0\n");
629
+    cli_dbgmsg("aes_decrypt: nrounds = 0\n");
630 630
     return;
631 631
     }
632 632
     cli_dbgmsg("aes_decrypt: Beginning rijndaelDecrypt\n");
... ...
@@ -649,8 +1136,8 @@ static void aes_decrypt(const unsigned char *in, size_t *length, unsigned char *
649 649
         pad = q[-1];
650 650
 
651 651
         if (pad > 0x10) {
652
-            cli_dbgmsg("cli_pdf: aes_decrypt: bad pad: %x (extra len: %zu)\n", pad, len-16);
653
-            noisy_warnmsg("cli_pdf: aes_decrypt: bad pad: %x (extra len: %zu)\n", pad, len-16);
652
+            cli_dbgmsg("aes_decrypt: bad pad: %x (extra len: %zu)\n", pad, len-16);
653
+            noisy_warnmsg("aes_decrypt: bad pad: %x (extra len: %zu)\n", pad, len-16);
654 654
             *length -= len;
655 655
             return;
656 656
         }
... ...
@@ -658,8 +1145,8 @@ static void aes_decrypt(const unsigned char *in, size_t *length, unsigned char *
658 658
         q -= pad;
659 659
         for (i=1;i<pad;i++) {
660 660
             if (q[i] != pad) {
661
-                cli_dbgmsg("cli_pdf: aes_decrypt: bad pad: %x != %x\n",q[i],pad);
662
-                noisy_warnmsg("cli_pdf: aes_decrypt: bad pad: %x != %x\n",q[i],pad);
661
+                cli_dbgmsg("aes_decrypt: bad pad: %x != %x\n",q[i],pad);
662
+                noisy_warnmsg("aes_decrypt: bad pad: %x != %x\n",q[i],pad);
663 663
                 *length -= len;
664 664
 
665 665
                 return;
... ...
@@ -671,7 +1158,7 @@ static void aes_decrypt(const unsigned char *in, size_t *length, unsigned char *
671 671
 
672 672
     *length -= len;
673 673
 
674
-    cli_dbgmsg("cli_pdf: aes_decrypt: length is %zu\n", *length);
674
+    cli_dbgmsg("aes_decrypt: length is %zu\n", *length);
675 675
 }
676 676
 
677 677
 
... ...
@@ -682,7 +1169,7 @@ char *decrypt_any(struct pdf_struct *pdf, uint32_t id, const char *in, size_t *l
682 682
     struct arc4_state arc4;
683 683
 
684 684
     if (!length || !*length || !in) {
685
-        noisy_warnmsg("decrypt failed for obj %u %u\n", id>>8, id&0xff);
685
+        noisy_warnmsg("decrypt_any: decrypt failed for obj %u %u\n", id>>8, id&0xff);
686 686
         return NULL;
687 687
     }
688 688
 
... ...
@@ -726,20 +1213,20 @@ char *decrypt_any(struct pdf_struct *pdf, uint32_t id, const char *in, size_t *l
726 726
         arc4_init(&arc4, result, n);
727 727
         arc4_apply(&arc4, q, (unsigned)*length); /* TODO: may truncate for very large lengths */
728 728
 
729
-        noisy_msg(pdf, "decrypted ARC4 data\n");
729
+        noisy_msg(pdf, "decrypt_any: decrypted ARC4 data\n");
730 730
 
731 731
         break;
732 732
     case ENC_AESV2:
733 733
         cli_dbgmsg("cli_pdf: enc is aesv2\n");
734 734
         aes_decrypt((const unsigned char *)in, length, q, (char *)result, n, 1);
735 735
 
736
-        noisy_msg(pdf, "decrypted AES(v2) data\n");
736
+        noisy_msg(pdf, "decrypt_any: decrypted AES(v2) data\n");
737 737
 
738 738
         break;
739 739
     case ENC_AESV3:
740
-        cli_dbgmsg("cli_pdf: enc is aesv3\n");
740
+        cli_dbgmsg("decrypt_any: enc is aesv3\n");
741 741
         if (pdf->keylen == 0) {
742
-            cli_dbgmsg("cli_pdf: no key\n");
742
+            cli_dbgmsg("decrypt_any: no key\n");
743 743
             return NULL;
744 744
         }
745 745
 
... ...
@@ -749,21 +1236,21 @@ char *decrypt_any(struct pdf_struct *pdf, uint32_t id, const char *in, size_t *l
749 749
 
750 750
         break;
751 751
     case ENC_IDENTITY:
752
-        cli_dbgmsg("cli_pdf: enc is identity\n");
752
+        cli_dbgmsg("decrypt_any: enc is identity\n");
753 753
         memcpy(q, in, *length);
754 754
 
755
-        noisy_msg(pdf, "identity encryption\n");
755
+        noisy_msg(pdf, "decrypt_any: identity encryption\n");
756 756
 
757 757
         break;
758 758
     case ENC_NONE:
759
-        cli_dbgmsg("cli_pdf: enc is none\n");
759
+        cli_dbgmsg("decrypt_any: enc is none\n");
760 760
 
761 761
         noisy_msg(pdf, "encryption is none\n");
762 762
 
763 763
         free(q);
764 764
         return NULL;
765 765
     case ENC_UNKNOWN:
766
-        cli_dbgmsg("cli_pdf: enc is unknown\n");
766
+        cli_dbgmsg("decrypt_any: enc is unknown\n");
767 767
         free(q);
768 768
 
769 769
         noisy_warnmsg("decrypt_any: unknown encryption method for obj %u %u\n",
... ...
@@ -838,7 +1325,8 @@ static int pdf_scan_contents(int fd, struct pdf_struct *pdf)
838 838
     char fullname[1024];
839 839
     char outbuff[BUFSIZ];
840 840
     char inbuf[BUFSIZ];
841
-    int fout, n, rc;
841
+    int fout, n;
842
+    cl_error_t rc;
842 843
     enum cstate st = CSTATE_NONE;
843 844
 
844 845
     snprintf(fullname, sizeof(fullname), "%s"PATHSEP"pdf%02u_c", pdf->dir, (pdf->files-1));
... ...
@@ -846,7 +1334,7 @@ static int pdf_scan_contents(int fd, struct pdf_struct *pdf)
846 846
     if (fout < 0) {
847 847
         char err[128];
848 848
 
849
-        cli_errmsg("cli_pdf: can't create temporary file %s: %s\n", fullname, cli_strerror(errno, err, sizeof(err)));
849
+        cli_errmsg("pdf_scan_contents: can't create temporary file %s: %s\n", fullname, cli_strerror(errno, err, sizeof(err)));
850 850
         return CL_ETMPFILE;
851 851
     }
852 852
 
... ...
@@ -872,20 +1360,19 @@ static int pdf_scan_contents(int fd, struct pdf_struct *pdf)
872 872
     return rc;
873 873
 }
874 874
 
875
-static const char *pdf_getdict(const char *q0, int* len, const char *key);
876
-static char *pdf_readval(const char *q, int len, const char *key);
877
-static char *pdf_readstring(const char *q0, int len, const char *key, unsigned *slen, const char **qend, int noescape);
878
-
879 875
 int pdf_extract_obj(struct pdf_struct *pdf, struct pdf_obj *obj, uint32_t flags)
880 876
 {
881 877
     char fullname[NAME_MAX + 1];
882 878
     int fout;
883 879
     ptrdiff_t sum = 0;
884
-    int rc = CL_SUCCESS;
880
+    cl_error_t rc = CL_SUCCESS;
885 881
     int dump = 1;
886 882
 
887 883
     cli_dbgmsg("pdf_extract_obj: obj %u %u\n", obj->id>>8, obj->id&0xff);
888 884
 
885
+    if (obj->objstm)
886
+        cli_dbgmsg("pdf_extract_obj: extracting obj found in objstm.\n");
887
+
889 888
     /* TODO: call bytecode hook here, allow override dumpability */
890 889
     if ((!(obj->flags & (1 << OBJ_STREAM)) || (obj->flags & (1 << OBJ_HASFILTERS))) && !(obj->flags & DUMP_MASK)) {
891 890
         /* don't dump all streams */
... ...
@@ -905,13 +1392,13 @@ int pdf_extract_obj(struct pdf_struct *pdf, struct pdf_obj *obj, uint32_t flags)
905 905
     if (!dump)
906 906
         return CL_CLEAN;
907 907
 
908
-    cli_dbgmsg("cli_pdf: dumping obj %u %u\n", obj->id>>8, obj->id&0xff);
908
+    cli_dbgmsg("pdf_extract_obj: dumping obj %u %u\n", obj->id>>8, obj->id&0xff);
909 909
 
910 910
     snprintf(fullname, sizeof(fullname), "%s"PATHSEP"pdf%02u", pdf->dir, pdf->files++);
911 911
     fout = open(fullname,O_RDWR|O_CREAT|O_EXCL|O_TRUNC|O_BINARY, 0600);
912 912
     if (fout < 0) {
913 913
         char err[128];
914
-        cli_errmsg("cli_pdf: can't create temporary file %s: %s\n", fullname, cli_strerror(errno, err, sizeof(err)));
914
+        cli_errmsg("pdf_extract_obj: can't create temporary file %s: %s\n", fullname, cli_strerror(errno, err, sizeof(err)));
915 915
 
916 916
         return CL_ETMPFILE;
917 917
     }
... ...
@@ -925,6 +1412,11 @@ int pdf_extract_obj(struct pdf_struct *pdf, struct pdf_obj *obj, uint32_t flags)
925 925
             off_t p_stream = 0, p_endstream = 0;
926 926
             off_t length;
927 927
 
928
+            if (NULL != obj->objstm) {
929
+                cli_warnmsg("pdf_extract_obj: Object found in object stream claims to be an object stream! Skipping.\n");
930
+                break;
931
+            }
932
+
928 933
             find_stream_bounds(start, pdf->size - obj->start,
929 934
                        pdf->size - obj->start,
930 935
                        &p_stream, &p_endstream,
... ...
@@ -937,6 +1429,7 @@ int pdf_extract_obj(struct pdf_struct *pdf, struct pdf_obj *obj, uint32_t flags)
937 937
                 int len = p_stream;
938 938
                 const char *pstr;
939 939
                 struct pdf_dict *dparams = NULL;
940
+                struct objstm_struct *objstm = NULL;
940 941
                 int xref = 0;
941 942
 
942 943
                 length = find_length(pdf, obj, start, p_stream);
... ...
@@ -970,7 +1463,7 @@ int pdf_extract_obj(struct pdf_struct *pdf, struct pdf_obj *obj, uint32_t flags)
970 970
                     if (length < 0)
971 971
                         length = 0;
972 972
 
973
-                    cli_dbgmsg("cli_pdf: calculated length %lld\n", (long long)length);
973
+                    cli_dbgmsg("pdf_extract_obj: calculated length %lld\n", (long long)length);
974 974
                 } else {
975 975
                     if (size > (size_t)length+2) {
976 976
                         cli_dbgmsg("cli_pdf: calculated length %zu < %zu\n",
... ...
@@ -980,7 +1473,7 @@ int pdf_extract_obj(struct pdf_struct *pdf, struct pdf_obj *obj, uint32_t flags)
980 980
                 }
981 981
 
982 982
                 if (orig_length && size > (size_t)orig_length + 20) {
983
-                    cli_dbgmsg("cli_pdf: orig length: %lld, length: %lld, size: %zu\n",
983
+                    cli_dbgmsg("pdf_extract_obj: orig length: %lld, length: %lld, size: %zu\n",
984 984
                                (long long)orig_length, (long long)length, size);
985 985
                     pdfobj_flag(pdf, obj, BAD_STREAMLEN);
986 986
                 }
... ...
@@ -998,12 +1491,20 @@ int pdf_extract_obj(struct pdf_struct *pdf, struct pdf_obj *obj, uint32_t flags)
998 998
 
999 999
                 cli_dbgmsg("-------------EXPERIMENTAL-------------\n");
1000 1000
 
1001
-                pstr = pdf_getdict(start, &len, "/DecodeParms");
1002
-                if (!pstr)
1003
-                    pstr = pdf_getdict(start, &len, "/DP");
1001
+                /*
1002
+                 * Identify the DecodeParms, if available.
1003
+                 */
1004
+                if (NULL != (pstr = pdf_getdict(start, &len, "/DecodeParms")))
1005
+                {
1006
+                    cli_dbgmsg("pdf_extract_obj: Found /DecodeParms\n");
1007
+                }
1008
+                else if (NULL != (pstr = pdf_getdict(start, &len, "/DP")))
1009
+                {
1010
+                    cli_dbgmsg("pdf_extract_obj: Found /DP\n");
1011
+                }
1004 1012
 
1005 1013
                 if (pstr) {
1006
-                    unsigned int objsz = obj_size(pdf, obj, 1);
1014
+                    unsigned int objsize = obj_size(pdf, obj, 1);
1007 1015
 
1008 1016
                     /* shift pstr left to "<<" for pdf_parse_dict */
1009 1017
                     while ((*pstr == '<') && (pstr > start)) {
... ...
@@ -1018,12 +1519,102 @@ int pdf_extract_obj(struct pdf_struct *pdf, struct pdf_obj *obj, uint32_t flags)
1018 1018
                     }
1019 1019
 
1020 1020
                     if (len > 4)
1021
-                        dparams = pdf_parse_dict(pdf, obj, objsz, (char *)pstr, NULL);
1021
+                        dparams = pdf_parse_dict(pdf, obj, objsize, (char *)pstr, NULL);
1022 1022
                     else
1023
-                        cli_dbgmsg("cli_pdf: failed to locate DecodeParms dictionary start\n");
1023
+                        cli_dbgmsg("pdf_extract_obj: failed to locate DecodeParms dictionary start\n");
1024
+                }
1025
+
1026
+                /*
1027
+                 * Identify if the stream is an object stream. If so, collect the relevant info. 
1028
+                 */
1029
+                len = p_stream;
1030
+                if (NULL != (pstr = pdf_getdict(start, &len, "/Type/ObjStm")))
1031
+                {
1032
+                    int32_t objstm_first = -1;
1033
+                    int32_t objstm_length = -1;
1034
+                    int32_t objstm_n = -1;
1035
+
1036
+                    cli_dbgmsg("pdf_extract_obj: Found /Type/ObjStm\n");
1037
+
1038
+                    len = p_stream;
1039
+                    if ((-1 == (objstm_first = pdf_readint(start, len, "/First"))))
1040
+                    {
1041
+                        cli_warnmsg("pdf_extract_obj: Failed to find offset of first object in object stream\n");
1042
+                    }
1043
+                    else if ((-1 == (objstm_length = pdf_readint(start, len, "/Length"))))
1044
+                    {
1045
+                        cli_warnmsg("pdf_extract_obj: Failed to find length of object stream\n");
1046
+                    }
1047
+                    else if ((-1 == (objstm_n = pdf_readint(start, len, "/N"))))
1048
+                    {
1049
+                        cli_warnmsg("pdf_extract_obj: Failed to find num objects in object stream\n");
1050
+                    }
1051
+                    else
1052
+                    {
1053
+                        /* Add objstm to pdf struct, so it can be freed eventually */
1054
+                        pdf->nobjstms++;
1055
+                        pdf->objstms = cli_realloc2(pdf->objstms, sizeof(struct objstm_struct*) * pdf->nobjstms);
1056
+                        if (!pdf->objstms) {
1057
+                            cli_warnmsg("pdf_extract_obj: out of memory parsing object stream (%u)\n", pdf->nobjstms);
1058
+                            pdf_free_dict(dparams);
1059
+                            return CL_EMEM;
1060
+                        }
1061
+
1062
+                        objstm = malloc(sizeof(struct objstm_struct));
1063
+                        if (!objstm) {
1064
+                            cli_warnmsg("pdf_extract_obj: out of memory parsing object stream (%u)\n", pdf->nobjstms);
1065
+                            pdf_free_dict(dparams);
1066
+                            return CL_EMEM;
1067
+                        }
1068
+                        pdf->objstms[pdf->nobjstms-1] = objstm;
1069
+
1070
+                        memset(objstm, 0, sizeof(*objstm));
1071
+
1072
+                        objstm->first =         (uint32_t)objstm_first;
1073
+                        objstm->current =       (uint32_t)objstm_first;
1074
+                        objstm->current_pair =  0;
1075
+                        objstm->length =        (uint32_t)objstm_length;
1076
+                        objstm->n =             (uint32_t)objstm_n;
1077
+
1078
+                        cli_dbgmsg("pdf_extract_obj: ObjStm first obj at offset %d\n", objstm->first);
1079
+                        cli_dbgmsg("pdf_extract_obj: ObjStm length is %d bytes\n", objstm->length);
1080
+                        cli_dbgmsg("pdf_extract_obj: ObjStm should contain %d objects\n", objstm->n);
1081
+                    }
1082
+                }
1083
+
1084
+                sum = pdf_decodestream(pdf, obj, dparams, start + p_stream, (uint32_t)length, xref, fout, &rc, objstm);
1085
+                if (sum < 0) {
1086
+                    /*
1087
+                    * If we were expecting an objstm and there was a failure...
1088
+                    *   discard the memory for last object stream.
1089
+                    */
1090
+                    if (NULL != objstm)
1091
+                    {
1092
+                        if (NULL != pdf->objstms) {
1093
+                            if (NULL != pdf->objstms[pdf->nobjstms - 1]) {
1094
+                                pdf->objstms[pdf->nobjstms - 1]->streambuf = NULL;
1095
+
1096
+                                free(pdf->objstms[pdf->nobjstms - 1]);
1097
+                                pdf->objstms[pdf->nobjstms - 1] = NULL;
1098
+                            }
1099
+
1100
+                            /* Pop the objstm off the end of the pdf->objstms array. */
1101
+                            if (pdf->nobjstms > 0) {
1102
+                                pdf->nobjstms--;
1103
+                                pdf->objstms = cli_realloc2(pdf->objstms, sizeof(struct objstm_struct*) * pdf->nobjstms);
1104
+
1105
+                                if (!pdf->objstms) {
1106
+                                    cli_warnmsg("pdf_extract_obj: out of memory when shrinking down objstm array\n");
1107
+                                    return CL_EMEM;
1108
+                                }
1109
+                            } else {
1110
+                                /* hm.. this shouldn't happen */
1111
+                                cli_warnmsg("pdf_extract_obj: Failure counting objstms.\n");
1112
+                            }
1113
+                        }
1114
+                    }
1024 1115
                 }
1025 1116
 
1026
-                sum = pdf_decodestream(pdf, obj, dparams, start + p_stream, (uint32_t)length, xref, fout, &rc);
1027 1117
                 if (dparams)
1028 1118
                     pdf_free_dict(dparams);
1029 1119
 
... ...
@@ -1034,14 +1625,17 @@ int pdf_extract_obj(struct pdf_struct *pdf, struct pdf_obj *obj, uint32_t flags)
1034 1034
 
1035 1035
                 cli_dbgmsg("-------------EXPERIMENTAL-------------\n");
1036 1036
             } else {
1037
-                noisy_warnmsg("cannot find stream bounds for obj %u %u\n", obj->id>>8, obj->id&0xff);
1037
+                noisy_warnmsg("pdf_extract_obj: cannot find stream bounds for obj %u %u\n", obj->id>>8, obj->id&0xff);
1038 1038
             }
1039 1039
 
1040 1040
         } else if (obj->flags & (1 << OBJ_JAVASCRIPT)) {
1041 1041
             const char *q2;
1042
-            const char *q = pdf->map+obj->start;
1042
+            const char *q = (obj->objstm) ? (const char *)(obj->start + obj->objstm->streambuf)
1043
+                                          : (const char *)(obj->start + pdf->map);
1044
+
1043 1045
             /* TODO: get obj-endobj size */
1044 1046
             off_t bytesleft = obj_size(pdf, obj, 0);
1047
+
1045 1048
             if (bytesleft < 0)
1046 1049
                 break;
1047 1050
 
... ...
@@ -1066,11 +1660,11 @@ int pdf_extract_obj(struct pdf_struct *pdf, struct pdf_obj *obj, uint32_t flags)
1066 1066
                     const char *out = js;
1067 1067
                     js_len = strlen(js);
1068 1068
                     if (pdf->flags & (1 << DECRYPTABLE_PDF)) {
1069
-                        cli_dbgmsg("cli_pdf: encrypted string\n");
1069
+                        cli_dbgmsg("pdf_extract_obj: encrypted string\n");
1070 1070
                         decrypted = decrypt_any(pdf, obj->id, js, &js_len, pdf->enc_method_string);
1071 1071
 
1072 1072
                         if (decrypted) {
1073
-                            noisy_msg(pdf, "decrypted Javascript string from obj %u %u\n", obj->id>>8,obj->id&0xff);
1073
+                            noisy_msg(pdf, "pdf_extract_obj: decrypted Javascript string from obj %u %u\n", obj->id>>8,obj->id&0xff);
1074 1074
                             out = decrypted;
1075 1075
                         }
1076 1076
                     }
... ...
@@ -1083,7 +1677,7 @@ int pdf_extract_obj(struct pdf_struct *pdf, struct pdf_obj *obj, uint32_t flags)
1083 1083
 
1084 1084
                     free(decrypted);
1085 1085
                     free(js);
1086
-                    cli_dbgmsg("bytesleft: %d\n", (int)bytesleft);
1086
+                    cli_dbgmsg("pdf_extract_obj: bytesleft: %d\n", (int)bytesleft);
1087 1087
 
1088 1088
                     if (bytesleft > 0) {
1089 1089
                         q2 = pdf_nextobject(q, bytesleft);
... ...
@@ -1117,8 +1711,8 @@ int pdf_extract_obj(struct pdf_struct *pdf, struct pdf_obj *obj, uint32_t flags)
1117 1117
         }
1118 1118
     } while (0);
1119 1119
 
1120
-    cli_dbgmsg("cli_pdf: extracted %td bytes %u %u obj\n", sum, obj->id>>8, obj->id&0xff);
1121
-    cli_dbgmsg("         ... to %s\n", fullname);
1120
+    cli_dbgmsg("pdf_extract_obj: extracted %td bytes %u %u obj\n", sum, obj->id>>8, obj->id&0xff);
1121
+    cli_dbgmsg("pdf_extract_obj:         ... to %s\n", fullname);
1122 1122
 
1123 1123
     if (flags & PDF_EXTRACT_OBJ_SCAN && sum) {
1124 1124
         int rc2;
... ...
@@ -1132,20 +1726,25 @@ int pdf_extract_obj(struct pdf_struct *pdf, struct pdf_obj *obj, uint32_t flags)
1132 1132
             rc = rc2;
1133 1133
 
1134 1134
         if ((rc == CL_CLEAN) || ((rc == CL_VIRUS) && (pdf->ctx->options & CL_SCAN_ALLMATCHES))) {
1135
-            rc2 = run_pdf_hooks(pdf, PDF_PHASE_POSTDUMP, fout, obj - pdf->objs);
1135
+            unsigned int dumpid = 0;
1136
+            for (dumpid = 0; dumpid < pdf->nobjs; dumpid++) {
1137
+                if (pdf->objs[dumpid] == obj)
1138
+                    break;
1139
+            }
1140
+            rc2 = run_pdf_hooks(pdf, PDF_PHASE_POSTDUMP, fout, dumpid);
1136 1141
             if (rc2 == CL_VIRUS)
1137 1142
                 rc = rc2;
1138 1143
         }
1139 1144
 
1140 1145
         if (((rc == CL_CLEAN) || ((rc == CL_VIRUS) && (pdf->ctx->options & CL_SCAN_ALLMATCHES))) && (obj->flags & (1 << OBJ_CONTENTS))) {
1141 1146
             lseek(fout, 0, SEEK_SET);
1142
-            cli_dbgmsg("cli_pdf: dumping contents %u %u\n", obj->id>>8, obj->id&0xff);
1147
+            cli_dbgmsg("pdf_extract_obj: dumping contents %u %u\n", obj->id>>8, obj->id&0xff);
1143 1148
 
1144 1149
             rc2 = pdf_scan_contents(fout, pdf);
1145 1150
             if (rc2 == CL_VIRUS)
1146 1151
                 rc = rc2;
1147 1152
 
1148
-            noisy_msg(pdf, "extracted text from obj %u %u\n", obj->id>>8, obj->id&0xff);
1153
+            noisy_msg(pdf, "pdf_extract_obj: extracted text from obj %u %u\n", obj->id>>8, obj->id&0xff);
1149 1154
         }
1150 1155
     }
1151 1156
 
... ...
@@ -1291,7 +1890,7 @@ static void handle_pdfname(struct pdf_struct *pdf, struct pdf_obj *obj, const ch
1291 1291
         /* these are digital signature objects, filter doesn't matter,
1292 1292
          * we don't need them anyway */
1293 1293
         if (*state == STATE_FILTER && !(obj->flags & (1 << OBJ_SIGNED)) && !(obj->flags & KNOWN_FILTERS)) {
1294
-            cli_dbgmsg("cli_pdf: unknown filter %s\n", pdfname);
1294
+            cli_dbgmsg("handle_pdfname: unknown filter %s\n", pdfname);
1295 1295
             obj->flags |= 1 << OBJ_FILTER_UNKNOWN;
1296 1296
         }
1297 1297
 
... ...
@@ -1305,7 +1904,7 @@ static void handle_pdfname(struct pdf_struct *pdf, struct pdf_obj *obj, const ch
1305 1305
     if ((act->nameflags & NAMEFLAG_HEURISTIC) && escapes) {
1306 1306
         /* if a commonly used PDF name is escaped that is certainly
1307 1307
            suspicious. */
1308
-        cli_dbgmsg("cli_pdf: pdfname %s is escaped\n", pdfname);
1308
+        cli_dbgmsg("handle_pdfname: pdfname %s is escaped\n", pdfname);
1309 1309
         pdfobj_flag(pdf, obj, ESCAPED_COMMON_PDFNAME);
1310 1310
     }
1311 1311
 
... ...
@@ -1318,7 +1917,7 @@ static void handle_pdfname(struct pdf_struct *pdf, struct pdf_obj *obj, const ch
1318 1318
         *state = act->to_state;
1319 1319
 
1320 1320
         if (*state == STATE_FILTER && act->set_objflag != OBJ_DICT && (obj->flags & (1 << act->set_objflag))) {
1321
-            cli_dbgmsg("cli_pdf: duplicate stream filter %s\n", pdfname);
1321
+            cli_dbgmsg("handle_pdfname: duplicate stream filter %s\n", pdfname);
1322 1322
             pdfobj_flag(pdf, obj, BAD_STREAM_FILTERS);
1323 1323
         }
1324 1324
 
... ...
@@ -1335,8 +1934,6 @@ static void handle_pdfname(struct pdf_struct *pdf, struct pdf_obj *obj, const ch
1335 1335
     }
1336 1336
 }
1337 1337
 
1338
-static int pdf_readint(const char *q0, int len, const char *key);
1339
-
1340 1338
 static void pdf_parse_encrypt(struct pdf_struct *pdf, const char *enc, int len)
1341 1339
 {
1342 1340
     const char *q, *q2;
... ...
@@ -1361,7 +1958,7 @@ static void pdf_parse_encrypt(struct pdf_struct *pdf, const char *enc, int len)
1361 1361
     q = q2;
1362 1362
 
1363 1363
     if (CL_SUCCESS != cli_strntoul_wrap(q2, (size_t)len, 0, 10, &objid)) {
1364
-        cli_dbgmsg("cli_pdf: Found Encrypt dictionary but failed to parse objid\n");
1364
+        cli_dbgmsg("pdf_parse_encrypt: Found Encrypt dictionary but failed to parse objid\n");
1365 1365
         return;
1366 1366
     }
1367 1367
     objid = objid << 8;
... ...
@@ -1372,7 +1969,7 @@ static void pdf_parse_encrypt(struct pdf_struct *pdf, const char *enc, int len)
1372 1372
     q = q2;
1373 1373
 
1374 1374
     if (CL_SUCCESS != cli_strntoul_wrap(q2, (size_t)len, 0, 10, &genid)) {
1375
-        cli_dbgmsg("cli_pdf: Found Encrypt dictionary but failed to parse genid\n");
1375
+        cli_dbgmsg("pdf_parse_encrypt: Found Encrypt dictionary but failed to parse genid\n");
1376 1376
         return;
1377 1377
     }
1378 1378
     objid |= genid & 0xff; 
... ...
@@ -1380,7 +1977,7 @@ static void pdf_parse_encrypt(struct pdf_struct *pdf, const char *enc, int len)
1380 1380
     if (!q2 || *q2 != 'R')
1381 1381
         return;
1382 1382
 
1383
-    cli_dbgmsg("cli_pdf: Encrypt dictionary in obj %lu %lu\n", objid>>8, objid&0xff);
1383
+    cli_dbgmsg("pdf_parse_encrypt: Encrypt dictionary in obj %lu %lu\n", objid>>8, objid&0xff);
1384 1384
 
1385 1385
     pdf->enc_objid = objid;
1386 1386
 }
... ...
@@ -1410,18 +2007,21 @@ void pdf_parseobj(struct pdf_struct *pdf, struct pdf_obj *obj)
1410 1410
     char pdfname[64];
1411 1411
     const char *q2, *q3;
1412 1412
     const char *nextobj = NULL, *nextopen = NULL, *nextclose = NULL;
1413
-    const char *q = obj->start + pdf->map;
1414
-    const char *dict, *enddict, *start;
1415
-    off_t dict_length, full_dict_length;
1416
-    off_t objsize = obj_size(pdf, obj, 1);
1417
-    off_t bytesleft;
1418
-    size_t i;
1419
-    unsigned filters=0, blockopens=0;
1413
+    const char *q = NULL;
1414
+    const char *dict = NULL, *enddict = NULL, *start = NULL;
1415
+    off_t dict_length = 0, full_dict_length = 0, objsize = 0, bytesleft = 0;
1416
+    size_t i = 0;
1417
+    unsigned filters = 0, blockopens = 0;
1420 1418
     enum objstate objstate = STATE_NONE;
1421 1419
 #if HAVE_JSON
1422 1420
     json_object *pdfobj=NULL, *jsonobj=NULL;
1423 1421
 #endif
1424 1422
 
1423
+    q = (obj->objstm) ? (const char *)(obj->start + obj->objstm->streambuf)
1424
+                      : (const char *)(obj->start + pdf->map);
1425
+
1426
+    objsize = obj_size(pdf, obj, 1);
1427
+
1425 1428
     if (objsize < 0)
1426 1429
         return;
1427 1430
 
... ...
@@ -1434,7 +2034,7 @@ void pdf_parseobj(struct pdf_struct *pdf, struct pdf_obj *obj)
1434 1434
         bytesleft -= nextobj -q;
1435 1435
 
1436 1436
         if (!nextobj || bytesleft < 0) {
1437
-            cli_dbgmsg("cli_pdf: %u %u obj: no dictionary\n", obj->id>>8, obj->id&0xff);
1437
+            cli_dbgmsg("pdf_parseobj: %u %u obj: no dictionary\n", obj->id>>8, obj->id&0xff);
1438 1438
 #if HAVE_JSON
1439 1439
             if (!(pdfobj) && pdf->ctx->wrkproperty != NULL) {
1440 1440
                 pdfobj = cli_jsonobj(pdf->ctx->wrkproperty, "PDFStats");
... ...
@@ -1465,7 +2065,7 @@ void pdf_parseobj(struct pdf_struct *pdf, struct pdf_obj *obj)
1465 1465
 
1466 1466
     /* find end of dictionary block */
1467 1467
     if (bytesleft < 0) {
1468
-        cli_dbgmsg("cli_pdf: %u %u obj: broken dictionary\n", obj->id>>8, obj->id&0xff);
1468
+        cli_dbgmsg("pdf_parseobj: %u %u obj: broken dictionary\n", obj->id>>8, obj->id&0xff);
1469 1469
 #if HAVE_JSON
1470 1470
         if (!(pdfobj) && pdf->ctx->wrkproperty != NULL) {
1471 1471
             pdfobj = cli_jsonobj(pdf->ctx->wrkproperty, "PDFStats");
... ...
@@ -1517,7 +2117,7 @@ void pdf_parseobj(struct pdf_struct *pdf, struct pdf_obj *obj)
1517 1517
     /* Was end of dictionary found? */
1518 1518
     if (blockopens) {
1519 1519
         /* probably truncated */
1520
-        cli_dbgmsg("cli_pdf: %u %u obj broken dictionary\n", obj->id>>8, obj->id&0xff);
1520
+        cli_dbgmsg("pdf_parseobj: %u %u obj broken dictionary\n", obj->id>>8, obj->id&0xff);
1521 1521
 #if HAVE_JSON
1522 1522
         if (!(pdfobj) && pdf->ctx->wrkproperty != NULL) {
1523 1523
             pdfobj = cli_jsonobj(pdf->ctx->wrkproperty, "PDFStats");
... ...
@@ -1552,7 +2152,7 @@ void pdf_parseobj(struct pdf_struct *pdf, struct pdf_obj *obj)
1552 1552
                     dictionary[i] = '*';
1553 1553
             }
1554 1554
             dictionary[dict_length] = '\0';
1555
-            cli_dbgmsg("cli_pdf: dictionary is <<%s>>\n", dictionary);
1555
+            cli_dbgmsg("pdf_parseobj: dictionary is <<%s>>\n", dictionary);
1556 1556
             free(dictionary);
1557 1557
         }
1558 1558
     }
... ...
@@ -1617,10 +2217,10 @@ void pdf_parseobj(struct pdf_struct *pdf, struct pdf_obj *obj)
1617 1617
                     trailer = 0;
1618 1618
 
1619 1619
                 q2 = pdf->map + trailer;
1620
-                cli_dbgmsg("cli_pdf: looking for trailer in linearized pdf: %ld - %ld\n", trailer, trailer_end);
1620
+                cli_dbgmsg("pdf_parseobj: looking for trailer in linearized pdf: %ld - %ld\n", trailer, trailer_end);
1621 1621
                 pdf_parse_trailer(pdf, q2, trailer_end - trailer);
1622 1622
                 if (pdf->fileID)
1623
-                    cli_dbgmsg("cli_pdf: found fileID\n");
1623
+                    cli_dbgmsg("pdf_parseobj: found fileID\n");
1624 1624
             }
1625 1625
         }
1626 1626
 
... ...
@@ -1641,7 +2241,7 @@ void pdf_parseobj(struct pdf_struct *pdf, struct pdf_obj *obj)
1641 1641
                 dict_remaining -= (off_t)(q2 - q);
1642 1642
 
1643 1643
                 if (CL_SUCCESS != cli_strntoul_wrap(q2, (size_t)dict_remaining, 0, 10, &objid)) {
1644
-                    cli_dbgmsg("cli_pdf: failed to parse object objid\n");
1644
+                    cli_dbgmsg("pdf_parseobj: failed to parse object objid\n");
1645 1645
                     return;
1646 1646
                 }
1647 1647
                 objid = objid << 8;
... ...
@@ -1654,7 +2254,7 @@ void pdf_parseobj(struct pdf_struct *pdf, struct pdf_obj *obj)
1654 1654
                 if (q2 && isdigit(*q2)) {
1655 1655
                     dict_remaining -= (off_t)(q2 - q2_old);
1656 1656
                     if (CL_SUCCESS != cli_strntoul_wrap(q2, (size_t)dict_remaining, 0, 10, &genid)) {
1657
-                        cli_dbgmsg("cli_pdf: failed to parse object genid\n");
1657
+                        cli_dbgmsg("pdf_parseobj: failed to parse object genid\n");
1658 1658
                         return;
1659 1659
                     }
1660 1660
                     objid |= genid & 0xff;
... ...
@@ -1663,7 +2263,7 @@ void pdf_parseobj(struct pdf_struct *pdf, struct pdf_obj *obj)
1663 1663
                     if (q2 && *q2 == 'R') {
1664 1664
                         struct pdf_obj *obj2;
1665 1665
 
1666
-                        cli_dbgmsg("cli_pdf: found %s stored in indirect object %lu %lu\n", pdfname, objid >> 8, objid&0xff);
1666
+                        cli_dbgmsg("pdf_parseobj: found %s stored in indirect object %lu %lu\n", pdfname, objid >> 8, objid&0xff);
1667 1667
                         obj2 = find_obj(pdf, obj, objid);
1668 1668
                         if (obj2) {
1669 1669
                             enum pdf_objflags flag =
... ...
@@ -1707,7 +2307,7 @@ void pdf_parseobj(struct pdf_struct *pdf, struct pdf_obj *obj)
1707 1707
     if (obj->flags & (1 << OBJ_FILTER_UNKNOWN))
1708 1708
         pdfobj_flag(pdf, obj, UNKNOWN_FILTER);
1709 1709
 
1710
-    cli_dbgmsg("cli_pdf: %u %u obj flags: %02x\n", obj->id>>8, obj->id&0xff, obj->flags);
1710
+    cli_dbgmsg("pdf_parseobj: %u %u obj flags: %02x\n", obj->id>>8, obj->id&0xff, obj->flags);
1711 1711
 }
1712 1712
 
1713 1713
 /**
... ...
@@ -1725,7 +2325,7 @@ static const char *pdf_getdict(const char *q0, int* len, const char *key)
1725 1725
     const char *q;
1726 1726
 
1727 1727
     if (*len <= 0) {
1728
-        cli_dbgmsg("cli_pdf: bad length %d\n", *len);
1728
+        cli_dbgmsg("pdf_getdict: bad length %d\n", *len);
1729 1729
         return NULL;
1730 1730
     }
1731 1731
 
... ...
@@ -1735,7 +2335,7 @@ static const char *pdf_getdict(const char *q0, int* len, const char *key)
1735 1735
     /* find the key */
1736 1736
     q = cli_memstr(q0, *len, key, strlen(key));
1737 1737
     if (!q) {
1738
-        cli_dbgmsg("cli_pdf: %s not found in dict\n", key);
1738
+        cli_dbgmsg("pdf_getdict: %s not found in dict\n", key);
1739 1739
         return NULL;
1740 1740
     }
1741 1741
 
... ...
@@ -1745,7 +2345,7 @@ static const char *pdf_getdict(const char *q0, int* len, const char *key)
1745 1745
     /* find the start of the value object */
1746 1746
     q = pdf_nextobject(q0 + 1, *len - 1);
1747 1747
     if (!q) {
1748
-        cli_dbgmsg("cli_pdf: %s is invalid in dict\n", key);
1748
+        cli_dbgmsg("pdf_getdict: %s is invalid in dict\n", key);
1749 1749
         return NULL;
1750 1750
     }
1751 1751
 
... ...
@@ -1891,12 +2491,12 @@ static char *pdf_readstring(const char *q0, int len, const char *key, unsigned *
1891 1891
 
1892 1892
         s = cli_malloc((q - start)/2 + 1);
1893 1893
         if (s == NULL) { /* oops, couldn't allocate memory */
1894
-          cli_dbgmsg("cli_pdf: unable to allocate memory...\n");
1894
+          cli_dbgmsg("pdf_readstring: unable to allocate memory...\n");
1895 1895
           return NULL;
1896 1896
         }
1897 1897
 
1898 1898
         if (cli_hex2str_to(start, s, q - start)) {
1899
-            cli_dbgmsg("cli_pdf: %s has bad hex value\n", key);
1899
+            cli_dbgmsg("pdf_readstring: %s has bad hex value\n", key);
1900 1900
             free(s);
1901 1901
             return NULL;
1902 1902
         }
... ...
@@ -1908,7 +2508,7 @@ static char *pdf_readstring(const char *q0, int len, const char *key, unsigned *
1908 1908
         return s;
1909 1909
     }
1910 1910
 
1911
-    cli_dbgmsg("cli_pdf: %s is invalid string in dict\n", key);
1911
+    cli_dbgmsg("pdf_readstring: %s is invalid string in dict\n", key);
1912 1912
     return NULL;
1913 1913
 }
1914 1914
 
... ...
@@ -1982,7 +2582,7 @@ static int pdf_readbool(const char *q0, int len, const char *key, int Default)
1982 1982
     if (!strncmp(q, "false", 5))
1983 1983
         return 0;
1984 1984
 
1985
-    cli_dbgmsg("cli_pdf: invalid value for %s bool\n", key);
1985
+    cli_dbgmsg("pdf_readbool: invalid value for %s bool\n", key);
1986 1986
 
1987 1987
     return Default;
1988 1988
 }
... ...
@@ -2032,8 +2632,8 @@ static void check_user_password(struct pdf_struct *pdf, int R, const char *O,
2032 2032
             cl_sha256(U+40, 8, result2, NULL);
2033 2033
             UE_len = UE ? strlen(UE) : 0;
2034 2034
             if (UE_len != 32) {
2035
-                cli_dbgmsg("cli_pdf: UE length is not 32: %zu\n", UE_len);
2036
-                noisy_warnmsg("cli_pdf: UE length is not 32: %zu\n", UE_len);
2035
+                cli_dbgmsg("check_user_password: UE length is not 32: %zu\n", UE_len);
2036
+                noisy_warnmsg("check_user_password: UE length is not 32: %zu\n", UE_len);
2037 2037
             } else {
2038 2038
                 pdf->keylen = 32;
2039 2039
                 pdf->key = cli_malloc(32);
... ...
@@ -2043,7 +2643,7 @@ static void check_user_password(struct pdf_struct *pdf, int R, const char *O,
2043 2043
                 }
2044 2044
 
2045 2045
                 aes_decrypt((const unsigned char *)UE, &UE_len, (unsigned char *)(pdf->key), (char *)result2, 32, 0);
2046
-                dbg_printhex("cli_pdf: Candidate encryption key", pdf->key, pdf->keylen);
2046
+                dbg_printhex("check_user_password: Candidate encryption key", pdf->key, pdf->keylen);
2047 2047
             }
2048 2048
         }
2049 2049
     } else if ((R >= 2) && (R <= 4)) {
... ...
@@ -2129,27 +2729,27 @@ static void check_user_password(struct pdf_struct *pdf, int R, const char *O,
2129 2129
                 password_empty = 1;
2130 2130
             free(d);
2131 2131
         } else {
2132
-            cli_dbgmsg("cli_pdf: invalid revision %d\n", R);
2133
-            noisy_warnmsg("cli_pdf: invalid revision %d\n", R);
2132
+            cli_dbgmsg("check_user_password: invalid revision %d\n", R);
2133
+            noisy_warnmsg("check_user_password: invalid revision %d\n", R);
2134 2134
         }
2135 2135
     } else {
2136 2136
         /* Supported R is in {2,3,4,5} */
2137
-        cli_dbgmsg("cli_pdf: R value out of range\n");
2138
-        noisy_warnmsg("cli_pdf: R value out of range\n");
2137
+        cli_dbgmsg("check_user_password: R value out of range\n");
2138
+        noisy_warnmsg("check_user_password: R value out of range\n");
2139 2139
 
2140 2140
         return;
2141 2141
     }
2142 2142
 
2143 2143
     if (password_empty) {
2144
-        cli_dbgmsg("cli_pdf: user password is empty\n");
2145
-        noisy_msg(pdf, "cli_pdf: encrypted PDF found, user password is empty, will attempt to decrypt\n");
2144
+        cli_dbgmsg("check_user_password: user password is empty\n");
2145
+        noisy_msg(pdf, "check_user_password: encrypted PDF found, user password is empty, will attempt to decrypt\n");
2146 2146
         /* The key we computed above is the key used to encrypt the streams.
2147 2147
          * We could decrypt it now if we wanted to */
2148 2148
         pdf->flags |= 1 << DECRYPTABLE_PDF;
2149 2149
     } else {
2150 2150
         /* the key is not valid, we would need the user or the owner password to decrypt */
2151
-        cli_dbgmsg("cli_pdf: user/owner password would be required for decryption\n");
2152
-        noisy_warnmsg("cli_pdf: encrypted PDF found, user password is NOT empty, cannot decrypt!\n");
2151
+        cli_dbgmsg("check_user_password: user/owner password would be required for decryption\n");
2152
+        noisy_warnmsg("check_user_password: encrypted PDF found, user password is NOT empty, cannot decrypt!\n");
2153 2153
     }
2154 2154
 }
2155 2155
 
... ...
@@ -2171,7 +2771,7 @@ enum enc_method parse_enc_method(const char *dict, unsigned len, const char *key
2171 2171
 
2172 2172
     CFM = pdf_readval(q, len, "/CFM");
2173 2173
     if (CFM) {
2174
-        cli_dbgmsg("cli_pdf: %s CFM: %s\n", key, CFM);
2174
+        cli_dbgmsg("parse_enc_method: %s CFM: %s\n", key, CFM);
2175 2175
         if (!strncmp(CFM,"V2", 2))
2176 2176
             ret = ENC_V2;
2177 2177
         else if (!strncmp(CFM,"AESV2",5))
... ...
@@ -2197,15 +2797,15 @@ void pdf_handle_enc(struct pdf_struct *pdf)
2197 2197
     if (pdf->enc_objid == ~0u)
2198 2198
         return;
2199 2199
     if (!pdf->fileID) {
2200
-        cli_dbgmsg("cli_pdf: pdf_handle_enc no file ID\n");
2201
-        noisy_warnmsg("cli_pdf: pdf_handle_enc no file ID\n");
2200
+        cli_dbgmsg("pdf_handle_enc: no file ID\n");
2201
+        noisy_warnmsg("pdf_handle_enc: no file ID\n");
2202 2202
         return;
2203 2203
     }
2204 2204
 
2205
-    obj = find_obj(pdf, pdf->objs, pdf->enc_objid);
2205
+    obj = find_obj(pdf, pdf->objs[0], pdf->enc_objid);
2206 2206
     if (!obj) {
2207
-        cli_dbgmsg("cli_pdf: can't find encrypted object %d %d\n", pdf->enc_objid>>8, pdf->enc_objid&0xff);
2208
-        noisy_warnmsg("cli_pdf: can't find encrypted object %d %d\n", pdf->enc_objid>>8, pdf->enc_objid&0xff);
2207
+        cli_dbgmsg("pdf_handle_enc: can't find encrypted object %d %d\n", pdf->enc_objid>>8, pdf->enc_objid&0xff);
2208
+        noisy_warnmsg("pdf_handle_enc: can't find encrypted object %d %d\n", pdf->enc_objid>>8, pdf->enc_objid&0xff);
2209 2209
         return;
2210 2210
     }
2211 2211
 
... ...
@@ -2220,15 +2820,15 @@ void pdf_handle_enc(struct pdf_struct *pdf)
2220 2220
         pdf->enc_method_embeddedfile = ENC_UNKNOWN;
2221 2221
         P = pdf_readint(q, len, "/P");
2222 2222
         if (P == ~0u) {
2223
-            cli_dbgmsg("cli_pdf: invalid P\n");
2224
-            noisy_warnmsg("cli_pdf: invalid P\n");
2223
+            cli_dbgmsg("pdf_handle_enc: invalid P\n");
2224
+            noisy_warnmsg("pdf_handle_enc: invalid P\n");
2225 2225
             break;
2226 2226
         }
2227 2227
 
2228 2228
         q2 = cli_memstr(q, len, "/Standard", 9);
2229 2229
         if (!q2) {
2230
-            cli_dbgmsg("cli_pdf: /Standard not found\n");
2231
-            noisy_warnmsg("cli_pdf: /Standard not found\n");
2230
+            cli_dbgmsg("pdf_handle_enc: /Standard not found\n");
2231
+            noisy_warnmsg("pdf_handle_enc: /Standard not found\n");
2232 2232
             break;
2233 2233
         }
2234 2234
 
... ...
@@ -2241,20 +2841,20 @@ void pdf_handle_enc(struct pdf_struct *pdf)
2241 2241
             length = pdf_readint(q, len, "/Length");
2242 2242
 
2243 2243
         if (length < 40) {
2244
-            cli_dbgmsg("cli_pdf: invalid length: %d\n", length);
2244
+            cli_dbgmsg("pdf_handle_enc: invalid length: %d\n", length);
2245 2245
             length = 40;
2246 2246
         }
2247 2247
 
2248 2248
         R = pdf_readint(q, len, "/R");
2249 2249
         if (R == ~0u) {
2250
-            cli_dbgmsg("cli_pdf: invalid R\n");
2251
-            noisy_warnmsg("cli_pdf: invalid R\n");
2250
+            cli_dbgmsg("pdf_handle_enc: invalid R\n");
2251
+            noisy_warnmsg("pdf_handle_enc: invalid R\n");
2252 2252
             break;
2253 2253
         }
2254 2254
 
2255 2255
         if ((R > 5) || (R < 2)) {
2256
-            cli_dbgmsg("cli_pdf: R value outside supported range [2..5]\n");
2257
-            noisy_warnmsg("cli_pdf: R value outside supported range [2..5]\n");
2256
+            cli_dbgmsg("pdf_handle_enc: R value outside supported range [2..5]\n");
2257
+            noisy_warnmsg("pdf_handle_enc: R value outside supported range [2..5]\n");
2258 2258
             break;
2259 2259
         }
2260 2260
 
... ...
@@ -2277,11 +2877,11 @@ void pdf_handle_enc(struct pdf_struct *pdf)
2277 2277
             pdf->CF_n = n;
2278 2278
 
2279 2279
             if (StmF)
2280
-                cli_dbgmsg("cli_pdf: StmF: %s\n", StmF);
2280
+                cli_dbgmsg("pdf_handle_enc: StmF: %s\n", StmF);
2281 2281
             if (StrF)
2282
-                cli_dbgmsg("cli_pdf: StrF: %s\n", StrF);
2282
+                cli_dbgmsg("pdf_handle_enc: StrF: %s\n", StrF);
2283 2283
             if (EFF)
2284
-                cli_dbgmsg("cli_pdf: EFF: %s\n", EFF);
2284
+                cli_dbgmsg("pdf_handle_enc: EFF: %s\n", EFF);
2285 2285
 
2286 2286
             pdf->enc_method_stream = parse_enc_method(pdf->CF, n, StmF, ENC_IDENTITY);
2287 2287
             pdf->enc_method_string = parse_enc_method(pdf->CF, n, StrF, ENC_IDENTITY);
... ...
@@ -2291,7 +2891,7 @@ void pdf_handle_enc(struct pdf_struct *pdf)
2291 2291
             free(StrF);
2292 2292
             free(EFF);
2293 2293
 
2294
-            cli_dbgmsg("cli_pdf: EncryptMetadata: %s\n", EM ? "true" : "false");
2294
+            cli_dbgmsg("pdf_handle_enc: EncryptMetadata: %s\n", EM ? "true" : "false");
2295 2295
 
2296 2296
             if (R == 4) {
2297 2297
                 length = 128;
... ...
@@ -2308,8 +2908,8 @@ void pdf_handle_enc(struct pdf_struct *pdf)
2308 2308
         n = 0;
2309 2309
         O = pdf_readstring(q, len, "/O", &n, NULL, 0);
2310 2310
         if (!O || n < oulen) {
2311
-            cli_dbgmsg("cli_pdf: invalid O: %d\n", n);
2312
-            cli_dbgmsg("cli_pdf: invalid O: %d\n", n);
2311
+            cli_dbgmsg("pdf_handle_enc: invalid O: %d\n", n);
2312
+            cli_dbgmsg("pdf_handle_enc: invalid O: %d\n", n);
2313 2313
             if (O)
2314 2314
                 dbg_printhex("invalid O", O, n);
2315 2315
 
... ...
@@ -2321,8 +2921,8 @@ void pdf_handle_enc(struct pdf_struct *pdf)
2321 2321
                     break;
2322 2322
 
2323 2323
             if (i != n) {
2324
-                dbg_printhex("too long O", O, n);
2325
-                noisy_warnmsg("too long O: %u", n);
2324
+                dbg_printhex("pdf_handle_enc: too long O", O, n);
2325
+                noisy_warnmsg("pdf_handle_enc: too long O: %u", n);
2326 2326
                 break;
2327 2327
             }
2328 2328
         }
... ...
@@ -2330,8 +2930,8 @@ void pdf_handle_enc(struct pdf_struct *pdf)
2330 2330
         n = 0;
2331 2331
         U = pdf_readstring(q, len, "/U", &n, NULL, 0);
2332 2332
         if (!U || n < oulen) {
2333
-            cli_dbgmsg("cli_pdf: invalid U: %u\n", n);
2334
-            noisy_warnmsg("cli_pdf: invalid U: %u\n", n);
2333
+            cli_dbgmsg("pdf_handle_enc: invalid U: %u\n", n);
2334
+            noisy_warnmsg("pdf_handle_enc: invalid U: %u\n", n);
2335 2335
 
2336 2336
             if (U)
2337 2337
                 dbg_printhex("invalid U", U, n);
... ...
@@ -2349,10 +2949,10 @@ void pdf_handle_enc(struct pdf_struct *pdf)
2349 2349
             }
2350 2350
         }
2351 2351
 
2352
-        cli_dbgmsg("cli_pdf: Encrypt R: %d, P %x, length: %u\n", R, P, length);
2352
+        cli_dbgmsg("pdf_handle_enc: Encrypt R: %d, P %x, length: %u\n", R, P, length);
2353 2353
         if (length % 8) {
2354
-            cli_dbgmsg("cli_pdf: wrong key length, not multiple of 8\n");
2355
-            noisy_warnmsg("cli_pdf: wrong key length, not multiple of 8\n");
2354
+            cli_dbgmsg("pdf_handle_enc: wrong key length, not multiple of 8\n");
2355
+            noisy_warnmsg("pdf_handle_enc: wrong key length, not multiple of 8\n");
2356 2356
             break;
2357 2357
         }
2358 2358
         check_user_password(pdf, R, O, U, P, EM, UE, length, oulen);
... ...
@@ -2363,8 +2963,216 @@ void pdf_handle_enc(struct pdf_struct *pdf)
2363 2363
     free(UE);
2364 2364
 }
2365 2365
 
2366
+/**
2367
+ * @brief Search pdf buffer for objects.  Parse each.  
2368
+ * 
2369
+ * Newly found objects will be extracted after completion when the extraction for loop continues.
2370
+ * 
2371
+ * @param pdf           Pdf struct that keeps track of all information found in the PDF. 
2372
+ * @param objstm        Pointer to an object stream to parse.
2373
+ * 
2374
+ * @return cl_error_t   Error code.
2375
+ */
2376
+cl_error_t pdf_find_and_parse_objs_in_objstm(struct pdf_struct *pdf, struct objstm_struct *objstm)
2377
+{
2378
+    cl_error_t status = CL_EFORMAT;
2379
+    cl_error_t retval = CL_EPARSE;
2380
+    int32_t foundobj = 0, alerts = 0;
2381
+    uint32_t badobjects = 0;
2382
+    size_t i = 0;
2383
+
2384
+    struct pdf_obj* obj = NULL;
2385
+
2386
+    char* current_pair = objstm->streambuf;
2387
+    char* current_obj = objstm->streambuf + objstm->first;
2388
+
2389
+    if ((0 == objstm->first) || 
2390
+        (0 == objstm->streambuf_len) || 
2391
+        (0 == objstm->n))
2392
+    {
2393
+        cli_dbgmsg("pdf_find_and_parse_objs_in_objstm: Empty object stream.\n");
2394
+        goto done;
2395
+    }
2396
+
2397
+    if (objstm->first >= objstm->streambuf_len)
2398
+    {
2399
+        cli_dbgmsg("pdf_find_and_parse_objs_in_objstm: Invalid objstm values. Offset of first obj greater than stream length.\n");
2400
+        goto done;
2401
+    }
2402
+
2403
+    /* Process each object */
2404
+    for (i = 0; i < objstm->n; i++)
2405
+    {
2406
+        obj = NULL;
2407
+
2408
+        if (cli_checktimelimit(pdf->ctx) != CL_SUCCESS) {
2409
+            cli_errmsg("Timeout reached in the PDF parser while parsing object stream.\n");
2410
+            status = CL_ETIMEOUT;
2411
+            goto done;
2412
+        }
2413
+
2414
+        /* Find object */
2415
+        retval = pdf_findobj_in_objstm(pdf, objstm, &obj);
2416
+        
2417
+        if (retval != CL_SUCCESS)
2418
+        {
2419
+            cli_dbgmsg("pdf_find_and_parse_objs_in_objstm: Fewer objects in stream than expected: %u found, %u expected.\n",
2420
+                objstm->nobjs_found, objstm->n);
2421
+            badobjects++;
2422
+            pdf->stats.ninvalidobjs++;
2423
+            break;
2424
+        }
2425
+
2426
+        cli_dbgmsg("pdf_find_and_parse_objs_in_objstm: Found object %u %u in object stream at offset: %u\n", obj->id >> 8, obj->id & 0xff, obj->start);
2427
+
2428
+        if (cli_checktimelimit(pdf->ctx) != CL_SUCCESS) {
2429
+            cli_errmsg("Timeout reached in the PDF parser while parsing object stream.\n");
2430
+            status = CL_ETIMEOUT;
2431
+            goto done;
2432
+        }
2433
+
2434
+        /* Parse object */
2435
+        pdf_parseobj(pdf, obj);
2436
+    }
2437
+
2438
+    if (alerts) {
2439
+        status = CL_VIRUS;
2440
+        goto done;
2441
+    }
2442
+    else if (badobjects) {
2443
+        status = CL_EFORMAT;
2444
+        goto done;
2445
+    }
2446
+    
2447
+    status = CL_SUCCESS;
2448
+
2449
+done:
2450
+    return status;
2451
+}
2452
+
2453
+/**
2454
+ * @brief Search pdf buffer for objects.  Parse each and then extract each.
2455
+ * 
2456
+ * @param pdf               Pdf struct that keeps track of all information found in the PDF.
2457
+ * @param alerts[in/out]    The number of alerts, relevant in ALLMATCH mode.
2458
+ * 
2459
+ * @return cl_error_t   Error code.
2460
+ */
2461
+cl_error_t pdf_find_and_extract_objs(struct pdf_struct *pdf, uint32_t *alerts)
2462
+{
2463
+    cl_error_t status = CL_SUCCESS;
2464
+    int32_t rv = 0;
2465
+    int foundobj = 0;
2466
+    unsigned int i = 0, j = 0;
2467
+    uint32_t badobjects = 0;
2468
+
2469
+    /* parse PDF and find obj offsets */
2470
+    while (CL_BREAK != (rv = pdf_findobj(pdf))) {
2471
+        if (rv == CL_EMEM) {
2472
+            break;
2473
+        }
2474
+    }
2475
+
2476
+    if (rv == -1)
2477
+        pdf->flags |= 1 << BAD_PDF_TOOMANYOBJS;
2478
+
2479
+    /* must parse after finding all objs, so we can flag indirect objects */
2480
+    for (i=0; i < pdf->nobjs; i++) {
2481
+        struct pdf_obj *obj = pdf->objs[i];
2482
+
2483
+        if (cli_checktimelimit(pdf->ctx) != CL_SUCCESS) {
2484
+            cli_errmsg("pdf_find_and_extract_objs: Timeout reached in the PDF parser while parsing objects.\n");
2485
+
2486
+            status = CL_ETIMEOUT;
2487
+            goto done;
2488
+        }
2489
+
2490
+        pdf_parseobj(pdf, obj);
2491
+    }
2492
+
2493
+    pdf_handle_enc(pdf);
2494
+    if (pdf->flags & (1 << ENCRYPTED_PDF))
2495
+        cli_dbgmsg("pdf_find_and_extract_objs: encrypted pdf found, %s!\n",
2496
+               (pdf->flags & (1 << DECRYPTABLE_PDF)) ?
2497
+               "decryptable" : "not decryptable, stream will probably fail to decompress");
2498
+
2499
+    if ((pdf->ctx->options & CL_SCAN_BLOCKENCRYPTED) &&
2500
+       (pdf->flags & (1 << ENCRYPTED_PDF)) &&
2501
+       !(pdf->flags & (1 << DECRYPTABLE_PDF)))
2502
+    {
2503
+        /* It is encrypted, and a password/key needs to be supplied to decrypt.
2504
+         * This doesn't trigger for PDFs that are encrypted but don't need
2505
+         * a password to decrypt */
2506
+        status = cli_append_virus(pdf->ctx, "Heuristics.Encrypted.PDF");
2507
+        if (status == CL_VIRUS) { 
2508
+            alerts++;
2509
+            if (pdf->ctx->options & CL_SCAN_ALLMATCHES)
2510
+                status = CL_CLEAN;
2511
+        }
2512
+    }
2513
+
2514
+    if (!status) {
2515
+        status = run_pdf_hooks(pdf, PDF_PHASE_PARSED, -1, -1);
2516
+        cli_dbgmsg("pdf_find_and_extract_objs: (parsed hooks) returned %d\n", status);
2517
+        if (status == CL_VIRUS) {
2518
+            alerts++;
2519
+            if (pdf->ctx->options & CL_SCAN_ALLMATCHES) {
2520
+                status = CL_CLEAN;
2521
+            }
2522
+        }
2523
+    }
2524
+
2525
+    /* extract PDF objs */
2526
+    for (i=0; !status && i < pdf->nobjs; i++) {
2527
+        struct pdf_obj *obj = pdf->objs[i];
2528
+
2529
+        if (cli_checktimelimit(pdf->ctx) != CL_SUCCESS) {
2530
+            cli_errmsg("pdf_find_and_extract_objs: Timeout reached in the PDF parser while extracting objects.\n");
2531
+
2532
+            status = CL_ETIMEOUT;
2533
+            goto done;
2534
+        }
2535
+
2536
+        status = pdf_extract_obj(pdf, obj, PDF_EXTRACT_OBJ_SCAN);
2537
+        switch (status) {
2538
+            case CL_EFORMAT:
2539
+                /* Don't halt on one bad object */
2540
+                cli_dbgmsg("pdf_find_and_extract_objs: Format error when extracting object, skipping to the next object.\n");
2541
+                badobjects++;
2542
+                pdf->stats.ninvalidobjs++;
2543
+                status = CL_CLEAN;
2544
+                break;
2545
+            case CL_VIRUS:
2546
+                alerts++;
2547
+                if (pdf->ctx->options & CL_SCAN_ALLMATCHES) {
2548
+                    status = CL_CLEAN;
2549
+                }
2550
+                break;
2551
+            default:
2552
+                break;
2553
+        }
2554
+    }
2555
+
2556
+done:
2557
+    if (!status && badobjects) {
2558
+        status = CL_EFORMAT;
2559
+    }
2560
+
2561
+    return status;
2562
+}
2563
+
2564
+/**
2565
+ * @brief Primary function for parsing and scanning a PDF.
2566
+ * 
2567
+ * @param dir       Filepath for temp file.
2568
+ * @param ctx       clam scan context structure. 
2569
+ * @param offset    offset of pdf in ctx->fmap
2570
+ * 
2571
+ * @return int      Returns cl_error_t status value.
2572
+ */
2366 2573
 int cli_pdf(const char *dir, cli_ctx *ctx, off_t offset)
2367 2574
 {
2575
+    cl_error_t rc = CL_SUCCESS;
2368 2576
     struct pdf_struct pdf;
2369 2577
     fmap_t *map = *ctx->fmap;
2370 2578
     size_t size = map->len - offset;
... ...
@@ -2372,8 +3180,8 @@ int cli_pdf(const char *dir, cli_ctx *ctx, off_t offset)
2372 2372
     off_t map_off, bytesleft;
2373 2373
     unsigned long xref;
2374 2374
     const char *pdfver, *tmp, *start, *eofmap, *q, *eof;
2375
-    int rc, badobjects = 0;
2376 2375
     unsigned i, alerts = 0;
2376
+    unsigned int objs_found = 0;
2377 2377
 #if HAVE_JSON
2378 2378
     json_object *pdfobj=NULL;
2379 2379
     char *begin, *end, *p1;
... ...
@@ -2390,7 +3198,8 @@ int cli_pdf(const char *dir, cli_ctx *ctx, off_t offset)
2390 2390
     /* Check PDF version */
2391 2391
     if (!pdfver) {
2392 2392
         cli_errmsg("cli_pdf: mmap() failed (1)\n");
2393
-        return CL_EMAP;
2393
+        rc = CL_EMAP;
2394
+        goto done;
2394 2395
     }
2395 2396
 
2396 2397
 #if HAVE_JSON
... ...
@@ -2406,14 +3215,16 @@ int cli_pdf(const char *dir, cli_ctx *ctx, off_t offset)
2406 2406
 #if HAVE_JSON
2407 2407
         pdf_export_json(&pdf);
2408 2408
 #endif
2409
-        return CL_SUCCESS;
2409
+        rc = CL_SUCCESS;
2410
+        goto done;
2410 2411
     }
2411 2412
 
2412 2413
     versize -= tmp - pdfver;
2413 2414
     pdfver = tmp;
2414 2415
 
2415 2416
     if (versize < 8) {
2416
-        return CL_EFORMAT;
2417
+        rc = CL_EFORMAT;
2418
+        goto done;
2417 2419
     }
2418 2420
 
2419 2421
     /* Check for PDF-1.[0-9]. Although 1.7 is highest now, allow for future versions */
... ...
@@ -2463,10 +3274,9 @@ int cli_pdf(const char *dir, cli_ctx *ctx, off_t offset)
2463 2463
     eofmap = fmap_need_off_once(map, map_off, bytesleft);
2464 2464
     if (!eofmap) {
2465 2465
         cli_errmsg("cli_pdf: mmap() failed (2)\n");
2466
-#if HAVE_JSON
2467
-        pdf_export_json(&pdf);
2468
-#endif
2469
-        return CL_EMAP;
2466
+
2467
+        rc = CL_EMAP;
2468
+        goto done;
2470 2469
     }
2471 2470
 
2472 2471
     eof = eofmap + bytesleft;
... ...
@@ -2533,10 +3343,9 @@ int cli_pdf(const char *dir, cli_ctx *ctx, off_t offset)
2533 2533
     pdf.map = fmap_need_off(map, offset, size);
2534 2534
     if (!pdf.map) {
2535 2535
         cli_errmsg("cli_pdf: mmap() failed (3)\n");
2536
-#if HAVE_JSON
2537
-        pdf_export_json(&pdf);
2538
-#endif
2539
-        return CL_EMAP;
2536
+
2537
+        rc = CL_EMAP;
2538
+        goto done;
2540 2539
     }
2541 2540
 
2542 2541
     pdf.startoff = offset;
... ...
@@ -2548,127 +3357,28 @@ int cli_pdf(const char *dir, cli_ctx *ctx, off_t offset)
2548 2548
         rc = CL_CLEAN;
2549 2549
     } else if (rc) {
2550 2550
         cli_dbgmsg("cli_pdf: (pre hooks) returning %d\n", rc);
2551
-#if HAVE_JSON
2552
-        pdf_export_json(&pdf);
2553
-#endif
2554
-        return rc == CL_BREAK ? CL_CLEAN : rc;
2555
-    }
2556
-
2557
-    /* parse PDF and find obj offsets */
2558
-    while ((rc = pdf_findobj(&pdf)) > 0) {
2559
-        if (rc == 1) {
2560
-            struct pdf_obj *obj = &pdf.objs[pdf.nobjs-1];
2561
-
2562
-            cli_dbgmsg("cli_pdf: found %d %d obj @%lld\n", obj->id >> 8, obj->id&0xff, (long long)(obj->start + offset));
2563
-        }
2564
-        else if (rc == 2) {
2565
-            pdf.nobjs--;
2566
-            cli_dbgmsg("cli_pdf: Failed to parse object, likely an oversight in parser design.\n");
2567
-        }
2568
-        else {
2569
-            pdf.nobjs--;
2570
-            cli_dbgmsg("cli_pdf: unexpected return code %d.\n", rc);
2571
-        }
2572
-    }
2573
-
2574
-    if (pdf.nobjs)
2575
-        pdf.nobjs--;
2576
-
2577
-    if (rc == -1)
2578
-        pdf.flags |= 1 << BAD_PDF_TOOMANYOBJS;
2579
-
2580
-    /* must parse after finding all objs, so we can flag indirect objects */
2581
-    for (i=0;i<pdf.nobjs;i++) {
2582
-        struct pdf_obj *obj = &pdf.objs[i];
2583 2551
 
2584
-        if (cli_checktimelimit(ctx) != CL_SUCCESS) {
2585
-            cli_errmsg("Timeout reached in the PDF parser\n");
2586
-#if HAVE_JSON
2587
-            pdf_export_json(&pdf);
2588
-#endif
2589
-            free(pdf.objs);
2590
-            if (pdf.fileID)
2591
-                free(pdf.fileID);
2592
-            if (pdf.key)
2593
-                free(pdf.key);
2594
-            return CL_ETIMEOUT;
2595
-        }
2596
-
2597
-        pdf_parseobj(&pdf, obj);
2598
-    }
2599
-
2600
-    pdf_handle_enc(&pdf);
2601
-    if (pdf.flags & (1 << ENCRYPTED_PDF))
2602
-        cli_dbgmsg("cli_pdf: encrypted pdf found, %s!\n",
2603
-               (pdf.flags & (1 << DECRYPTABLE_PDF)) ?
2604
-               "decryptable" : "not decryptable, stream will probably fail to decompress");
2605
-
2606
-    if (DETECT_ENCRYPTED &&
2607
-       (pdf.flags & (1 << ENCRYPTED_PDF)) &&
2608
-       !(pdf.flags & (1 << DECRYPTABLE_PDF))) {
2609
-        /* It is encrypted, and a password/key needs to be supplied to decrypt.
2610
-         * This doesn't trigger for PDFs that are encrypted but don't need
2611
-         * a password to decrypt */
2612
-        rc = cli_append_virus(ctx, "Heuristics.Encrypted.PDF");
2613
-        if (rc == CL_VIRUS) { 
2614
-            alerts++;
2615
-            if (SCAN_ALL)
2616
-                rc = CL_CLEAN;
2617
-        }
2618
-    }
2619
-
2620
-    if (!rc) {
2621
-        rc = run_pdf_hooks(&pdf, PDF_PHASE_PARSED, -1, -1);
2622
-        cli_dbgmsg("cli_pdf: (parsed hooks) returned %d\n", rc);
2623
-        if (rc == CL_VIRUS) {
2624
-            alerts++;
2625
-            if (SCAN_ALL) {
2626
-                rc = CL_CLEAN;
2627
-            }
2628
-        }
2552
+        rc = rc == CL_BREAK ? CL_CLEAN : rc;
2553
+        goto done;
2629 2554
     }
2630 2555
 
2631
-    /* extract PDF objs */
2632
-    for (i=0;!rc && i<pdf.nobjs;i++) {
2633
-        struct pdf_obj *obj = &pdf.objs[i];
2634
-
2635
-        if (cli_checktimelimit(ctx) != CL_SUCCESS) {
2636
-            cli_errmsg("Timeout reached in the PDF parser\n");
2637
-#if HAVE_JSON
2638
-            pdf_export_json(&pdf);
2639
-#endif
2640
-            free(pdf.objs);
2641
-            if (pdf.fileID)
2642
-                free(pdf.fileID);
2643
-            if (pdf.key)
2644
-                free(pdf.key);
2645
-            return CL_ETIMEOUT;
2646
-        }
2556
+    /*
2557
+     * Find and extract all objects in the PDF. 
2558
+     * New experimental recursive methodology that adds objects from object streams.
2559
+     */
2560
+    objs_found = pdf.nobjs;
2561
+    rc = pdf_find_and_extract_objs(&pdf, &alerts);
2647 2562
 
2648
-        rc = pdf_extract_obj(&pdf, obj, PDF_EXTRACT_OBJ_SCAN);
2649
-        switch (rc) {
2650
-            case CL_EFORMAT:
2651
-                /* Don't halt on one bad object */
2652
-                cli_dbgmsg("cli_pdf: bad format object, skipping to next\n");
2653
-                badobjects++;
2654
-                pdf.stats.ninvalidobjs++;
2655
-                rc = CL_CLEAN;
2656
-                break;
2657
-            case CL_VIRUS:
2658
-                alerts++;
2659
-                if (SCAN_ALL) {
2660
-                    rc = CL_CLEAN;
2661
-                }
2662
-                break;
2663
-            default:
2664
-                break;
2665
-        }
2563
+    if (pdf.nobjs <= objs_found) {
2564
+        cli_dbgmsg("cli_pdf: pdf_find_and_extract_objs did not find any new objects!\n");
2565
+    } else {
2566
+        cli_dbgmsg("cli_pdf: pdf_find_and_extract_objs found %d new objects.\n", pdf.nobjs - objs_found);
2666 2567
     }
2667 2568
 
2668 2569
     if (pdf.flags & (1 << ENCRYPTED_PDF))
2669 2570
         pdf.flags &= ~ ((1 << BAD_FLATESTART) | (1 << BAD_STREAMSTART) | (1 << BAD_ASCIIDECODE));
2670 2571
 
2671
-   if (pdf.flags && !rc) {
2572
+    if (pdf.flags && !rc) {
2672 2573
         cli_dbgmsg("cli_pdf: flags 0x%02x\n", pdf.flags);
2673 2574
         rc = run_pdf_hooks(&pdf, PDF_PHASE_END, -1, -1);
2674 2575
         if (rc == CL_VIRUS) {
... ...
@@ -2699,11 +3409,11 @@ int cli_pdf(const char *dir, cli_ctx *ctx, off_t offset)
2699 2699
 #endif
2700 2700
     }
2701 2701
 
2702
+done:
2702 2703
     if (alerts) {
2703 2704
         rc = CL_VIRUS;
2704 2705
     }
2705
-
2706
-    else if (!rc && badobjects) {
2706
+    else if (!rc && pdf.stats.ninvalidobjs > 0) {
2707 2707
         rc = CL_EFORMAT;
2708 2708
     }
2709 2709
 
... ...
@@ -2711,17 +3421,54 @@ int cli_pdf(const char *dir, cli_ctx *ctx, off_t offset)
2711 2711
     pdf_export_json(&pdf);
2712 2712
 #endif
2713 2713
 
2714
-    cli_dbgmsg("cli_pdf: returning %d\n", rc);
2715
-    free(pdf.objs);
2716
-    free(pdf.fileID);
2717
-    free(pdf.key);
2714
+    if (pdf.objstms) {
2715
+        for (i = 0; i < pdf.nobjstms; i++) {
2716
+            if (pdf.objstms[i]) {
2717
+                if (pdf.objstms[i]->streambuf) {
2718
+                    free(pdf.objstms[i]->streambuf);
2719
+                    pdf.objstms[i]->streambuf = NULL;
2720
+                }
2721
+                free(pdf.objstms[i]);
2722
+                pdf.objstms[i] = NULL;
2723
+            }
2724
+        }
2725
+        free(pdf.objstms);
2726
+        pdf.objstms = NULL;
2727
+    }
2728
+
2729
+    if (NULL != pdf.objs) {
2730
+        for (i = 0; i < pdf.nobjs; i++) {
2731
+            if (NULL != pdf.objs[i]) {
2732
+                free(pdf.objs[i]);
2733
+                pdf.objs[i] = NULL;
2734
+            }
2735
+        }
2736
+        free(pdf.objs);
2737
+        pdf.objs = NULL;
2738
+    }
2739
+    if (pdf.fileID) {
2740
+        free(pdf.fileID);
2741
+        pdf.fileID = NULL;
2742
+    }
2743
+    if (pdf.key) {
2744
+        free(pdf.key);
2745
+        pdf.key = NULL;
2746
+    }
2718 2747
 
2719 2748
     /* PDF hooks may abort, don't return CL_BREAK to caller! */
2720
-    return rc == CL_BREAK ? CL_CLEAN : rc;
2749
+    rc = (rc == CL_BREAK) ? CL_CLEAN : rc;
2750
+
2751
+    cli_dbgmsg("cli_pdf: returning %d\n", rc);
2752
+    return rc;
2721 2753
 }
2722 2754
 
2723
-/*
2724
- * Find the start of the next line
2755
+/**
2756
+ * @brief   Skip the rest of the current line, and find the start of the next line.
2757
+ * 
2758
+ * @param ptr   Current offset into buffer.
2759
+ * @param len   Remaining bytes in buffer. 
2760
+ * 
2761
+ * @return const char*  Address of next line, or NULL if no next line in buffer.
2725 2762
  */
2726 2763
 static const char *
2727 2764
 pdf_nextlinestart(const char *ptr, size_t len)
... ...
@@ -2743,9 +3490,15 @@ pdf_nextlinestart(const char *ptr, size_t len)
2743 2743
     return ptr;
2744 2744
 }
2745 2745
 
2746
-/*
2747
- * Return the start of the next PDF object.
2746
+/**
2747
+ * @brief   Return the start of the next PDF object.
2748
+ * 
2748 2749
  * This assumes that we're not in a stream.
2750
+ * 
2751
+ * @param ptr   Current offset into buffer.
2752
+ * @param len   Remaining bytes in buffer. 
2753
+ * 
2754
+ * @return const char*  Address of next object in the buffer, or NULL if there is none in the buffer. 
2749 2755
  */
2750 2756
 static const char *
2751 2757
 pdf_nextobject(const char *ptr, size_t len)
... ...
@@ -3078,10 +3831,13 @@ static void Author_cb(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdfnam
3078 3078
         return;
3079 3079
 
3080 3080
     if (!(pdf->stats.author)) {
3081
+        const char *objstart = (obj->objstm) ? (const char *)(obj->start + obj->objstm->streambuf)
3082
+                                             : (const char *)(obj->start + pdf->map);
3083
+
3081 3084
         pdf->stats.author = cli_calloc(1, sizeof(struct pdf_stats_entry));
3082 3085
         if (!(pdf->stats.author))
3083 3086
             return;
3084
-        pdf->stats.author->data = pdf_parse_string(pdf, obj, obj->start + pdf->map, obj_size(pdf, obj, 1), "/Author", NULL, &(pdf->stats.author->meta));
3087
+        pdf->stats.author->data = pdf_parse_string(pdf, obj, objstart, obj_size(pdf, obj, 1), "/Author", NULL, &(pdf->stats.author->meta));
3085 3088
     }
3086 3089
 }
3087 3090
 #endif
... ...
@@ -3098,10 +3854,13 @@ static void Creator_cb(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdfna
3098 3098
         return;
3099 3099
 
3100 3100
     if (!(pdf->stats.creator)) {
3101
+        const char *objstart = (obj->objstm) ? (const char *)(obj->start + obj->objstm->streambuf)
3102
+                                             : (const char *)(obj->start + pdf->map);
3103
+
3101 3104
         pdf->stats.creator = cli_calloc(1, sizeof(struct pdf_stats_entry));
3102 3105
         if (!(pdf->stats.creator))
3103 3106
             return;
3104
-        pdf->stats.creator->data = pdf_parse_string(pdf, obj, obj->start + pdf->map, obj_size(pdf, obj, 1), "/Creator", NULL, &(pdf->stats.creator->meta));
3107
+        pdf->stats.creator->data = pdf_parse_string(pdf, obj, objstart, obj_size(pdf, obj, 1), "/Creator", NULL, &(pdf->stats.creator->meta));
3105 3108
     }
3106 3109
 }
3107 3110
 #endif
... ...
@@ -3118,10 +3877,13 @@ static void ModificationDate_cb(struct pdf_struct *pdf, struct pdf_obj *obj, str
3118 3118
         return;
3119 3119
 
3120 3120
     if (!(pdf->stats.modificationdate)) {
3121
+        const char *objstart = (obj->objstm) ? (const char *)(obj->start + obj->objstm->streambuf)
3122
+                                             : (const char *)(obj->start + pdf->map);
3123
+
3121 3124
         pdf->stats.modificationdate = cli_calloc(1, sizeof(struct pdf_stats_entry));
3122 3125
         if (!(pdf->stats.modificationdate))
3123 3126
             return;
3124
-        pdf->stats.modificationdate->data = pdf_parse_string(pdf, obj, obj->start + pdf->map, obj_size(pdf, obj, 1), "/ModDate", NULL, &(pdf->stats.modificationdate->meta));
3127
+        pdf->stats.modificationdate->data = pdf_parse_string(pdf, obj, objstart, obj_size(pdf, obj, 1), "/ModDate", NULL, &(pdf->stats.modificationdate->meta));
3125 3128
     }
3126 3129
 }
3127 3130
 #endif
... ...
@@ -3138,10 +3900,13 @@ static void CreationDate_cb(struct pdf_struct *pdf, struct pdf_obj *obj, struct
3138 3138
         return;
3139 3139
 
3140 3140
     if (!(pdf->stats.creationdate)) {
3141
+        const char *objstart = (obj->objstm) ? (const char *)(obj->start + obj->objstm->streambuf)
3142
+                                             : (const char *)(obj->start + pdf->map);
3143
+
3141 3144
         pdf->stats.creationdate = cli_calloc(1, sizeof(struct pdf_stats_entry));
3142 3145
         if (!(pdf->stats.creationdate))
3143 3146
             return;
3144
-        pdf->stats.creationdate->data = pdf_parse_string(pdf, obj, obj->start + pdf->map, obj_size(pdf, obj, 1), "/CreationDate", NULL, &(pdf->stats.creationdate->meta));
3147
+        pdf->stats.creationdate->data = pdf_parse_string(pdf, obj, objstart, obj_size(pdf, obj, 1), "/CreationDate", NULL, &(pdf->stats.creationdate->meta));
3145 3148
     }
3146 3149
 }
3147 3150
 #endif
... ...
@@ -3158,10 +3923,13 @@ static void Producer_cb(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdfn
3158 3158
         return;
3159 3159
 
3160 3160
     if (!(pdf->stats.producer)) {
3161
+        const char *objstart = (obj->objstm) ? (const char *)(obj->start + obj->objstm->streambuf)
3162
+                                             : (const char *)(obj->start + pdf->map);
3163
+
3161 3164
         pdf->stats.producer = cli_calloc(1, sizeof(struct pdf_stats_entry));
3162 3165
         if (!(pdf->stats.producer))
3163 3166
             return;
3164
-        pdf->stats.producer->data = pdf_parse_string(pdf, obj, obj->start + pdf->map, obj_size(pdf, obj, 1), "/Producer", NULL, &(pdf->stats.producer->meta));
3167
+        pdf->stats.producer->data = pdf_parse_string(pdf, obj, objstart, obj_size(pdf, obj, 1), "/Producer", NULL, &(pdf->stats.producer->meta));
3165 3168
     }
3166 3169
 }
3167 3170
 #endif
... ...
@@ -3178,10 +3946,13 @@ static void Title_cb(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdfname
3178 3178
         return;
3179 3179
 
3180 3180
     if (!(pdf->stats.title)) {
3181
+        const char *objstart = (obj->objstm) ? (const char *)(obj->start + obj->objstm->streambuf)
3182
+                                             : (const char *)(obj->start + pdf->map);
3183
+
3181 3184
         pdf->stats.title = cli_calloc(1, sizeof(struct pdf_stats_entry));
3182 3185
         if (!(pdf->stats.title))
3183 3186
             return;
3184
-        pdf->stats.title->data = pdf_parse_string(pdf, obj, obj->start + pdf->map, obj_size(pdf, obj, 1), "/Title", NULL, &(pdf->stats.title->meta));
3187
+        pdf->stats.title->data = pdf_parse_string(pdf, obj, objstart, obj_size(pdf, obj, 1), "/Title", NULL, &(pdf->stats.title->meta));
3185 3188
     }
3186 3189
 }
3187 3190
 #endif
... ...
@@ -3198,10 +3969,13 @@ static void Keywords_cb(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdfn
3198 3198
         return;
3199 3199
 
3200 3200
     if (!(pdf->stats.keywords)) {
3201
+        const char *objstart = (obj->objstm) ? (const char *)(obj->start + obj->objstm->streambuf)
3202
+                                             : (const char *)(obj->start + pdf->map);
3203
+
3201 3204
         pdf->stats.keywords = cli_calloc(1, sizeof(struct pdf_stats_entry));
3202 3205
         if (!(pdf->stats.keywords))
3203 3206
             return;
3204
-        pdf->stats.keywords->data = pdf_parse_string(pdf, obj, obj->start + pdf->map, obj_size(pdf, obj, 1), "/Keywords", NULL, &(pdf->stats.keywords->meta));
3207
+        pdf->stats.keywords->data = pdf_parse_string(pdf, obj, objstart, obj_size(pdf, obj, 1), "/Keywords", NULL, &(pdf->stats.keywords->meta));
3205 3208
     }
3206 3209
 }
3207 3210
 #endif
... ...
@@ -3218,10 +3992,13 @@ static void Subject_cb(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdfna
3218 3218
         return;
3219 3219
 
3220 3220
     if (!(pdf->stats.subject)) {
3221
+        const char *objstart = (obj->objstm) ? (const char *)(obj->start + obj->objstm->streambuf)
3222
+                                             : (const char *)(obj->start + pdf->map);
3223
+
3221 3224
         pdf->stats.subject = cli_calloc(1, sizeof(struct pdf_stats_entry));
3222 3225
         if (!(pdf->stats.subject))
3223 3226
             return;
3224
-        pdf->stats.subject->data = pdf_parse_string(pdf, obj, obj->start + pdf->map, obj_size(pdf, obj, 1), "/Subject", NULL, &(pdf->stats.subject->meta));
3227
+        pdf->stats.subject->data = pdf_parse_string(pdf, obj, objstart, obj_size(pdf, obj, 1), "/Subject", NULL, &(pdf->stats.subject->meta));
3225 3228
     }
3226 3229
 }
3227 3230
 #endif
... ...
@@ -3269,9 +4046,10 @@ static void XFA_cb(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdfname_a
3269 3269
 static void Pages_cb(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdfname_action *act)
3270 3270
 {
3271 3271
     struct pdf_array *array;
3272
-    const char *objstart = (const char *)(obj->start + pdf->map);
3272
+    const char *objstart = (obj->objstm) ? (const char *)(obj->start + obj->objstm->streambuf)
3273
+                                         : (const char *)(obj->start + pdf->map);
3273 3274
     const char *begin;
3274
-    unsigned int objsz;
3275
+    unsigned int objsize;
3275 3276
     unsigned long npages=0, count;
3276 3277
     struct pdf_array_node *node;
3277 3278
     json_object *pdfobj;
... ...
@@ -3284,19 +4062,19 @@ static void Pages_cb(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdfname
3284 3284
     if (!(pdf->ctx->options & CL_SCAN_FILE_PROPERTIES))
3285 3285
         return;
3286 3286
 
3287
-    objsz = obj_size(pdf, obj, 1);
3287
+    objsize = obj_size(pdf, obj, 1);
3288 3288
 
3289 3289
     pdfobj = cli_jsonobj(pdf->ctx->wrkproperty, "PDFStats");
3290 3290
     if (!(pdfobj))
3291 3291
         return;
3292 3292
 
3293
-    begin = cli_memstr(objstart, objsz, "/Kids", 5);
3293
+    begin = cli_memstr(objstart, objsize, "/Kids", 5);
3294 3294
     if (!(begin))
3295 3295
         return;
3296 3296
 
3297 3297
     begin += 5;
3298 3298
 
3299
-    array = pdf_parse_array(pdf, obj, objsz, (char *)begin, NULL);
3299
+    array = pdf_parse_array(pdf, obj, objsize, (char *)begin, NULL);
3300 3300
     if (!(array)) {
3301 3301
         cli_jsonbool(pdfobj, "IncorrectPagesCount", 1);
3302 3302
         return;
... ...
@@ -3307,21 +4085,21 @@ static void Pages_cb(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdfname
3307 3307
             if (strchr((char *)(node->data), 'R'))
3308 3308
                 npages++;
3309 3309
 
3310
-    begin = cli_memstr(obj->start + pdf->map, objsz, "/Count", 6);
3310
+    begin = cli_memstr(objstart, objsize, "/Count", 6);
3311 3311
     if (!(begin)) {
3312 3312
         cli_jsonbool(pdfobj, "IncorrectPagesCount", 1);
3313 3313
         goto cleanup;
3314 3314
     }
3315 3315
 
3316 3316
     begin += 6;
3317
-    while (begin - objstart <  objsz && isspace(begin[0]))
3317
+    while (begin - objstart <  objsize && isspace(begin[0]))
3318 3318
         begin++;
3319 3319
 
3320
-    if (begin - objstart >= objsz) {
3320
+    if (begin - objstart >= objsize) {
3321 3321
         goto cleanup;
3322 3322
     }
3323 3323
 
3324
-    if ((CL_SUCCESS != cli_strntoul_wrap(begin, (size_t)(obj->start + pdf->map + objsz - begin), 0, 10, &count)) ||
3324
+    if ((CL_SUCCESS != cli_strntoul_wrap(begin, (size_t)(obj->start + pdf->map + objsize - begin), 0, 10, &count)) ||
3325 3325
         (count != npages)) {
3326 3326
         cli_jsonbool(pdfobj, "IncorrectPagesCount", 1);
3327 3327
     }
... ...
@@ -3336,8 +4114,10 @@ static void Colors_cb(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdfnam
3336 3336
 {
3337 3337
     json_object *colorsobj, *pdfobj;
3338 3338
     unsigned long ncolors;
3339
-    char *start, *p1;
3340
-    size_t objsz;
3339
+    char *p1;
3340
+    const char *objstart = (obj->objstm) ? (const char *)(obj->start + obj->objstm->streambuf)
3341
+                                         : (const char *)(obj->start + pdf->map);
3342
+    size_t objsize;
3341 3343
 
3342 3344
     UNUSEDPARAM(act);
3343 3345
 
... ...
@@ -3347,27 +4127,25 @@ static void Colors_cb(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdfnam
3347 3347
     if (!(pdf->ctx->options & CL_SCAN_FILE_PROPERTIES))
3348 3348
         return;
3349 3349
 
3350
-    objsz = obj_size(pdf, obj, 1);
3351
-
3352
-    start = (char *)(obj->start + pdf->map);
3350
+    objsize = obj_size(pdf, obj, 1);
3353 3351
 
3354
-    p1 = (char *)cli_memstr(start, objsz, "/Colors", 7);
3352
+    p1 = (char *)cli_memstr(objstart, objsize, "/Colors", 7);
3355 3353
     if (!(p1))
3356 3354
         return;
3357 3355
 
3358 3356
     p1 += 7;
3359 3357
 
3360 3358
     /* Ensure that we have at least one whitespace character plus at least one number */
3361
-    if (objsz - (p1 - start) < 2)
3359
+    if (objsize - (p1 - objstart) < 2)
3362 3360
         return;
3363 3361
 
3364
-    while (p1 - start < objsz && isspace(p1[0]))
3362
+    while (p1 - objstart < objsize && isspace(p1[0]))
3365 3363
         p1++;
3366 3364
 
3367
-    if ((size_t)(p1 - start) == objsz)
3365
+    if ((size_t)(p1 - objstart) == objsize)
3368 3366
         return;
3369 3367
 
3370
-    if (CL_SUCCESS != cli_strntoul_wrap(p1, (size_t)((p1 - start) - objsz), 0, 10, &ncolors))
3368
+    if (CL_SUCCESS != cli_strntoul_wrap(p1, (size_t)((p1 - objstart) - objsize), 0, 10, &ncolors))
3371 3369
         return;
3372 3370
 
3373 3371
     /* We only care if the number of colors > 2**24 */
... ...
@@ -3651,14 +4429,14 @@ static void pdf_export_json(struct pdf_struct *pdf)
3651 3651
     }
3652 3652
 
3653 3653
     for (i=0; i < pdf->nobjs; i++) {
3654
-        if (pdf->objs[i].flags & (1<<OBJ_TRUNCATED)) {
3654
+        if (pdf->objs[i]->flags & (1<<OBJ_TRUNCATED)) {
3655 3655
             json_object *truncobj;
3656 3656
 
3657 3657
             truncobj = cli_jsonarray(pdfobj, "TruncatedObjects");
3658 3658
             if (!(truncobj))
3659 3659
                 continue;
3660 3660
 
3661
-            cli_jsonint_array(truncobj, pdf->objs[i].id>>8);
3661
+            cli_jsonint_array(truncobj, pdf->objs[i]->id >> 8);
3662 3662
         }
3663 3663
     }
3664 3664
 
... ...
@@ -24,13 +24,26 @@
24 24
 #include "others.h"
25 25
 #define PDF_FILTERLIST_MAX  64
26 26
 
27
+struct objstm_struct {
28
+    uint32_t first;         // offset of first obj
29
+    uint32_t current;       // offset of current obj
30
+    uint32_t current_pair;  // offset of current pair describing id, location of object
31
+    uint32_t length;        // total length of all objects (starting at first)
32
+    uint32_t n;             // number of objects that should be found in the object stream
33
+    uint32_t nobjs_found;   // number of objects actually found in the object stream
34
+    char *streambuf;        // address of stream buffer, beginning with first obj pair
35
+    size_t streambuf_len;   // length of stream buffer, includes pairs followed by actual objects
36
+};
37
+
27 38
 struct pdf_obj {
28 39
     uint32_t start;
40
+    int32_t size;
29 41
     uint32_t id;
30 42
     uint32_t flags;
31 43
     uint32_t statsflags;
32 44
     uint32_t numfilters;
33 45
     uint32_t filterlist[PDF_FILTERLIST_MAX];
46
+    struct objstm_struct *objstm;  // Should be NULL unless the obj exists in an object stream (separate buffer)
34 47
     char *path;
35 48
 };
36 49
 
... ...
@@ -124,7 +137,7 @@ enum enc_method {
124 124
 };
125 125
 
126 126
 struct pdf_struct {
127
-    struct pdf_obj *objs;
127
+    struct pdf_obj **objs;
128 128
     unsigned nobjs;
129 129
     unsigned flags;
130 130
     unsigned enc_method_stream;
... ...
@@ -145,6 +158,8 @@ struct pdf_struct {
145 145
     char *key;
146 146
     unsigned keylen;
147 147
     struct pdf_stats stats;
148
+    struct objstm_struct **objstms;
149
+    uint32_t nobjstms;
148 150
 };
149 151
 
150 152
 #define OBJ_FLAG_PDFNAME_NONE 0x0
... ...
@@ -156,7 +171,7 @@ struct pdf_struct {
156 156
 int cli_pdf(const char *dir, cli_ctx *ctx, off_t offset);
157 157
 void pdf_parseobj(struct pdf_struct *pdf, struct pdf_obj *obj);
158 158
 int pdf_extract_obj(struct pdf_struct *pdf, struct pdf_obj *obj, uint32_t flags);
159
-int pdf_findobj(struct pdf_struct *pdf);
159
+cl_error_t pdf_findobj(struct pdf_struct *pdf);
160 160
 struct pdf_obj *find_obj(struct pdf_struct *pdf, struct pdf_obj *obj, uint32_t objid);
161 161
 
162 162
 void pdf_handle_enc(struct pdf_struct *pdf);
... ...
@@ -166,13 +181,16 @@ enum enc_method parse_enc_method(const char *dict, unsigned len, const char *key
166 166
 
167 167
 void pdfobj_flag(struct pdf_struct *pdf, struct pdf_obj *obj, enum pdf_flag flag);
168 168
 char *pdf_finalize_string(struct pdf_struct *pdf, struct pdf_obj *obj, const char *in, size_t len);
169
-char *pdf_parse_string(struct pdf_struct *pdf, struct pdf_obj *obj, const char *objstart, size_t objsize, const char *str, char **endchar, struct pdf_stats_metadata *stats);
170
-struct pdf_array *pdf_parse_array(struct pdf_struct *pdf, struct pdf_obj *obj, size_t objsz, char *begin, char **endchar);
171
-struct pdf_dict *pdf_parse_dict(struct pdf_struct *pdf, struct pdf_obj *obj, size_t objsz, char *begin, char **endchar);
169
+char *pdf_parse_string(struct pdf_struct *pdf, struct pdf_obj *obj, const char *objstart, size_t objsize, const char *str, char **endchar, struct pdf_stats_metadata *meta);
170
+struct pdf_array *pdf_parse_array(struct pdf_struct *pdf, struct pdf_obj *obj, size_t objsize, char *begin, char **endchar);
171
+struct pdf_dict *pdf_parse_dict(struct pdf_struct *pdf, struct pdf_obj *obj, size_t objsize, char *begin, char **endchar);
172 172
 int is_object_reference(char *begin, char **endchar, uint32_t *id);
173 173
 void pdf_free_dict(struct pdf_dict *dict);
174 174
 void pdf_free_array(struct pdf_array *array);
175 175
 void pdf_print_dict(struct pdf_dict *dict, unsigned long depth);
176 176
 void pdf_print_array(struct pdf_array *array, unsigned long depth);
177 177
 
178
+cl_error_t pdf_find_and_extract_objs(struct pdf_struct *pdf, uint32_t *alerts);
179
+cl_error_t pdf_find_and_parse_objs_in_objstm(struct pdf_struct *pdf, struct objstm_struct *objstm);
180
+
178 181
 #endif
... ...
@@ -1,5 +1,5 @@
1 1
 /*
2
- *  Copyright (C) 2016-2017 Cisco and/or its affiliates. All rights reserved.
2
+ *  Copyright (C) 2016-2018 Cisco and/or its affiliates. All rights reserved.
3 3
  *
4 4
  *  Author: Kevin Lin
5 5
  *
... ...
@@ -37,6 +37,7 @@
37 37
 #endif
38 38
 
39 39
 #include <stdio.h>
40
+#include <stddef.h> 
40 41
 #include <sys/types.h>
41 42
 #include <sys/stat.h>
42 43
 #include <ctype.h>
... ...
@@ -75,26 +76,57 @@ struct pdf_token {
75 75
     uint8_t *content;  /* content stream */
76 76
 };
77 77
 
78
-static  int pdf_decodestream_internal(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdf_dict *params, struct pdf_token *token);
79
-static  int pdf_decode_dump(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdf_token *token, int lvl);
78
+static ptrdiff_t pdf_decodestream_internal(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdf_dict *params, struct pdf_token *token, int fout, cl_error_t *status, struct objstm_struct *objstm);
79
+static cl_error_t pdf_decode_dump(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdf_token *token, int lvl);
80
+
81
+static cl_error_t filter_ascii85decode(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdf_token *token);
82
+static cl_error_t filter_rldecode(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdf_token *token);
83
+static cl_error_t filter_flatedecode(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdf_dict *params, struct pdf_token *token);
84
+static cl_error_t filter_asciihexdecode(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdf_token *token);
85
+static cl_error_t filter_decrypt(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdf_dict *params, struct pdf_token *token, int mode);
86
+static cl_error_t filter_lzwdecode(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdf_dict *params, struct pdf_token *token);
87
+
88
+/**
89
+ * @brief       Wrapper function for pdf_decodestream_internal.
90
+ * 
91
+ * Allocate a token object to store decoded filter data.
92
+ * Parse/decode the filter data and scan it.
93
+ * 
94
+ * @param pdf       Pdf context structure.
95
+ * @param obj       The object we found the filter content in.
96
+ * @param params    (optional) Dictionary parameters describing the filter data.
97
+ * @param stream    Filter stream buffer pointer.
98
+ * @param streamlen Length of filter stream buffer.
99
+ * @param xref      Indicates if the stream is an /XRef stream.  Do not apply forced decryption on /XRef streams.
100
+ * @param fout      File descriptor to write to to be scanned.
101
+ * @param[out] rc   Return code ()
102
+ * @param objstm    (optional) Object stream context structure.
103
+ * @return ptrdiff_t   The number of bytes written to 'fout' to be scanned. -1 if failed out.
104
+ */
105
+ptrdiff_t pdf_decodestream(
106
+    struct pdf_struct *pdf, struct pdf_obj *obj, struct pdf_dict *params,
107
+    const char *stream, uint32_t streamlen, int xref, int fout, cl_error_t *status,
108
+    struct objstm_struct *objstm)
109
+{
110
+    struct pdf_token *token = NULL;
111
+    ptrdiff_t bytes_scanned = -1;
112
+    cl_error_t retval = CL_SUCCESS;
80 113
 
81
-static  int filter_ascii85decode(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdf_token *token);
82
-static  int filter_rldecode(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdf_token *token);
83
-static  int filter_flatedecode(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdf_dict *params, struct pdf_token *token);
84
-static  int filter_asciihexdecode(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdf_token *token);
85
-static  int filter_decrypt(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdf_dict *params, struct pdf_token *token, int mode);
86
-static  int filter_lzwdecode(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdf_dict *params, struct pdf_token *token);
114
+    if (!status) {
115
+        /* invalid args, and no way to pass back the status code */
116
+        return -1;
117
+    }
87 118
 
88
-ptrdiff_t pdf_decodestream(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdf_dict *params, const char *stream, uint32_t streamlen, int xref, int fout, int *rc)
89
-{
90
-    struct pdf_token *token;
91
-    ptrdiff_t rv;
119
+    if (!pdf || !obj) {
120
+        /* Invalid args */
121
+        retval = CL_EARG;
122
+        goto done;
123
+    }
92 124
 
93 125
     if (!stream || !streamlen || fout < 0) {
94
-        cli_dbgmsg("cli_pdf: no filters or stream on obj %u %u\n", obj->id>>8, obj->id&0xff);
95
-        if (rc)
96
-            *rc = CL_ENULLARG;
97
-        return -1;
126
+        cli_dbgmsg("pdf_decodestream: no filters or stream on obj %u %u\n", obj->id>>8, obj->id&0xff);
127
+        retval = CL_ENULLARG;
128
+        goto done;
98 129
     }
99 130
 
100 131
 #if 0
... ...
@@ -104,9 +136,8 @@ ptrdiff_t pdf_decodestream(struct pdf_struct *pdf, struct pdf_obj *obj, struct p
104 104
 
105 105
     token = cli_malloc(sizeof(struct pdf_token));
106 106
     if (!token) {
107
-        if (rc)
108
-            *rc = CL_EMEM;
109
-        return -1;
107
+        retval = CL_EMEM;
108
+        goto done;
110 109
     }
111 110
 
112 111
     token->flags = 0;
... ...
@@ -118,69 +149,110 @@ ptrdiff_t pdf_decodestream(struct pdf_struct *pdf, struct pdf_obj *obj, struct p
118 118
     token->content = cli_malloc(streamlen);
119 119
     if (!token->content) {
120 120
         free(token);
121
-        if (rc)
122
-            *rc = CL_EMEM;
123
-        return -1;
121
+        retval = CL_EMEM;
122
+        goto done;
124 123
     }
125 124
     memcpy(token->content, stream, streamlen);
126 125
     token->length = streamlen;
127 126
 
128
-    cli_dbgmsg("cli_pdf: detected %lu applied filters\n", (long unsigned)(obj->numfilters));
127
+    cli_dbgmsg("pdf_decodestream: detected %lu applied filters\n", (long unsigned)(obj->numfilters));
129 128
 
130
-    rv = (ptrdiff_t)pdf_decodestream_internal(pdf, obj, params, token);
131
-    /* return is generally ignored */
132
-    if (rc) {
133
-        if (rv == CL_VIRUS)
134
-            *rc = CL_VIRUS;
135
-        else
136
-            *rc = CL_SUCCESS;
137
-    }
138
-
139
-    if (token->success) {
140
-        if (!cli_checklimits("pdf", pdf->ctx, token->length, 0, 0)) {
141
-            if (cli_writen(fout, token->content, token->length) != token->length) {
142
-                cli_errmsg("cli_pdf: failed to write output file\n");
143
-                if (rc)
144
-                    *rc = CL_EWRITE;
145
-                return -1;
146
-            }
147
-            rv = token->length;
148
-        }
149
-    } else {  /* if no non-forced filter are decoded, return the raw stream */
129
+    bytes_scanned = pdf_decodestream_internal(pdf, obj, params, token, fout, &retval, objstm);
130
+    /* 
131
+     * Pass back the return value, though we really only care
132
+     * if it is CV_VIRUS or CL_SUCCESS.
133
+     */
134
+    if (retval == CL_VIRUS)
135
+        retval = CL_VIRUS;
136
+    else
137
+        retval = CL_SUCCESS;
138
+
139
+    if (!token->success) {
140
+        /*
141
+         * If it was successful, the internal() function calls cli_writen()
142
+         * However, in this case... no non-forced filter are decoded, 
143
+         *   so return the raw stream.
144
+         */
150 145
         if (!cli_checklimits("pdf", pdf->ctx, streamlen, 0, 0)) {
151
-            cli_dbgmsg("cli_pdf: no non-forced filters decoded, returning raw stream\n");
146
+            cli_dbgmsg("pdf_decodestream: no non-forced filters decoded, returning raw stream\n");
152 147
 
153 148
             if (cli_writen(fout, stream, streamlen) != streamlen) {
154
-                cli_errmsg("cli_pdf: failed to write output file\n");
155
-                if (rc)
156
-                    *rc = CL_EWRITE;
157
-                return -1;
149
+                cli_errmsg("pdf_decodestream: failed to write output file\n");
150
+                retval = CL_EWRITE;
151
+                bytes_scanned = -1;
152
+                goto done;
158 153
             }
159
-            rv = streamlen;
154
+            bytes_scanned = streamlen;
160 155
         }
161 156
     }
162 157
 
163
-    free(token->content);
164
-    free(token);
165
-    return rv;
158
+done:
159
+    *status = retval;
160
+
161
+    /*
162
+     * Free up the token, and token content, if any.
163
+     */
164
+    if (NULL != token)
165
+    {
166
+        if (NULL != token->content) {
167
+            free(token->content);
168
+            token->content = NULL;
169
+            token->length = 0;
170
+        }
171
+        free(token);
172
+        token = NULL;
173
+    }
174
+
175
+    return bytes_scanned;
166 176
 }
167 177
 
168
-static int pdf_decodestream_internal(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdf_dict *params, struct pdf_token *token)
178
+/**
179
+ * @brief       Decode filter buffer data. 
180
+ * 
181
+ * Attempt to decompress, decrypt or otherwise parse it.
182
+ * 
183
+ * @param pdf           Pdf context structure.
184
+ * @param obj           The object we found the filter content in.
185
+ * @param params        (optional) Dictionary parameters describing the filter data.
186
+ * @param token         Pointer to and length of filter data.
187
+ * @param fout          File handle to write data to to be scanned.
188
+ * @param[out] status   CL_CLEAN/CL_SUCCESS or CL_VIRUS/CL_E<error>
189
+ * @param objstm        (optional) Object stream context structure.
190
+ * @return ptrdiff_t    The number of bytes we wrote to 'fout'. -1 if failed out.
191
+ */
192
+static ptrdiff_t pdf_decodestream_internal(
193
+    struct pdf_struct *pdf, struct pdf_obj *obj, struct pdf_dict *params,
194
+    struct pdf_token *token, int fout, cl_error_t *status, struct objstm_struct *objstm)
169 195
 {
196
+    cl_error_t vir = CL_CLEAN;
197
+    cl_error_t retval = CL_SUCCESS;
198
+    ptrdiff_t bytes_scanned = -1;
170 199
     const char *filter = NULL;
171
-    int i, vir = 0, rc = CL_SUCCESS;
200
+    int i;
172 201
 
202
+    if (!status) {
203
+        /* invalid args, and no way to pass back the status code */
204
+        return -1;
205
+    }
206
+
207
+    if (!pdf || !obj || !token) {
208
+        /* Invalid args */
209
+        retval = CL_EARG;
210
+        goto done;
211
+    }
212
+    
173 213
     /*
174 214
      * if pdf is decryptable, scan for CRYPT filter
175 215
      * if none, force a DECRYPT filter application
176 216
      */
177 217
     if ((pdf->flags & (1 << DECRYPTABLE_PDF)) && !(obj->flags & (1 << OBJ_FILTER_CRYPT))) {
178 218
         if (token->flags & PDFTOKEN_FLAG_XREF) /* TODO: is this on all crypt filters or only the assumed one? */
179
-            cli_dbgmsg("cli_pdf: skipping decoding => non-filter CRYPT (reason: xref)\n");
219
+            cli_dbgmsg("pdf_decodestream_internal: skipping decoding => non-filter CRYPT (reason: xref)\n");
180 220
         else {
181
-            cli_dbgmsg("cli_pdf: decoding => non-filter CRYPT\n");
182
-            if ((rc = filter_decrypt(pdf, obj, params, token, 1)) != CL_SUCCESS) {
183
-                return rc;
221
+            cli_dbgmsg("pdf_decodestream_internal: decoding => non-filter CRYPT\n");
222
+            retval = filter_decrypt(pdf, obj, params, token, 1);
223
+            if (retval != CL_SUCCESS) {
224
+                goto done;
184 225
             }
185 226
         }
186 227
     }
... ...
@@ -188,33 +260,33 @@ static int pdf_decodestream_internal(struct pdf_struct *pdf, struct pdf_obj *obj
188 188
     for (i = 0; i < obj->numfilters; i++) {
189 189
         switch(obj->filterlist[i]) {
190 190
         case OBJ_FILTER_A85:
191
-            cli_dbgmsg("cli_pdf: decoding [%d] => ASCII85DECODE\n", obj->filterlist[i]);
192
-            rc = filter_ascii85decode(pdf, obj, token);
191
+            cli_dbgmsg("pdf_decodestream_internal: decoding [%d] => ASCII85DECODE\n", obj->filterlist[i]);
192
+            retval = filter_ascii85decode(pdf, obj, token);
193 193
             break;
194 194
 
195 195
         case OBJ_FILTER_RL:
196
-            cli_dbgmsg("cli_pdf: decoding [%d] => RLDECODE\n", obj->filterlist[i]);
197
-            rc = filter_rldecode(pdf, obj, token);
196
+            cli_dbgmsg("pdf_decodestream_internal: decoding [%d] => RLDECODE\n", obj->filterlist[i]);
197
+            retval = filter_rldecode(pdf, obj, token);
198 198
             break;
199 199
 
200 200
         case OBJ_FILTER_FLATE:
201
-            cli_dbgmsg("cli_pdf: decoding [%d] => FLATEDECODE\n", obj->filterlist[i]);
202
-            rc = filter_flatedecode(pdf, obj, params, token);
201
+            cli_dbgmsg("pdf_decodestream_internal: decoding [%d] => FLATEDECODE\n", obj->filterlist[i]);
202
+            retval = filter_flatedecode(pdf, obj, params, token);
203 203
             break;
204 204
 
205 205
         case OBJ_FILTER_AH:
206
-            cli_dbgmsg("cli_pdf: decoding [%d] => ASCIIHEXDECODE\n", obj->filterlist[i]);
207
-            rc = filter_asciihexdecode(pdf, obj, token);
206
+            cli_dbgmsg("pdf_decodestream_internal: decoding [%d] => ASCIIHEXDECODE\n", obj->filterlist[i]);
207
+            retval = filter_asciihexdecode(pdf, obj, token);
208 208
             break;
209 209
 
210 210
         case OBJ_FILTER_CRYPT:
211
-            cli_dbgmsg("cli_pdf: decoding [%d] => CRYPT\n", obj->filterlist[i]);
212
-            rc = filter_decrypt(pdf, obj, params, token, 0);
211
+            cli_dbgmsg("pdf_decodestream_internal: decoding [%d] => CRYPT\n", obj->filterlist[i]);
212
+            retval = filter_decrypt(pdf, obj, params, token, 0);
213 213
             break;
214 214
 
215 215
         case OBJ_FILTER_LZW:
216
-            cli_dbgmsg("cli_pdf: decoding [%d] => LZWDECODE\n", obj->filterlist[i]);
217
-            rc = filter_lzwdecode(pdf, obj, params, token);
216
+            cli_dbgmsg("pdf_decodestream_internal: decoding [%d] => LZWDECODE\n", obj->filterlist[i]);
217
+            retval = filter_lzwdecode(pdf, obj, params, token);
218 218
             break;
219 219
 
220 220
         case OBJ_FILTER_JPX:
... ...
@@ -226,29 +298,29 @@ static int pdf_decodestream_internal(struct pdf_struct *pdf, struct pdf_obj *obj
226 226
         case OBJ_FILTER_JBIG2:
227 227
             if (!filter) filter = "JBIG2DECODE";
228 228
 
229
-            cli_dbgmsg("cli_pdf: unimplemented filter type [%d] => %s\n", obj->filterlist[i], filter);
229
+            cli_dbgmsg("pdf_decodestream_internal: unimplemented filter type [%d] => %s\n", obj->filterlist[i], filter);
230 230
             filter = NULL;
231
-            rc = CL_BREAK;
231
+            retval = CL_BREAK;
232 232
             break;
233 233
 
234 234
         default:
235
-            cli_dbgmsg("cli_pdf: unknown filter type [%d]\n", obj->filterlist[i]);
236
-            rc = CL_BREAK;
235
+            cli_dbgmsg("pdf_decodestream_internal: unknown filter type [%d]\n", obj->filterlist[i]);
236
+            retval = CL_BREAK;
237 237
             break;
238 238
         }
239 239
 
240 240
         if (!(token->content) || !(token->length)) {
241
-            cli_dbgmsg("cli_pdf: empty content, breaking after %d (of %lu) filters\n",
242
-                       i, (long unsigned)(obj->numfilters));
241
+            cli_dbgmsg("pdf_decodestream_internal: empty content, breaking after %d (of %lu) filters\n", i, (long unsigned)(obj->numfilters));
243 242
             break;
244 243
         }
245 244
 
246
-        if (rc != CL_SUCCESS) {
247
-            if (rc == CL_VIRUS && pdf->ctx->options & CL_SCAN_ALLMATCHES)
248
-                vir = 1;
249
-            else {
250
-                const char *reason;
251
-                switch (rc) {
245
+        if (retval != CL_SUCCESS) {
246
+            if (retval == CL_VIRUS && pdf->ctx->options & CL_SCAN_ALLMATCHES) {
247
+                vir = CL_VIRUS;
248
+            } else {
249
+                const char* reason;
250
+
251
+                switch (retval) {
252 252
                 case CL_VIRUS:
253 253
                     reason = "detection";
254 254
                     break;
... ...
@@ -260,29 +332,89 @@ static int pdf_decodestream_internal(struct pdf_struct *pdf, struct pdf_obj *obj
260 260
                     break;
261 261
                 }
262 262
 
263
-                cli_dbgmsg("cli_pdf: stopping after %d (of %lu) filters (reason: %s)\n",
264
-                           i, (long unsigned)(obj->numfilters), reason);
263
+                cli_dbgmsg("pdf_decodestream_internal: stopping after %d (of %lu) filters (reason: %s)\n", i, (long unsigned)(obj->numfilters), reason);
265 264
                 break;
266 265
             }
267 266
         }
268 267
         token->success++;
269 268
 
269
+        /* Dump the stream content to a text file if keeptmp is enabled. */
270 270
         if (pdf->ctx->engine->keeptmp) {
271
+            retval = pdf_decode_dump(pdf, obj, token, i+1);
272
+            if (retval != CL_SUCCESS) {
273
+                goto done;
274
+            }
275
+        }
276
+    }
271 277
 
272
-            if ((rc = pdf_decode_dump(pdf, obj, token, i+1)) != CL_SUCCESS)
273
-                return rc;
278
+    if (token->success > 0) {
279
+        /*
280
+         * Looks like we successfully decoded the stream, so lets write it out.
281
+         *   In the failure case, the caller will deal with the raw stream.
282
+         */
283
+        if (!cli_checklimits("pdf", pdf->ctx, token->length, 0, 0)) {
284
+            if (cli_writen(fout, token->content, token->length) != token->length) {
285
+                cli_errmsg("pdf_decodestream_internal: failed to write output file\n");
286
+                retval = CL_EWRITE;
287
+                bytes_scanned = -1;
288
+                goto done;
289
+            }
290
+            bytes_scanned = token->length;
274 291
         }
275 292
     }
276 293
 
277
-    if (vir)
278
-        return CL_VIRUS;
279
-    if (rc == CL_BREAK)
280
-        return CL_SUCCESS;
281
-    return rc;
294
+    if (NULL != objstm)
295
+    {
296
+        /*
297
+         * The caller indicated that the decoded data is an object stream.
298
+         * Perform experimental object stream parsing to extract objects from the stream.
299
+         */
300
+        objstm->streambuf = (char*)token->content;
301
+        objstm->streambuf_len = (size_t)token->length;
302
+
303
+        /* Take ownership of the malloc'd buffer */
304
+        token->content = NULL;
305
+        token->length = 0;
306
+
307
+        int objs_found = pdf->nobjs;
308
+        if (CL_SUCCESS != pdf_find_and_parse_objs_in_objstm(pdf, objstm))
309
+        {
310
+            cli_dbgmsg("pdf_decodestream_internal: pdf_find_and_parse_objs_in_objstm failed!\n");
311
+        }
312
+
313
+        if (pdf->nobjs <= objs_found) {
314
+            cli_dbgmsg("pdf_decodestream_internal: pdf_find_and_parse_objs_in_objstm did not find any new objects!\n");
315
+        } else {
316
+            cli_dbgmsg("pdf_decodestream_internal: pdf_find_and_parse_objs_in_objstm found %d new objects.\n", pdf->nobjs - objs_found);
317
+        }
318
+    }
319
+
320
+done:
321
+
322
+    *status = retval;
323
+
324
+    if (vir == CL_VIRUS)
325
+        *status = CL_VIRUS;
326
+
327
+    if (*status == CL_BREAK)
328
+        *status = CL_SUCCESS;
329
+
330
+    return bytes_scanned;
282 331
 }
283 332
 
284
-/* used only for intermediate dumping */
285
-static int pdf_decode_dump(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdf_token *token, int lvl)
333
+/**
334
+ * @brief   Dump PDF filter content such as stream contents to a temp file.
335
+ * 
336
+ * Temp file is created in the pdf->dir directory.
337
+ * Filename format is "pdf<pdf->files-1>_<lvl>".
338
+ * 
339
+ * @param pdf   Pdf context structure.
340
+ * @param obj   The object we found the filter content in.
341
+ * @param token The struct for the filter contents.
342
+ * @param lvl   A unique index to distinguish the files from each other.
343
+ * @return cl_error_t 
344
+ */
345
+static cl_error_t pdf_decode_dump(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdf_token *token, int lvl)
286 346
 {
287 347
     char fname[1024];
288 348
     int ifd;
... ...
@@ -313,7 +445,7 @@ static int pdf_decode_dump(struct pdf_struct *pdf, struct pdf_obj *obj, struct p
313 313
  * ascii85 inflation
314 314
  * See http://www.piclist.com/techref/method/encode.htm (look for base85)
315 315
  */
316
-static int filter_ascii85decode(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdf_token *token)
316
+static cl_error_t filter_ascii85decode(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdf_token *token)
317 317
 {
318 318
     uint8_t *decoded, *dptr;
319 319
     uint32_t declen = 0;
... ...
@@ -415,7 +547,7 @@ static int filter_ascii85decode(struct pdf_struct *pdf, struct pdf_obj *obj, str
415 415
 }
416 416
 
417 417
 /* imported from razorback */
418
-static int filter_rldecode(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdf_token *token)
418
+static cl_error_t filter_rldecode(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdf_token *token)
419 419
 {
420 420
     uint8_t *decoded, *temp;
421 421
     uint32_t declen = 0, capacity = 0;
... ...
@@ -523,7 +655,7 @@ static uint8_t *decode_nextlinestart(uint8_t *content, uint32_t length)
523 523
     return pt;
524 524
 }
525 525
 
526
-static int filter_flatedecode(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdf_dict *params, struct pdf_token *token)
526
+static cl_error_t filter_flatedecode(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdf_dict *params, struct pdf_token *token)
527 527
 {
528 528
     uint8_t *decoded, *temp;
529 529
     uint32_t declen = 0, capacity = 0;
... ...
@@ -671,14 +803,14 @@ static int filter_flatedecode(struct pdf_struct *pdf, struct pdf_obj *obj, struc
671 671
     return rc;
672 672
 }
673 673
 
674
-static int filter_asciihexdecode(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdf_token *token)
674
+static cl_error_t filter_asciihexdecode(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdf_token *token)
675 675
 {
676 676
     uint8_t *decoded;
677 677
 
678 678
     const uint8_t *content = (uint8_t *)token->content;
679 679
     uint32_t length = token->length;
680 680
     uint32_t i, j;
681
-    int rc = CL_SUCCESS;
681
+    cl_error_t rc = CL_SUCCESS;
682 682
 
683 683
     if (!(decoded = (uint8_t *)cli_calloc(length/2 + 1, sizeof(uint8_t)))) {
684 684
         cli_errmsg("cli_pdf: cannot allocate memory for decoded output\n");
... ...
@@ -724,7 +856,7 @@ static int filter_asciihexdecode(struct pdf_struct *pdf, struct pdf_obj *obj, st
724 724
 }
725 725
 
726 726
 /* modes: 0 = use default/DecodeParms, 1 = use document setting */
727
-static int filter_decrypt(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdf_dict *params, struct pdf_token *token, int mode)
727
+static cl_error_t filter_decrypt(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdf_dict *params, struct pdf_token *token, int mode)
728 728
 {
729 729
     char *decrypted;
730 730
     size_t length = (size_t)token->length;
... ...
@@ -768,7 +900,7 @@ static int filter_decrypt(struct pdf_struct *pdf, struct pdf_obj *obj, struct pd
768 768
     return CL_SUCCESS;
769 769
 }
770 770
 
771
-static int filter_lzwdecode(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdf_dict *params, struct pdf_token *token)
771
+static cl_error_t filter_lzwdecode(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdf_dict *params, struct pdf_token *token)
772 772
 {
773 773
     uint8_t *decoded, *temp;
774 774
     uint32_t declen = 0, capacity = 0;
... ...
@@ -36,6 +36,26 @@
36 36
 
37 37
 #include "pdf.h"
38 38
 
39
-ptrdiff_t pdf_decodestream(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdf_dict *params, const char *stream, uint32_t streamlen, int xref, int fout, int *rc);
39
+/**
40
+ * @brief       Wrapper function for pdf_decodestream_internal.
41
+ * 
42
+ * Allocate a token object to store decoded filter data.
43
+ * Parse/decode the filter data and scan it.
44
+ * 
45
+ * @param pdf       Pdf context structure.
46
+ * @param obj       The object we found the filter content in.
47
+ * @param params    Dictionary parameters describing the filter data.
48
+ * @param stream    Filter stream buffer pointer.
49
+ * @param streamlen Length of filter stream buffer.
50
+ * @param xref      Indicates if the stream is an /XRef stream.  Do not apply forced decryption on /XRef streams.
51
+ * @param fout      File descriptor to write to a temp file.
52
+ * @param[out] rc   Return code ()
53
+ * @param objstm    Object stream context structure.
54
+ * @return ptrdiff_t 
55
+ */
56
+ptrdiff_t pdf_decodestream(
57
+    struct pdf_struct *pdf, struct pdf_obj *obj, struct pdf_dict *params,
58
+    const char *stream, uint32_t streamlen, int xref, int fout, cl_error_t *status,
59
+    struct objstm_struct *objstm);
40 60
 
41 61
 #endif /* __PDFDECODE_H__ */
... ...
@@ -377,17 +377,26 @@ char *pdf_finalize_string(struct pdf_struct *pdf, struct pdf_obj *obj, const cha
377 377
 
378 378
 char *pdf_parse_string(struct pdf_struct *pdf, struct pdf_obj *obj, const char *objstart, size_t objsize, const char *str, char **endchar, struct pdf_stats_metadata *meta)
379 379
 {
380
-    const char *q = objstart, *oobj=obj->start+pdf->map;
380
+    const char *q = objstart;
381 381
     char *p1, *p2;
382 382
     size_t len, checklen;
383 383
     char *res = NULL;
384 384
     uint32_t objid;
385 385
     size_t i;
386 386
 
387
-    if (objsize > (size_t)(pdf->size - (objstart - pdf->map))) {
388
-        /* Possible attempt to exploit bb11980 */
389
-        cli_dbgmsg("Malformed PDF: Alleged size of obj in PDF would extend further than the PDF data.\n");
390
-        return NULL;
387
+    if (obj->objstm) {
388
+        if (objsize > (size_t)(obj->objstm->streambuf_len - (objstart - obj->objstm->streambuf))) {
389
+            /* Possible attempt to exploit bb11980 */
390
+            cli_dbgmsg("Malformed PDF: Alleged size of obj in object stream in PDF would extend further than the object stream data.\n");
391
+            return NULL;
392
+        }
393
+    }
394
+    else {
395
+        if (objsize > (size_t)(pdf->size - (objstart - pdf->map))) {
396
+            /* Possible attempt to exploit bb11980 */
397
+            cli_dbgmsg("Malformed PDF: Alleged size of obj in PDF would extend further than the PDF data.\n");
398
+            return NULL;
399
+        }
391 400
     }
392 401
 
393 402
     /*
... ...
@@ -557,10 +566,10 @@ char *pdf_parse_string(struct pdf_struct *pdf, struct pdf_obj *obj, const char *
557 557
         /* Hex string */
558 558
 
559 559
         p2 = p1+1;
560
-        while ((size_t)(p2 - oobj) < objsize && *p2 != '>')
560
+        while ((size_t)(p2 - objstart) < objsize && *p2 != '>')
561 561
             p2++;
562 562
 
563
-        if ((size_t)(p2 - oobj) == objsize) {
563
+        if ((size_t)(p2 - objstart) == objsize) {
564 564
             return NULL;
565 565
         }
566 566
 
... ...
@@ -647,7 +656,7 @@ char *pdf_parse_string(struct pdf_struct *pdf, struct pdf_obj *obj, const char *
647 647
     return res;
648 648
 }
649 649
 
650
-struct pdf_dict *pdf_parse_dict(struct pdf_struct *pdf, struct pdf_obj *obj, size_t objsz, char *begin, char **endchar)
650
+struct pdf_dict *pdf_parse_dict(struct pdf_struct *pdf, struct pdf_obj *obj, size_t objsize, char *begin, char **endchar)
651 651
 {
652 652
     struct pdf_dict *res=NULL;
653 653
     struct pdf_dict_node *node=NULL;
... ...
@@ -659,9 +668,10 @@ struct pdf_dict *pdf_parse_dict(struct pdf_struct *pdf, struct pdf_obj *obj, siz
659 659
     if (!(pdf) || !(obj) || !(begin))
660 660
         return NULL;
661 661
 
662
-    objstart = (const char *)(obj->start + pdf->map);
662
+    objstart = (obj->objstm) ? (const char *)(obj->start + obj->objstm->streambuf)
663
+                             : (const char *)(obj->start + pdf->map);
663 664
 
664
-    if (begin < objstart || (size_t)(begin - objstart) >= objsz - 2)
665
+    if (begin < objstart || (size_t)(begin - objstart) >= objsize - 2)
665 666
         return NULL;
666 667
 
667 668
     if (begin[0] != '<' || begin[1] != '<')
... ...
@@ -669,7 +679,7 @@ struct pdf_dict *pdf_parse_dict(struct pdf_struct *pdf, struct pdf_obj *obj, siz
669 669
 
670 670
     /* Find the end of the dictionary */
671 671
     end = begin;
672
-    while ((size_t)(end - objstart) < objsz) {
672
+    while ((size_t)(end - objstart) < objsize) {
673 673
         int increment=1;
674 674
         if (in_string) {
675 675
             if (*end == '\\') {
... ...
@@ -689,18 +699,18 @@ struct pdf_dict *pdf_parse_dict(struct pdf_struct *pdf, struct pdf_obj *obj, siz
689 689
                 in_string=1;
690 690
                 break;
691 691
             case '<':
692
-                if ((size_t)(end - objstart) <= objsz - 2 && end[1] == '<')
692
+                if ((size_t)(end - objstart) <= objsize - 2 && end[1] == '<')
693 693
                     ninner++;
694 694
                 increment=2;
695 695
                 break;
696 696
             case '>':
697
-                if ((size_t)(end - objstart) <= objsz - 2 && end[1] == '>')
697
+                if ((size_t)(end - objstart) <= objsize - 2 && end[1] == '>')
698 698
                     ninner--;
699 699
                 increment=2;
700 700
                 break;
701 701
         }
702 702
 
703
-        if ((size_t)(end - objstart) <= objsz - 2)
703
+        if ((size_t)(end - objstart) <= objsize - 2)
704 704
             if (end[0] == '>' && end[1] == '>' && ninner == 0)
705 705
                 break;
706 706
 
... ...
@@ -708,7 +718,7 @@ struct pdf_dict *pdf_parse_dict(struct pdf_struct *pdf, struct pdf_obj *obj, siz
708 708
     }
709 709
 
710 710
     /* More sanity checking */
711
-    if ((size_t)(end - objstart) >= objsz - 2)
711
+    if ((size_t)(end - objstart) >= objsize - 2)
712 712
         return NULL;
713 713
 
714 714
     if (end[0] != '>' || end[1] != '>')
... ...
@@ -809,7 +819,7 @@ struct pdf_dict *pdf_parse_dict(struct pdf_struct *pdf, struct pdf_obj *obj, siz
809 809
                 begin = p1+1;
810 810
                 break;
811 811
             case '<':
812
-                if ((size_t)(begin - objstart) < objsz - 2) {
812
+                if ((size_t)(begin - objstart) < objsize - 2) {
813 813
                     if (begin[1] == '<') {
814 814
                         dict = pdf_parse_dict(pdf, obj, end - objstart, begin, &p1);
815 815
                         begin = p1+2;
... ...
@@ -912,7 +922,7 @@ struct pdf_dict *pdf_parse_dict(struct pdf_struct *pdf, struct pdf_obj *obj, siz
912 912
     return res;
913 913
 }
914 914
 
915
-struct pdf_array *pdf_parse_array(struct pdf_struct *pdf, struct pdf_obj *obj, size_t objsz, char *begin, char **endchar)
915
+struct pdf_array *pdf_parse_array(struct pdf_struct *pdf, struct pdf_obj *obj, size_t objsize, char *begin, char **endchar)
916 916
 {
917 917
     struct pdf_array *res=NULL;
918 918
     struct pdf_array_node *node=NULL;
... ...
@@ -924,9 +934,10 @@ struct pdf_array *pdf_parse_array(struct pdf_struct *pdf, struct pdf_obj *obj, s
924 924
     if (!(pdf) || !(obj) || !(begin))
925 925
         return NULL;
926 926
 
927
-    objstart = obj->start + pdf->map;
927
+    objstart = (obj->objstm) ? (const char *)(obj->start + obj->objstm->streambuf)
928
+                             : (const char *)(obj->start + pdf->map);
928 929
 
929
-    if (begin < objstart || (size_t)(begin - objstart) >= objsz)
930
+    if (begin < objstart || (size_t)(begin - objstart) >= objsize)
930 931
         return NULL;
931 932
 
932 933
     if (begin[0] != '[')
... ...
@@ -934,7 +945,7 @@ struct pdf_array *pdf_parse_array(struct pdf_struct *pdf, struct pdf_obj *obj, s
934 934
 
935 935
     /* Find the end of the array */
936 936
     end = begin;
937
-    while ((size_t)(end - objstart) < objsz) {
937
+    while ((size_t)(end - objstart) < objsize) {
938 938
         if (in_string) {
939 939
             if (*end == '\\') {
940 940
                 end += 2;
... ...
@@ -967,7 +978,7 @@ struct pdf_array *pdf_parse_array(struct pdf_struct *pdf, struct pdf_obj *obj, s
967 967
     }
968 968
 
969 969
     /* More sanity checking */
970
-    if ((size_t)(end - objstart) >= objsz)
970
+    if ((size_t)(end - objstart) >= objsize)
971 971
         return NULL;
972 972
 
973 973
     if (*end != ']')
... ...
@@ -991,7 +1002,7 @@ struct pdf_array *pdf_parse_array(struct pdf_struct *pdf, struct pdf_obj *obj, s
991 991
 
992 992
         switch (begin[0]) {
993 993
             case '<':
994
-                if ((size_t)(begin - objstart) < objsz - 2 && begin[1] == '<') {
994
+                if ((size_t)(begin - objstart) < objsize - 2 && begin[1] == '<') {
995 995
                     dict = pdf_parse_dict(pdf, obj, end - objstart, begin, &begin);
996 996
                     begin+=2;
997 997
                     break;
... ...
@@ -755,7 +755,7 @@ done:
755 755
  * @return CL_SUCCESS       Success
756 756
  * @return CL_EPARSE        Failure
757 757
  */
758
-int cli_strntol_wrap(const char *buf, size_t buf_size, int fail_at_nondigit, int base, long *result)
758
+cl_error_t cli_strntol_wrap(const char *buf, size_t buf_size, int fail_at_nondigit, int base, long *result)
759 759
 {
760 760
     char *endptr = NULL;
761 761
     long num;
... ...
@@ -798,7 +798,7 @@ int cli_strntol_wrap(const char *buf, size_t buf_size, int fail_at_nondigit, int
798 798
  * @return CL_SUCCESS       Success
799 799
  * @return CL_EPARSE        Failure
800 800
  */
801
-int cli_strntoul_wrap(const char *buf, size_t buf_size, int fail_at_nondigit, int base, unsigned long *result)
801
+cl_error_t cli_strntoul_wrap(const char *buf, size_t buf_size, int fail_at_nondigit, int base, unsigned long *result)
802 802
 {
803 803
     char *endptr = NULL;
804 804
     long num;
... ...
@@ -28,6 +28,7 @@
28 28
 #include <ctype.h>
29 29
 #include <sys/types.h>
30 30
 
31
+#include "clamav.h"
31 32
 #include "cltypes.h"
32 33
 
33 34
 #ifdef HAVE_STRCASESTR
... ...
@@ -68,8 +69,8 @@ const char *cli_memstr(const char *haystack, unsigned int hs, const char *needle
68 68
 char *cli_strrcpy(char *dest, const char *source);
69 69
 size_t cli_strtokenize(char *buffer, const char delim, const size_t token_count, const char **tokens);
70 70
 size_t cli_ldbtokenize(char *buffer, const char delim, const size_t token_count, const char **tokens, int token_skip);
71
-int cli_strntol_wrap(const char *buf, size_t buf_size, int fail_at_nondigit, int base, long *result);
72
-int cli_strntoul_wrap(const char *buf, size_t buf_size, int fail_at_nondigit, int base, unsigned long *result);
71
+cl_error_t cli_strntol_wrap(const char *buf, size_t buf_size, int fail_at_nondigit, int base, long *result);
72
+cl_error_t cli_strntoul_wrap(const char *buf, size_t buf_size, int fail_at_nondigit, int base, unsigned long *result);
73 73
 int cli_isnumber(const char *str);
74 74
 char *cli_unescape(const char *str);
75 75
 struct text_buffer;