Browse code

Clean up of PDF object finding logic. Changes include recording object sizes as objects are found, identifying object streams in the object parsing section instead of the PDF parsing section, and limiting of stream and other object parsing to the size of the object instead of the size of the PDF. It is also easier to read and includes more inline documentation.

Micah Snyder authored on 2019/03/06 11:15:41
Showing 4 changed files
... ...
@@ -68,7 +68,7 @@
68 68
 #include "json_api.h"
69 69
 
70 70
 #ifdef CL_DEBUG
71
-/*#define	SAVE_TMP	
71
+/*#define	SAVE_TMP
72 72
  *Save the file being worked on in tmp */
73 73
 #endif
74 74
 
... ...
@@ -134,14 +134,14 @@ static int xrefCheck(const char *xref, const char *eof)
134 134
     if (xref + 4 >= eof)
135 135
         return -1;
136 136
 
137
-    if (!memcmp(xref, "xref", 4)) {
137
+    if (!memcmp(xref, "xref", strlen("xref"))) {
138 138
         cli_dbgmsg("cli_pdf: found xref\n");
139 139
         return 0;
140 140
     }
141 141
 
142 142
     /* could be xref stream */
143 143
     for (q = xref; q + 5 < eof; q++) {
144
-        if (!memcmp(q, "/XRef", 4)) {
144
+        if (!memcmp(q, "/XRef", strlen("/XRef"))) {
145 145
             cli_dbgmsg("cli_pdf: found /XRef\n");
146 146
             return 0;
147 147
         }
... ...
@@ -163,10 +163,10 @@ static int xrefCheck(const char *xref, const char *eof)
163 163
 
164 164
 /**
165 165
  * @brief   Searching BACKwards, find the next character that is not a whitespace.
166
- * 
166
+ *
167 167
  * @param q         Index to start from (at the end of the search space)
168
- * @param start     Beginning of the search space. 
169
- * 
168
+ * @param start     Beginning of the search space.
169
+ *
170 170
  * @return const char*  Address of the final non-whitespace character OR the same address as the start.
171 171
  */
172 172
 static const char *findNextNonWSBack(const char *q, const char *start)
... ...
@@ -179,10 +179,10 @@ static const char *findNextNonWSBack(const char *q, const char *start)
179 179
 
180 180
 /**
181 181
  * @brief   Searching FORwards, find the next character that is not a whitespace.
182
- * 
182
+ *
183 183
  * @param q         Index to start from (at the end of the search space)
184
- * @param start     Beginning of the search space. 
185
- * 
184
+ * @param start     Beginning of the search space.
185
+ *
186 186
  * @return const char*  Address of the final non-whitespace character OR the same address as the start.
187 187
  */
188 188
 static const char *findNextNonWS(const char *q, const char *end)
... ...
@@ -195,100 +195,116 @@ static const char *findNextNonWS(const char *q, const char *end)
195 195
 
196 196
 /**
197 197
  * @brief   Find bounds of stream.
198
- * 
198
+ *
199 199
  * PDF streams are prefixed with "stream" and suffixed with "endstream".
200 200
  * Return value indicates success or failure.
201
- * 
201
+ *
202 202
  * @param start             start address of search space.
203
- * @param bytesleft         size of search space for "stream"
204
- * @param bytesleft2        size of search space for "endstream"
203
+ * @param size              size of search space
205 204
  * @param[out] stream       output param, address of start of stream data
206
- * @param[out] endstream    output param, address of end of stream data
205
+ * @param[out] stream_size  output param, size of stream data
207 206
  * @param newline_hack      hack to support newlines that are \r\n, and not just \n or just \r.
208
- * 
209
- * @return int  1 if stream bounds were found. 
210
- * @return int  0 if stream bounds could not be found. 
207
+ *
208
+ * @return cl_error_t       CL_SUCCESS if stream bounds were found.
209
+ * @return cl_error_t       CL_BREAK if stream bounds could not be found.
210
+ * @return cl_error_t       CL_EFORMAT if stream start was found, but not end. (truncated)
211
+ * @return cl_error_t       CL_EARG if invalid args were provided.
211 212
  */
212
-static int find_stream_bounds(
213
+static cl_error_t find_stream_bounds(
213 214
     const char *start,
214
-    off_t bytesleft,
215
-    off_t bytesleft2,
216
-    off_t *stream,
217
-    off_t *endstream,
215
+    size_t size,
216
+    const char **stream,
217
+    size_t *stream_size,
218 218
     int newline_hack)
219 219
 {
220
-    const char *q2, *q;
220
+    cl_error_t status = CL_BREAK;
221
+
222
+    const char *idx;
223
+    const char *stream_begin;
224
+    const char *endstream_begin;
225
+    size_t bytesleft = size;
226
+
227
+    if ((NULL == start) || (0 == bytesleft) || (NULL == stream) || (NULL == stream_size)) {
228
+        status = CL_EARG;
229
+        return status;
230
+    }
231
+
232
+    *stream      = NULL;
233
+    *stream_size = 0;
221 234
 
222 235
     /* Begin by finding the "stream" string that prefixes stream data. */
223
-    if ((q2 = cli_memstr(start, bytesleft, "stream", 6))) {
224
-        q2 += 6;
225
-        bytesleft -= q2 - start;
236
+    if ((stream_begin = cli_memstr(start, bytesleft, "stream", strlen("stream")))) {
237
+        idx = stream_begin + strlen("stream");
238
+        bytesleft -= idx - start;
226 239
         if (bytesleft < 0)
227
-            return 0;
240
+            goto done;
228 241
 
229 242
         /* Skip any new line charcters. */
230
-        if (bytesleft >= 2 && q2[0] == '\xd' && q2[1] == '\xa') {
231
-            q2 += 2;
232
-            if (newline_hack && (bytesleft > 2) && q2[0] == '\xa')
233
-                q2++;
234
-        } else if (bytesleft && q2[0] == '\xa') {
235
-            q2++;
243
+        if (bytesleft >= 2 && idx[0] == '\xd' && idx[1] == '\xa') {
244
+            idx += 2;
245
+            if (newline_hack && (bytesleft > 2) && idx[0] == '\xa')
246
+                idx++;
247
+        } else if (bytesleft && idx[0] == '\xa') {
248
+            idx++;
236 249
         }
237 250
 
238
-        *stream = q2 - start;
251
+        /* Pass back start of the stream data. */
252
+        *stream = idx;
239 253
 
240
-        bytesleft2 -= q2 - start;
241
-        if (bytesleft2 <= 0)
242
-            return 0;
254
+        bytesleft = size - (idx - start);
255
+        if (bytesleft <= 0)
256
+            goto done;
243 257
 
244
-        /* Now find the "endstream" string that suffixes stream data */
245
-        q  = q2;
246
-        q2 = cli_memstr(q, bytesleft2, "endstream", 9);
247
-        if (!q2) {
248
-            /* Couldn't find "endstream" */
249
-            return 0;
258
+        /* Now find the "endstream" string that suffixes stream data. */
259
+        endstream_begin = cli_memstr(idx, bytesleft, "endstream", strlen("endstream"));
260
+        if (!endstream_begin) {
261
+            /* Couldn't find "endstream", but that's ok --
262
+             * -- we'll just count the rest of the provided buffer. */
263
+            cli_dbgmsg("find_stream_bounds: Truncated stream found!\n");
264
+            endstream_begin = start + size;
265
+            status          = CL_EFORMAT;
250 266
         }
251 267
 
252
-        *endstream = q2 - start;
268
+        /* Pass back end of the stream data, as offset from start. */
269
+        *stream_size = endstream_begin - *stream;
253 270
 
254
-        /* Double-check that endstream >= stream */
255
-        if (*endstream < *stream)
256
-            *endstream = *stream;
257
-
258
-        return 1;
271
+        if (CL_EFORMAT != status)
272
+            status = CL_SUCCESS;
259 273
     }
260 274
 
261
-    return 0;
275
+done:
276
+
277
+    return status;
262 278
 }
263 279
 
264 280
 /**
265
- * @brief Find the next *indirect* object in an object stream, adds it to our list of 
281
+ * @brief Find the next *indirect* object in an object stream, adds it to our list of
266 282
  *        objects, and increments nobj.
267
- * 
283
+ *
268 284
  * Indirect objects in a stream DON'T begin with "obj" and end with "endobj".
269 285
  * Instead, they have an obj ID and an offset from the first object to point you
270 286
  * right at them.
271
- * 
287
+ *
272 288
  * If found, objstm->current will be updated to the next obj id.
273
- * 
274
- * All objects in an object stream are indirect and thus do not begin or start 
275
- * with "obj" or "endobj".  Instead, the object stream takes the following 
289
+ *
290
+ * All objects in an object stream are indirect and thus do not begin or start
291
+ * with "obj" or "endobj".  Instead, the object stream takes the following
276 292
  * format.
277
- * 
293
+ *
278 294
  *      <dictionary describing stream> objstm content endobjstm
279
- * 
295
+ *
280 296
  * where content looks something like the following:
281
- * 
297
+ *
282 298
  *      15 0 16 3 17 46 (ab)<</IDS 8 0 R/JavaScript 27 0 R/URLS 9 0 R>><</Names[(Test)28 0 R]>>
283
- * 
284
- * In the above example, the literal string (ab) is indirect object # 15, and 
285
- * begins at offset 0 of the set of objects.  The next object, # 16 begis at 
286
- * offset 3 is a dictionary.  The final object is also a dictionary, beginning 
299
+ *
300
+ * In the above example, the literal string (ab) is indirect object # 15, and
301
+ * begins at offset 0 of the set of objects.  The next object, # 16 begis at
302
+ * offset 3 is a dictionary.  The final object is also a dictionary, beginning
287 303
  * at offset 46.
288
- * 
289
- * @param pdf   Pdf struct that keeps track of all information found in the PDF. 
304
+ *
305
+ * @param pdf   Pdf struct that keeps track of all information found in the PDF.
290 306
  * @param objstm
291
- * 
307
+ *
292 308
  * @return CL_SUCCESS  if success
293 309
  * @return CL_EPARSE   if parsing error
294 310
  * @return CL_EMEM     if error allocating memory
... ...
@@ -298,7 +314,7 @@ int pdf_findobj_in_objstm(struct pdf_struct *pdf, struct objstm_struct *objstm,
298 298
 {
299 299
     cl_error_t status   = CL_EPARSE;
300 300
     struct pdf_obj *obj = NULL;
301
-    unsigned long objid = 0, objsize = 0, objoff = 0;
301
+    unsigned long objid = 0, objoff = 0;
302 302
     long temp_long         = 0;
303 303
     const char *index      = NULL;
304 304
     size_t bytes_remaining = 0;
... ...
@@ -382,10 +398,10 @@ int pdf_findobj_in_objstm(struct pdf_struct *pdf, struct objstm_struct *objstm,
382 382
         (index < objstm->streambuf + objstm->streambuf_len)) {
383 383
         unsigned long next_objid = 0, next_objoff = 0;
384 384
 
385
-        /* 
386
-         * While we're at it, 
385
+        /*
386
+         * While we're at it,
387 387
          *   lets record the size as running up to the next object offset.
388
-         * 
388
+         *
389 389
          * To do so, we will need to parse the next obj pair.
390 390
          */
391 391
         /* objstm->current_pair points directly to the obj id */
... ...
@@ -440,9 +456,9 @@ int pdf_findobj_in_objstm(struct pdf_struct *pdf, struct objstm_struct *objstm,
440 440
     } else {
441 441
         /*
442 442
          * Should be no more objects. We should verify.
443
-         * 
443
+         *
444 444
          * Either way...
445
-         *   obj->size should be the rest of the buffer. 
445
+         *   obj->size should be the rest of the buffer.
446 446
          */
447 447
         if (objstm->nobjs_found < objstm->n) {
448 448
             cli_warnmsg("pdf_findobj_in_objstm: Fewer objects found in object stream than expected!\n");
... ...
@@ -476,17 +492,25 @@ done:
476 476
 
477 477
 /**
478 478
  * @brief Find the next *indirect* object.
479
- * 
480
- * Indirect objects begin with "obj" and end with "endobj".
481
- * Identify objects that contain streams.
482
- * Identify truncated objects. 
483
- * 
479
+ *
480
+ * Indirect objects located outside of an object stream are prefaced with:
481
+ *      <objid> <genid> obj
482
+ *
483
+ * Each of the above are separated by whitespace of some sort.
484
+ *
485
+ * Indirect objects are postfaced with:
486
+ *      endobj
487
+ *
488
+ * The specification does not say if whitespace is required before or after "endobj".
489
+ *
490
+ * Identify truncated objects.
491
+ *
484 492
  * If found, pdf->offset will be updated to just after the "endobj".
485 493
  * If truncated, pdf->offset will == pdf->size.
486 494
  * If not found, pdf->offset will not be updated.
487
- * 
488
- * @param pdf   Pdf context struct that keeps track of all information found in the PDF. 
489
- * 
495
+ *
496
+ * @param pdf   Pdf context struct that keeps track of all information found in the PDF.
497
+ *
490 498
  * @return CL_SUCCESS  if success
491 499
  * @return CL_BREAK    if no more objects
492 500
  * @return CL_EPARSE   if parsing error
... ...
@@ -495,9 +519,13 @@ done:
495 495
 cl_error_t pdf_findobj(struct pdf_struct *pdf)
496 496
 {
497 497
     cl_error_t status = CL_EPARSE;
498
-    const char *start, *q, *q2, *q3, *eof;
498
+    const char *start, *idx, *genid_search_index, *objid_search_index;
499
+
500
+    const char *obj_begin = NULL, *obj_end = NULL;
501
+    const char *endobj_begin = NULL, *endobj_end = NULL;
502
+
499 503
     struct pdf_obj *obj = NULL;
500
-    off_t bytesleft;
504
+    size_t bytesleft;
501 505
     unsigned long genid, objid;
502 506
     long temp_long;
503 507
 
... ...
@@ -520,100 +548,111 @@ cl_error_t pdf_findobj(struct pdf_struct *pdf)
520 520
     start     = pdf->map + pdf->offset;
521 521
     bytesleft = pdf->size - pdf->offset;
522 522
 
523
-    /* Indirect objects located outside of an object stream are prefaced with "obj"
524
-     * and suffixed with "endobj".  Find the "obj" preface. */
525
-    while (bytesleft > 0) {
526
-        q2 = cli_memstr(start, bytesleft, "obj", 3);
527
-        if (!q2) {
528
-            status = CL_BREAK; /* no more objs */
529
-            goto done;
523
+    /*
524
+     * Start by searching for "obj"
525
+     */
526
+    idx = start + 1;
527
+    while (bytesleft > 1 + strlen("obj")) {
528
+        /* `- 1` accounts for size of white space before obj */
529
+        idx = cli_memstr(idx, bytesleft - 1, "obj", strlen("obj"));
530
+        if (NULL == idx) {
531
+            status = CL_BREAK;
532
+            goto done; /* No more objs. */
530 533
         }
531 534
 
532
-        /* verify that "obj" has a whitespace before it, and is not the end of 
533
-         * a previous string like... "globj" */
534
-        q2--;
535
-        bytesleft -= q2 - start;
535
+        /* verify that the word has a whitespace before it, and is not the end of
536
+         * a previous word */
537
+        idx--;
538
+        bytesleft = (pdf->size - pdf->offset) - (size_t)(idx - start);
536 539
 
537
-        if (*q2 != 0 && *q2 != 9 && *q2 != 0xa && *q2 != 0xc && *q2 != 0xd && *q2 != 0x20) {
538
-            /* This instance of the "obj" string appears to be part of another string.
540
+        if (*idx != 0 && *idx != 9 && *idx != 0xa && *idx != 0xc && *idx != 0xd && *idx != 0x20) {
541
+            /* This instance of "obj" appears to be part of a longer string.
539 542
              * Skip it, and keep searching for an object. */
540
-            start = q2 + 4;
541
-            bytesleft -= 4;
543
+            idx += 1 + strlen("obj");
544
+            bytesleft -= 1 + strlen("obj");
542 545
             continue;
543 546
         }
544 547
 
545
-        break; /* Found it. q2 should point to the whitespace before the "obj" string */
546
-    }
548
+        /* Found the beginning of the word */
549
+        obj_begin = idx;
550
+        obj_end   = idx + 1 + strlen("obj");
547 551
 
548
-    if (bytesleft <= 0) {
549
-        status = CL_BREAK; /* No "obj" found. */
550
-        goto done;
552
+        break;
551 553
     }
552 554
 
553
-    /* "obj" found! */
555
+    if ((NULL == obj_begin) || (NULL == obj_end)) {
556
+        status = CL_BREAK;
557
+        goto done; /* No more objs. */
558
+    }
554 559
 
555 560
     /* Find the generation id (genid) that appears before the "obj" */
556
-    q = findNextNonWSBack(q2 - 1, start);
557
-    while (q > start && isdigit(*q))
558
-        q--;
561
+    genid_search_index = findNextNonWSBack(obj_begin - 1, start);
562
+    while (genid_search_index > start && isdigit(*genid_search_index))
563
+        genid_search_index--;
559 564
 
560
-    if (CL_SUCCESS != cli_strntol_wrap(q, (size_t)(bytesleft + (q2 - q)), 0, 10, &temp_long)) {
565
+    if (CL_SUCCESS != cli_strntol_wrap(genid_search_index, (size_t)((obj_begin)-genid_search_index), 0, 10, &temp_long)) {
561 566
         cli_dbgmsg("pdf_findobj: Failed to parse object genid (# objects found: %u)\n", pdf->nobjs);
562 567
         /* Failed to parse, probably not a real object.  Skip past the "obj" thing, and continue. */
563
-        pdf->offset = q2 + 4 - pdf->map;
568
+        pdf->offset = obj_end - pdf->map;
564 569
         status      = CL_EPARSE;
565 570
         goto done;
566 571
     } else if (temp_long < 0) {
567 572
         cli_dbgmsg("pdf_findobj: Encountered invalid negative obj genid (%ld).\n", temp_long);
568
-        pdf->offset = q2 + 4 - pdf->map;
573
+        pdf->offset = obj_end - pdf->map;
569 574
         status      = CL_EPARSE;
570 575
         goto done;
571 576
     }
572 577
     genid = (unsigned long)temp_long;
573 578
 
574
-    /* Find the object id (objid) that appers before the genid */
575
-    q = findNextNonWSBack(q - 1, start);
576
-    while (q > start && isdigit(*q))
577
-        q--;
579
+    /* Find the object id (objid) that appears before the genid */
580
+    objid_search_index = findNextNonWSBack(genid_search_index - 1, start);
581
+    while (objid_search_index > start && isdigit(*objid_search_index))
582
+        objid_search_index--;
578 583
 
579
-    if (CL_SUCCESS != cli_strntol_wrap(q, (size_t)(bytesleft + (q2 - q)), 0, 10, &temp_long)) {
584
+    if (CL_SUCCESS != cli_strntol_wrap(objid_search_index, (size_t)((genid_search_index)-objid_search_index), 0, 10, &temp_long)) {
580 585
         /*
581
-         * PDFs with multiple revisions will have %%EOF before the end of the file, 
582
-         * followed by the next revision of the PDF.  If this is the case, we can 
583
-         * detect it and continue parsing after the %%EOF.
586
+         * Edge case:
587
+         *
588
+         * PDFs with multiple revisions will have %%EOF before the end of the file,
589
+         * followed by the next revision of the PDF, which will probably be an immediate objid.
590
+         *
591
+         * Example:
592
+         *   %%EOF1 1 obj <blah> endobj
593
+         *
594
+         * If this is the case, we can detect it and continue parsing after the %%EOF.
584 595
          */
585
-        if (q - 4 > start) {
586
-            const char *lastfile = q - 4;
596
+        if (objid_search_index - strlen("\%\%EO") > start) {
597
+            const char *lastfile = objid_search_index - strlen("\%\%EO");
587 598
             if (0 != strncmp(lastfile, "\%\%EOF", 5)) {
588 599
                 /* Nope, wasn't %%EOF */
589 600
                 cli_dbgmsg("pdf_findobj: Failed to parse object objid (# objects found: %u)\n", pdf->nobjs);
590 601
                 /* Skip past the "obj" thing, and continue. */
591
-                pdf->offset = q2 + 4 - pdf->map;
602
+                pdf->offset = obj_end - pdf->map;
592 603
                 status      = CL_EPARSE;
593 604
                 goto done;
594 605
             }
595
-            /* Yup, Looks, like the file continues after %%EOF.  
606
+            /* Yup, Looks, like the file continues after %%EOF.
596 607
              * Probably another revision.  Keep parsing... */
597
-            q++;
598
-            cli_dbgmsg("pdf_findobj: \%\%EOF detected before end of file, at %zu\n", (size_t)q);
608
+            objid_search_index++;
609
+            cli_dbgmsg("pdf_findobj: \%\%EOF detected before end of file, at offset: %zu\n", (size_t)(objid_search_index - pdf->map));
599 610
         } else {
600 611
             /* Failed parsing at the very beginning */
601 612
             cli_dbgmsg("pdf_findobj: Failed to parse object objid (# objects found: %u)\n", pdf->nobjs);
602 613
             /* Probably not a real object.  Skip past the "obj" thing, and continue. */
603
-            pdf->offset = q2 + 4 - pdf->map;
614
+            pdf->offset = obj_end - pdf->map;
604 615
             status      = CL_EPARSE;
605 616
             goto done;
606 617
         }
607 618
         /* Try again, with offset slightly adjusted */
608
-        if (CL_SUCCESS != cli_strntol_wrap(q, (size_t)(bytesleft + (q2 - q)), 0, 10, &temp_long)) {
619
+        if (CL_SUCCESS != cli_strntol_wrap(objid_search_index, (size_t)((genid_search_index - 1) - objid_search_index), 0, 10, &temp_long)) {
609 620
             cli_dbgmsg("pdf_findobj: Failed to parse object objid (# objects found: %u)\n", pdf->nobjs);
610 621
             /* Still failed... Probably not a real object.  Skip past the "obj" thing, and continue. */
611
-            pdf->offset = q2 + 4 - pdf->map;
622
+            pdf->offset = obj_end - pdf->map;
612 623
             status      = CL_EPARSE;
613 624
             goto done;
614 625
         } else if (temp_long < 0) {
615 626
             cli_dbgmsg("pdf_findobj: Encountered invalid negative objid (%ld).\n", temp_long);
616
-            pdf->offset = q2 + 4 - pdf->map;
627
+            pdf->offset = obj_end - pdf->map;
617 628
             status      = CL_EPARSE;
618 629
             goto done;
619 630
         }
... ...
@@ -621,82 +660,52 @@ cl_error_t pdf_findobj(struct pdf_struct *pdf)
621 621
         cli_dbgmsg("pdf_findobj: There appears to be an additional revision. Continuing to parse...\n");
622 622
     } else if (temp_long < 0) {
623 623
         cli_dbgmsg("pdf_findobj: Encountered invalid negative objid (%ld).\n", temp_long);
624
-        pdf->offset = q2 + 4 - pdf->map;
624
+        pdf->offset = obj_end - pdf->map;
625 625
         status      = CL_EPARSE;
626 626
         goto done;
627 627
     }
628 628
     objid = (unsigned long)temp_long;
629 629
 
630
-    /*
631
-     * Ok so we have the objid, genid, and "obj" string.
632
-     *   Time to store that information and then ...
633
-     *     ... investigate what kind of object this is.
634
-     */
635 630
     obj->id    = (objid << 8) | (genid & 0xff);
636
-    obj->start = q2 + 4 - pdf->map; /* obj start begins just after the "obj" string */
631
+    obj->start = obj_end - pdf->map; /* obj start begins just after the "obj" string */
637 632
     obj->flags = 0;
638 633
 
639
-    bytesleft -= 4;
640
-    eof = pdf->map + pdf->size;
641
-    q   = pdf->map + obj->start;
642
-
643
-    while (q < eof && bytesleft > 0) {
644
-        off_t p_stream, p_endstream;
645
-        q2 = pdf_nextobject(q, bytesleft);
646
-        if (!q2)
647
-            q2 = pdf->map + pdf->size; /* No interesting objects found, fast-forward to eof */
648
-
649
-        bytesleft -= q2 - q;
650
-        if (find_stream_bounds(q - 1, q2 - q, bytesleft + (q2 - q), &p_stream, &p_endstream, 1)) {
651
-            /*
652
-             * Found obj that contains a stream.
653
-             */
654
-            obj->flags |= 1 << OBJ_STREAM;
655
-            q2 = q - 1 + p_endstream + 9;
656
-            bytesleft -= q2 - q + 1;
657
-
658
-            if (bytesleft < 0) {
659
-                /* ... and the stream is truncated.  Hmm... */
660
-                obj->flags |= 1 << OBJ_TRUNCATED;
661
-                pdf->offset = pdf->size;
662
-
663
-                status = CL_SUCCESS;
664
-                goto done; /* Truncated file, no end to obj/stream. 
665
-                            * The next call to pdf_findobj() will return no more objects. */
666
-            }
667
-        } else if ((q3 = cli_memstr(q - 1, q2 - q + 1, "endobj", 6))) {
668
-            /*
669
-             * obj found and offset positioned. ideal return case
670
-             */
671
-            q2          = q3 + 6;
672
-            pdf->offset = q2 - pdf->map; /* update the offset to just after the endobj */
673
-
674
-            status = CL_SUCCESS;
675
-            goto done;
676
-        } else {
677
-            q2++;
678
-            bytesleft--;
679
-        }
680
-
681
-        q = q2;
634
+    /*
635
+     * We now have the objid, genid, and object start.
636
+     * Find the object end ("endobj").
637
+     */
638
+    /* `- 1` accounts for size of white space before obj */
639
+    endobj_begin = cli_memstr(obj_end, pdf->map + pdf->size - obj_end, "endobj", strlen("endobj"));
640
+    if (NULL == endobj_begin) {
641
+        /* No end to object.
642
+         * PDF appears to be malformed or truncated.
643
+         * Will record the object size as going ot the end of the file.
644
+         * Will record that the object is truncated.
645
+         * Will position the pdf offset to the end of the PDF.
646
+         * The next iteration of this function will find no more objects. */
647
+        obj->flags |= 1 << OBJ_TRUNCATED;
648
+        obj->size   = (pdf->map + pdf->size) - obj_end;
649
+        pdf->offset = pdf->size;
650
+
651
+        /* Truncated "object" found! */
652
+        status = CL_SUCCESS;
653
+        goto done;
682 654
     }
655
+    endobj_end = endobj_begin + strlen("endobj");
683 656
 
684
-    obj->flags |= 1 << OBJ_TRUNCATED;
685
-    pdf->offset = pdf->size;
657
+    /* Size of the object goes from "obj" <-> "endobject". */
658
+    obj->size   = endobj_begin - obj_end;
659
+    pdf->offset = endobj_end - pdf->map;
686 660
 
661
+    /*
662
+     * Object found!
663
+     */
687 664
     status = CL_SUCCESS; /* truncated file, no end to obj. */
688 665
 
689 666
 done:
690 667
     if (status == CL_SUCCESS) {
691
-        cli_dbgmsg("pdf_findobj: found %d %d obj @%lld\n", obj->id >> 8, obj->id & 0xff, (long long)(obj->start + pdf->startoff));
668
+        cli_dbgmsg("pdf_findobj: found %d %d obj @%lld, size: %zu bytes.\n", obj->id >> 8, obj->id & 0xff, (long long)(obj->start + pdf->startoff), obj->size);
692 669
     } else {
693
-        if (status == CL_BREAK) {
694
-            cli_dbgmsg("pdf_findobj: No more objects (# objects found: %u)\n", pdf->nobjs);
695
-        } else if (status == CL_EMEM) {
696
-            cli_warnmsg("pdf_findobj: Error allocating memory (# objects found: %u)\n", pdf->nobjs);
697
-        } else {
698
-            cli_dbgmsg("pdf_findobj: Unexpected status code %d.\n", status);
699
-        }
700 670
         /* Remove the unused obj reference from our list of objects found */
701 671
         /* No need to realloc pdf->objs back down.  It won't leak. */
702 672
         pdf->objs[pdf->nobjs - 1] = NULL;
... ...
@@ -705,6 +714,14 @@ done:
705 705
         /* Free up the obj struct. */
706 706
         if (NULL != obj)
707 707
             free(obj);
708
+
709
+        if (status == CL_BREAK) {
710
+            cli_dbgmsg("pdf_findobj: No more objects (# objects found: %u)\n", pdf->nobjs);
711
+        } else if (status == CL_EMEM) {
712
+            cli_warnmsg("pdf_findobj: Error allocating memory (# objects found: %u)\n", pdf->nobjs);
713
+        } else {
714
+            cli_dbgmsg("pdf_findobj: Unexpected status code %d.\n", status);
715
+        }
708 716
     }
709 717
 
710 718
     return status;
... ...
@@ -828,14 +845,14 @@ struct pdf_obj *find_obj(struct pdf_struct *pdf, struct pdf_obj *obj, uint32_t o
828 828
 
829 829
 /**
830 830
  * @brief   Find and interpret the "/Length" dictionary key value.
831
- * 
831
+ *
832 832
  * The value may be:
833
- *  - a direct object (i.e. just a number) 
833
+ *  - a direct object (i.e. just a number)
834 834
  *  - an indirect object, where the value is somewhere else in the document and we have to look it up.
835 835
  *    indirect objects are referenced using an object id (objid), generation id (genid) genid, and the letter 'R'.
836
- * 
836
+ *
837 837
  * Example dictionary with a single key "/Length" that relies direct object for the value.
838
- * 
838
+ *
839 839
  *      1 0 obj
840 840
  *          << /Length 534
841 841
  *              /Filter [ /ASCII85Decode /LZWDecode ]
... ...
@@ -849,9 +866,9 @@ struct pdf_obj *find_obj(struct pdf_struct *pdf, struct pdf_obj *obj, uint32_t o
849 849
  *              JD?M$0QP)lKn06l1apKDC@\qJ4B!!(5m+j.7F790m(Vj88l8Q:_CZ(Gm1%X\N1&u!FKHMB~>
850 850
  *          endstream
851 851
  *      endobj
852
- * 
852
+ *
853 853
  * Example dictionary with a single key "/Length" that relies on an indirect object for the value.
854
- * 
854
+ *
855 855
  *      7 0 obj
856 856
  *          << /Length 8 0 R >> % An indirect reference to object 8, with generation id 0.
857 857
  *          stream
... ...
@@ -862,11 +879,11 @@ struct pdf_obj *find_obj(struct pdf_struct *pdf, struct pdf_obj *obj, uint32_t o
862 862
  *              ET
863 863
  *          endstream
864 864
  *      endobj
865
- * 
865
+ *
866 866
  *      8 0 obj
867 867
  *          77 % The length of the preceding stream
868 868
  *      endobj
869
- * 
869
+ *
870 870
  * @param pdf       Pdf context structure.
871 871
  * @param obj       Pdf object context structure.
872 872
  * @param start     Pointer start of the dictionary string.
... ...
@@ -906,7 +923,7 @@ static size_t find_length(struct pdf_struct *pdf, struct pdf_obj *obj, const cha
906 906
     if (!obj_start)
907 907
         return 0;
908 908
 
909
-    if (bytes_remaining < obj_start - index) {
909
+    if (bytes_remaining < (size_t)(obj_start - index)) {
910 910
         return 0;
911 911
     }
912 912
     bytes_remaining -= obj_start - index;
... ...
@@ -923,10 +940,10 @@ static size_t find_length(struct pdf_struct *pdf, struct pdf_obj *obj, const cha
923 923
     }
924 924
     length = (size_t)temp_long; /* length or maybe object id */
925 925
 
926
-    /* 
927
-     * Keep parsing, skipping past the first integer that might have been what we wanted. 
928
-     * If it's an indirect object, we'll find a Generation ID followed by the letter 'R' 
929
-     * I.e. something like " 0 R" 
926
+    /*
927
+     * Keep parsing, skipping past the first integer that might have been what we wanted.
928
+     * If it's an indirect object, we'll find a Generation ID followed by the letter 'R'
929
+     * I.e. something like " 0 R"
930 930
      */
931 931
     while ((bytes_remaining > 0) && isdigit(*index)) {
932 932
         index++;
... ...
@@ -958,8 +975,8 @@ static size_t find_length(struct pdf_struct *pdf, struct pdf_obj *obj, const cha
958 958
         }
959 959
 
960 960
         if (index[0] == ' ' && index[1] == 'R') {
961
-            /* 
962
-             * Ok so we found a genid and that 'R'.  Which means that first value 
961
+            /*
962
+             * Ok so we found a genid and that 'R'.  Which means that first value
963 963
              * was actually the objid.
964 964
              * We can look up the indirect object using this information.
965 965
              */
... ...
@@ -984,7 +1001,7 @@ static size_t find_length(struct pdf_struct *pdf, struct pdf_obj *obj, const cha
984 984
                 return 0;
985 985
             }
986 986
 
987
-            if (bytes_remaining < index - indirect_obj_start) {
987
+            if (bytes_remaining < (size_t)(index - indirect_obj_start)) {
988 988
                 return 0;
989 989
             }
990 990
             bytes_remaining -= index - indirect_obj_start;
... ...
@@ -1002,7 +1019,7 @@ static size_t find_length(struct pdf_struct *pdf, struct pdf_obj *obj, const cha
1002 1002
     }
1003 1003
 
1004 1004
     /* limit length */
1005
-    if (obj_start - pdf->map + length + 5 > pdf->size)
1005
+    if ((size_t)(obj_start - pdf->map) + length + 5 > pdf->size)
1006 1006
         length = pdf->size - (obj_start - pdf->map) - 5;
1007 1007
 
1008 1008
     return length;
... ...
@@ -1010,101 +1027,6 @@ static size_t find_length(struct pdf_struct *pdf, struct pdf_obj *obj, const cha
1010 1010
 
1011 1011
 #define DUMP_MASK ((1 << OBJ_CONTENTS) | (1 << OBJ_FILTER_FLATE) | (1 << OBJ_FILTER_DCT) | (1 << OBJ_FILTER_AH) | (1 << OBJ_FILTER_A85) | (1 << OBJ_EMBEDDED_FILE) | (1 << OBJ_JAVASCRIPT) | (1 << OBJ_OPENACTION) | (1 << OBJ_LAUNCHACTION))
1012 1012
 
1013
-static int obj_size(struct pdf_struct *pdf, struct pdf_obj *obj, int binary)
1014
-{
1015
-    if (0 == obj->size) {
1016
-        /*
1017
-         * Programmatically determine size if not already known.
1018
-         */
1019
-        unsigned i = 0;
1020
-
1021
-        /* Find the index of the current object */
1022
-        for (i = 0; i < pdf->nobjs; i++) {
1023
-            if (pdf->objs[i] == obj)
1024
-                break;
1025
-        }
1026
-
1027
-        /* Find the next object that exists in the same buffer (pdf fmap, or object stream) */
1028
-        if (i < pdf->nobjs) {
1029
-            i++;
1030
-        }
1031
-
1032
-        if (obj->objstm == NULL) {
1033
-            /* Current object isn't in an object stream, we want to find
1034
-             * the next object that also isn't in an object stream. */
1035
-            for (; i < pdf->nobjs; i++) {
1036
-                if (pdf->objs[i]->objstm == NULL)
1037
-                    break;
1038
-            }
1039
-        } else {
1040
-            /* Current object is in an object stream, we want to find
1041
-             * the next object that is in the same object stream.
1042
-             *
1043
-             * This really shouldn't happen, so throw a warning and
1044
-             * then see if we can solve it anyhow */
1045
-            cli_warnmsg("obj_size: Encountered pdf object in an object stream that has an unknown size!!\n");
1046
-
1047
-            for (; i < pdf->nobjs; i++) {
1048
-                if (pdf->objs[i]->objstm == obj->objstm)
1049
-                    break;
1050
-            }
1051
-        }
1052
-
1053
-        /* Step backwards from the "next" object to find the end of the current object */
1054
-        if (i < pdf->nobjs) {
1055
-            int s = pdf->objs[i]->start - obj->start - 4;
1056
-            if (s > 0) {
1057
-                if (!binary) {
1058
-                    const char *p = NULL;
1059
-                    const char *q = NULL;
1060
-
1061
-                    if (obj->objstm == NULL) {
1062
-                        p = pdf->map + obj->start;
1063
-                    } else {
1064
-                        p = obj->objstm->streambuf + obj->start;
1065
-                    }
1066
-                    q = p + s;
1067
-
1068
-                    while (q > p && (isspace(*q) || isdigit(*q)))
1069
-                        q--;
1070
-
1071
-                    if (q > p + 5 && !memcmp(q - 5, "endobj", 6))
1072
-                        q -= 6;
1073
-
1074
-                    q = findNextNonWSBack(q, p);
1075
-                    q++;
1076
-
1077
-                    obj->size = q - p;
1078
-                    goto done;
1079
-                }
1080
-
1081
-                obj->size = s;
1082
-                goto done;
1083
-            }
1084
-        }
1085
-
1086
-        /* If we've gotten this far, we didn't find a "next" object... so our 
1087
-         * current object must be at the end of the pdf fmap or the end of the 
1088
-         * object stream. */
1089
-        if (obj->objstm == NULL) {
1090
-            /* Current object isn't in an object stream, so we can determine object 
1091
-             * size based on the remaining size of the file (in theory). */
1092
-            if (binary)
1093
-                obj->size = pdf->size - obj->start;
1094
-            else
1095
-                obj->size = pdf->offset - obj->start - 6; /* This hack I think assumes that we reached the end of the file when finding objects. */
1096
-        } else {
1097
-            /* Current object is in an object stream, we want to find 
1098
-             * the next object that is in the same object stream. */
1099
-            obj->size = obj->objstm->streambuf_len - obj->start;
1100
-        }
1101
-    }
1102
-
1103
-done:
1104
-
1105
-    return obj->size;
1106
-}
1107
-
1108 1013
 static int run_pdf_hooks(struct pdf_struct *pdf, enum pdf_phase phase, int fd, int dumpid)
1109 1014
 {
1110 1015
     int ret;
... ...
@@ -1472,319 +1394,317 @@ int pdf_extract_obj(struct pdf_struct *pdf, struct pdf_obj *obj, uint32_t flags)
1472 1472
     if (!(flags & PDF_EXTRACT_OBJ_SCAN))
1473 1473
         obj->path = strdup(fullname);
1474 1474
 
1475
-    do {
1476
-        if (obj->flags & (1 << OBJ_STREAM)) {
1477
-            const char *start = pdf->map + obj->start;
1478
-            off_t p_stream = 0, p_endstream = 0;
1479
-            off_t length;
1475
+    if ((NULL == obj->objstm) &&
1476
+        (obj->flags & (1 << OBJ_STREAM))) {
1477
+        /*
1478
+         * Object contains a stream. Parse this now.
1479
+         */
1480
+        cli_dbgmsg("pdf_extract_obj: parsing a stream in obj %u %u\n", obj->id >> 8, obj->id & 0xff);
1480 1481
 
1481
-            if (NULL != obj->objstm) {
1482
-                cli_warnmsg("pdf_extract_obj: Object found in object stream claims to be an object stream! Skipping.\n");
1483
-                break;
1484
-            }
1482
+        const char *start = pdf->map + obj->start;
1485 1483
 
1486
-            find_stream_bounds(start, pdf->size - obj->start,
1487
-                               pdf->size - obj->start,
1488
-                               &p_stream, &p_endstream,
1489
-                               pdf->enc_method_stream <= ENC_IDENTITY &&
1490
-                                   pdf->enc_method_embeddedfile <= ENC_IDENTITY);
1491
-
1492
-            if (p_stream && p_endstream) {
1493
-                size_t size = p_endstream - p_stream;
1494
-                off_t orig_length;
1495
-                int len = p_stream;
1496
-                const char *pstr;
1497
-                struct pdf_dict *dparams     = NULL;
1498
-                struct objstm_struct *objstm = NULL;
1499
-                int xref                     = 0;
1500
-
1501
-                length = find_length(pdf, obj, start, p_stream);
1502
-                if (length < 0)
1503
-                    length = 0;
1504
-
1505
-                orig_length = length;
1506
-                if (length > pdf->size || obj->start + p_stream + length > pdf->size) {
1507
-                    cli_dbgmsg("cli_pdf: length out of file: %lld + %lld > %lld\n",
1508
-                               (long long)p_stream, (long long)length, (long long)pdf->size);
1509
-                    noisy_warnmsg("length out of file, truncated: %lld + %lld > %lld\n",
1510
-                                  (long long)p_stream, (long long)length, (long long)pdf->size);
1511
-                    length = pdf->size - (obj->start + p_stream);
1512
-                }
1484
+        size_t length;
1485
+        size_t orig_length;
1486
+        int dict_len = obj->stream - start; /* Dictionary should end where the stream begins */
1513 1487
 
1514
-                if (!(obj->flags & (1 << OBJ_FILTER_FLATE)) && length <= 0) {
1515
-                    const char *q = start + p_endstream;
1516
-                    length        = size;
1517
-                    q--;
1488
+        const char *pstr;
1489
+        struct pdf_dict *dparams     = NULL;
1490
+        struct objstm_struct *objstm = NULL;
1491
+        int xref                     = 0;
1518 1492
 
1519
-                    if (*q == '\n') {
1520
-                        q--;
1521
-                        length--;
1493
+        /* Find and interpret the length dictionary value */
1494
+        length = find_length(pdf, obj, start, dict_len);
1495
+        if (length < 0)
1496
+            length = 0;
1522 1497
 
1523
-                        if (*q == '\r')
1524
-                            length--;
1525
-                    } else if (*q == '\r') {
1526
-                        length--;
1527
-                    }
1498
+        orig_length = length;
1528 1499
 
1529
-                    if (length < 0)
1530
-                        length = 0;
1500
+        if (length > obj->stream_size) {
1501
+            cli_dbgmsg("cli_pdf: Stream length exceeds object length by %zu bytes. Length truncated to %zu bytes\n", length - obj->stream_size, obj->stream_size);
1502
+            noisy_warnmsg("Stream length exceeds object length by %zu bytes. Length truncated to %zu bytes\n", length - obj->stream_size, obj->stream_size);
1531 1503
 
1532
-                    cli_dbgmsg("pdf_extract_obj: calculated length %lld\n", (long long)length);
1533
-                } else {
1534
-                    if (size > (size_t)length + 2) {
1535
-                        cli_dbgmsg("cli_pdf: calculated length %zu < %zu\n",
1536
-                                   (size_t)length, size);
1537
-                        length = size;
1538
-                    }
1539
-                }
1504
+            length = obj->stream_size;
1505
+        }
1540 1506
 
1541
-                if (orig_length && size > (size_t)orig_length + 20) {
1542
-                    cli_dbgmsg("pdf_extract_obj: orig length: %lld, length: %lld, size: %zu\n",
1543
-                               (long long)orig_length, (long long)length, size);
1544
-                    pdfobj_flag(pdf, obj, BAD_STREAMLEN);
1545
-                }
1507
+        if (!(obj->flags & (1 << OBJ_FILTER_FLATE)) && (length <= 0)) {
1508
+            /*
1509
+             * If the length is unknown and this doesn't contain a FLATE encoded filter...
1510
+             * Calculate the length using the stream size, and trimming
1511
+             * off any newline/carriage returns from the end of the stream.
1512
+             */
1513
+            const char *q = start + obj->stream_size;
1514
+            length        = obj->stream_size;
1515
+            q--;
1516
+
1517
+            if (*q == '\n') {
1518
+                q--;
1519
+                length--;
1520
+
1521
+                if (*q == '\r')
1522
+                    length--;
1523
+            } else if (*q == '\r') {
1524
+                length--;
1525
+            }
1546 1526
 
1547
-                if (!length) {
1548
-                    length = size;
1549
-                    if (!length) {
1550
-                        cli_dbgmsg("pdf_extract_obj: length and size both 0\n");
1551
-                        break; /* Empty stream, nothing to scan */
1552
-                    }
1553
-                }
1527
+            if (length < 0)
1528
+                length = 0;
1529
+
1530
+            cli_dbgmsg("pdf_extract_obj: calculated length %lld\n", (long long)length);
1531
+        } else {
1532
+            if (obj->stream_size > (size_t)length + 2) {
1533
+                cli_dbgmsg("cli_pdf: calculated length %zu < %zu\n",
1534
+                           (size_t)length, obj->stream_size);
1535
+                length = obj->stream_size;
1536
+            }
1537
+        }
1554 1538
 
1555
-                if (cli_memstr(start, p_stream, "/XRef", 5))
1556
-                    xref = 1;
1539
+        if ((0 != orig_length) && (obj->stream_size > (size_t)orig_length + 20)) {
1540
+            cli_dbgmsg("pdf_extract_obj: orig length: %lld, length: %lld, size: %zu\n",
1541
+                       (long long)orig_length, (long long)length, obj->stream_size);
1542
+            pdfobj_flag(pdf, obj, BAD_STREAMLEN);
1543
+        }
1557 1544
 
1558
-                cli_dbgmsg("-------------EXPERIMENTAL-------------\n");
1545
+        if (0 == length) {
1546
+            length = obj->stream_size;
1547
+            if (0 == length) {
1548
+                cli_dbgmsg("pdf_extract_obj: Alleged or calculated stream length and stream buffer size both 0\n");
1549
+                goto done; /* Empty stream, nothing to scan */
1550
+            }
1551
+        }
1559 1552
 
1560
-                /*
1561
-                 * Identify the DecodeParms, if available.
1562
-                 */
1563
-                if (NULL != (pstr = pdf_getdict(start, &len, "/DecodeParms"))) {
1564
-                    cli_dbgmsg("pdf_extract_obj: Found /DecodeParms\n");
1565
-                } else if (NULL != (pstr = pdf_getdict(start, &len, "/DP"))) {
1566
-                    cli_dbgmsg("pdf_extract_obj: Found /DP\n");
1567
-                }
1553
+        /* Check if XRef is enabled */
1554
+        if (cli_memstr(start, dict_len, "/XRef", strlen("/XRef"))) {
1555
+            xref = 1;
1556
+        }
1568 1557
 
1569
-                if (pstr) {
1570
-                    unsigned int objsize = obj_size(pdf, obj, 1);
1558
+        cli_dbgmsg("-------------EXPERIMENTAL-------------\n");
1571 1559
 
1572
-                    /* shift pstr left to "<<" for pdf_parse_dict */
1573
-                    while ((*pstr == '<') && (pstr > start)) {
1574
-                        pstr--;
1575
-                        len++;
1576
-                    }
1560
+        /*
1561
+         * Identify the DecodeParms, if available.
1562
+         */
1563
+        if (NULL != (pstr = pdf_getdict(start, &dict_len, "/DecodeParms"))) {
1564
+            cli_dbgmsg("pdf_extract_obj: Found /DecodeParms\n");
1565
+        } else if (NULL != (pstr = pdf_getdict(start, &dict_len, "/DP"))) {
1566
+            cli_dbgmsg("pdf_extract_obj: Found /DP\n");
1567
+        }
1577 1568
 
1578
-                    /* shift pstr right to "<<" for pdf_parse_dict */
1579
-                    while ((*pstr != '<') && (len > 0)) {
1580
-                        pstr++;
1581
-                        len--;
1582
-                    }
1569
+        if (pstr) {
1570
+            /* shift pstr left to "<<" for pdf_parse_dict */
1571
+            while ((*pstr == '<') && (pstr > start)) {
1572
+                pstr--;
1573
+                dict_len++;
1574
+            }
1583 1575
 
1584
-                    if (len > 4)
1585
-                        dparams = pdf_parse_dict(pdf, obj, objsize, (char *)pstr, NULL);
1586
-                    else
1587
-                        cli_dbgmsg("pdf_extract_obj: failed to locate DecodeParms dictionary start\n");
1576
+            /* shift pstr right to "<<" for pdf_parse_dict */
1577
+            while ((*pstr != '<') && (dict_len > 0)) {
1578
+                pstr++;
1579
+                dict_len--;
1580
+            }
1581
+
1582
+            if (dict_len > 4)
1583
+                dparams = pdf_parse_dict(pdf, obj, obj->size, (char *)pstr, NULL);
1584
+            else
1585
+                cli_dbgmsg("pdf_extract_obj: failed to locate DecodeParms dictionary start\n");
1586
+        }
1587
+
1588
+        /*
1589
+         * Go back to the start of the dictionary and check to see if the stream
1590
+         * is an object stream. If so, collect the relevant info.
1591
+         */
1592
+        dict_len = obj->stream - start;
1593
+        if (NULL != (pstr = pdf_getdict(start, &dict_len, "/Type/ObjStm"))) {
1594
+            int32_t objstm_first  = -1;
1595
+            int32_t objstm_length = -1;
1596
+            int32_t objstm_n      = -1;
1597
+
1598
+            cli_dbgmsg("pdf_extract_obj: Found /Type/ObjStm\n");
1599
+
1600
+            dict_len = obj->stream - start;
1601
+            if ((-1 == (objstm_first = pdf_readint(start, dict_len, "/First")))) {
1602
+                cli_warnmsg("pdf_extract_obj: Failed to find offset of first object in object stream\n");
1603
+            } else if ((-1 == (objstm_length = pdf_readint(start, dict_len, "/Length")))) {
1604
+                cli_warnmsg("pdf_extract_obj: Failed to find length of object stream\n");
1605
+            } else if ((-1 == (objstm_n = pdf_readint(start, dict_len, "/N")))) {
1606
+                cli_warnmsg("pdf_extract_obj: Failed to find num objects in object stream\n");
1607
+            } else {
1608
+                /* Add objstm to pdf struct, so it can be freed eventually */
1609
+                pdf->nobjstms++;
1610
+                pdf->objstms = cli_realloc2(pdf->objstms, sizeof(struct objstm_struct *) * pdf->nobjstms);
1611
+                if (!pdf->objstms) {
1612
+                    cli_warnmsg("pdf_extract_obj: out of memory parsing object stream (%u)\n", pdf->nobjstms);
1613
+                    pdf_free_dict(dparams);
1614
+                    return CL_EMEM;
1588 1615
                 }
1589 1616
 
1590
-                /*
1591
-                 * Identify if the stream is an object stream. If so, collect the relevant info. 
1592
-                 */
1593
-                len = p_stream;
1594
-                if (NULL != (pstr = pdf_getdict(start, &len, "/Type/ObjStm"))) {
1595
-                    int32_t objstm_first  = -1;
1596
-                    int32_t objstm_length = -1;
1597
-                    int32_t objstm_n      = -1;
1598
-
1599
-                    cli_dbgmsg("pdf_extract_obj: Found /Type/ObjStm\n");
1600
-
1601
-                    len = p_stream;
1602
-                    if ((-1 == (objstm_first = pdf_readint(start, len, "/First")))) {
1603
-                        cli_warnmsg("pdf_extract_obj: Failed to find offset of first object in object stream\n");
1604
-                    } else if ((-1 == (objstm_length = pdf_readint(start, len, "/Length")))) {
1605
-                        cli_warnmsg("pdf_extract_obj: Failed to find length of object stream\n");
1606
-                    } else if ((-1 == (objstm_n = pdf_readint(start, len, "/N")))) {
1607
-                        cli_warnmsg("pdf_extract_obj: Failed to find num objects in object stream\n");
1608
-                    } else {
1609
-                        /* Add objstm to pdf struct, so it can be freed eventually */
1610
-                        pdf->nobjstms++;
1611
-                        pdf->objstms = cli_realloc2(pdf->objstms, sizeof(struct objstm_struct *) * pdf->nobjstms);
1612
-                        if (!pdf->objstms) {
1613
-                            cli_warnmsg("pdf_extract_obj: out of memory parsing object stream (%u)\n", pdf->nobjstms);
1614
-                            pdf_free_dict(dparams);
1615
-                            return CL_EMEM;
1616
-                        }
1617
+                objstm = malloc(sizeof(struct objstm_struct));
1618
+                if (!objstm) {
1619
+                    cli_warnmsg("pdf_extract_obj: out of memory parsing object stream (%u)\n", pdf->nobjstms);
1620
+                    pdf_free_dict(dparams);
1621
+                    return CL_EMEM;
1622
+                }
1623
+                pdf->objstms[pdf->nobjstms - 1] = objstm;
1617 1624
 
1618
-                        objstm = malloc(sizeof(struct objstm_struct));
1619
-                        if (!objstm) {
1620
-                            cli_warnmsg("pdf_extract_obj: out of memory parsing object stream (%u)\n", pdf->nobjstms);
1621
-                            pdf_free_dict(dparams);
1622
-                            return CL_EMEM;
1623
-                        }
1624
-                        pdf->objstms[pdf->nobjstms - 1] = objstm;
1625
+                memset(objstm, 0, sizeof(*objstm));
1625 1626
 
1626
-                        memset(objstm, 0, sizeof(*objstm));
1627
+                objstm->first        = (uint32_t)objstm_first;
1628
+                objstm->current      = (uint32_t)objstm_first;
1629
+                objstm->current_pair = 0;
1630
+                objstm->length       = (uint32_t)objstm_length;
1631
+                objstm->n            = (uint32_t)objstm_n;
1627 1632
 
1628
-                        objstm->first        = (uint32_t)objstm_first;
1629
-                        objstm->current      = (uint32_t)objstm_first;
1630
-                        objstm->current_pair = 0;
1631
-                        objstm->length       = (uint32_t)objstm_length;
1632
-                        objstm->n            = (uint32_t)objstm_n;
1633
+                cli_dbgmsg("pdf_extract_obj: ObjStm first obj at offset %d\n", objstm->first);
1634
+                cli_dbgmsg("pdf_extract_obj: ObjStm length is %d bytes\n", objstm->length);
1635
+                cli_dbgmsg("pdf_extract_obj: ObjStm should contain %d objects\n", objstm->n);
1636
+            }
1637
+        }
1638
+
1639
+        sum = pdf_decodestream(pdf, obj, dparams, obj->stream, (uint32_t)length, xref, fout, &rc, objstm);
1640
+        if ((CL_SUCCESS != rc) && (CL_VIRUS != rc)) {
1641
+            cli_dbgmsg("Error decoding stream! Error code: %d\n", rc);
1642
+
1643
+            /* It's ok if we couldn't decode the stream,
1644
+             *   make a best effort to keep parsing. */
1645
+            if (CL_EPARSE == rc)
1646
+                rc = CL_SUCCESS;
1633 1647
 
1634
-                        cli_dbgmsg("pdf_extract_obj: ObjStm first obj at offset %d\n", objstm->first);
1635
-                        cli_dbgmsg("pdf_extract_obj: ObjStm length is %d bytes\n", objstm->length);
1636
-                        cli_dbgmsg("pdf_extract_obj: ObjStm should contain %d objects\n", objstm->n);
1648
+            if (NULL != objstm) {
1649
+                /*
1650
+                 * If we were expecting an objstm and there was a failure...
1651
+                 *   discard the memory for last object stream.
1652
+                 */
1653
+                if (NULL != pdf->objstms) {
1654
+                    if (NULL != pdf->objstms[pdf->nobjstms - 1]) {
1655
+                        if (NULL != pdf->objstms[pdf->nobjstms - 1]->streambuf) {
1656
+                            free(pdf->objstms[pdf->nobjstms - 1]->streambuf);
1657
+                            pdf->objstms[pdf->nobjstms - 1]->streambuf = NULL;
1658
+                        }
1659
+                        free(pdf->objstms[pdf->nobjstms - 1]);
1660
+                        pdf->objstms[pdf->nobjstms - 1] = NULL;
1637 1661
                     }
1638
-                }
1639 1662
 
1640
-                sum = pdf_decodestream(pdf, obj, dparams, start + p_stream, (uint32_t)length, xref, fout, &rc, objstm);
1641
-                if ((CL_SUCCESS != rc) && (CL_VIRUS != rc)) {
1642
-                    cli_dbgmsg("Error decoding stream! Error code: %d\n", rc);
1643
-
1644
-                    /* It's ok if we couldn't decode the stream,
1645
-                     *   make a best effort to keep parsing. */
1646
-                    if (CL_EPARSE == rc)
1647
-                        rc = CL_SUCCESS;
1648
-
1649
-                    if (NULL != objstm) {
1650
-                        /*
1651
-                         * If we were expecting an objstm and there was a failure...
1652
-                         *   discard the memory for last object stream.
1653
-                         */
1654
-                        if (NULL != pdf->objstms) {
1655
-                            if (NULL != pdf->objstms[pdf->nobjstms - 1]) {
1656
-                                if (NULL != pdf->objstms[pdf->nobjstms - 1]->streambuf) {
1657
-                                    free(pdf->objstms[pdf->nobjstms - 1]->streambuf);
1658
-                                    pdf->objstms[pdf->nobjstms - 1]->streambuf = NULL;
1659
-                                }
1660
-                                free(pdf->objstms[pdf->nobjstms - 1]);
1661
-                                pdf->objstms[pdf->nobjstms - 1] = NULL;
1662
-                            }
1663
+                    /* Pop the objstm off the end of the pdf->objstms array. */
1664
+                    if (pdf->nobjstms > 0) {
1665
+                        pdf->nobjstms--;
1666
+                        if (0 == pdf->nobjstms) {
1667
+                            free(pdf->objstms);
1668
+                            pdf->objstms = NULL;
1669
+                        } else {
1670
+                            pdf->objstms = cli_realloc2(pdf->objstms, sizeof(struct objstm_struct *) * pdf->nobjstms);
1663 1671
 
1664
-                            /* Pop the objstm off the end of the pdf->objstms array. */
1665
-                            if (pdf->nobjstms > 0) {
1666
-                                pdf->nobjstms--;
1667
-                                if (0 == pdf->nobjstms) {
1668
-                                    free(pdf->objstms);
1669
-                                    pdf->objstms = NULL;
1670
-                                } else {
1671
-                                    pdf->objstms = cli_realloc2(pdf->objstms, sizeof(struct objstm_struct *) * pdf->nobjstms);
1672
-
1673
-                                    if (!pdf->objstms) {
1674
-                                        cli_warnmsg("pdf_extract_obj: out of memory when shrinking down objstm array\n");
1675
-                                        return CL_EMEM;
1676
-                                    }
1677
-                                }
1678
-                            } else {
1679
-                                /* hm.. this shouldn't happen */
1680
-                                cli_warnmsg("pdf_extract_obj: Failure counting objstms.\n");
1672
+                            if (!pdf->objstms) {
1673
+                                cli_warnmsg("pdf_extract_obj: out of memory when shrinking down objstm array\n");
1674
+                                return CL_EMEM;
1681 1675
                             }
1682 1676
                         }
1677
+                    } else {
1678
+                        /* hm.. this shouldn't happen */
1679
+                        cli_warnmsg("pdf_extract_obj: Failure counting objstms.\n");
1683 1680
                     }
1684 1681
                 }
1682
+            }
1683
+        }
1685 1684
 
1686
-                if (dparams)
1687
-                    pdf_free_dict(dparams);
1688
-
1689
-                if ((rc == CL_VIRUS) && !SCAN_ALLMATCHES) {
1690
-                    sum = 0; /* prevents post-filter scan */
1691
-                    break;
1692
-                }
1685
+        if (dparams)
1686
+            pdf_free_dict(dparams);
1693 1687
 
1694
-                cli_dbgmsg("-------------EXPERIMENTAL-------------\n");
1695
-            } else {
1696
-                noisy_warnmsg("pdf_extract_obj: cannot find stream bounds for obj %u %u\n", obj->id >> 8, obj->id & 0xff);
1697
-            }
1688
+        if ((rc == CL_VIRUS) && !SCAN_ALLMATCHES) {
1689
+            sum = 0; /* prevents post-filter scan */
1690
+            goto done;
1691
+        }
1698 1692
 
1699
-        } else if (obj->flags & (1 << OBJ_JAVASCRIPT)) {
1700
-            const char *q2;
1701
-            const char *q = (obj->objstm) ? (const char *)(obj->start + obj->objstm->streambuf)
1702
-                                          : (const char *)(obj->start + pdf->map);
1693
+        cli_dbgmsg("-------------EXPERIMENTAL-------------\n");
1703 1694
 
1704
-            /* TODO: get obj-endobj size */
1705
-            off_t bytesleft = obj_size(pdf, obj, 0);
1695
+    } else if (obj->flags & (1 << OBJ_JAVASCRIPT)) {
1696
+        const char *q2;
1697
+        const char *q = (obj->objstm) ? (const char *)(obj->start + obj->objstm->streambuf)
1698
+                                      : (const char *)(obj->start + pdf->map);
1706 1699
 
1707
-            if (bytesleft < 0)
1708
-                break;
1700
+        /* TODO: get obj-endobj size */
1701
+        off_t bytesleft = obj->size;
1709 1702
 
1710
-            do {
1711
-                char *js      = NULL;
1712
-                size_t js_len = 0;
1713
-                const char *q3;
1703
+        if (bytesleft < 0) {
1704
+            goto done;
1705
+        }
1714 1706
 
1715
-                q2 = cli_memstr(q, bytesleft, "/JavaScript", 11);
1716
-                if (!q2)
1717
-                    break;
1707
+        do {
1708
+            char *js      = NULL;
1709
+            size_t js_len = 0;
1710
+            const char *q3;
1718 1711
 
1719
-                bytesleft -= q2 - q + 11;
1720
-                q = q2 + 11;
1712
+            q2 = cli_memstr(q, bytesleft, "/JavaScript", 11);
1713
+            if (!q2)
1714
+                break;
1721 1715
 
1722
-                js = pdf_readstring(q, bytesleft, "/JS", NULL, &q2, !(pdf->flags & (1 << DECRYPTABLE_PDF)));
1723
-                bytesleft -= q2 - q;
1724
-                q = q2;
1716
+            bytesleft -= q2 - q + 11;
1717
+            q = q2 + 11;
1725 1718
 
1726
-                if (js) {
1727
-                    char *decrypted = NULL;
1728
-                    const char *out = js;
1729
-                    js_len          = strlen(js);
1730
-                    if (pdf->flags & (1 << DECRYPTABLE_PDF)) {
1731
-                        cli_dbgmsg("pdf_extract_obj: encrypted string\n");
1732
-                        decrypted = decrypt_any(pdf, obj->id, js, &js_len, pdf->enc_method_string);
1719
+            js = pdf_readstring(q, bytesleft, "/JS", NULL, &q2, !(pdf->flags & (1 << DECRYPTABLE_PDF)));
1720
+            bytesleft -= q2 - q;
1721
+            q = q2;
1733 1722
 
1734
-                        if (decrypted) {
1735
-                            noisy_msg(pdf, "pdf_extract_obj: decrypted Javascript string from obj %u %u\n", obj->id >> 8, obj->id & 0xff);
1736
-                            out = decrypted;
1737
-                        }
1738
-                    }
1723
+            if (js) {
1724
+                char *decrypted = NULL;
1725
+                const char *out = js;
1726
+                js_len          = strlen(js);
1727
+                if (pdf->flags & (1 << DECRYPTABLE_PDF)) {
1728
+                    cli_dbgmsg("pdf_extract_obj: encrypted string\n");
1729
+                    decrypted = decrypt_any(pdf, obj->id, js, &js_len, pdf->enc_method_string);
1739 1730
 
1740
-                    if (filter_writen(pdf, obj, fout, out, js_len, (size_t *)&sum) != js_len) {
1741
-                        rc = CL_EWRITE;
1742
-                        free(js);
1743
-                        break;
1731
+                    if (decrypted) {
1732
+                        noisy_msg(pdf, "pdf_extract_obj: decrypted Javascript string from obj %u %u\n", obj->id >> 8, obj->id & 0xff);
1733
+                        out = decrypted;
1744 1734
                     }
1735
+                }
1745 1736
 
1746
-                    free(decrypted);
1737
+                if (filter_writen(pdf, obj, fout, out, js_len, (size_t *)&sum) != js_len) {
1738
+                    rc = CL_EWRITE;
1747 1739
                     free(js);
1748
-                    cli_dbgmsg("pdf_extract_obj: bytesleft: %d\n", (int)bytesleft);
1740
+                    break;
1741
+                }
1749 1742
 
1750
-                    if (bytesleft > 0) {
1751
-                        q2 = pdf_nextobject(q, bytesleft);
1752
-                        if (!q2)
1753
-                            q2 = q + bytesleft - 1;
1743
+                free(decrypted);
1744
+                free(js);
1745
+                cli_dbgmsg("pdf_extract_obj: bytesleft: %d\n", (int)bytesleft);
1754 1746
 
1755
-                        /* non-conforming PDFs that don't escape ) properly */
1756
-                        q3 = memchr(q, ')', bytesleft);
1757
-                        if (q3 && q3 < q2)
1758
-                            q2 = q3;
1747
+                if (bytesleft > 0) {
1748
+                    q2 = pdf_nextobject(q, bytesleft);
1749
+                    if (!q2)
1750
+                        q2 = q + bytesleft - 1;
1759 1751
 
1760
-                        while (q2 > q && q2[-1] == ' ')
1761
-                            q2--;
1752
+                    /* non-conforming PDFs that don't escape ) properly */
1753
+                    q3 = memchr(q, ')', bytesleft);
1754
+                    if (q3 && q3 < q2)
1755
+                        q2 = q3;
1762 1756
 
1763
-                        if (q2 > q) {
1764
-                            q--;
1765
-                            filter_writen(pdf, obj, fout, q, q2 - q, (size_t *)&sum);
1766
-                            q++;
1767
-                        }
1757
+                    while (q2 > q && q2[-1] == ' ')
1758
+                        q2--;
1759
+
1760
+                    if (q2 > q) {
1761
+                        q--;
1762
+                        filter_writen(pdf, obj, fout, q, q2 - q, (size_t *)&sum);
1763
+                        q++;
1768 1764
                     }
1769 1765
                 }
1766
+            }
1770 1767
 
1771
-            } while (bytesleft > 0);
1772
-        } else {
1773
-            off_t bytesleft = obj_size(pdf, obj, 0);
1774
-
1775
-            if (bytesleft < 0)
1776
-                rc = CL_EFORMAT;
1777
-            else {
1778
-                if (obj->objstm) {
1779
-                    if (filter_writen(pdf, obj, fout, obj->objstm->streambuf + obj->start, bytesleft, (size_t *)&sum) != (size_t)bytesleft)
1780
-                        rc = CL_EWRITE;
1781
-                } else {
1782
-                    if (filter_writen(pdf, obj, fout, pdf->map + obj->start, bytesleft, (size_t *)&sum) != (size_t)bytesleft)
1783
-                        rc = CL_EWRITE;
1784
-                }
1768
+        } while (bytesleft > 0);
1769
+    } else {
1770
+        off_t bytesleft = obj->size;
1771
+
1772
+        if (bytesleft < 0)
1773
+            rc = CL_EFORMAT;
1774
+        else {
1775
+            if (obj->objstm) {
1776
+                if (filter_writen(pdf, obj, fout, obj->objstm->streambuf + obj->start, bytesleft, (size_t *)&sum) != (size_t)bytesleft)
1777
+                    rc = CL_EWRITE;
1778
+            } else {
1779
+                if (filter_writen(pdf, obj, fout, pdf->map + obj->start, bytesleft, (size_t *)&sum) != (size_t)bytesleft)
1780
+                    rc = CL_EWRITE;
1785 1781
             }
1786 1782
         }
1787
-    } while (0);
1783
+    }
1784
+
1785
+done:
1788 1786
 
1789 1787
     cli_dbgmsg("pdf_extract_obj: extracted %td bytes %u %u obj\n", sum, obj->id >> 8, obj->id & 0xff);
1790 1788
     cli_dbgmsg("pdf_extract_obj:         ... to %s\n", fullname);
... ...
@@ -2093,7 +2013,7 @@ void pdf_parseobj(struct pdf_struct *pdf, struct pdf_obj *obj)
2093 2093
     const char *nextobj = NULL, *nextopen = NULL, *nextclose = NULL;
2094 2094
     const char *q    = NULL;
2095 2095
     const char *dict = NULL, *enddict = NULL, *start = NULL;
2096
-    off_t dict_length = 0, full_dict_length = 0, objsize = 0, bytesleft = 0;
2096
+    off_t dict_length = 0, full_dict_length = 0, bytesleft = 0;
2097 2097
     size_t i         = 0;
2098 2098
     unsigned filters = 0, blockopens = 0;
2099 2099
     enum objstate objstate = STATE_NONE;
... ...
@@ -2106,6 +2026,8 @@ void pdf_parseobj(struct pdf_struct *pdf, struct pdf_obj *obj)
2106 2106
         return;
2107 2107
     }
2108 2108
 
2109
+    cli_dbgmsg("pdf_parseobj: Parsing object %u %u\n", obj->id >> 8, obj->id & 0xff);
2110
+
2109 2111
     if (obj->objstm) {
2110 2112
         if ((size_t)obj->start > obj->objstm->streambuf_len) {
2111 2113
             cli_dbgmsg("pdf_parseobj: %u %u obj: obj start (%u) is greater than size of object stream (%zu).\n",
... ...
@@ -2123,14 +2045,38 @@ void pdf_parseobj(struct pdf_struct *pdf, struct pdf_obj *obj)
2123 2123
     }
2124 2124
     start = q;
2125 2125
 
2126
-    objsize = obj_size(pdf, obj, 1);
2127
-    if (objsize < 0)
2126
+    if (obj->size <= 0)
2128 2127
         return;
2129 2128
 
2130 2129
     if (obj->objstm) {
2131
-        bytesleft = MIN(objsize, obj->objstm->streambuf_len - obj->start);
2130
+        bytesleft = MIN(obj->size, obj->objstm->streambuf_len - obj->start);
2132 2131
     } else {
2133
-        bytesleft = MIN(objsize, pdf->size - obj->start);
2132
+        bytesleft = MIN(obj->size, pdf->size - obj->start);
2133
+    }
2134
+
2135
+    /* For objects that aren't already in an object stream^, check if they contain a stream.
2136
+     * ^Objects in object streams aren't supposed to contain streams, so we don't check them. */
2137
+    if (NULL == obj->objstm) {
2138
+        /* Check if object contains stream */
2139
+        cl_error_t has_stream;
2140
+        const char *stream = NULL;
2141
+        size_t stream_size = 0;
2142
+
2143
+        has_stream = find_stream_bounds(
2144
+            start,
2145
+            obj->size,
2146
+            &stream,
2147
+            &stream_size,
2148
+            (pdf->enc_method_stream <= ENC_IDENTITY) && (pdf->enc_method_embeddedfile <= ENC_IDENTITY));
2149
+
2150
+        if ((CL_SUCCESS == has_stream) ||
2151
+            (CL_EFORMAT == has_stream)) {
2152
+            /* Stream found. Store this fact and the stream bounds. */
2153
+            cli_dbgmsg("pdf_parseobj: %u %u contains stream, size: %zu\n", obj->id >> 8, obj->id & 0xff, stream_size);
2154
+            obj->flags |= (1 << OBJ_STREAM);
2155
+            obj->stream      = stream;
2156
+            obj->stream_size = stream_size;
2157
+        }
2134 2158
     }
2135 2159
 
2136 2160
     /* find start of dictionary */
... ...
@@ -2181,7 +2127,7 @@ void pdf_parseobj(struct pdf_struct *pdf, struct pdf_obj *obj)
2181 2181
     dict = q3 + 2;
2182 2182
     q    = dict;
2183 2183
     blockopens++;
2184
-    bytesleft = objsize - (q - start);
2184
+    bytesleft = obj->size - (q - start);
2185 2185
     enddict   = q + bytesleft - 1;
2186 2186
 
2187 2187
     /* find end of dictionary block */
... ...
@@ -2329,7 +2275,7 @@ void pdf_parseobj(struct pdf_struct *pdf, struct pdf_obj *obj)
2329 2329
             pdfobj_flag(pdf, obj, LINEARIZED_PDF);
2330 2330
             objstate    = STATE_NONE;
2331 2331
             trailer_end = pdf_readint(dict, full_dict_length, "/H");
2332
-            if (trailer_end > 0 && trailer_end < pdf->size) {
2332
+            if ((trailer_end > 0) && ((size_t)trailer_end < pdf->size)) {
2333 2333
                 trailer = trailer_end - 1024;
2334 2334
                 if (trailer < 0)
2335 2335
                     trailer = 0;
... ...
@@ -2939,7 +2885,7 @@ void pdf_handle_enc(struct pdf_struct *pdf)
2939 2939
         return;
2940 2940
     }
2941 2941
 
2942
-    len = obj_size(pdf, obj, 1);
2942
+    len = obj->size;
2943 2943
     q   = (obj->objstm) ? (const char *)(obj->start + obj->objstm->streambuf)
2944 2944
                       : (const char *)(obj->start + pdf->map);
2945 2945
 
... ...
@@ -3095,20 +3041,20 @@ void pdf_handle_enc(struct pdf_struct *pdf)
3095 3095
 }
3096 3096
 
3097 3097
 /**
3098
- * @brief Search pdf buffer for objects.  Parse each.  
3099
- * 
3098
+ * @brief Search pdf buffer for objects.  Parse each.
3099
+ *
3100 3100
  * Newly found objects will be extracted after completion when the extraction for loop continues.
3101
- * 
3102
- * @param pdf           Pdf struct that keeps track of all information found in the PDF. 
3101
+ *
3102
+ * @param pdf           Pdf struct that keeps track of all information found in the PDF.
3103 3103
  * @param objstm        Pointer to an object stream to parse.
3104
- * 
3104
+ *
3105 3105
  * @return cl_error_t   Error code.
3106 3106
  */
3107 3107
 cl_error_t pdf_find_and_parse_objs_in_objstm(struct pdf_struct *pdf, struct objstm_struct *objstm)
3108 3108
 {
3109
-    cl_error_t status = CL_EFORMAT;
3110
-    cl_error_t retval = CL_EPARSE;
3111
-    int32_t foundobj = 0, alerts = 0;
3109
+    cl_error_t status   = CL_EFORMAT;
3110
+    cl_error_t retval   = CL_EPARSE;
3111
+    int32_t alerts      = 0;
3112 3112
     uint32_t badobjects = 0;
3113 3113
     size_t i            = 0;
3114 3114
 
... ...
@@ -3119,9 +3065,6 @@ cl_error_t pdf_find_and_parse_objs_in_objstm(struct pdf_struct *pdf, struct objs
3119 3119
         goto done;
3120 3120
     }
3121 3121
 
3122
-    char *current_pair = objstm->streambuf;
3123
-    char *current_obj  = objstm->streambuf + objstm->first;
3124
-
3125 3122
     if ((0 == objstm->first) ||
3126 3123
         (0 == objstm->streambuf_len) ||
3127 3124
         (0 == objstm->n)) {
... ...
@@ -3183,18 +3126,17 @@ done:
3183 3183
 
3184 3184
 /**
3185 3185
  * @brief Search pdf buffer for objects.  Parse each and then extract each.
3186
- * 
3186
+ *
3187 3187
  * @param pdf               Pdf struct that keeps track of all information found in the PDF.
3188 3188
  * @param alerts[in/out]    The number of alerts, relevant in ALLMATCH mode.
3189
- * 
3189
+ *
3190 3190
  * @return cl_error_t   Error code.
3191 3191
  */
3192 3192
 cl_error_t pdf_find_and_extract_objs(struct pdf_struct *pdf, uint32_t *alerts)
3193 3193
 {
3194
-    cl_error_t status = CL_SUCCESS;
3195
-    int32_t rv        = 0;
3196
-    int foundobj      = 0;
3197
-    unsigned int i = 0, j = 0;
3194
+    cl_error_t status   = CL_SUCCESS;
3195
+    int32_t rv          = 0;
3196
+    unsigned int i      = 0;
3198 3197
     uint32_t badobjects = 0;
3199 3198
     cli_ctx *ctx        = pdf->ctx;
3200 3199
 
... ...
@@ -3299,11 +3241,11 @@ done:
3299 3299
 
3300 3300
 /**
3301 3301
  * @brief Primary function for parsing and scanning a PDF.
3302
- * 
3302
+ *
3303 3303
  * @param dir       Filepath for temp file.
3304
- * @param ctx       clam scan context structure. 
3304
+ * @param ctx       clam scan context structure.
3305 3305
  * @param offset    offset of pdf in ctx->fmap
3306
- * 
3306
+ *
3307 3307
  * @return int      Returns cl_error_t status value.
3308 3308
  */
3309 3309
 int cli_pdf(const char *dir, cli_ctx *ctx, off_t offset)
... ...
@@ -3505,7 +3447,7 @@ int cli_pdf(const char *dir, cli_ctx *ctx, off_t offset)
3505 3505
     }
3506 3506
 
3507 3507
     /*
3508
-     * Find and extract all objects in the PDF. 
3508
+     * Find and extract all objects in the PDF.
3509 3509
      * New experimental recursive methodology that adds objects from object streams.
3510 3510
      */
3511 3511
     objs_found = pdf.nobjs;
... ...
@@ -3605,10 +3547,10 @@ done:
3605 3605
 
3606 3606
 /**
3607 3607
  * @brief   Skip the rest of the current line, and find the start of the next line.
3608
- * 
3608
+ *
3609 3609
  * @param ptr   Current offset into buffer.
3610
- * @param len   Remaining bytes in buffer. 
3611
- * 
3610
+ * @param len   Remaining bytes in buffer.
3611
+ *
3612 3612
  * @return const char*  Address of next line, or NULL if no next line in buffer.
3613 3613
  */
3614 3614
 static const char *
... ...
@@ -3638,13 +3580,13 @@ pdf_nextlinestart(const char *ptr, size_t len)
3638 3638
 
3639 3639
 /**
3640 3640
  * @brief   Return the start of the next PDF object.
3641
- * 
3641
+ *
3642 3642
  * This assumes that we're not in a stream.
3643
- * 
3643
+ *
3644 3644
  * @param ptr   Current offset into buffer.
3645
- * @param len   Remaining bytes in buffer. 
3646
- * 
3647
- * @return const char*  Address of next object in the buffer, or NULL if there is none in the buffer. 
3645
+ * @param len   Remaining bytes in buffer.
3646
+ *
3647
+ * @return const char*  Address of next object in the buffer, or NULL if there is none in the buffer.
3648 3648
  */
3649 3649
 static const char *
3650 3650
 pdf_nextobject(const char *ptr, size_t len)
... ...
@@ -3987,7 +3929,7 @@ static void Author_cb(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdfnam
3987 3987
         pdf->stats.author = cli_calloc(1, sizeof(struct pdf_stats_entry));
3988 3988
         if (!(pdf->stats.author))
3989 3989
             return;
3990
-        pdf->stats.author->data = pdf_parse_string(pdf, obj, objstart, obj_size(pdf, obj, 1), "/Author", NULL, &(pdf->stats.author->meta));
3990
+        pdf->stats.author->data = pdf_parse_string(pdf, obj, objstart, obj->size, "/Author", NULL, &(pdf->stats.author->meta));
3991 3991
     }
3992 3992
 }
3993 3993
 #endif
... ...
@@ -4012,7 +3954,7 @@ static void Creator_cb(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdfna
4012 4012
         pdf->stats.creator = cli_calloc(1, sizeof(struct pdf_stats_entry));
4013 4013
         if (!(pdf->stats.creator))
4014 4014
             return;
4015
-        pdf->stats.creator->data = pdf_parse_string(pdf, obj, objstart, obj_size(pdf, obj, 1), "/Creator", NULL, &(pdf->stats.creator->meta));
4015
+        pdf->stats.creator->data = pdf_parse_string(pdf, obj, objstart, obj->size, "/Creator", NULL, &(pdf->stats.creator->meta));
4016 4016
     }
4017 4017
 }
4018 4018
 #endif
... ...
@@ -4037,7 +3979,7 @@ static void ModificationDate_cb(struct pdf_struct *pdf, struct pdf_obj *obj, str
4037 4037
         pdf->stats.modificationdate = cli_calloc(1, sizeof(struct pdf_stats_entry));
4038 4038
         if (!(pdf->stats.modificationdate))
4039 4039
             return;
4040
-        pdf->stats.modificationdate->data = pdf_parse_string(pdf, obj, objstart, obj_size(pdf, obj, 1), "/ModDate", NULL, &(pdf->stats.modificationdate->meta));
4040
+        pdf->stats.modificationdate->data = pdf_parse_string(pdf, obj, objstart, obj->size, "/ModDate", NULL, &(pdf->stats.modificationdate->meta));
4041 4041
     }
4042 4042
 }
4043 4043
 #endif
... ...
@@ -4062,7 +4004,7 @@ static void CreationDate_cb(struct pdf_struct *pdf, struct pdf_obj *obj, struct
4062 4062
         pdf->stats.creationdate = cli_calloc(1, sizeof(struct pdf_stats_entry));
4063 4063
         if (!(pdf->stats.creationdate))
4064 4064
             return;
4065
-        pdf->stats.creationdate->data = pdf_parse_string(pdf, obj, objstart, obj_size(pdf, obj, 1), "/CreationDate", NULL, &(pdf->stats.creationdate->meta));
4065
+        pdf->stats.creationdate->data = pdf_parse_string(pdf, obj, objstart, obj->size, "/CreationDate", NULL, &(pdf->stats.creationdate->meta));
4066 4066
     }
4067 4067
 }
4068 4068
 #endif
... ...
@@ -4087,7 +4029,7 @@ static void Producer_cb(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdfn
4087 4087
         pdf->stats.producer = cli_calloc(1, sizeof(struct pdf_stats_entry));
4088 4088
         if (!(pdf->stats.producer))
4089 4089
             return;
4090
-        pdf->stats.producer->data = pdf_parse_string(pdf, obj, objstart, obj_size(pdf, obj, 1), "/Producer", NULL, &(pdf->stats.producer->meta));
4090
+        pdf->stats.producer->data = pdf_parse_string(pdf, obj, objstart, obj->size, "/Producer", NULL, &(pdf->stats.producer->meta));
4091 4091
     }
4092 4092
 }
4093 4093
 #endif
... ...
@@ -4112,7 +4054,7 @@ static void Title_cb(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdfname
4112 4112
         pdf->stats.title = cli_calloc(1, sizeof(struct pdf_stats_entry));
4113 4113
         if (!(pdf->stats.title))
4114 4114
             return;
4115
-        pdf->stats.title->data = pdf_parse_string(pdf, obj, objstart, obj_size(pdf, obj, 1), "/Title", NULL, &(pdf->stats.title->meta));
4115
+        pdf->stats.title->data = pdf_parse_string(pdf, obj, objstart, obj->size, "/Title", NULL, &(pdf->stats.title->meta));
4116 4116
     }
4117 4117
 }
4118 4118
 #endif
... ...
@@ -4137,7 +4079,7 @@ static void Keywords_cb(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdfn
4137 4137
         pdf->stats.keywords = cli_calloc(1, sizeof(struct pdf_stats_entry));
4138 4138
         if (!(pdf->stats.keywords))
4139 4139
             return;
4140
-        pdf->stats.keywords->data = pdf_parse_string(pdf, obj, objstart, obj_size(pdf, obj, 1), "/Keywords", NULL, &(pdf->stats.keywords->meta));
4140
+        pdf->stats.keywords->data = pdf_parse_string(pdf, obj, objstart, obj->size, "/Keywords", NULL, &(pdf->stats.keywords->meta));
4141 4141
     }
4142 4142
 }
4143 4143
 #endif
... ...
@@ -4162,7 +4104,7 @@ static void Subject_cb(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdfna
4162 4162
         pdf->stats.subject = cli_calloc(1, sizeof(struct pdf_stats_entry));
4163 4163
         if (!(pdf->stats.subject))
4164 4164
             return;
4165
-        pdf->stats.subject->data = pdf_parse_string(pdf, obj, objstart, obj_size(pdf, obj, 1), "/Subject", NULL, &(pdf->stats.subject->meta));
4165
+        pdf->stats.subject->data = pdf_parse_string(pdf, obj, objstart, obj->size, "/Subject", NULL, &(pdf->stats.subject->meta));
4166 4166
     }
4167 4167
 }
4168 4168
 #endif
... ...
@@ -4214,7 +4156,6 @@ static void Pages_cb(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdfname
4214 4214
     const char *objstart = (obj->objstm) ? (const char *)(obj->start + obj->objstm->streambuf)
4215 4215
                                          : (const char *)(obj->start + pdf->map);
4216 4216
     const char *begin;
4217
-    unsigned int objsize;
4218 4217
     unsigned long npages = 0, count;
4219 4218
     long temp_long;
4220 4219
     struct pdf_array_node *node;
... ...
@@ -4229,19 +4170,17 @@ static void Pages_cb(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdfname
4229 4229
     if (!(SCAN_COLLECT_METADATA))
4230 4230
         return;
4231 4231
 
4232
-    objsize = obj_size(pdf, obj, 1);
4233
-
4234 4232
     pdfobj = cli_jsonobj(pdf->ctx->wrkproperty, "PDFStats");
4235 4233
     if (!(pdfobj))
4236 4234
         return;
4237 4235
 
4238
-    begin = cli_memstr(objstart, objsize, "/Kids", 5);
4236
+    begin = cli_memstr(objstart, obj->size, "/Kids", 5);
4239 4237
     if (!(begin))
4240 4238
         return;
4241 4239
 
4242 4240
     begin += 5;
4243 4241
 
4244
-    array = pdf_parse_array(pdf, obj, objsize, (char *)begin, NULL);
4242
+    array = pdf_parse_array(pdf, obj, obj->size, (char *)begin, NULL);
4245 4243
     if (!(array)) {
4246 4244
         cli_jsonbool(pdfobj, "IncorrectPagesCount", 1);
4247 4245
         return;
... ...
@@ -4252,22 +4191,22 @@ static void Pages_cb(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdfname
4252 4252
             if (strchr((char *)(node->data), 'R'))
4253 4253
                 npages++;
4254 4254
 
4255
-    begin = cli_memstr(objstart, objsize, "/Count", 6);
4255
+    begin = cli_memstr(objstart, obj->size, "/Count", 6);
4256 4256
     if (!(begin)) {
4257 4257
         cli_jsonbool(pdfobj, "IncorrectPagesCount", 1);
4258 4258
         goto cleanup;
4259 4259
     }
4260 4260
 
4261 4261
     begin += 6;
4262
-    while (begin - objstart < objsize && isspace(begin[0]))
4262
+    while (((size_t)(begin - objstart) < obj->size) && isspace(begin[0]))
4263 4263
         begin++;
4264 4264
 
4265
-    if (begin - objstart >= objsize) {
4265
+    if ((size_t)(begin - objstart) >= obj->size) {
4266 4266
         goto cleanup;
4267 4267
     }
4268 4268
 
4269
-    countsize = (obj->objstm) ? (size_t)(obj->start + obj->objstm->streambuf + objsize - begin)
4270
-                              : (size_t)(obj->start + pdf->map + objsize - begin);
4269
+    countsize = (obj->objstm) ? (size_t)(obj->start + obj->objstm->streambuf + obj->size - begin)
4270
+                              : (size_t)(obj->start + pdf->map + obj->size - begin);
4271 4271
 
4272 4272
     if (CL_SUCCESS != cli_strntol_wrap(begin, countsize, 0, 10, &temp_long)) {
4273 4273
         cli_jsonbool(pdfobj, "IncorrectPagesCount", 1);
... ...
@@ -4295,7 +4234,6 @@ static void Colors_cb(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdfnam
4295 4295
     char *p1;
4296 4296
     const char *objstart = (obj->objstm) ? (const char *)(obj->start + obj->objstm->streambuf)
4297 4297
                                          : (const char *)(obj->start + pdf->map);
4298
-    size_t objsize;
4299 4298
 
4300 4299
     UNUSEDPARAM(act);
4301 4300
 
... ...
@@ -4305,25 +4243,23 @@ static void Colors_cb(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdfnam
4305 4305
     if (!(SCAN_COLLECT_METADATA))
4306 4306
         return;
4307 4307
 
4308
-    objsize = obj_size(pdf, obj, 1);
4309
-
4310
-    p1 = (char *)cli_memstr(objstart, objsize, "/Colors", 7);
4308
+    p1 = (char *)cli_memstr(objstart, obj->size, "/Colors", 7);
4311 4309
     if (!(p1))
4312 4310
         return;
4313 4311
 
4314 4312
     p1 += 7;
4315 4313
 
4316 4314
     /* Ensure that we have at least one whitespace character plus at least one number */
4317
-    if (objsize - (p1 - objstart) < 2)
4315
+    if (obj->size - (size_t)(p1 - objstart) < 2)
4318 4316
         return;
4319 4317
 
4320
-    while (p1 - objstart < objsize && isspace(p1[0]))
4318
+    while (((size_t)(p1 - objstart) < obj->size) && isspace(p1[0]))
4321 4319
         p1++;
4322 4320
 
4323
-    if ((size_t)(p1 - objstart) == objsize)
4321
+    if ((size_t)(p1 - objstart) == obj->size)
4324 4322
         return;
4325 4323
 
4326
-    if (CL_SUCCESS != cli_strntol_wrap(p1, (size_t)((p1 - objstart) - objsize), 0, 10, &temp_long)) {
4324
+    if (CL_SUCCESS != cli_strntol_wrap(p1, (size_t)((p1 - objstart) - obj->size), 0, 10, &temp_long)) {
4327 4325
         return;
4328 4326
     } else if (temp_long < 0) {
4329 4327
         return;
... ...
@@ -37,12 +37,14 @@ struct objstm_struct {
37 37
 
38 38
 struct pdf_obj {
39 39
     uint32_t start;
40
-    int32_t size;
40
+    size_t size;
41 41
     uint32_t id;
42 42
     uint32_t flags;
43 43
     uint32_t statsflags;
44 44
     uint32_t numfilters;
45 45
     uint32_t filterlist[PDF_FILTERLIST_MAX];
46
+    const char *stream;     // pointer to stream contained in object.
47
+    size_t stream_size;      // size of stream contained in object.
46 48
     struct objstm_struct *objstm; // Should be NULL unless the obj exists in an object stream (separate buffer)
47 49
     char *path;
48 50
 };
... ...
@@ -151,7 +153,7 @@ struct pdf_struct {
151 151
     const char *CF;
152 152
     long CF_n;
153 153
     const char *map;
154
-    off_t size;
154
+    size_t size;
155 155
     off_t offset;
156 156
     off_t startoff;
157 157
     cli_ctx *ctx;
... ...
@@ -403,10 +403,9 @@ char *cli_strtokbuf(const char *input, int fieldno, const char *delim,
403 403
     return output;
404 404
 }
405 405
 
406
-const char *cli_memstr(const char *haystack, unsigned int hs,
407
-                       const char *needle, unsigned int ns)
406
+const char *cli_memstr(const char *haystack, size_t hs, const char *needle, size_t ns)
408 407
 {
409
-    unsigned int i, s1, s2;
408
+    size_t i, s1, s2;
410 409
 
411 410
     if (!hs || !ns || hs < ns)
412 411
         return NULL;
... ...
@@ -32,7 +32,7 @@
32 32
 #include "clamav.h"
33 33
 #include "clamav.h"
34 34
 
35
-#define SIZE_T_CHARLEN ((sizeof(size_t) * CHAR_BIT + 2) / 3 + 1)
35
+#define SIZE_T_CHARLEN ( (sizeof(size_t) * CHAR_BIT + 2) / 3 + 1 )
36 36
 
37 37
 #ifdef HAVE_STRCASESTR
38 38
 #define cli_strcasestr strcasestr
... ...
@@ -61,25 +61,25 @@ int cli_chomp(char *string);
61 61
 char *cli_strtok(const char *line, int field, const char *delim);
62 62
 int cli_realhex2ui(const char *hex, uint16_t *ptr, unsigned int len);
63 63
 uint16_t *cli_hex2ui(const char *hex);
64
-int cli_hex2str_to(const char *hex, char *ptr, size_t len);
64
+int  cli_hex2str_to(const char *hex, char *ptr, size_t len);
65 65
 char *cli_hex2str(const char *hex);
66 66
 int cli_hex2num(const char *hex);
67 67
 int cli_xtoi(const char *hex);
68 68
 char *cli_str2hex(const char *string, unsigned int len);
69 69
 char *cli_utf16toascii(const char *str, unsigned int length);
70 70
 char *cli_strtokbuf(const char *input, int fieldno, const char *delim, char *output);
71
-const char *cli_memstr(const char *haystack, unsigned int hs, const char *needle, unsigned int ns);
71
+const char *cli_memstr(const char *haystack, size_t hs, const char *needle, size_t ns);
72 72
 char *cli_strrcpy(char *dest, const char *source);
73 73
 size_t cli_strtokenize(char *buffer, const char delim, const size_t token_count, const char **tokens);
74 74
 size_t cli_ldbtokenize(char *buffer, const char delim, const size_t token_count, const char **tokens, int token_skip);
75
-long cli_strntol(const char *nptr, size_t n, char **endptr, register int base);
76
-unsigned long cli_strntoul(const char *nptr, size_t n, char **endptr, register int base);
75
+long cli_strntol(const char* nptr, size_t n, char** endptr, register int base);
76
+unsigned long cli_strntoul(const char* nptr, size_t n, char** endptr, register int base);
77 77
 cl_error_t cli_strntol_wrap(const char *buf, size_t buf_size, int fail_at_nondigit, int base, long *result);
78 78
 cl_error_t cli_strntoul_wrap(const char *buf, size_t buf_size, int fail_at_nondigit, int base, unsigned long *result);
79 79
 int cli_isnumber(const char *str);
80 80
 char *cli_unescape(const char *str);
81 81
 struct text_buffer;
82
-int cli_textbuffer_append_normalize(struct text_buffer *buf, const char *str, size_t len);
82
+int  cli_textbuffer_append_normalize(struct text_buffer *buf, const char *str, size_t len);
83 83
 int cli_hexnibbles(char *str, int len);
84 84
 
85 85
 typedef enum {