Browse code

Clean up of PDF object finding logic. Changes include recording object sizes as objects are found, identifying object streams in the object parsing section instead of the PDF parsing section, and limiting of stream and other object parsing to the size of the object instead of the size of the PDF. It is also easier to read and includes more inline documentation.

Micah Snyder authored on 2019/03/06 11:15:41
Showing 4 changed files
... ...
@@ -134,14 +134,14 @@ static int xrefCheck(const char *xref, const char *eof)
134 134
     if (xref + 4 >= eof)
135 135
         return -1;
136 136
 
137
-    if (!memcmp(xref, "xref", 4)) {
137
+    if (!memcmp(xref, "xref", strlen("xref"))) {
138 138
         cli_dbgmsg("cli_pdf: found xref\n");
139 139
         return 0;
140 140
     }
141 141
 
142 142
     /* could be xref stream */
143 143
     for (q=xref; q+5 < eof; q++) {
144
-        if (!memcmp(q,"/XRef",4)) {
144
+        if (!memcmp(q,"/XRef", strlen("/XRef"))) {
145 145
             cli_dbgmsg("cli_pdf: found /XRef\n");
146 146
             return 0;
147 147
         }
... ...
@@ -163,10 +163,10 @@ static int xrefCheck(const char *xref, const char *eof)
163 163
 
164 164
 /**
165 165
  * @brief   Searching BACKwards, find the next character that is not a whitespace.
166
- * 
166
+ *
167 167
  * @param q         Index to start from (at the end of the search space)
168
- * @param start     Beginning of the search space. 
169
- * 
168
+ * @param start     Beginning of the search space.
169
+ *
170 170
  * @return const char*  Address of the final non-whitespace character OR the same address as the start.
171 171
  */
172 172
 static const char *findNextNonWSBack(const char *q, const char *start)
... ...
@@ -179,10 +179,10 @@ static const char *findNextNonWSBack(const char *q, const char *start)
179 179
 
180 180
 /**
181 181
  * @brief   Searching FORwards, find the next character that is not a whitespace.
182
- * 
182
+ *
183 183
  * @param q         Index to start from (at the end of the search space)
184
- * @param start     Beginning of the search space. 
185
- * 
184
+ * @param start     Beginning of the search space.
185
+ *
186 186
  * @return const char*  Address of the final non-whitespace character OR the same address as the start.
187 187
  */
188 188
 static const char *findNextNonWS(const char *q, const char *end)
... ...
@@ -195,100 +195,116 @@ static const char *findNextNonWS(const char *q, const char *end)
195 195
 
196 196
 /**
197 197
  * @brief   Find bounds of stream.
198
- * 
198
+ *
199 199
  * PDF streams are prefixed with "stream" and suffixed with "endstream".
200 200
  * Return value indicates success or failure.
201
- * 
201
+ *
202 202
  * @param start             start address of search space.
203
- * @param bytesleft         size of search space for "stream"
204
- * @param bytesleft2        size of search space for "endstream"
203
+ * @param size              size of search space
205 204
  * @param[out] stream       output param, address of start of stream data
206
- * @param[out] endstream    output param, address of end of stream data
205
+ * @param[out] stream_size  output param, size of stream data
207 206
  * @param newline_hack      hack to support newlines that are \r\n, and not just \n or just \r.
208
- * 
209
- * @return int  1 if stream bounds were found. 
210
- * @return int  0 if stream bounds could not be found. 
207
+ *
208
+ * @return cl_error_t       CL_SUCCESS if stream bounds were found.
209
+ * @return cl_error_t       CL_BREAK if stream bounds could not be found.
210
+ * @return cl_error_t       CL_EFORMAT if stream start was found, but not end. (truncated)
211
+ * @return cl_error_t       CL_EARG if invalid args were provided.
211 212
  */
212
-static int find_stream_bounds(
213
-    const char *start, 
214
-    off_t bytesleft, 
215
-    off_t bytesleft2, 
216
-    off_t *stream, 
217
-    off_t *endstream, 
213
+static cl_error_t find_stream_bounds(
214
+    const char *start,
215
+    size_t size,
216
+    const char **stream,
217
+    size_t *stream_size,
218 218
     int newline_hack)
219 219
 {
220
-    const char *q2, *q;
220
+    cl_error_t status = CL_BREAK;
221
+
222
+    const char *idx;
223
+    const char *stream_begin;
224
+    const char *endstream_begin;
225
+    size_t bytesleft = size;
226
+
227
+    if ((NULL == start) || (0 == bytesleft) || (NULL == stream) || (NULL == stream_size)) {
228
+        status = CL_EARG;
229
+        return status;
230
+    }
231
+
232
+    *stream = NULL;
233
+    *stream_size = 0;
221 234
 
222 235
     /* Begin by finding the "stream" string that prefixes stream data. */
223
-    if ((q2 = cli_memstr(start, bytesleft, "stream", 6))) {
224
-        q2 += 6;
225
-        bytesleft -= q2 - start;
236
+    if ((stream_begin = cli_memstr(start, bytesleft, "stream", strlen("stream")))) {
237
+        idx = stream_begin + strlen("stream");
238
+        bytesleft -= idx - start;
226 239
         if (bytesleft < 0)
227
-            return 0;
240
+            goto done;
228 241
 
229 242
         /* Skip any new line charcters. */
230
-        if (bytesleft >= 2 && q2[0] == '\xd' && q2[1] == '\xa') {
231
-            q2 += 2;
232
-            if (newline_hack && (bytesleft > 2) && q2[0] == '\xa')
233
-                q2++;
234
-        } else if (bytesleft && q2[0] == '\xa') {
235
-            q2++;
243
+        if (bytesleft >= 2 && idx[0] == '\xd' && idx[1] == '\xa') {
244
+            idx += 2;
245
+            if (newline_hack && (bytesleft > 2) && idx[0] == '\xa')
246
+                idx++;
247
+        } else if (bytesleft && idx[0] == '\xa') {
248
+            idx++;
236 249
         }
237 250
 
238
-        *stream = q2 - start;
251
+        /* Pass back start of the stream data. */
252
+        *stream = idx;
239 253
 
240
-        bytesleft2 -= q2 - start;
241
-        if (bytesleft2 <= 0)
242
-            return 0;
254
+        bytesleft = size - (idx - start);
255
+        if (bytesleft <= 0)
256
+            goto done;
243 257
 
244
-        /* Now find the "endstream" string that suffixes stream data */
245
-        q = q2;
246
-        q2 = cli_memstr(q, bytesleft2, "endstream", 9);
247
-        if (!q2) {
248
-            /* Couldn't find "endstream" */
249
-            return 0;
258
+        /* Now find the "endstream" string that suffixes stream data. */
259
+        endstream_begin = cli_memstr(idx, bytesleft, "endstream", strlen("endstream"));
260
+        if (!endstream_begin) {
261
+            /* Couldn't find "endstream", but that's ok --
262
+             * -- we'll just count the rest of the provided buffer. */
263
+            cli_dbgmsg("find_stream_bounds: Truncated stream found!\n");
264
+            endstream_begin = start + size;
265
+            status = CL_EFORMAT;
250 266
         }
251 267
 
252
-        *endstream = q2 - start;
253
-
254
-        /* Double-check that endstream >= stream */
255
-        if (*endstream < *stream)
256
-            *endstream = *stream;
268
+        /* Pass back end of the stream data, as offset from start. */
269
+        *stream_size = endstream_begin - *stream;
257 270
 
258
-        return 1;
271
+        if (CL_EFORMAT != status)
272
+            status = CL_SUCCESS;
259 273
     }
260 274
 
261
-    return 0;
275
+done:
276
+
277
+    return status;
262 278
 }
263 279
 
264 280
 /**
265
- * @brief Find the next *indirect* object in an object stream, adds it to our list of 
281
+ * @brief Find the next *indirect* object in an object stream, adds it to our list of
266 282
  *        objects, and increments nobj.
267
- * 
283
+ *
268 284
  * Indirect objects in a stream DON'T begin with "obj" and end with "endobj".
269 285
  * Instead, they have an obj ID and an offset from the first object to point you
270 286
  * right at them.
271
- * 
287
+ *
272 288
  * If found, objstm->current will be updated to the next obj id.
273
- * 
274
- * All objects in an object stream are indirect and thus do not begin or start 
275
- * with "obj" or "endobj".  Instead, the object stream takes the following 
289
+ *
290
+ * All objects in an object stream are indirect and thus do not begin or start
291
+ * with "obj" or "endobj".  Instead, the object stream takes the following
276 292
  * format.
277
- * 
293
+ *
278 294
  *      <dictionary describing stream> objstm content endobjstm
279
- * 
295
+ *
280 296
  * where content looks something like the following:
281
- * 
297
+ *
282 298
  *      15 0 16 3 17 46 (ab)<</IDS 8 0 R/JavaScript 27 0 R/URLS 9 0 R>><</Names[(Test)28 0 R]>>
283
- * 
284
- * In the above example, the literal string (ab) is indirect object # 15, and 
285
- * begins at offset 0 of the set of objects.  The next object, # 16 begis at 
286
- * offset 3 is a dictionary.  The final object is also a dictionary, beginning 
299
+ *
300
+ * In the above example, the literal string (ab) is indirect object # 15, and
301
+ * begins at offset 0 of the set of objects.  The next object, # 16 begis at
302
+ * offset 3 is a dictionary.  The final object is also a dictionary, beginning
287 303
  * at offset 46.
288
- * 
289
- * @param pdf   Pdf struct that keeps track of all information found in the PDF. 
304
+ *
305
+ * @param pdf   Pdf struct that keeps track of all information found in the PDF.
290 306
  * @param objstm
291
- * 
307
+ *
292 308
  * @return CL_SUCCESS  if success
293 309
  * @return CL_EPARSE   if parsing error
294 310
  * @return CL_EMEM     if error allocating memory
... ...
@@ -298,7 +314,7 @@ int pdf_findobj_in_objstm(struct pdf_struct *pdf, struct objstm_struct *objstm,
298 298
 {
299 299
     cl_error_t status = CL_EPARSE;
300 300
     struct pdf_obj *obj = NULL;
301
-    unsigned long objid = 0, objsize = 0, objoff = 0;
301
+    unsigned long objid = 0, objoff = 0;
302 302
     long temp_long         = 0;
303 303
     const char *index = NULL;
304 304
     size_t bytes_remaining = 0;
... ...
@@ -383,10 +399,10 @@ int pdf_findobj_in_objstm(struct pdf_struct *pdf, struct objstm_struct *objstm,
383 383
     {
384 384
         unsigned long next_objid = 0, next_objoff = 0;
385 385
 
386
-        /* 
387
-         * While we're at it, 
386
+        /*
387
+         * While we're at it,
388 388
          *   lets record the size as running up to the next object offset.
389
-         * 
389
+         *
390 390
          * To do so, we will need to parse the next obj pair.
391 391
          */
392 392
         /* objstm->current_pair points directly to the obj id */
... ...
@@ -439,14 +455,14 @@ int pdf_findobj_in_objstm(struct pdf_struct *pdf, struct objstm_struct *objstm,
439 439
         }
440 440
 
441 441
         obj->size = next_objoff - objoff;
442
-    } 
443
-    else 
442
+    }
443
+    else
444 444
     {
445 445
         /*
446 446
          * Should be no more objects. We should verify.
447
-         * 
447
+         *
448 448
          * Either way...
449
-         *   obj->size should be the rest of the buffer. 
449
+         *   obj->size should be the rest of the buffer.
450 450
          */
451 451
         if (objstm->nobjs_found < objstm->n) {
452 452
             cli_warnmsg("pdf_findobj_in_objstm: Fewer objects found in object stream than expected!\n");
... ...
@@ -480,17 +496,25 @@ done:
480 480
 
481 481
 /**
482 482
  * @brief Find the next *indirect* object.
483
- * 
484
- * Indirect objects begin with "obj" and end with "endobj".
485
- * Identify objects that contain streams.
486
- * Identify truncated objects. 
487
- * 
483
+ *
484
+ * Indirect objects located outside of an object stream are prefaced with:
485
+ *      <objid> <genid> obj
486
+ *
487
+ * Each of the above are separated by whitespace of some sort.
488
+ *
489
+ * Indirect objects are postfaced with:
490
+ *      endobj
491
+ *
492
+ * The specification does not say if whitespace is required before or after "endobj".
493
+ *
494
+ * Identify truncated objects.
495
+ *
488 496
  * If found, pdf->offset will be updated to just after the "endobj".
489 497
  * If truncated, pdf->offset will == pdf->size.
490 498
  * If not found, pdf->offset will not be updated.
491
- * 
492
- * @param pdf   Pdf context struct that keeps track of all information found in the PDF. 
493
- * 
499
+ *
500
+ * @param pdf   Pdf context struct that keeps track of all information found in the PDF.
501
+ *
494 502
  * @return CL_SUCCESS  if success
495 503
  * @return CL_BREAK    if no more objects
496 504
  * @return CL_EPARSE   if parsing error
... ...
@@ -499,9 +523,13 @@ done:
499 499
 cl_error_t pdf_findobj(struct pdf_struct *pdf)
500 500
 {
501 501
     cl_error_t status = CL_EPARSE;
502
-    const char *start, *q, *q2, *q3, *eof;
502
+    const char *start, *idx, *genid_search_index, *objid_search_index;
503
+
504
+    const char *obj_begin = NULL, *obj_end = NULL;
505
+    const char *endobj_begin = NULL, *endobj_end = NULL;
506
+
503 507
     struct pdf_obj *obj = NULL;
504
-    off_t bytesleft;
508
+    size_t bytesleft;
505 509
     unsigned long genid, objid;
506 510
     long temp_long;
507 511
 
... ...
@@ -524,101 +552,111 @@ cl_error_t pdf_findobj(struct pdf_struct *pdf)
524 524
     start = pdf->map + pdf->offset;
525 525
     bytesleft = pdf->size - pdf->offset;
526 526
 
527
-    /* Indirect objects located outside of an object stream are prefaced with "obj"
528
-     * and suffixed with "endobj".  Find the "obj" preface. */
529
-    while (bytesleft > 0)
530
-    {
531
-        q2 = cli_memstr(start, bytesleft, "obj", 3);
532
-        if (!q2) {
533
-            status = CL_BREAK; /* no more objs */
534
-            goto done;
527
+    /*
528
+     * Start by searching for "obj"
529
+     */
530
+    idx = start + 1;
531
+    while (bytesleft > 1 + strlen("obj")) {
532
+        /* `- 1` accounts for size of white space before obj */
533
+        idx = cli_memstr(idx, bytesleft - 1, "obj", strlen("obj"));
534
+        if (NULL == idx) {
535
+            status = CL_BREAK;
536
+            goto done; /* No more objs. */
535 537
         }
536 538
 
537
-        /* verify that "obj" has a whitespace before it, and is not the end of 
538
-         * a previous string like... "globj" */
539
-        q2--;
540
-        bytesleft -= q2 - start;
539
+        /* verify that the word has a whitespace before it, and is not the end of
540
+         * a previous word */
541
+        idx--;
542
+        bytesleft = (pdf->size - pdf->offset) - (size_t)(idx - start);
541 543
 
542
-        if (*q2 != 0 && *q2 != 9 && *q2 != 0xa && *q2 != 0xc && *q2 != 0xd && *q2 != 0x20) {
543
-            /* This instance of the "obj" string appears to be part of another string.
544
+        if (*idx != 0 && *idx != 9 && *idx != 0xa && *idx != 0xc && *idx != 0xd && *idx != 0x20) {
545
+            /* This instance of "obj" appears to be part of a longer string.
544 546
              * Skip it, and keep searching for an object. */
545
-            start = q2+4;
546
-            bytesleft -= 4;
547
+            idx += 1 + strlen("obj");
548
+            bytesleft -= 1 + strlen("obj");
547 549
             continue;
548 550
         }
549 551
 
550
-        break; /* Found it. q2 should point to the whitespace before the "obj" string */
551
-    }
552
+        /* Found the beginning of the word */
553
+        obj_begin = idx;
554
+        obj_end = idx + 1 + strlen("obj");
552 555
 
553
-    if (bytesleft <= 0) {
554
-        status = CL_BREAK; /* No "obj" found. */
555
-        goto done;
556
+        break;
556 557
     }
557 558
 
558
-    /* "obj" found! */
559
+    if ((NULL == obj_begin) || (NULL == obj_end)) {
560
+        status = CL_BREAK;
561
+        goto done; /* No more objs. */
562
+    }
559 563
 
560 564
     /* Find the generation id (genid) that appears before the "obj" */
561
-    q = findNextNonWSBack(q2-1, start);
562
-    while (q > start && isdigit(*q))
563
-        q--;
565
+    genid_search_index = findNextNonWSBack(obj_begin - 1, start);
566
+    while (genid_search_index > start && isdigit(*genid_search_index))
567
+        genid_search_index--;
564 568
 
565
-    if (CL_SUCCESS != cli_strntol_wrap(q, (size_t)(bytesleft + (q2 - q)), 0, 10, &temp_long)) {
569
+    if (CL_SUCCESS != cli_strntol_wrap(genid_search_index, (size_t)((obj_begin) - genid_search_index), 0, 10, &temp_long)) {
566 570
         cli_dbgmsg("pdf_findobj: Failed to parse object genid (# objects found: %u)\n", pdf->nobjs);
567 571
         /* Failed to parse, probably not a real object.  Skip past the "obj" thing, and continue. */
568
-        pdf->offset = q2 + 4 - pdf->map;
572
+        pdf->offset = obj_end - pdf->map;
569 573
         status = CL_EPARSE;
570 574
         goto done;
571 575
     } else if (temp_long < 0) {
572 576
         cli_dbgmsg("pdf_findobj: Encountered invalid negative obj genid (%ld).\n", temp_long);
573
-        pdf->offset = q2 + 4 - pdf->map;
577
+        pdf->offset = obj_end - pdf->map;
574 578
         status      = CL_EPARSE;
575 579
         goto done;
576 580
     }
577 581
     genid = (unsigned long)temp_long;
578 582
 
579
-    /* Find the object id (objid) that appers before the genid */
580
-    q = findNextNonWSBack(q-1,start);
581
-    while (q > start && isdigit(*q))
582
-        q--;
583
+    /* Find the object id (objid) that appears before the genid */
584
+    objid_search_index = findNextNonWSBack(genid_search_index - 1, start);
585
+    while (objid_search_index > start && isdigit(*objid_search_index))
586
+        objid_search_index--;
583 587
 
584
-    if (CL_SUCCESS != cli_strntol_wrap(q, (size_t)(bytesleft + (q2 - q)), 0, 10, &temp_long)) {
588
+    if (CL_SUCCESS != cli_strntol_wrap(objid_search_index, (size_t)((genid_search_index) - objid_search_index), 0, 10, &temp_long)) {
585 589
         /*
586
-         * PDFs with multiple revisions will have %%EOF before the end of the file, 
587
-         * followed by the next revision of the PDF.  If this is the case, we can 
588
-         * detect it and continue parsing after the %%EOF.
590
+         * Edge case:
591
+         *
592
+         * PDFs with multiple revisions will have %%EOF before the end of the file,
593
+         * followed by the next revision of the PDF, which will probably be an immediate objid.
594
+         *
595
+         * Example:
596
+         *   %%EOF1 1 obj <blah> endobj
597
+         *
598
+         * If this is the case, we can detect it and continue parsing after the %%EOF.
589 599
          */
590
-        if (q - 4 > start) {
591
-            const char* lastfile = q - 4;
600
+        if (objid_search_index - strlen("\%\%EO") > start) {
601
+            const char* lastfile = objid_search_index - strlen("\%\%EO");
592 602
             if (0 != strncmp(lastfile, "\%\%EOF", 5)) {
593 603
                 /* Nope, wasn't %%EOF */
594 604
                 cli_dbgmsg("pdf_findobj: Failed to parse object objid (# objects found: %u)\n", pdf->nobjs);
595 605
                 /* Skip past the "obj" thing, and continue. */
596
-                pdf->offset = q2 + 4 - pdf->map;
597
-                status = CL_EPARSE;
606
+                pdf->offset = obj_end - pdf->map;
607
+                status      = CL_EPARSE;
598 608
                 goto done;
599 609
             }
600
-            /* Yup, Looks, like the file continues after %%EOF.  
610
+            /* Yup, Looks, like the file continues after %%EOF.
601 611
              * Probably another revision.  Keep parsing... */
602
-            q++;
603
-            cli_dbgmsg("pdf_findobj: \%\%EOF detected before end of file, at %zu\n", (size_t)q);
612
+            objid_search_index++;
613
+            cli_dbgmsg("pdf_findobj: \%\%EOF detected before end of file, at offset: %zu\n", (size_t)(objid_search_index - pdf->map));
604 614
         } else {
605 615
             /* Failed parsing at the very beginning */
606 616
             cli_dbgmsg("pdf_findobj: Failed to parse object objid (# objects found: %u)\n", pdf->nobjs);
607 617
             /* Probably not a real object.  Skip past the "obj" thing, and continue. */
608
-            pdf->offset = q2 + 4 - pdf->map;
609
-            status = CL_EPARSE;
618
+            pdf->offset = obj_end - pdf->map;
619
+            status      = CL_EPARSE;
610 620
             goto done;
611 621
         }
612 622
         /* Try again, with offset slightly adjusted */
613
-        if (CL_SUCCESS != cli_strntol_wrap(q, (size_t)(bytesleft + (q2 - q)), 0, 10, &temp_long)) {
623
+        if (CL_SUCCESS != cli_strntol_wrap(objid_search_index, (size_t)((genid_search_index - 1) - objid_search_index), 0, 10, &temp_long)) {
614 624
             cli_dbgmsg("pdf_findobj: Failed to parse object objid (# objects found: %u)\n", pdf->nobjs);
615 625
             /* Still failed... Probably not a real object.  Skip past the "obj" thing, and continue. */
616
-            pdf->offset = q2 + 4 - pdf->map;
617
-            status = CL_EPARSE;
626
+            pdf->offset = obj_end - pdf->map;
627
+            status      = CL_EPARSE;
618 628
             goto done;
619 629
         } else if (temp_long < 0) {
620 630
             cli_dbgmsg("pdf_findobj: Encountered invalid negative objid (%ld).\n", temp_long);
621
-            pdf->offset = q2 + 4 - pdf->map;
631
+            pdf->offset = obj_end - pdf->map;
622 632
             status      = CL_EPARSE;
623 633
             goto done;
624 634
         }
... ...
@@ -626,85 +664,54 @@ cl_error_t pdf_findobj(struct pdf_struct *pdf)
626 626
         cli_dbgmsg("pdf_findobj: There appears to be an additional revision. Continuing to parse...\n");
627 627
     } else if (temp_long < 0) {
628 628
         cli_dbgmsg("pdf_findobj: Encountered invalid negative objid (%ld).\n", temp_long);
629
-        pdf->offset = q2 + 4 - pdf->map;
629
+        pdf->offset = obj_end - pdf->map;
630 630
         status      = CL_EPARSE;
631 631
         goto done;
632 632
     }
633 633
     objid = (unsigned long)temp_long;
634 634
 
635
-    /*
636
-     * Ok so we have the objid, genid, and "obj" string.
637
-     *   Time to store that information and then ...
638
-     *     ... investigate what kind of object this is.
639
-     */
640 635
     obj->id = (objid << 8) | (genid & 0xff);
641
-    obj->start = q2+4 - pdf->map; /* obj start begins just after the "obj" string */
636
+    obj->start = obj_end - pdf->map; /* obj start begins just after the "obj" string */
642 637
     obj->flags = 0;
643 638
 
644
-    bytesleft -= 4;
645
-    eof = pdf->map + pdf->size;
646
-    q = pdf->map + obj->start;
647
-
648
-    while (q < eof && bytesleft > 0)
649
-    {
650
-        off_t p_stream, p_endstream;
651
-        q2 = pdf_nextobject(q, bytesleft);
652
-        if (!q2)
653
-            q2 = pdf->map + pdf->size; /* No interesting objects found, fast-forward to eof */
654
-
655
-        bytesleft -= q2 - q;
656
-        if (find_stream_bounds(q-1, q2-q, bytesleft + (q2-q), &p_stream, &p_endstream, 1)) {
657
-            /*
658
-             * Found obj that contains a stream.
659
-             */
660
-            obj->flags |= 1 << OBJ_STREAM;
661
-            q2 = q-1 + p_endstream + 9;
662
-            bytesleft -= q2 - q + 1;
663
-
664
-            if (bytesleft < 0) {
665
-                /* ... and the stream is truncated.  Hmm... */
666
-                obj->flags |= 1 << OBJ_TRUNCATED;
667
-                pdf->offset = pdf->size;
668
-
669
-                status = CL_SUCCESS;
670
-                goto done; /* Truncated file, no end to obj/stream. 
671
-                            * The next call to pdf_findobj() will return no more objects. */
672
-            }
673
-        } else if ((q3 = cli_memstr(q-1, q2-q+1, "endobj", 6))) {
674
-            /*
675
-             * obj found and offset positioned. ideal return case
676
-             */
677
-            q2 = q3 + 6;
678
-            pdf->offset = q2 - pdf->map; /* update the offset to just after the endobj */
679
-
680
-            status = CL_SUCCESS;
681
-            goto done; 
682
-        } else {
683
-            q2++;
684
-            bytesleft--;
685
-        }
686
-
687
-        q = q2;
639
+    /*
640
+     * We now have the objid, genid, and object start.
641
+     * Find the object end ("endobj").
642
+     */
643
+    /* `- 1` accounts for size of white space before obj */
644
+    endobj_begin = cli_memstr(obj_end, pdf->map + pdf->size - obj_end, "endobj", strlen("endobj"));
645
+    if (NULL == endobj_begin) {
646
+        /* No end to object.
647
+         * PDF appears to be malformed or truncated.
648
+         * Will record the object size as going ot the end of the file.
649
+         * Will record that the object is truncated.
650
+         * Will position the pdf offset to the end of the PDF.
651
+         * The next iteration of this function will find no more objects. */
652
+        obj->flags |= 1 << OBJ_TRUNCATED;
653
+        obj->size   = (pdf->map + pdf->size) - obj_end;
654
+        pdf->offset = pdf->size;
655
+
656
+        /* Truncated "object" found! */
657
+        status = CL_SUCCESS;
658
+        goto done;
688 659
     }
660
+    endobj_end = endobj_begin + strlen("endobj");
689 661
 
690
-    obj->flags |= 1 << OBJ_TRUNCATED;
691
-    pdf->offset = pdf->size;
662
+    /* Size of the object goes from "obj" <-> "endobject". */
663
+    obj->size = endobj_begin - obj_end;
664
+    pdf->offset = endobj_end - pdf->map;
692 665
 
666
+    /*
667
+     * Object found!
668
+     */
693 669
     status = CL_SUCCESS; /* truncated file, no end to obj. */
694 670
 
695 671
 done:
696 672
     if (status == CL_SUCCESS) {
697
-        cli_dbgmsg("pdf_findobj: found %d %d obj @%lld\n", obj->id >> 8, obj->id&0xff, (long long)(obj->start + pdf->startoff));
673
+        cli_dbgmsg("pdf_findobj: found %d %d obj @%lld, size: %zu bytes.\n", obj->id >> 8, obj->id&0xff, (long long)(obj->start + pdf->startoff), obj->size);
698 674
     }
699 675
     else
700 676
     {
701
-        if(status == CL_BREAK) {
702
-            cli_dbgmsg("pdf_findobj: No more objects (# objects found: %u)\n", pdf->nobjs);
703
-        } else if(status == CL_EMEM) {
704
-            cli_warnmsg("pdf_findobj: Error allocating memory (# objects found: %u)\n", pdf->nobjs);
705
-        } else {
706
-            cli_dbgmsg("pdf_findobj: Unexpected status code %d.\n", status);
707
-        }
708 677
         /* Remove the unused obj reference from our list of objects found */
709 678
         /* No need to realloc pdf->objs back down.  It won't leak. */
710 679
         pdf->objs[pdf->nobjs-1] = NULL;
... ...
@@ -713,9 +720,17 @@ done:
713 713
         /* Free up the obj struct. */
714 714
         if (NULL != obj)
715 715
             free(obj);
716
+
717
+        if(status == CL_BREAK) {
718
+            cli_dbgmsg("pdf_findobj: No more objects (# objects found: %u)\n", pdf->nobjs);
719
+        } else if(status == CL_EMEM) {
720
+            cli_warnmsg("pdf_findobj: Error allocating memory (# objects found: %u)\n", pdf->nobjs);
721
+        } else {
722
+            cli_dbgmsg("pdf_findobj: Unexpected status code %d.\n", status);
723
+        }
716 724
     }
717 725
 
718
-    return status; 
726
+    return status;
719 727
 }
720 728
 
721 729
 static size_t filter_writen(struct pdf_struct *pdf, struct pdf_obj *obj, int fout, const char *buf, size_t len, size_t *sum)
... ...
@@ -836,14 +851,14 @@ struct pdf_obj *find_obj(struct pdf_struct *pdf, struct pdf_obj *obj, uint32_t o
836 836
 
837 837
 /**
838 838
  * @brief   Find and interpret the "/Length" dictionary key value.
839
- * 
839
+ *
840 840
  * The value may be:
841
- *  - a direct object (i.e. just a number) 
841
+ *  - a direct object (i.e. just a number)
842 842
  *  - an indirect object, where the value is somewhere else in the document and we have to look it up.
843 843
  *    indirect objects are referenced using an object id (objid), generation id (genid) genid, and the letter 'R'.
844
- * 
844
+ *
845 845
  * Example dictionary with a single key "/Length" that relies direct object for the value.
846
- * 
846
+ *
847 847
  *      1 0 obj
848 848
  *          << /Length 534
849 849
  *              /Filter [ /ASCII85Decode /LZWDecode ]
... ...
@@ -857,9 +872,9 @@ struct pdf_obj *find_obj(struct pdf_struct *pdf, struct pdf_obj *obj, uint32_t o
857 857
  *              JD?M$0QP)lKn06l1apKDC@\qJ4B!!(5m+j.7F790m(Vj88l8Q:_CZ(Gm1%X\N1&u!FKHMB~>
858 858
  *          endstream
859 859
  *      endobj
860
- * 
860
+ *
861 861
  * Example dictionary with a single key "/Length" that relies on an indirect object for the value.
862
- * 
862
+ *
863 863
  *      7 0 obj
864 864
  *          << /Length 8 0 R >> % An indirect reference to object 8, with generation id 0.
865 865
  *          stream
... ...
@@ -870,11 +885,11 @@ struct pdf_obj *find_obj(struct pdf_struct *pdf, struct pdf_obj *obj, uint32_t o
870 870
  *              ET
871 871
  *          endstream
872 872
  *      endobj
873
- * 
873
+ *
874 874
  *      8 0 obj
875 875
  *          77 % The length of the preceding stream
876 876
  *      endobj
877
- * 
877
+ *
878 878
  * @param pdf       Pdf context structure.
879 879
  * @param obj       Pdf object context structure.
880 880
  * @param start     Pointer start of the dictionary string.
... ...
@@ -914,12 +929,12 @@ static size_t find_length(struct pdf_struct *pdf, struct pdf_obj *obj, const cha
914 914
     if (!obj_start)
915 915
         return 0;
916 916
 
917
-    if (bytes_remaining < obj_start - index) {
917
+    if (bytes_remaining < (size_t)(obj_start - index)) {
918 918
         return 0;
919 919
     }
920 920
     bytes_remaining -= obj_start - index;
921 921
     index = obj_start;
922
-    
922
+
923 923
     /* Read the value.  This could either be the direct length value,
924 924
        or the object id of the indirect object that has the length */
925 925
     if (CL_SUCCESS != cli_strntol_wrap(index, bytes_remaining, 0, 10, &temp_long)) {
... ...
@@ -931,10 +946,10 @@ static size_t find_length(struct pdf_struct *pdf, struct pdf_obj *obj, const cha
931 931
     }
932 932
     length = (size_t)temp_long; /* length or maybe object id */
933 933
 
934
-    /* 
935
-     * Keep parsing, skipping past the first integer that might have been what we wanted. 
936
-     * If it's an indirect object, we'll find a Generation ID followed by the letter 'R' 
937
-     * I.e. something like " 0 R" 
934
+    /*
935
+     * Keep parsing, skipping past the first integer that might have been what we wanted.
936
+     * If it's an indirect object, we'll find a Generation ID followed by the letter 'R'
937
+     * I.e. something like " 0 R"
938 938
      */
939 939
     while ((bytes_remaining > 0) && isdigit(*index)) {
940 940
         index++;
... ...
@@ -966,14 +981,14 @@ static size_t find_length(struct pdf_struct *pdf, struct pdf_obj *obj, const cha
966 966
         }
967 967
 
968 968
         if (index[0] == ' ' && index[1] == 'R') {
969
-            /* 
970
-             * Ok so we found a genid and that 'R'.  Which means that first value 
969
+            /*
970
+             * Ok so we found a genid and that 'R'.  Which means that first value
971 971
              * was actually the objid.
972 972
              * We can look up the indirect object using this information.
973 973
              */
974 974
             unsigned long objid = length;
975 975
             const char* indirect_obj_start = NULL;
976
-            
976
+
977 977
             cli_dbgmsg("find_length: length is in indirect object %lu %lu\n", objid, genid);
978 978
 
979 979
             obj = find_obj(pdf, obj, (length << 8) | (genid&0xff));
... ...
@@ -984,15 +999,15 @@ static size_t find_length(struct pdf_struct *pdf, struct pdf_obj *obj, const cha
984 984
 
985 985
             indirect_obj_start = pdf->map + obj->start;
986 986
             bytes_remaining = pdf->size - obj->start;
987
-            
987
+
988 988
             /* Ok so we found the indirect object, lets read the value. */
989 989
             index = pdf_nextobject(indirect_obj_start, bytes_remaining);
990 990
             if (!index) {
991 991
                 cli_dbgmsg("find_length: next object not found\n");
992 992
                 return 0;
993 993
             }
994
-            
995
-            if (bytes_remaining < index - indirect_obj_start) {
994
+
995
+            if (bytes_remaining < (size_t)(index - indirect_obj_start)) {
996 996
                 return 0;
997 997
             }
998 998
             bytes_remaining -= index - indirect_obj_start;
... ...
@@ -1010,7 +1025,7 @@ static size_t find_length(struct pdf_struct *pdf, struct pdf_obj *obj, const cha
1010 1010
     }
1011 1011
 
1012 1012
     /* limit length */
1013
-    if (obj_start - pdf->map + length + 5 > pdf->size)
1013
+    if ((size_t)(obj_start - pdf->map) + length + 5 > pdf->size)
1014 1014
         length = pdf->size - (obj_start - pdf->map) - 5;
1015 1015
 
1016 1016
     return length;
... ...
@@ -1018,102 +1033,6 @@ static size_t find_length(struct pdf_struct *pdf, struct pdf_obj *obj, const cha
1018 1018
 
1019 1019
 #define DUMP_MASK ((1 << OBJ_CONTENTS) | (1 << OBJ_FILTER_FLATE) | (1 << OBJ_FILTER_DCT) | (1 << OBJ_FILTER_AH) | (1 << OBJ_FILTER_A85) | (1 << OBJ_EMBEDDED_FILE) | (1 << OBJ_JAVASCRIPT) | (1 << OBJ_OPENACTION) | (1 << OBJ_LAUNCHACTION))
1020 1020
 
1021
-static int obj_size(struct pdf_struct *pdf, struct pdf_obj *obj, int binary)
1022
-{
1023
-    if (0 == obj->size)
1024
-    {
1025
-        /*
1026
-         * Programmatically determine size if not already known.
1027
-         */
1028
-        unsigned i = 0;
1029
-
1030
-        /* Find the index of the current object */
1031
-        for (i = 0; i < pdf->nobjs; i++) {
1032
-            if (pdf->objs[i] == obj)
1033
-                break;
1034
-        }
1035
-
1036
-        /* Find the next object that exists in the same buffer (pdf fmap, or object stream) */
1037
-        if (i < pdf->nobjs) {
1038
-            i++;
1039
-        }
1040
-
1041
-        if (obj->objstm == NULL) {
1042
-            /* Current object isn't in an object stream, we want to find
1043
-             * the next object that also isn't in an object stream. */
1044
-            for ( ; i < pdf->nobjs; i++) {
1045
-                if (pdf->objs[i]->objstm == NULL)
1046
-                    break;
1047
-            }
1048
-        } else {
1049
-            /* Current object is in an object stream, we want to find
1050
-             * the next object that is in the same object stream.
1051
-             *
1052
-             * This really shouldn't happen, so throw a warning and
1053
-             * then see if we can solve it anyhow */
1054
-            cli_warnmsg("obj_size: Encountered pdf object in an object stream that has an unknown size!!\n");
1055
-
1056
-            for ( ; i < pdf->nobjs; i++) {
1057
-                if (pdf->objs[i]->objstm == obj->objstm)
1058
-                    break;
1059
-            }
1060
-        }
1061
-
1062
-        /* Step backwards from the "next" object to find the end of the current object */
1063
-        if (i < pdf->nobjs) {
1064
-            int s = pdf->objs[i]->start - obj->start - 4;
1065
-            if (s > 0) {
1066
-                if (!binary) {
1067
-                    const char *p = NULL;
1068
-                    const char *q = NULL;
1069
-
1070
-                    if (obj->objstm == NULL) {
1071
-                        p = pdf->map + obj->start;
1072
-                    } else {
1073
-                        p = obj->objstm->streambuf + obj->start;
1074
-                    }
1075
-                    q = p + s;
1076
-
1077
-                    while (q > p && (isspace(*q) || isdigit(*q)))
1078
-                        q--;
1079
-
1080
-                    if (q > p+5 && !memcmp(q-5,"endobj",6))
1081
-                        q -= 6;
1082
-
1083
-                    q = findNextNonWSBack(q, p);
1084
-                    q++;
1085
-
1086
-                    obj->size = q - p;
1087
-                    goto done;
1088
-                }
1089
-
1090
-                obj->size = s;
1091
-                goto done;
1092
-            }
1093
-        }
1094
-
1095
-        /* If we've gotten this far, we didn't find a "next" object... so our 
1096
-         * current object must be at the end of the pdf fmap or the end of the 
1097
-         * object stream. */
1098
-        if (obj->objstm == NULL) {
1099
-            /* Current object isn't in an object stream, so we can determine object 
1100
-             * size based on the remaining size of the file (in theory). */
1101
-            if (binary)
1102
-                obj->size = pdf->size - obj->start;
1103
-            else
1104
-                obj->size = pdf->offset - obj->start - 6; /* This hack I think assumes that we reached the end of the file when finding objects. */
1105
-        } else {
1106
-            /* Current object is in an object stream, we want to find 
1107
-             * the next object that is in the same object stream. */
1108
-            obj->size = obj->objstm->streambuf_len - obj->start;
1109
-        }
1110
-    }
1111
-
1112
-done:
1113
-
1114
-    return obj->size;
1115
-}
1116
-
1117 1021
 static int run_pdf_hooks(struct pdf_struct *pdf, enum pdf_phase phase, int fd, int dumpid)
1118 1022
 {
1119 1023
     int ret;
... ...
@@ -1482,330 +1401,328 @@ int pdf_extract_obj(struct pdf_struct *pdf, struct pdf_obj *obj, uint32_t flags)
1482 1482
     if (!(flags & PDF_EXTRACT_OBJ_SCAN))
1483 1483
         obj->path = strdup(fullname);
1484 1484
 
1485
-    do {
1486
-        if (obj->flags & (1 << OBJ_STREAM)) {
1487
-            const char *start = pdf->map + obj->start;
1488
-            off_t p_stream = 0, p_endstream = 0;
1489
-            off_t length;
1485
+    if ((NULL == obj->objstm) &&
1486
+        (obj->flags & (1 << OBJ_STREAM))) {
1487
+        /*
1488
+         * Object contains a stream. Parse this now.
1489
+         */
1490
+        cli_dbgmsg("pdf_extract_obj: parsing a stream in obj %u %u\n", obj->id>>8, obj->id&0xff);
1490 1491
 
1491
-            if (NULL != obj->objstm) {
1492
-                cli_warnmsg("pdf_extract_obj: Object found in object stream claims to be an object stream! Skipping.\n");
1493
-                break;
1492
+        const char *start = pdf->map + obj->start;
1493
+
1494
+        size_t length;
1495
+        size_t orig_length;
1496
+        int dict_len = obj->stream - start; /* Dictionary should end where the stream begins */
1497
+
1498
+        const char *pstr;
1499
+        struct pdf_dict *dparams = NULL;
1500
+        struct objstm_struct *objstm = NULL;
1501
+        int xref = 0;
1502
+
1503
+        /* Find and interpret the length dictionary value */
1504
+        length = find_length(pdf, obj, start, dict_len);
1505
+        if (length < 0)
1506
+            length = 0;
1507
+
1508
+        orig_length = length;
1509
+
1510
+        if (length > obj->stream_size) {
1511
+            cli_dbgmsg("cli_pdf: Stream length exceeds object length by %zu bytes. Length truncated to %zu bytes\n", length - obj->stream_size, obj->stream_size);
1512
+            noisy_warnmsg("Stream length exceeds object length by %zu bytes. Length truncated to %zu bytes\n", length - obj->stream_size, obj->stream_size);
1513
+
1514
+            length = obj->stream_size;
1515
+        }
1516
+
1517
+        if (!(obj->flags & (1 << OBJ_FILTER_FLATE)) && (length <= 0)) {
1518
+            /*
1519
+             * If the length is unknown and this doesn't contain a FLATE encoded filter...
1520
+             * Calculate the length using the stream size, and trimming
1521
+             * off any newline/carriage returns from the end of the stream.
1522
+             */
1523
+            const char *q = start + obj->stream_size;
1524
+            length = obj->stream_size;
1525
+            q--;
1526
+
1527
+            if (*q == '\n') {
1528
+                q--;
1529
+                length--;
1530
+
1531
+                if (*q == '\r')
1532
+                    length--;
1533
+            } else if (*q == '\r') {
1534
+                length--;
1494 1535
             }
1495 1536
 
1496
-            find_stream_bounds(start, pdf->size - obj->start,
1497
-                       pdf->size - obj->start,
1498
-                       &p_stream, &p_endstream,
1499
-                       pdf->enc_method_stream <= ENC_IDENTITY &&
1500
-                       pdf->enc_method_embeddedfile <= ENC_IDENTITY);
1501
-
1502
-            if (p_stream && p_endstream) {
1503
-                size_t size = p_endstream - p_stream;
1504
-                off_t orig_length;
1505
-                int len = p_stream;
1506
-                const char *pstr;
1507
-                struct pdf_dict *dparams = NULL;
1508
-                struct objstm_struct *objstm = NULL;
1509
-                int xref = 0;
1510
-
1511
-                length = find_length(pdf, obj, start, p_stream);
1512
-                if (length < 0)
1513
-                    length = 0;
1514
-
1515
-                orig_length = length;
1516
-                if (length > pdf->size || obj->start + p_stream + length > pdf->size) {
1517
-                    cli_dbgmsg("cli_pdf: length out of file: %lld + %lld > %lld\n",
1518
-                           (long long)p_stream, (long long)length, (long long)pdf->size);
1519
-                    noisy_warnmsg("length out of file, truncated: %lld + %lld > %lld\n",
1520
-                           (long long)p_stream, (long long)length, (long long)pdf->size);
1521
-                    length = pdf->size - (obj->start + p_stream);
1522
-                }
1537
+            if (length < 0)
1538
+                length = 0;
1523 1539
 
1524
-                if (!(obj->flags & (1 << OBJ_FILTER_FLATE)) && length <= 0) {
1525
-                    const char *q = start + p_endstream;
1526
-                    length = size;
1527
-                    q--;
1540
+            cli_dbgmsg("pdf_extract_obj: calculated length %lld\n", (long long)length);
1541
+        } else {
1542
+            if (obj->stream_size > (size_t)length + 2) {
1543
+                cli_dbgmsg("cli_pdf: calculated length %zu < %zu\n",
1544
+                            (size_t)length, obj->stream_size);
1545
+                length = obj->stream_size;
1546
+            }
1547
+        }
1528 1548
 
1529
-                    if (*q == '\n') {
1530
-                        q--;
1531
-                        length--;
1549
+        if ((0 != orig_length) && (obj->stream_size > (size_t)orig_length + 20)) {
1550
+            cli_dbgmsg("pdf_extract_obj: orig length: %lld, length: %lld, size: %zu\n",
1551
+                        (long long)orig_length, (long long)length, obj->stream_size);
1552
+            pdfobj_flag(pdf, obj, BAD_STREAMLEN);
1553
+        }
1532 1554
 
1533
-                        if (*q == '\r')
1534
-                            length--;
1535
-                    } else if (*q == '\r') {
1536
-                        length--;
1537
-                    }
1555
+        if (0 == length) {
1556
+            length = obj->stream_size;
1557
+            if (0 == length) {
1558
+                cli_dbgmsg("pdf_extract_obj: Alleged or calculated stream length and stream buffer size both 0\n");
1559
+                goto done; /* Empty stream, nothing to scan */
1560
+            }
1561
+        }
1538 1562
 
1539
-                    if (length < 0)
1540
-                        length = 0;
1563
+        /* Check if XRef is enabled */
1564
+        if (cli_memstr(start, dict_len, "/XRef", strlen("/XRef"))) {
1565
+            xref = 1;
1566
+        }
1541 1567
 
1542
-                    cli_dbgmsg("pdf_extract_obj: calculated length %lld\n", (long long)length);
1543
-                } else {
1544
-                    if (size > (size_t)length+2) {
1545
-                        cli_dbgmsg("cli_pdf: calculated length %zu < %zu\n",
1546
-                                   (size_t)length, size);
1547
-                        length = size;
1548
-                    }
1549
-                }
1568
+        cli_dbgmsg("-------------EXPERIMENTAL-------------\n");
1550 1569
 
1551
-                if (orig_length && size > (size_t)orig_length + 20) {
1552
-                    cli_dbgmsg("pdf_extract_obj: orig length: %lld, length: %lld, size: %zu\n",
1553
-                               (long long)orig_length, (long long)length, size);
1554
-                    pdfobj_flag(pdf, obj, BAD_STREAMLEN);
1555
-                }
1570
+        /*
1571
+         * Identify the DecodeParms, if available.
1572
+         */
1573
+        if (NULL != (pstr = pdf_getdict(start, &dict_len, "/DecodeParms")))
1574
+        {
1575
+            cli_dbgmsg("pdf_extract_obj: Found /DecodeParms\n");
1576
+        }
1577
+        else if (NULL != (pstr = pdf_getdict(start, &dict_len, "/DP")))
1578
+        {
1579
+            cli_dbgmsg("pdf_extract_obj: Found /DP\n");
1580
+        }
1556 1581
 
1557
-                if (!length) {
1558
-                    length = size;
1559
-                    if (!length) {
1560
-                        cli_dbgmsg("pdf_extract_obj: length and size both 0\n");
1561
-                        break; /* Empty stream, nothing to scan */
1562
-                    }
1563
-                }
1582
+        if (pstr) {
1583
+            /* shift pstr left to "<<" for pdf_parse_dict */
1584
+            while ((*pstr == '<') && (pstr > start)) {
1585
+                pstr--;
1586
+                dict_len++;
1587
+            }
1564 1588
 
1565
-                if (cli_memstr(start, p_stream, "/XRef", 5))
1566
-                    xref = 1;
1589
+            /* shift pstr right to "<<" for pdf_parse_dict */
1590
+            while ((*pstr != '<') && (dict_len > 0)) {
1591
+                pstr++;
1592
+                dict_len--;
1593
+            }
1567 1594
 
1568
-                cli_dbgmsg("-------------EXPERIMENTAL-------------\n");
1595
+            if (dict_len > 4)
1596
+                dparams = pdf_parse_dict(pdf, obj, obj->size, (char *)pstr, NULL);
1597
+            else
1598
+                cli_dbgmsg("pdf_extract_obj: failed to locate DecodeParms dictionary start\n");
1599
+        }
1569 1600
 
1570
-                /*
1571
-                 * Identify the DecodeParms, if available.
1572
-                 */
1573
-                if (NULL != (pstr = pdf_getdict(start, &len, "/DecodeParms")))
1574
-                {
1575
-                    cli_dbgmsg("pdf_extract_obj: Found /DecodeParms\n");
1601
+        /*
1602
+         * Go back to the start of the dictionary and check to see if the stream
1603
+         * is an object stream. If so, collect the relevant info.
1604
+         */
1605
+        dict_len = obj->stream - start;
1606
+        if (NULL != (pstr = pdf_getdict(start, &dict_len, "/Type/ObjStm")))
1607
+        {
1608
+            int32_t objstm_first = -1;
1609
+            int32_t objstm_length = -1;
1610
+            int32_t objstm_n = -1;
1611
+
1612
+            cli_dbgmsg("pdf_extract_obj: Found /Type/ObjStm\n");
1613
+
1614
+            dict_len = obj->stream - start;
1615
+            if ((-1 == (objstm_first = pdf_readint(start, dict_len, "/First"))))
1616
+            {
1617
+                cli_warnmsg("pdf_extract_obj: Failed to find offset of first object in object stream\n");
1618
+            }
1619
+            else if ((-1 == (objstm_length = pdf_readint(start, dict_len, "/Length"))))
1620
+            {
1621
+                cli_warnmsg("pdf_extract_obj: Failed to find length of object stream\n");
1622
+            }
1623
+            else if ((-1 == (objstm_n = pdf_readint(start, dict_len, "/N"))))
1624
+            {
1625
+                cli_warnmsg("pdf_extract_obj: Failed to find num objects in object stream\n");
1626
+            }
1627
+            else
1628
+            {
1629
+                /* Add objstm to pdf struct, so it can be freed eventually */
1630
+                pdf->nobjstms++;
1631
+                pdf->objstms = cli_realloc2(pdf->objstms, sizeof(struct objstm_struct*) * pdf->nobjstms);
1632
+                if (!pdf->objstms) {
1633
+                    cli_warnmsg("pdf_extract_obj: out of memory parsing object stream (%u)\n", pdf->nobjstms);
1634
+                    pdf_free_dict(dparams);
1635
+                    return CL_EMEM;
1576 1636
                 }
1577
-                else if (NULL != (pstr = pdf_getdict(start, &len, "/DP")))
1578
-                {
1579
-                    cli_dbgmsg("pdf_extract_obj: Found /DP\n");
1637
+
1638
+                objstm = malloc(sizeof(struct objstm_struct));
1639
+                if (!objstm) {
1640
+                    cli_warnmsg("pdf_extract_obj: out of memory parsing object stream (%u)\n", pdf->nobjstms);
1641
+                    pdf_free_dict(dparams);
1642
+                    return CL_EMEM;
1580 1643
                 }
1644
+                pdf->objstms[pdf->nobjstms-1] = objstm;
1581 1645
 
1582
-                if (pstr) {
1583
-                    unsigned int objsize = obj_size(pdf, obj, 1);
1646
+                memset(objstm, 0, sizeof(*objstm));
1584 1647
 
1585
-                    /* shift pstr left to "<<" for pdf_parse_dict */
1586
-                    while ((*pstr == '<') && (pstr > start)) {
1587
-                        pstr--;
1588
-                        len++;
1589
-                    }
1648
+                objstm->first =         (uint32_t)objstm_first;
1649
+                objstm->current =       (uint32_t)objstm_first;
1650
+                objstm->current_pair =  0;
1651
+                objstm->length =        (uint32_t)objstm_length;
1652
+                objstm->n =             (uint32_t)objstm_n;
1590 1653
 
1591
-                    /* shift pstr right to "<<" for pdf_parse_dict */
1592
-                    while ((*pstr != '<') && (len > 0)) {
1593
-                        pstr++;
1594
-                        len--;
1595
-                    }
1654
+                cli_dbgmsg("pdf_extract_obj: ObjStm first obj at offset %d\n", objstm->first);
1655
+                cli_dbgmsg("pdf_extract_obj: ObjStm length is %d bytes\n", objstm->length);
1656
+                cli_dbgmsg("pdf_extract_obj: ObjStm should contain %d objects\n", objstm->n);
1657
+            }
1658
+        }
1596 1659
 
1597
-                    if (len > 4)
1598
-                        dparams = pdf_parse_dict(pdf, obj, objsize, (char *)pstr, NULL);
1599
-                    else
1600
-                        cli_dbgmsg("pdf_extract_obj: failed to locate DecodeParms dictionary start\n");
1601
-                }
1660
+        sum = pdf_decodestream(pdf, obj, dparams, obj->stream, (uint32_t)length, xref, fout, &rc, objstm);
1661
+        if ((CL_SUCCESS != rc) && (CL_VIRUS != rc)) {
1662
+            cli_dbgmsg("Error decoding stream! Error code: %d\n", rc);
1663
+
1664
+            /* It's ok if we couldn't decode the stream,
1665
+             *   make a best effort to keep parsing. */
1666
+            if (CL_EPARSE == rc)
1667
+                rc = CL_SUCCESS;
1602 1668
 
1669
+            if (NULL != objstm) {
1603 1670
                 /*
1604
-                 * Identify if the stream is an object stream. If so, collect the relevant info. 
1671
+                 * If we were expecting an objstm and there was a failure...
1672
+                 *   discard the memory for last object stream.
1605 1673
                  */
1606
-                len = p_stream;
1607
-                if (NULL != (pstr = pdf_getdict(start, &len, "/Type/ObjStm")))
1608
-                {
1609
-                    int32_t objstm_first = -1;
1610
-                    int32_t objstm_length = -1;
1611
-                    int32_t objstm_n = -1;
1612
-
1613
-                    cli_dbgmsg("pdf_extract_obj: Found /Type/ObjStm\n");
1614
-
1615
-                    len = p_stream;
1616
-                    if ((-1 == (objstm_first = pdf_readint(start, len, "/First"))))
1617
-                    {
1618
-                        cli_warnmsg("pdf_extract_obj: Failed to find offset of first object in object stream\n");
1619
-                    }
1620
-                    else if ((-1 == (objstm_length = pdf_readint(start, len, "/Length"))))
1621
-                    {
1622
-                        cli_warnmsg("pdf_extract_obj: Failed to find length of object stream\n");
1623
-                    }
1624
-                    else if ((-1 == (objstm_n = pdf_readint(start, len, "/N"))))
1625
-                    {
1626
-                        cli_warnmsg("pdf_extract_obj: Failed to find num objects in object stream\n");
1627
-                    }
1628
-                    else
1629
-                    {
1630
-                        /* Add objstm to pdf struct, so it can be freed eventually */
1631
-                        pdf->nobjstms++;
1632
-                        pdf->objstms = cli_realloc2(pdf->objstms, sizeof(struct objstm_struct*) * pdf->nobjstms);
1633
-                        if (!pdf->objstms) {
1634
-                            cli_warnmsg("pdf_extract_obj: out of memory parsing object stream (%u)\n", pdf->nobjstms);
1635
-                            pdf_free_dict(dparams);
1636
-                            return CL_EMEM;
1637
-                        }
1638
-
1639
-                        objstm = malloc(sizeof(struct objstm_struct));
1640
-                        if (!objstm) {
1641
-                            cli_warnmsg("pdf_extract_obj: out of memory parsing object stream (%u)\n", pdf->nobjstms);
1642
-                            pdf_free_dict(dparams);
1643
-                            return CL_EMEM;
1674
+                if (NULL != pdf->objstms) {
1675
+                    if (NULL != pdf->objstms[pdf->nobjstms - 1]) {
1676
+                        if (NULL != pdf->objstms[pdf->nobjstms - 1]->streambuf) {
1677
+                            free(pdf->objstms[pdf->nobjstms - 1]->streambuf);
1678
+                            pdf->objstms[pdf->nobjstms - 1]->streambuf = NULL;
1644 1679
                         }
1645
-                        pdf->objstms[pdf->nobjstms-1] = objstm;
1646
-
1647
-                        memset(objstm, 0, sizeof(*objstm));
1648
-
1649
-                        objstm->first =         (uint32_t)objstm_first;
1650
-                        objstm->current =       (uint32_t)objstm_first;
1651
-                        objstm->current_pair =  0;
1652
-                        objstm->length =        (uint32_t)objstm_length;
1653
-                        objstm->n =             (uint32_t)objstm_n;
1654
-
1655
-                        cli_dbgmsg("pdf_extract_obj: ObjStm first obj at offset %d\n", objstm->first);
1656
-                        cli_dbgmsg("pdf_extract_obj: ObjStm length is %d bytes\n", objstm->length);
1657
-                        cli_dbgmsg("pdf_extract_obj: ObjStm should contain %d objects\n", objstm->n);
1680
+                        free(pdf->objstms[pdf->nobjstms - 1]);
1681
+                        pdf->objstms[pdf->nobjstms - 1] = NULL;
1658 1682
                     }
1659
-                }
1660 1683
 
1661
-                sum = pdf_decodestream(pdf, obj, dparams, start + p_stream, (uint32_t)length, xref, fout, &rc, objstm);
1662
-                if ((CL_SUCCESS != rc) && (CL_VIRUS != rc)) {
1663
-                    cli_dbgmsg("Error decoding stream! Error code: %d\n", rc);
1664
-
1665
-                    /* It's ok if we couldn't decode the stream,
1666
-                     *   make a best effort to keep parsing. */
1667
-                    if (CL_EPARSE == rc)
1668
-                        rc = CL_SUCCESS;
1669
-
1670
-                    if (NULL != objstm) {
1671
-                        /*
1672
-                         * If we were expecting an objstm and there was a failure...
1673
-                         *   discard the memory for last object stream.
1674
-                         */
1675
-                        if (NULL != pdf->objstms) {
1676
-                            if (NULL != pdf->objstms[pdf->nobjstms - 1]) {
1677
-                                if (NULL != pdf->objstms[pdf->nobjstms - 1]->streambuf) {
1678
-                                    free(pdf->objstms[pdf->nobjstms - 1]->streambuf);
1679
-                                    pdf->objstms[pdf->nobjstms - 1]->streambuf = NULL;
1680
-                                }
1681
-                                free(pdf->objstms[pdf->nobjstms - 1]);
1682
-                                pdf->objstms[pdf->nobjstms - 1] = NULL;
1683
-                            }
1684
+                    /* Pop the objstm off the end of the pdf->objstms array. */
1685
+                    if (pdf->nobjstms > 0) {
1686
+                        pdf->nobjstms--;
1687
+                        if (0 == pdf->nobjstms) {
1688
+                            free(pdf->objstms);
1689
+                            pdf->objstms = NULL;
1690
+                        } else {
1691
+                            pdf->objstms = cli_realloc2(pdf->objstms, sizeof(struct objstm_struct*) * pdf->nobjstms);
1684 1692
 
1685
-                            /* Pop the objstm off the end of the pdf->objstms array. */
1686
-                            if (pdf->nobjstms > 0) {
1687
-                                pdf->nobjstms--;
1688
-                                if (0 == pdf->nobjstms) {
1689
-                                    free(pdf->objstms);
1690
-                                    pdf->objstms = NULL;
1691
-                                } else {
1692
-                                    pdf->objstms = cli_realloc2(pdf->objstms, sizeof(struct objstm_struct*) * pdf->nobjstms);
1693
-
1694
-                                    if (!pdf->objstms) {
1695
-                                        cli_warnmsg("pdf_extract_obj: out of memory when shrinking down objstm array\n");
1696
-                                        return CL_EMEM;
1697
-                                    }
1698
-                                }
1699
-                            } else {
1700
-                                /* hm.. this shouldn't happen */
1701
-                                cli_warnmsg("pdf_extract_obj: Failure counting objstms.\n");
1693
+                            if (!pdf->objstms) {
1694
+                                cli_warnmsg("pdf_extract_obj: out of memory when shrinking down objstm array\n");
1695
+                                return CL_EMEM;
1702 1696
                             }
1703 1697
                         }
1698
+                    } else {
1699
+                        /* hm.. this shouldn't happen */
1700
+                        cli_warnmsg("pdf_extract_obj: Failure counting objstms.\n");
1704 1701
                     }
1705 1702
                 }
1703
+            }
1704
+        }
1706 1705
 
1707
-                if (dparams)
1708
-                    pdf_free_dict(dparams);
1706
+        if (dparams)
1707
+            pdf_free_dict(dparams);
1709 1708
 
1710
-                if ((rc == CL_VIRUS) && !SCAN_ALLMATCHES) {
1711
-                    sum = 0; /* prevents post-filter scan */
1712
-                    break;
1713
-                }
1709
+        if ((rc == CL_VIRUS) && !SCAN_ALLMATCHES) {
1710
+            sum = 0; /* prevents post-filter scan */
1711
+            goto done;
1712
+        }
1714 1713
 
1715
-                cli_dbgmsg("-------------EXPERIMENTAL-------------\n");
1716
-            } else {
1717
-                noisy_warnmsg("pdf_extract_obj: cannot find stream bounds for obj %u %u\n", obj->id>>8, obj->id&0xff);
1718
-            }
1714
+        cli_dbgmsg("-------------EXPERIMENTAL-------------\n");
1719 1715
 
1720
-        } else if (obj->flags & (1 << OBJ_JAVASCRIPT)) {
1721
-            const char *q2;
1722
-            const char *q = (obj->objstm) ? (const char *)(obj->start + obj->objstm->streambuf)
1723
-                                          : (const char *)(obj->start + pdf->map);
1716
+    } else if (obj->flags & (1 << OBJ_JAVASCRIPT)) {
1717
+        const char *q2;
1718
+        const char *q = (obj->objstm) ? (const char *)(obj->start + obj->objstm->streambuf)
1719
+                                        : (const char *)(obj->start + pdf->map);
1724 1720
 
1725
-            /* TODO: get obj-endobj size */
1726
-            off_t bytesleft = obj_size(pdf, obj, 0);
1721
+        /* TODO: get obj-endobj size */
1722
+        off_t bytesleft = obj->size;
1727 1723
 
1728
-            if (bytesleft < 0)
1729
-                break;
1724
+        if (bytesleft < 0) {
1725
+            goto done;
1726
+        }
1730 1727
 
1731
-            do {
1732
-                char *js = NULL;
1733
-                size_t js_len = 0;
1734
-                const char *q3;
1728
+        do {
1729
+            char *js = NULL;
1730
+            size_t js_len = 0;
1731
+            const char *q3;
1735 1732
 
1736
-                q2 = cli_memstr(q, bytesleft, "/JavaScript", 11);
1737
-                if (!q2)
1738
-                    break;
1733
+            q2 = cli_memstr(q, bytesleft, "/JavaScript", 11);
1734
+            if (!q2)
1735
+                break;
1739 1736
 
1740
-                bytesleft -= q2 - q + 11;
1741
-                q = q2 + 11;
1737
+            bytesleft -= q2 - q + 11;
1738
+            q = q2 + 11;
1742 1739
 
1743
-                js = pdf_readstring(q, bytesleft,  "/JS", NULL, &q2, !(pdf->flags & (1<<DECRYPTABLE_PDF)));
1744
-                bytesleft -= q2 - q;
1745
-                q = q2;
1740
+            js = pdf_readstring(q, bytesleft,  "/JS", NULL, &q2, !(pdf->flags & (1<<DECRYPTABLE_PDF)));
1741
+            bytesleft -= q2 - q;
1742
+            q = q2;
1746 1743
 
1747
-                if (js) {
1748
-                    char *decrypted = NULL;
1749
-                    const char *out = js;
1750
-                    js_len = strlen(js);
1751
-                    if (pdf->flags & (1 << DECRYPTABLE_PDF)) {
1752
-                        cli_dbgmsg("pdf_extract_obj: encrypted string\n");
1753
-                        decrypted = decrypt_any(pdf, obj->id, js, &js_len, pdf->enc_method_string);
1744
+            if (js) {
1745
+                char *decrypted = NULL;
1746
+                const char *out = js;
1747
+                js_len = strlen(js);
1748
+                if (pdf->flags & (1 << DECRYPTABLE_PDF)) {
1749
+                    cli_dbgmsg("pdf_extract_obj: encrypted string\n");
1750
+                    decrypted = decrypt_any(pdf, obj->id, js, &js_len, pdf->enc_method_string);
1754 1751
 
1755
-                        if (decrypted) {
1756
-                            noisy_msg(pdf, "pdf_extract_obj: decrypted Javascript string from obj %u %u\n", obj->id>>8,obj->id&0xff);
1757
-                            out = decrypted;
1758
-                        }
1752
+                    if (decrypted) {
1753
+                        noisy_msg(pdf, "pdf_extract_obj: decrypted Javascript string from obj %u %u\n", obj->id>>8,obj->id&0xff);
1754
+                        out = decrypted;
1759 1755
                     }
1756
+                }
1760 1757
 
1761
-                    if (filter_writen(pdf, obj, fout, out, js_len, (size_t*)&sum) != js_len) {
1762
-                        rc = CL_EWRITE;
1763
-                                free(js);
1764
-                        break;
1765
-                    }
1758
+                if (filter_writen(pdf, obj, fout, out, js_len, (size_t*)&sum) != js_len) {
1759
+                    rc = CL_EWRITE;
1760
+                            free(js);
1761
+                    break;
1762
+                }
1766 1763
 
1767
-                    free(decrypted);
1768
-                    free(js);
1769
-                    cli_dbgmsg("pdf_extract_obj: bytesleft: %d\n", (int)bytesleft);
1764
+                free(decrypted);
1765
+                free(js);
1766
+                cli_dbgmsg("pdf_extract_obj: bytesleft: %d\n", (int)bytesleft);
1770 1767
 
1771
-                    if (bytesleft > 0) {
1772
-                        q2 = pdf_nextobject(q, bytesleft);
1773
-                        if (!q2)
1774
-                            q2 = q + bytesleft - 1;
1768
+                if (bytesleft > 0) {
1769
+                    q2 = pdf_nextobject(q, bytesleft);
1770
+                    if (!q2)
1771
+                        q2 = q + bytesleft - 1;
1775 1772
 
1776
-                        /* non-conforming PDFs that don't escape ) properly */
1777
-                        q3 = memchr(q, ')', bytesleft);
1778
-                        if (q3 && q3 < q2)
1779
-                            q2 = q3;
1773
+                    /* non-conforming PDFs that don't escape ) properly */
1774
+                    q3 = memchr(q, ')', bytesleft);
1775
+                    if (q3 && q3 < q2)
1776
+                        q2 = q3;
1780 1777
 
1781
-                        while (q2 > q && q2[-1] == ' ')
1782
-                            q2--;
1778
+                    while (q2 > q && q2[-1] == ' ')
1779
+                        q2--;
1783 1780
 
1784
-                        if (q2 > q) {
1785
-                            q--;
1786
-                            filter_writen(pdf, obj, fout, q, q2 - q, (size_t*)&sum);
1787
-                            q++;
1788
-                        }
1781
+                    if (q2 > q) {
1782
+                        q--;
1783
+                        filter_writen(pdf, obj, fout, q, q2 - q, (size_t*)&sum);
1784
+                        q++;
1789 1785
                     }
1790 1786
                 }
1787
+            }
1791 1788
 
1792
-            } while (bytesleft > 0);
1793
-        } else {
1794
-            off_t bytesleft = obj_size(pdf, obj, 0);
1795
-
1796
-            if (bytesleft < 0)
1797
-                rc = CL_EFORMAT;
1798
-            else {
1799
-                if (obj->objstm) {
1800
-                    if (filter_writen(pdf, obj, fout , obj->objstm->streambuf + obj->start, bytesleft, (size_t*)&sum) != (size_t)bytesleft)
1801
-                        rc = CL_EWRITE;
1802
-                } else {
1803
-                    if (filter_writen(pdf, obj, fout , pdf->map + obj->start, bytesleft, (size_t*)&sum) != (size_t)bytesleft)
1804
-                        rc = CL_EWRITE;
1805
-                }
1789
+        } while (bytesleft > 0);
1790
+    } else {
1791
+        off_t bytesleft = obj->size;
1792
+
1793
+        if (bytesleft < 0)
1794
+            rc = CL_EFORMAT;
1795
+        else {
1796
+            if (obj->objstm) {
1797
+                if (filter_writen(pdf, obj, fout , obj->objstm->streambuf + obj->start, bytesleft, (size_t*)&sum) != (size_t)bytesleft)
1798
+                    rc = CL_EWRITE;
1799
+            } else {
1800
+                if (filter_writen(pdf, obj, fout , pdf->map + obj->start, bytesleft, (size_t*)&sum) != (size_t)bytesleft)
1801
+                    rc = CL_EWRITE;
1806 1802
             }
1807 1803
         }
1808
-    } while (0);
1804
+    }
1805
+
1806
+done:
1809 1807
 
1810 1808
     cli_dbgmsg("pdf_extract_obj: extracted %td bytes %u %u obj\n", sum, obj->id>>8, obj->id&0xff);
1811 1809
     cli_dbgmsg("pdf_extract_obj:         ... to %s\n", fullname);
... ...
@@ -2079,7 +1996,7 @@ static void pdf_parse_encrypt(struct pdf_struct *pdf, const char *enc, int len)
2079 2079
     }
2080 2080
     genid = (unsigned long)temp_long;
2081 2081
 
2082
-    objid |= genid & 0xff; 
2082
+    objid |= genid & 0xff;
2083 2083
     q2 = pdf_nextobject(q, len);
2084 2084
     if (!q2 || *q2 != 'R')
2085 2085
         return;
... ...
@@ -2116,7 +2033,7 @@ void pdf_parseobj(struct pdf_struct *pdf, struct pdf_obj *obj)
2116 2116
     const char *nextobj = NULL, *nextopen = NULL, *nextclose = NULL;
2117 2117
     const char *q = NULL;
2118 2118
     const char *dict = NULL, *enddict = NULL, *start = NULL;
2119
-    off_t dict_length = 0, full_dict_length = 0, objsize = 0, bytesleft = 0;
2119
+    off_t dict_length = 0, full_dict_length = 0, bytesleft = 0;
2120 2120
     size_t i = 0;
2121 2121
     unsigned filters = 0, blockopens = 0;
2122 2122
     enum objstate objstate = STATE_NONE;
... ...
@@ -2129,6 +2046,8 @@ void pdf_parseobj(struct pdf_struct *pdf, struct pdf_obj *obj)
2129 2129
         return;
2130 2130
     }
2131 2131
 
2132
+    cli_dbgmsg("pdf_parseobj: Parsing object %u %u\n", obj->id >> 8, obj->id & 0xff);
2133
+
2132 2134
     if (obj->objstm) {
2133 2135
         if ((size_t)obj->start > obj->objstm->streambuf_len) {
2134 2136
             cli_dbgmsg("pdf_parseobj: %u %u obj: obj start (%u) is greater than size of object stream (%zu).\n",
... ...
@@ -2146,14 +2065,38 @@ void pdf_parseobj(struct pdf_struct *pdf, struct pdf_obj *obj)
2146 2146
     }
2147 2147
     start = q;
2148 2148
 
2149
-    objsize = obj_size(pdf, obj, 1);
2150
-    if (objsize < 0)
2149
+    if (obj->size <= 0)
2151 2150
         return;
2152 2151
 
2153 2152
     if (obj->objstm) {
2154
-        bytesleft = MIN(objsize, obj->objstm->streambuf_len - obj->start);
2153
+        bytesleft = MIN(obj->size, obj->objstm->streambuf_len - obj->start);
2155 2154
     } else {
2156
-        bytesleft = MIN(objsize, pdf->size - obj->start);
2155
+        bytesleft = MIN(obj->size, pdf->size - obj->start);
2156
+    }
2157
+
2158
+    /* For objects that aren't already in an object stream^, check if they contain a stream.
2159
+     * ^Objects in object streams aren't supposed to contain streams, so we don't check them. */
2160
+    if (NULL == obj->objstm) {
2161
+        /* Check if object contains stream */
2162
+        cl_error_t has_stream;
2163
+        const char* stream = NULL;
2164
+        size_t stream_size = 0;
2165
+
2166
+        has_stream = find_stream_bounds(
2167
+            start,
2168
+            obj->size,
2169
+            &stream,
2170
+            &stream_size,
2171
+            (pdf->enc_method_stream <= ENC_IDENTITY) && (pdf->enc_method_embeddedfile <= ENC_IDENTITY));
2172
+
2173
+        if ((CL_SUCCESS == has_stream) ||
2174
+            (CL_EFORMAT == has_stream)) {
2175
+            /* Stream found. Store this fact and the stream bounds. */
2176
+            cli_dbgmsg("pdf_parseobj: %u %u contains stream, size: %zu\n", obj->id>>8, obj->id&0xff, stream_size);
2177
+            obj->flags |= (1 << OBJ_STREAM);
2178
+            obj->stream = stream;
2179
+            obj->stream_size = stream_size;
2180
+        }
2157 2181
     }
2158 2182
 
2159 2183
     /* find start of dictionary */
... ...
@@ -2204,7 +2147,7 @@ void pdf_parseobj(struct pdf_struct *pdf, struct pdf_obj *obj)
2204 2204
     dict = q3+2;
2205 2205
     q = dict;
2206 2206
     blockopens++;
2207
-    bytesleft = objsize - (q - start);
2207
+    bytesleft = obj->size - (q - start);
2208 2208
     enddict = q + bytesleft - 1;
2209 2209
 
2210 2210
     /* find end of dictionary block */
... ...
@@ -2355,7 +2298,7 @@ void pdf_parseobj(struct pdf_struct *pdf, struct pdf_obj *obj)
2355 2355
             pdfobj_flag(pdf, obj, LINEARIZED_PDF);
2356 2356
             objstate = STATE_NONE;
2357 2357
             trailer_end = pdf_readint(dict, full_dict_length, "/H");
2358
-            if (trailer_end > 0 && trailer_end < pdf->size) {
2358
+            if ((trailer_end > 0) && ((size_t)trailer_end < pdf->size)) {
2359 2359
                 trailer = trailer_end - 1024;
2360 2360
                 if (trailer < 0)
2361 2361
                     trailer = 0;
... ...
@@ -2967,7 +2910,7 @@ void pdf_handle_enc(struct pdf_struct *pdf)
2967 2967
         return;
2968 2968
     }
2969 2969
 
2970
-    len = obj_size(pdf, obj, 1);
2970
+    len = obj->size;
2971 2971
     q = (obj->objstm) ? (const char *)(obj->start + obj->objstm->streambuf)
2972 2972
                       : (const char *)(obj->start + pdf->map);
2973 2973
 
... ...
@@ -3123,20 +3066,20 @@ void pdf_handle_enc(struct pdf_struct *pdf)
3123 3123
 }
3124 3124
 
3125 3125
 /**
3126
- * @brief Search pdf buffer for objects.  Parse each.  
3127
- * 
3126
+ * @brief Search pdf buffer for objects.  Parse each.
3127
+ *
3128 3128
  * Newly found objects will be extracted after completion when the extraction for loop continues.
3129
- * 
3130
- * @param pdf           Pdf struct that keeps track of all information found in the PDF. 
3129
+ *
3130
+ * @param pdf           Pdf struct that keeps track of all information found in the PDF.
3131 3131
  * @param objstm        Pointer to an object stream to parse.
3132
- * 
3132
+ *
3133 3133
  * @return cl_error_t   Error code.
3134 3134
  */
3135 3135
 cl_error_t pdf_find_and_parse_objs_in_objstm(struct pdf_struct *pdf, struct objstm_struct *objstm)
3136 3136
 {
3137 3137
     cl_error_t status = CL_EFORMAT;
3138 3138
     cl_error_t retval = CL_EPARSE;
3139
-    int32_t foundobj = 0, alerts = 0;
3139
+    int32_t alerts = 0;
3140 3140
     uint32_t badobjects = 0;
3141 3141
     size_t i = 0;
3142 3142
 
... ...
@@ -3147,11 +3090,8 @@ cl_error_t pdf_find_and_parse_objs_in_objstm(struct pdf_struct *pdf, struct objs
3147 3147
         goto done;
3148 3148
     }
3149 3149
 
3150
-    char* current_pair = objstm->streambuf;
3151
-    char* current_obj = objstm->streambuf + objstm->first;
3152
-
3153
-    if ((0 == objstm->first) || 
3154
-        (0 == objstm->streambuf_len) || 
3150
+    if ((0 == objstm->first) ||
3151
+        (0 == objstm->streambuf_len) ||
3155 3152
         (0 == objstm->n))
3156 3153
     {
3157 3154
         cli_dbgmsg("pdf_find_and_parse_objs_in_objstm: Empty object stream.\n");
... ...
@@ -3177,7 +3117,7 @@ cl_error_t pdf_find_and_parse_objs_in_objstm(struct pdf_struct *pdf, struct objs
3177 3177
 
3178 3178
         /* Find object */
3179 3179
         retval = pdf_findobj_in_objstm(pdf, objstm, &obj);
3180
-        
3180
+
3181 3181
         if (retval != CL_SUCCESS)
3182 3182
         {
3183 3183
             cli_dbgmsg("pdf_find_and_parse_objs_in_objstm: Fewer objects in stream than expected: %u found, %u expected.\n",
... ...
@@ -3207,7 +3147,7 @@ cl_error_t pdf_find_and_parse_objs_in_objstm(struct pdf_struct *pdf, struct objs
3207 3207
         status = CL_EFORMAT;
3208 3208
         goto done;
3209 3209
     }
3210
-    
3210
+
3211 3211
     status = CL_SUCCESS;
3212 3212
 
3213 3213
 done:
... ...
@@ -3216,18 +3156,17 @@ done:
3216 3216
 
3217 3217
 /**
3218 3218
  * @brief Search pdf buffer for objects.  Parse each and then extract each.
3219
- * 
3219
+ *
3220 3220
  * @param pdf               Pdf struct that keeps track of all information found in the PDF.
3221 3221
  * @param alerts[in/out]    The number of alerts, relevant in ALLMATCH mode.
3222
- * 
3222
+ *
3223 3223
  * @return cl_error_t   Error code.
3224 3224
  */
3225 3225
 cl_error_t pdf_find_and_extract_objs(struct pdf_struct *pdf, uint32_t *alerts)
3226 3226
 {
3227 3227
     cl_error_t status = CL_SUCCESS;
3228 3228
     int32_t rv = 0;
3229
-    int foundobj = 0;
3230
-    unsigned int i = 0, j = 0;
3229
+    unsigned int i = 0;
3231 3230
     uint32_t badobjects = 0;
3232 3231
     cli_ctx *ctx = pdf->ctx;
3233 3232
 
... ...
@@ -3269,7 +3208,7 @@ cl_error_t pdf_find_and_extract_objs(struct pdf_struct *pdf, uint32_t *alerts)
3269 3269
          * This doesn't trigger for PDFs that are encrypted but don't need
3270 3270
          * a password to decrypt */
3271 3271
         status = cli_append_virus(pdf->ctx, "Heuristics.Encrypted.PDF");
3272
-        if (status == CL_VIRUS) { 
3272
+        if (status == CL_VIRUS) {
3273 3273
             alerts++;
3274 3274
             if (SCAN_ALLMATCHES)
3275 3275
                 status = CL_CLEAN;
... ...
@@ -3328,11 +3267,11 @@ done:
3328 3328
 
3329 3329
 /**
3330 3330
  * @brief Primary function for parsing and scanning a PDF.
3331
- * 
3331
+ *
3332 3332
  * @param dir       Filepath for temp file.
3333
- * @param ctx       clam scan context structure. 
3333
+ * @param ctx       clam scan context structure.
3334 3334
  * @param offset    offset of pdf in ctx->fmap
3335
- * 
3335
+ *
3336 3336
  * @return int      Returns cl_error_t status value.
3337 3337
  */
3338 3338
 int cli_pdf(const char *dir, cli_ctx *ctx, off_t offset)
... ...
@@ -3532,7 +3471,7 @@ int cli_pdf(const char *dir, cli_ctx *ctx, off_t offset)
3532 3532
     }
3533 3533
 
3534 3534
     /*
3535
-     * Find and extract all objects in the PDF. 
3535
+     * Find and extract all objects in the PDF.
3536 3536
      * New experimental recursive methodology that adds objects from object streams.
3537 3537
      */
3538 3538
     objs_found = pdf.nobjs;
... ...
@@ -3633,10 +3572,10 @@ done:
3633 3633
 
3634 3634
 /**
3635 3635
  * @brief   Skip the rest of the current line, and find the start of the next line.
3636
- * 
3636
+ *
3637 3637
  * @param ptr   Current offset into buffer.
3638
- * @param len   Remaining bytes in buffer. 
3639
- * 
3638
+ * @param len   Remaining bytes in buffer.
3639
+ *
3640 3640
  * @return const char*  Address of next line, or NULL if no next line in buffer.
3641 3641
  */
3642 3642
 static const char *
... ...
@@ -3666,13 +3605,13 @@ pdf_nextlinestart(const char *ptr, size_t len)
3666 3666
 
3667 3667
 /**
3668 3668
  * @brief   Return the start of the next PDF object.
3669
- * 
3669
+ *
3670 3670
  * This assumes that we're not in a stream.
3671
- * 
3671
+ *
3672 3672
  * @param ptr   Current offset into buffer.
3673
- * @param len   Remaining bytes in buffer. 
3674
- * 
3675
- * @return const char*  Address of next object in the buffer, or NULL if there is none in the buffer. 
3673
+ * @param len   Remaining bytes in buffer.
3674
+ *
3675
+ * @return const char*  Address of next object in the buffer, or NULL if there is none in the buffer.
3676 3676
  */
3677 3677
 static const char *
3678 3678
 pdf_nextobject(const char *ptr, size_t len)
... ...
@@ -4015,7 +3954,7 @@ static void Author_cb(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdfnam
4015 4015
         pdf->stats.author = cli_calloc(1, sizeof(struct pdf_stats_entry));
4016 4016
         if (!(pdf->stats.author))
4017 4017
             return;
4018
-        pdf->stats.author->data = pdf_parse_string(pdf, obj, objstart, obj_size(pdf, obj, 1), "/Author", NULL, &(pdf->stats.author->meta));
4018
+        pdf->stats.author->data = pdf_parse_string(pdf, obj, objstart, obj->size, "/Author", NULL, &(pdf->stats.author->meta));
4019 4019
     }
4020 4020
 }
4021 4021
 #endif
... ...
@@ -4040,7 +3979,7 @@ static void Creator_cb(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdfna
4040 4040
         pdf->stats.creator = cli_calloc(1, sizeof(struct pdf_stats_entry));
4041 4041
         if (!(pdf->stats.creator))
4042 4042
             return;
4043
-        pdf->stats.creator->data = pdf_parse_string(pdf, obj, objstart, obj_size(pdf, obj, 1), "/Creator", NULL, &(pdf->stats.creator->meta));
4043
+        pdf->stats.creator->data = pdf_parse_string(pdf, obj, objstart, obj->size, "/Creator", NULL, &(pdf->stats.creator->meta));
4044 4044
     }
4045 4045
 }
4046 4046
 #endif
... ...
@@ -4065,7 +4004,7 @@ static void ModificationDate_cb(struct pdf_struct *pdf, struct pdf_obj *obj, str
4065 4065
         pdf->stats.modificationdate = cli_calloc(1, sizeof(struct pdf_stats_entry));
4066 4066
         if (!(pdf->stats.modificationdate))
4067 4067
             return;
4068
-        pdf->stats.modificationdate->data = pdf_parse_string(pdf, obj, objstart, obj_size(pdf, obj, 1), "/ModDate", NULL, &(pdf->stats.modificationdate->meta));
4068
+        pdf->stats.modificationdate->data = pdf_parse_string(pdf, obj, objstart, obj->size, "/ModDate", NULL, &(pdf->stats.modificationdate->meta));
4069 4069
     }
4070 4070
 }
4071 4071
 #endif
... ...
@@ -4090,7 +4029,7 @@ static void CreationDate_cb(struct pdf_struct *pdf, struct pdf_obj *obj, struct
4090 4090
         pdf->stats.creationdate = cli_calloc(1, sizeof(struct pdf_stats_entry));
4091 4091
         if (!(pdf->stats.creationdate))
4092 4092
             return;
4093
-        pdf->stats.creationdate->data = pdf_parse_string(pdf, obj, objstart, obj_size(pdf, obj, 1), "/CreationDate", NULL, &(pdf->stats.creationdate->meta));
4093
+        pdf->stats.creationdate->data = pdf_parse_string(pdf, obj, objstart, obj->size, "/CreationDate", NULL, &(pdf->stats.creationdate->meta));
4094 4094
     }
4095 4095
 }
4096 4096
 #endif
... ...
@@ -4115,7 +4054,7 @@ static void Producer_cb(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdfn
4115 4115
         pdf->stats.producer = cli_calloc(1, sizeof(struct pdf_stats_entry));
4116 4116
         if (!(pdf->stats.producer))
4117 4117
             return;
4118
-        pdf->stats.producer->data = pdf_parse_string(pdf, obj, objstart, obj_size(pdf, obj, 1), "/Producer", NULL, &(pdf->stats.producer->meta));
4118
+        pdf->stats.producer->data = pdf_parse_string(pdf, obj, objstart, obj->size, "/Producer", NULL, &(pdf->stats.producer->meta));
4119 4119
     }
4120 4120
 }
4121 4121
 #endif
... ...
@@ -4140,7 +4079,7 @@ static void Title_cb(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdfname
4140 4140
         pdf->stats.title = cli_calloc(1, sizeof(struct pdf_stats_entry));
4141 4141
         if (!(pdf->stats.title))
4142 4142
             return;
4143
-        pdf->stats.title->data = pdf_parse_string(pdf, obj, objstart, obj_size(pdf, obj, 1), "/Title", NULL, &(pdf->stats.title->meta));
4143
+        pdf->stats.title->data = pdf_parse_string(pdf, obj, objstart, obj->size, "/Title", NULL, &(pdf->stats.title->meta));
4144 4144
     }
4145 4145
 }
4146 4146
 #endif
... ...
@@ -4165,7 +4104,7 @@ static void Keywords_cb(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdfn
4165 4165
         pdf->stats.keywords = cli_calloc(1, sizeof(struct pdf_stats_entry));
4166 4166
         if (!(pdf->stats.keywords))
4167 4167
             return;
4168
-        pdf->stats.keywords->data = pdf_parse_string(pdf, obj, objstart, obj_size(pdf, obj, 1), "/Keywords", NULL, &(pdf->stats.keywords->meta));
4168
+        pdf->stats.keywords->data = pdf_parse_string(pdf, obj, objstart, obj->size, "/Keywords", NULL, &(pdf->stats.keywords->meta));
4169 4169
     }
4170 4170
 }
4171 4171
 #endif
... ...
@@ -4190,7 +4129,7 @@ static void Subject_cb(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdfna
4190 4190
         pdf->stats.subject = cli_calloc(1, sizeof(struct pdf_stats_entry));
4191 4191
         if (!(pdf->stats.subject))
4192 4192
             return;
4193
-        pdf->stats.subject->data = pdf_parse_string(pdf, obj, objstart, obj_size(pdf, obj, 1), "/Subject", NULL, &(pdf->stats.subject->meta));
4193
+        pdf->stats.subject->data = pdf_parse_string(pdf, obj, objstart, obj->size, "/Subject", NULL, &(pdf->stats.subject->meta));
4194 4194
     }
4195 4195
 }
4196 4196
 #endif
... ...
@@ -4242,7 +4181,6 @@ static void Pages_cb(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdfname
4242 4242
     const char *objstart = (obj->objstm) ? (const char *)(obj->start + obj->objstm->streambuf)
4243 4243
                                          : (const char *)(obj->start + pdf->map);
4244 4244
     const char *begin;
4245
-    unsigned int objsize;
4246 4245
     unsigned long npages=0, count;
4247 4246
     long temp_long;
4248 4247
     struct pdf_array_node *node;
... ...
@@ -4257,19 +4195,17 @@ static void Pages_cb(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdfname
4257 4257
     if (!(SCAN_COLLECT_METADATA))
4258 4258
         return;
4259 4259
 
4260
-    objsize = obj_size(pdf, obj, 1);
4261
-
4262 4260
     pdfobj = cli_jsonobj(pdf->ctx->wrkproperty, "PDFStats");
4263 4261
     if (!(pdfobj))
4264 4262
         return;
4265 4263
 
4266
-    begin = cli_memstr(objstart, objsize, "/Kids", 5);
4264
+    begin = cli_memstr(objstart, obj->size, "/Kids", 5);
4267 4265
     if (!(begin))
4268 4266
         return;
4269 4267
 
4270 4268
     begin += 5;
4271 4269
 
4272
-    array = pdf_parse_array(pdf, obj, objsize, (char *)begin, NULL);
4270
+    array = pdf_parse_array(pdf, obj, obj->size, (char *)begin, NULL);
4273 4271
     if (!(array)) {
4274 4272
         cli_jsonbool(pdfobj, "IncorrectPagesCount", 1);
4275 4273
         return;
... ...
@@ -4280,22 +4216,22 @@ static void Pages_cb(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdfname
4280 4280
             if (strchr((char *)(node->data), 'R'))
4281 4281
                 npages++;
4282 4282
 
4283
-    begin = cli_memstr(objstart, objsize, "/Count", 6);
4283
+    begin = cli_memstr(objstart, obj->size, "/Count", 6);
4284 4284
     if (!(begin)) {
4285 4285
         cli_jsonbool(pdfobj, "IncorrectPagesCount", 1);
4286 4286
         goto cleanup;
4287 4287
     }
4288 4288
 
4289 4289
     begin += 6;
4290
-    while (begin - objstart <  objsize && isspace(begin[0]))
4290
+    while (((size_t)(begin - objstart) < obj->size) && isspace(begin[0]))
4291 4291
         begin++;
4292 4292
 
4293
-    if (begin - objstart >= objsize) {
4293
+    if ((size_t)(begin - objstart) >= obj->size) {
4294 4294
         goto cleanup;
4295 4295
     }
4296 4296
 
4297
-    countsize = (obj->objstm) ? (size_t)(obj->start + obj->objstm->streambuf + objsize - begin)
4298
-                              : (size_t)(obj->start + pdf->map + objsize - begin);
4297
+    countsize = (obj->objstm) ? (size_t)(obj->start + obj->objstm->streambuf + obj->size - begin)
4298
+                              : (size_t)(obj->start + pdf->map + obj->size - begin);
4299 4299
 
4300 4300
     if (CL_SUCCESS != cli_strntol_wrap(begin, countsize, 0, 10, &temp_long)) {
4301 4301
         cli_jsonbool(pdfobj, "IncorrectPagesCount", 1);
... ...
@@ -4323,7 +4259,6 @@ static void Colors_cb(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdfnam
4323 4323
     char *p1;
4324 4324
     const char *objstart = (obj->objstm) ? (const char *)(obj->start + obj->objstm->streambuf)
4325 4325
                                          : (const char *)(obj->start + pdf->map);
4326
-    size_t objsize;
4327 4326
 
4328 4327
     UNUSEDPARAM(act);
4329 4328
 
... ...
@@ -4333,25 +4268,23 @@ static void Colors_cb(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdfnam
4333 4333
     if (!(SCAN_COLLECT_METADATA))
4334 4334
         return;
4335 4335
 
4336
-    objsize = obj_size(pdf, obj, 1);
4337
-
4338
-    p1 = (char *)cli_memstr(objstart, objsize, "/Colors", 7);
4336
+    p1 = (char *)cli_memstr(objstart, obj->size, "/Colors", 7);
4339 4337
     if (!(p1))
4340 4338
         return;
4341 4339
 
4342 4340
     p1 += 7;
4343 4341
 
4344 4342
     /* Ensure that we have at least one whitespace character plus at least one number */
4345
-    if (objsize - (p1 - objstart) < 2)
4343
+    if (obj->size - (size_t)(p1 - objstart) < 2)
4346 4344
         return;
4347 4345
 
4348
-    while (p1 - objstart < objsize && isspace(p1[0]))
4346
+    while (((size_t)(p1 - objstart) < obj->size) && isspace(p1[0]))
4349 4347
         p1++;
4350 4348
 
4351
-    if ((size_t)(p1 - objstart) == objsize)
4349
+    if ((size_t)(p1 - objstart) == obj->size)
4352 4350
         return;
4353 4351
 
4354
-    if (CL_SUCCESS != cli_strntol_wrap(p1, (size_t)((p1 - objstart) - objsize), 0, 10, &temp_long)) {
4352
+    if (CL_SUCCESS != cli_strntol_wrap(p1, (size_t)((p1 - objstart) - obj->size), 0, 10, &temp_long)) {
4355 4353
         return;
4356 4354
     } else if (temp_long < 0) {
4357 4355
         return;
... ...
@@ -37,12 +37,14 @@ struct objstm_struct {
37 37
 
38 38
 struct pdf_obj {
39 39
     uint32_t start;
40
-    int32_t size;
40
+    size_t size;
41 41
     uint32_t id;
42 42
     uint32_t flags;
43 43
     uint32_t statsflags;
44 44
     uint32_t numfilters;
45 45
     uint32_t filterlist[PDF_FILTERLIST_MAX];
46
+    const char *stream;     // pointer to stream contained in object.
47
+    size_t stream_size;      // size of stream contained in object.
46 48
     struct objstm_struct *objstm;  // Should be NULL unless the obj exists in an object stream (separate buffer)
47 49
     char *path;
48 50
 };
... ...
@@ -146,7 +148,7 @@ struct pdf_struct {
146 146
     const char *CF;
147 147
     long CF_n;
148 148
     const char *map;
149
-    off_t size;
149
+    size_t size;
150 150
     off_t offset;
151 151
     off_t startoff;
152 152
     cli_ctx *ctx;
... ...
@@ -400,9 +400,9 @@ char *cli_strtokbuf(const char *input, int fieldno, const char *delim, char *out
400 400
     return output;
401 401
 }
402 402
 
403
-const char *cli_memstr(const char *haystack, unsigned int hs, const char *needle, unsigned int ns)
403
+const char *cli_memstr(const char *haystack, size_t hs, const char *needle, size_t ns)
404 404
 {
405
-	unsigned int i, s1, s2;
405
+	size_t i, s1, s2;
406 406
 
407 407
     if(!hs || !ns || hs < ns)
408 408
 	return NULL;
... ...
@@ -68,7 +68,7 @@ int cli_xtoi(const char *hex);
68 68
 char *cli_str2hex(const char *string, unsigned int len);
69 69
 char *cli_utf16toascii(const char *str, unsigned int length);
70 70
 char *cli_strtokbuf(const char *input, int fieldno, const char *delim, char *output);
71
-const char *cli_memstr(const char *haystack, unsigned int hs, const char *needle, unsigned int ns);
71
+const char *cli_memstr(const char *haystack, size_t hs, const char *needle, size_t ns);
72 72
 char *cli_strrcpy(char *dest, const char *source);
73 73
 size_t cli_strtokenize(char *buffer, const char delim, const size_t token_count, const char **tokens);
74 74
 size_t cli_ldbtokenize(char *buffer, const char delim, const size_t token_count, const char **tokens, int token_skip);