Browse code

Adjustment to pdf find_obj logic to allow the parser to skip, continue when it finds objects that cannot be parsed and may not in fact be objects at all.

Micah Snyder authored on 2018/06/09 22:42:57
Showing 1 changed files
... ...
@@ -201,7 +201,15 @@ static int find_stream_bounds(const char *start, off_t bytesleft, off_t byteslef
201 201
     return 0;
202 202
 }
203 203
 
204
-/* Expected returns: 1 if success, 0 if no more objects, -1 if error */
204
+/**
205
+ * @brief  Finds the next obj and adds it to our list of objects, and increments nobj.
206
+ *
207
+ * @param pdf   PDF structure
208
+ * @return int  -1 if error
209
+ * @return int  0 if no more objects
210
+ * @return int  1 if success
211
+ * @return int  2 if an invalid object was discovered, may be skipped.
212
+ */
205 213
 int pdf_findobj(struct pdf_struct *pdf)
206 214
 {
207 215
     const char *start, *q, *q2, *q3, *eof;
... ...
@@ -245,7 +253,9 @@ int pdf_findobj(struct pdf_struct *pdf)
245 245
 
246 246
     if (CL_SUCCESS != cli_strntol_wrap(q, (size_t)(bytesleft + (q2-q)), 0, 10, (long*)&genid)) {
247 247
         cli_dbgmsg("cli_pdf: Failed to parse object genid (%u)\n", pdf->nobjs);
248
-        return -1;
248
+        /* Failed to parse, probably not a real object.  Skip past the "obj" thing, and continue. */
249
+        pdf->offset = q2 + 4 - pdf->map;
250
+        return 2;
249 251
     }
250 252
     q = findNextNonWSBack(q-1,start);
251 253
     while (q > start && isdigit(*q))
... ...
@@ -260,9 +270,11 @@ int pdf_findobj(struct pdf_struct *pdf)
260 260
         if (q - 4 > start) {
261 261
             const char* lastfile = q - 4;
262 262
             if (0 != strncmp(lastfile, "\%\%EOF", 5)) {
263
-                /* Nope, wasn't %%EOF, I guess just fail out. */
263
+                /* Nope, wasn't %%EOF */
264 264
                 cli_dbgmsg("cli_pdf: Failed to parse object objid (%u)\n", pdf->nobjs);
265
-                return -1;
265
+                /* Skip past the "obj" thing, and continue. */
266
+                pdf->offset = q2 + 4 - pdf->map;
267
+                return 2;
266 268
             }
267 269
             /* Yup, Looks, like the file continues after %%EOF.  
268 270
              * Probably another revision.  Keep parsing... */
... ...
@@ -271,12 +283,16 @@ int pdf_findobj(struct pdf_struct *pdf)
271 271
         } else {
272 272
             /* Failed parsing at the very beginning */
273 273
             cli_dbgmsg("cli_pdf: Failed to parse object objid (%u)\n", pdf->nobjs);
274
-            return -1;
274
+            /* Probably not a real object.  Skip past the "obj" thing, and continue. */
275
+            pdf->offset = q2 + 4 - pdf->map;
276
+            return 2;
275 277
         }
276 278
         /* Try again, with offset slightly adjusted */
277 279
         if (CL_SUCCESS != cli_strntol_wrap(q, (size_t)(bytesleft + (q2-q)), 0, 10, (long*)&objid)) {
278 280
             cli_dbgmsg("cli_pdf: Failed to parse object objid (%u)\n", pdf->nobjs);
279
-            return -1;
281
+            /* Still failed... Probably not a real object.  Skip past the "obj" thing, and continue. */
282
+            pdf->offset = q2 + 4 - pdf->map;
283
+            return 2;
280 284
         }
281 285
         cli_dbgmsg("cli_pdf: There appears to be an additional revision. Continuing to parse...\n");
282 286
     }
... ...
@@ -2540,9 +2556,19 @@ int cli_pdf(const char *dir, cli_ctx *ctx, off_t offset)
2540 2540
 
2541 2541
     /* parse PDF and find obj offsets */
2542 2542
     while ((rc = pdf_findobj(&pdf)) > 0) {
2543
-        struct pdf_obj *obj = &pdf.objs[pdf.nobjs-1];
2543
+        if (rc == 1) {
2544
+            struct pdf_obj *obj = &pdf.objs[pdf.nobjs-1];
2544 2545
 
2545
-        cli_dbgmsg("cli_pdf: found %d %d obj @%lld\n", obj->id >> 8, obj->id&0xff, (long long)(obj->start + offset));
2546
+            cli_dbgmsg("cli_pdf: found %d %d obj @%lld\n", obj->id >> 8, obj->id&0xff, (long long)(obj->start + offset));
2547
+        }
2548
+        else if (rc == 2) {
2549
+            pdf.nobjs--;
2550
+            cli_dbgmsg("cli_pdf: Failed to parse object, likely an oversight in parser design.\n");
2551
+        }
2552
+        else {
2553
+            pdf.nobjs--;
2554
+            cli_dbgmsg("cli_pdf: unexpected return code %d.\n", rc);
2555
+        }
2546 2556
     }
2547 2557
 
2548 2558
     if (pdf.nobjs)