Browse code

Improve handling of pdf streams.

Also dump undecompressable streams, since we are not decrypting.

Török Edvin authored on 2010/07/30 22:54:53
Showing 1 changed files
... ...
@@ -69,6 +69,7 @@ enum pdf_flag {
69 69
     BAD_PDF_TOOMANYOBJS,
70 70
     BAD_STREAM_FILTERS,
71 71
     BAD_FLATE,
72
+    BAD_FLATESTART,
72 73
     BAD_STREAMSTART,
73 74
     BAD_ASCIIDECODE,
74 75
     BAD_INDOBJ,
... ...
@@ -77,7 +78,8 @@ enum pdf_flag {
77 77
     HEX_JAVASCRIPT,
78 78
     UNKNOWN_FILTER,
79 79
     HAS_OPENACTION,
80
-    BAD_STREAMLEN
80
+    BAD_STREAMLEN,
81
+    ENCRYPTED_PDF
81 82
 };
82 83
 
83 84
 static int xrefCheck(const char *xref, const char *eof)
... ...
@@ -117,7 +119,8 @@ enum objflags {
117 117
     OBJ_FILTER_CRYPT,
118 118
     OBJ_JAVASCRIPT,
119 119
     OBJ_OPENACTION,
120
-    OBJ_HASFILTERS
120
+    OBJ_HASFILTERS,
121
+    OBJ_SIGNED
121 122
 };
122 123
 
123 124
 struct pdf_obj {
... ...
@@ -264,6 +267,9 @@ static void pdfobj_flag(struct pdf_struct *pdf, struct pdf_obj *obj, enum pdf_fl
264 264
 	case BAD_FLATE:
265 265
 	    s = "bad deflate stream";
266 266
 	    break;
267
+	case BAD_FLATESTART:
268
+	    s = "bad deflate stream start";
269
+	    break;
267 270
 	case BAD_STREAMSTART:
268 271
 	    s = "bad stream start";
269 272
 	    break;
... ...
@@ -285,8 +291,11 @@ static void pdfobj_flag(struct pdf_struct *pdf, struct pdf_obj *obj, enum pdf_fl
285 285
 	case BAD_STREAMLEN:
286 286
 	    s = "bad /Length, too small";
287 287
 	    break;
288
+	case ENCRYPTED_PDF:
289
+	    s = "PDF is encrypted";
290
+	    break;
288 291
     }
289
-    cli_dbgmsg("cli_pdf: %s in object %u %u\n", s, obj->id>>8, obj->id&0xff);
292
+    cli_dbgmsg("cli_pdf: %s flagged in object %u %u\n", s, obj->id>>8, obj->id&0xff);
290 293
 }
291 294
 
292 295
 static int filter_flatedecode(struct pdf_struct *pdf, struct pdf_obj *obj,
... ...
@@ -351,8 +360,18 @@ static int filter_flatedecode(struct pdf_struct *pdf, struct pdf_obj *obj,
351 351
 		else
352 352
 		    cli_dbgmsg("cli_pdf: after writing %lu bytes, got error %d inflating PDF stream in %u %u obj\n",
353 353
 			       (unsigned long)nbytes, zstat, obj->id>>8, obj->id&0xff);
354
-		pdfobj_flag(pdf, obj, BAD_FLATE);
354
+		/* mark stream as bad only if not encrypted */
355 355
 		inflateEnd(&stream);
356
+		if (!nbytes) {
357
+		    cli_dbgmsg("cli_pdf: dumping raw stream (probably encrypted)\n");
358
+		    if (filter_writen(pdf, obj, fout, buf, len, sum) != len) {
359
+			cli_errmsg("cli_pdf: failed to write output file\n");
360
+			return CL_EWRITE;
361
+		    }
362
+		    pdfobj_flag(pdf, obj, BAD_FLATESTART);
363
+		} else {
364
+		    pdfobj_flag(pdf, obj, BAD_FLATE);
365
+		}
356 366
 		return CL_CLEAN;
357 367
 	}
358 368
 	break;
... ...
@@ -662,6 +681,8 @@ static struct pdfname_action pdfname_actions[] = {
662 662
     {"DCTDecode", OBJ_FILTER_DCT, STATE_FILTER, STATE_FILTER},
663 663
     {"JPXDecode", OBJ_FILTER_JPX, STATE_FILTER, STATE_FILTER},
664 664
     {"Crypt",  OBJ_FILTER_CRYPT, STATE_FILTER, STATE_NONE},
665
+    {"Standard", OBJ_FILTER_CRYPT, STATE_FILTER, STATE_FILTER},
666
+    {"Sig",    OBJ_SIGNED, STATE_NONE, STATE_NONE},
665 667
     {"Filter", OBJ_HASFILTERS, STATE_ANY, STATE_FILTER},
666 668
     {"JavaScript", OBJ_JAVASCRIPT, STATE_S, STATE_JAVASCRIPT},
667 669
     {"Length", OBJ_DICT, STATE_FILTER, STATE_NONE},
... ...
@@ -686,6 +707,9 @@ static void handle_pdfname(struct pdf_struct *pdf, struct pdf_obj *obj,
686 686
     }
687 687
     if (!act) {
688 688
 	if (*state == STATE_FILTER &&
689
+	    !(obj->flags & (1 << OBJ_SIGNED)) &&
690
+	    /* these are digital signature objects, filter doesn't matter,
691
+	     * we don't need them anyway */
689 692
 	    !(obj->flags & KNOWN_FILTERS)) {
690 693
 	    cli_dbgmsg("cli_pdf: unknown filter %s\n", pdfname);
691 694
 	    pdfobj_flag(pdf, obj, UNKNOWN_FILTER);
... ...
@@ -784,7 +808,7 @@ static void pdf_parseobj(struct pdf_struct *pdf, struct pdf_obj *obj)
784 784
 		escapes = 1;
785 785
 		continue;
786 786
 	    }
787
-	    if (*q == ' ' || *q == '\r' || *q == '\n' || *q == '/')
787
+	    if (*q == ' ' || *q == '\r' || *q == '\n' || *q == '/' || *q == '>' || *q == ']')
788 788
 		break;
789 789
 	    pdfname[i] = *q;
790 790
 	}
... ...
@@ -888,6 +912,7 @@ int cli_pdf(const char *dir, cli_ctx *ctx, off_t offset)
888 888
 	pdf.flags |= 1 << BAD_PDF_TRAILER;
889 889
 	cli_dbgmsg("cli_pdf: %%%%EOF not found\n");
890 890
     } else {
891
+	const char *t;
891 892
 	size = q - eofmap + map_off;
892 893
 	for (;q > eofmap;q--) {
893 894
 	    if (memcmp(q, "startxref", 9) == 0)
... ...
@@ -896,17 +921,28 @@ int cli_pdf(const char *dir, cli_ctx *ctx, off_t offset)
896 896
 	if (q <= eofmap) {
897 897
 	    pdf.flags |= 1 << BAD_PDF_TRAILER;
898 898
 	    cli_dbgmsg("cli_pdf: startxref not found\n");
899
-	}
900
-	q += 9;
901
-	while (q < eof && (*q == ' ' || *q == '\n' || *q == '\r')) { q++; }
902
-	xref = atol(q);
903
-	bytesleft = map->len - offset - xref;
904
-	if (bytesleft > 4096)
905
-	    bytesleft = 4096;
906
-	q = fmap_need_off_once(map, offset + xref, bytesleft);
907
-	if (!q || xrefCheck(q, q+bytesleft) == -1) {
908
-	    cli_dbgmsg("cli_pdf: did not find valid xref\n");
909
-	    pdf.flags |= 1 << BAD_PDF_TRAILER;
899
+	} else {
900
+	    for (t=q;t > eofmap; t--) {
901
+		if (memcmp(t,"trailer",7) == 0)
902
+		    break;
903
+	    }
904
+	    if (t > eofmap) {
905
+		if (cli_memstr(t, q-t, "/Encrypt", 8)) {
906
+		    pdf.flags |= 1 << ENCRYPTED_PDF;
907
+		    cli_dbgmsg("cli_pdf: encrypted pdf found, stream will probably fail to decompress!\n");
908
+		}
909
+	    }
910
+	    q += 9;
911
+	    while (q < eof && (*q == ' ' || *q == '\n' || *q == '\r')) { q++; }
912
+	    xref = atol(q);
913
+	    bytesleft = map->len - offset - xref;
914
+	    if (bytesleft > 4096)
915
+		bytesleft = 4096;
916
+	    q = fmap_need_off_once(map, offset + xref, bytesleft);
917
+	    if (!q || xrefCheck(q, q+bytesleft) == -1) {
918
+		cli_dbgmsg("cli_pdf: did not find valid xref\n");
919
+		pdf.flags |= 1 << BAD_PDF_TRAILER;
920
+	    }
910 921
 	}
911 922
     }
912 923
     size -= offset;
... ...
@@ -941,6 +977,9 @@ int cli_pdf(const char *dir, cli_ctx *ctx, off_t offset)
941 941
 	    break;
942 942
     }
943 943
 
944
+    if (pdf.flags & (1 << ENCRYPTED_PDF))
945
+	pdf.flags &= ~ (1 << BAD_FLATESTART);
946
+
944 947
     if (pdf.flags) {
945 948
 	cli_dbgmsg("cli_pdf: flags 0x%02x\n", pdf.flags);
946 949
 	if (pdf.flags & (1 << ESCAPED_COMMON_PDFNAME)) {