Browse code

pdf: improve handling of truncated files, and fix some filter handling bugs.

Also don't dump images by default, this will be overridable from bytecode.

Török Edvin authored on 2010/08/02 04:14:44
Showing 1 changed files
... ...
@@ -77,6 +77,7 @@ enum pdf_flag {
77 77
     ESCAPED_COMMON_PDFNAME,
78 78
     HEX_JAVASCRIPT,
79 79
     UNKNOWN_FILTER,
80
+    MANY_FILTERS,
80 81
     HAS_OPENACTION,
81 82
     BAD_STREAMLEN,
82 83
     ENCRYPTED_PDF,
... ...
@@ -122,7 +123,9 @@ enum objflags {
122 122
     OBJ_JAVASCRIPT,
123 123
     OBJ_OPENACTION,
124 124
     OBJ_HASFILTERS,
125
-    OBJ_SIGNED
125
+    OBJ_SIGNED,
126
+    OBJ_IMAGE,
127
+    OBJ_TRUNCATED
126 128
 };
127 129
 
128 130
 struct pdf_obj {
... ...
@@ -166,7 +169,7 @@ static int find_stream_bounds(const char *start, off_t bytesleft, off_t byteslef
166 166
 	q = q2;
167 167
 	q2 = cli_memstr(q, bytesleft2, "endstream", 9);
168 168
 	if (!q2)
169
-	    return 0;/* no more objs */
169
+	    q2 = q + bytesleft2-9; /* till EOF */
170 170
 	*endstream = q2 - start;
171 171
 	return 1;
172 172
     }
... ...
@@ -210,12 +213,17 @@ static int pdf_findobj(struct pdf_struct *pdf)
210 210
 	off_t p_stream, p_endstream;
211 211
 	q2 = pdf_nextobject(q, bytesleft);
212 212
 	if (!q2)
213
-	    return 0;/* no more objs */
213
+	    q2 = pdf->map + pdf->size;
214 214
 	bytesleft -= q2 - q;
215 215
 	if (find_stream_bounds(q-1, q2-q, bytesleft + (q2-q), &p_stream, &p_endstream)) {
216 216
 	    obj->flags |= 1 << OBJ_STREAM;
217
-	    q2 = q-1 + p_endstream + 6;
217
+	    q2 = q-1 + p_endstream + 9;
218 218
 	    bytesleft -= q2 - q + 1;
219
+	    if (bytesleft < 0) {
220
+		obj->flags |= 1 << OBJ_TRUNCATED;
221
+		pdf->offset = pdf->size;
222
+		return 1;/* truncated */
223
+	    }
219 224
 	} else if ((q3 = cli_memstr(q-1, q2-q+1, "endobj", 6))) {
220 225
 	    q2 = q3 + 6;
221 226
 	    pdf->offset = q2 - pdf->map;
... ...
@@ -226,7 +234,9 @@ static int pdf_findobj(struct pdf_struct *pdf)
226 226
 	}
227 227
 	q = q2;
228 228
     }
229
-    return 0;/* no more objs */
229
+    obj->flags |= 1 << OBJ_TRUNCATED;
230
+    pdf->offset = pdf->size;
231
+    return 1;/* truncated */
230 232
 }
231 233
 
232 234
 static int filter_writen(struct pdf_struct *pdf, struct pdf_obj *obj,
... ...
@@ -300,6 +310,9 @@ static void pdfobj_flag(struct pdf_struct *pdf, struct pdf_obj *obj, enum pdf_fl
300 300
 	case LINEARIZED_PDF:
301 301
 	    s = "linearized PDF";
302 302
 	    break;
303
+	case MANY_FILTERS:
304
+	    s = "more than 2 filters per obj";
305
+	    break;
303 306
     }
304 307
     cli_dbgmsg("cli_pdf: %s flagged in object %u %u\n", s, obj->id>>8, obj->id&0xff);
305 308
 }
... ...
@@ -497,12 +510,19 @@ static int pdf_extract_obj(struct pdf_struct *pdf, struct pdf_obj *obj)
497 497
     int rc = CL_SUCCESS;
498 498
     char *ascii_decoded = NULL;
499 499
 
500
+    /* TODO: call bytecode hook here, allow override dumpability */
500 501
     if ((!(obj->flags & (1 << OBJ_STREAM)) ||
501 502
 	(obj->flags & (1 << OBJ_HASFILTERS)))
502 503
 	&& !(obj->flags & DUMP_MASK)) {
503 504
 	/* don't dump all streams */
504 505
 	return CL_CLEAN;
505 506
     }
507
+#if 1
508
+    if (obj->flags & (1 << OBJ_IMAGE)) {
509
+	/* don't dump / scan images */
510
+	return CL_CLEAN;
511
+    }
512
+#endif
506 513
     snprintf(fullname, sizeof(fullname), "%s"PATHSEP"pdf%02u", pdf->dir, pdf->files++);
507 514
     fout = open(fullname,O_RDWR|O_CREAT|O_EXCL|O_TRUNC|O_BINARY, 0600);
508 515
     if (fout < 0) {
... ...
@@ -531,6 +551,11 @@ static int pdf_extract_obj(struct pdf_struct *pdf, struct pdf_obj *obj)
531 531
 	    if (length < 0)
532 532
 		length = 0;
533 533
 	    orig_length = length;
534
+	    if (length > pdf->size || obj->start + p_stream + length > pdf->size) {
535
+		cli_dbgmsg("cli_pdf: length out of file: %ld + %ld > %ld\n",
536
+			   p_stream, length, pdf->size);
537
+		length = pdf->size - (obj->start + p_stream);
538
+	    }
534 539
 	    if (!(obj->flags & (1 << OBJ_FILTER_FLATE)) && length <= 0) {
535 540
 		const char *q = start + p_endstream;
536 541
 		length = size;
... ...
@@ -553,8 +578,11 @@ static int pdf_extract_obj(struct pdf_struct *pdf, struct pdf_obj *obj)
553 553
 		    length = size;
554 554
 		}
555 555
 	    }
556
-	    if (orig_length && size > orig_length + 20)
556
+	    if (orig_length && size > orig_length + 20) {
557
+		cli_dbgmsg("cli_pdf: orig length: %ld, length: %ld, size: %ld\n",
558
+			   orig_length, length, size);
557 559
 		pdfobj_flag(pdf, obj, BAD_STREAMLEN);
560
+	    }
558 561
 	    if (!length)
559 562
 		length = size;
560 563
 
... ...
@@ -580,10 +608,14 @@ static int pdf_extract_obj(struct pdf_struct *pdf, struct pdf_obj *obj)
580 580
 						   (unsigned char*)ascii_decoded);
581 581
 	    }
582 582
 	    if (ascii_decoded_size < 0) {
583
-		pdfobj_flag(pdf, obj, BAD_ASCIIDECODE);
583
+		/* don't flag for images or truncated objs*/
584
+		if (!(obj->flags &
585
+		      ((1 << OBJ_IMAGE) | (1 << OBJ_TRUNCATED))))
586
+		    pdfobj_flag(pdf, obj, BAD_ASCIIDECODE);
584 587
 		cli_dbgmsg("cli_pdf: failed to asciidecode in %u %u obj\n", obj->id>>8,obj->id&0xff);
585
-		rc = CL_CLEAN;
586
-		break;
588
+		free(ascii_decoded);
589
+		ascii_decoded = NULL;
590
+		/* attempt to directly flatedecode it */
587 591
 	    }
588 592
 	    /* either direct or ascii-decoded input */
589 593
 	    if (!ascii_decoded)
... ...
@@ -591,7 +623,7 @@ static int pdf_extract_obj(struct pdf_struct *pdf, struct pdf_obj *obj)
591 591
 	    flate_in = ascii_decoded ? ascii_decoded : start+p_stream;
592 592
 
593 593
 	    if (obj->flags & (1 << OBJ_FILTER_FLATE)) {
594
-		cli_dbgmsg("cli_pdf: deflate len %d (orig %d)\n", ascii_decoded_size, orig_length);
594
+		cli_dbgmsg("cli_pdf: deflate len %ld (orig %ld)\n", ascii_decoded_size, (long)orig_length);
595 595
 		rc = filter_flatedecode(pdf, obj, flate_in, ascii_decoded_size, fout, &sum);
596 596
 	    } else {
597 597
 		if (filter_writen(pdf, obj, fout, flate_in, ascii_decoded_size, &sum) != ascii_decoded_size)
... ...
@@ -686,10 +718,11 @@ static struct pdfname_action pdfname_actions[] = {
686 686
     {"ASCIIHexDecode", OBJ_FILTER_AH, STATE_FILTER, STATE_FILTER},
687 687
     {"ASCII85Decode", OBJ_FILTER_A85, STATE_FILTER, STATE_FILTER},
688 688
     {"A85", OBJ_FILTER_A85, STATE_FILTER, STATE_FILTER},
689
-    {"AHx", OBJ_FILTER_A85, STATE_FILTER, STATE_FILTER},
689
+    {"AHx", OBJ_FILTER_AH, STATE_FILTER, STATE_FILTER},
690 690
     {"EmbeddedFile", OBJ_EMBEDDED_FILE, STATE_NONE, STATE_NONE},
691 691
     {"FlateDecode", OBJ_FILTER_FLATE, STATE_FILTER, STATE_FILTER},
692 692
     {"Fl", OBJ_FILTER_FLATE, STATE_FILTER, STATE_FILTER},
693
+    {"Image", OBJ_IMAGE, STATE_NONE, STATE_NONE},
693 694
     {"LZWDecode", OBJ_FILTER_LZW, STATE_FILTER, STATE_FILTER},
694 695
     {"LZW", OBJ_FILTER_LZW, STATE_FILTER, STATE_FILTER},
695 696
     {"RunLengthDecode", OBJ_FILTER_RL, STATE_FILTER, STATE_FILTER},
... ...
@@ -704,6 +737,7 @@ static struct pdfname_action pdfname_actions[] = {
704 704
     {"Standard", OBJ_FILTER_CRYPT, STATE_FILTER, STATE_FILTER},
705 705
     {"Sig",    OBJ_SIGNED, STATE_ANY, STATE_NONE},
706 706
     {"V",     OBJ_SIGNED, STATE_ANY, STATE_NONE},
707
+    {"R",     OBJ_SIGNED, STATE_ANY, STATE_NONE},
707 708
     {"Linearized", OBJ_DICT, STATE_NONE, STATE_LINEARIZED},
708 709
     {"Filter", OBJ_HASFILTERS, STATE_ANY, STATE_FILTER},
709 710
     {"JavaScript", OBJ_JAVASCRIPT, STATE_S, STATE_JAVASCRIPT},
... ...
@@ -751,6 +785,7 @@ static void handle_pdfname(struct pdf_struct *pdf, struct pdf_obj *obj,
751 751
 	if (*state == STATE_FILTER &&
752 752
 	    act->set_objflag !=OBJ_DICT &&
753 753
 	    (obj->flags & (1 << act->set_objflag))) {
754
+	    cli_dbgmsg("cli_pdf: duplicate stream filter %s\n", pdfname);
754 755
 	    pdfobj_flag(pdf, obj, BAD_STREAM_FILTERS);
755 756
 	}
756 757
 	obj->flags |= 1 << act->set_objflag;
... ...
@@ -774,8 +809,8 @@ static void pdf_parseobj(struct pdf_struct *pdf, struct pdf_obj *obj)
774 774
     const char *q = obj->start + pdf->map;
775 775
     const char *dict, *start;
776 776
     off_t dict_length;
777
-    off_t bytesleft = pdf->offset - obj->start;
778
-    unsigned i;
777
+    off_t bytesleft = obj_size(pdf, obj, 1);
778
+    unsigned i, filters=0;
779 779
     enum objstate objstate = STATE_NONE;
780 780
 
781 781
     if (bytesleft < 0)
... ...
@@ -795,7 +830,7 @@ static void pdf_parseobj(struct pdf_struct *pdf, struct pdf_obj *obj)
795 795
     } while (!q3 || q3[1] != '<');
796 796
     dict = q3+2;
797 797
     q = dict;
798
-    bytesleft = pdf->offset - obj->start - (q3 - start);
798
+    bytesleft = obj_size(pdf, obj, 1) - (q3 - start);
799 799
     /* find end of dictionary */
800 800
     do {
801 801
 	q2 = pdf_nextobject(q, bytesleft);
... ...
@@ -830,7 +865,8 @@ static void pdf_parseobj(struct pdf_struct *pdf, struct pdf_obj *obj)
830 830
 		escapes = 1;
831 831
 		continue;
832 832
 	    }
833
-	    if (*q == ' ' || *q == '\r' || *q == '\n' || *q == '/' || *q == '>' || *q == ']')
833
+	    if (*q == ' ' || *q == '\t' || *q == '\r' || *q == '\n' ||
834
+		*q == '/' || *q == '>' || *q == ']')
834 835
 		break;
835 836
 	    pdfname[i] = *q;
836 837
 	}
... ...
@@ -873,6 +909,18 @@ static void pdf_parseobj(struct pdf_struct *pdf, struct pdf_obj *obj)
873 873
 	    objstate = STATE_NONE;
874 874
 	}
875 875
     }
876
+    for (i=0;i<sizeof(pdfname_actions)/sizeof(pdfname_actions[0]);i++) {
877
+	const struct pdfname_action *act = &pdfname_actions[i];
878
+	if ((obj->flags & (1 << act->set_objflag)) &&
879
+	    act->from_state == STATE_FILTER &&
880
+	    act->to_state == STATE_FILTER &&
881
+	    act->set_objflag != OBJ_FILTER_CRYPT) {
882
+	    filters++;
883
+	}
884
+    }
885
+    if (filters > 2) { /* more than 2 non-crypt filters */
886
+	pdfobj_flag(pdf, obj, MANY_FILTERS);
887
+    }
876 888
     if (obj->flags & ((1 << OBJ_SIGNED) | KNOWN_FILTERS))
877 889
 	obj->flags &= ~(1 << OBJ_FILTER_UNKNOWN);
878 890
     if (obj->flags & (1 << OBJ_FILTER_UNKNOWN))
... ...
@@ -1013,15 +1061,12 @@ int cli_pdf(const char *dir, cli_ctx *ctx, off_t offset)
1013 1013
 
1014 1014
     if (pdf.flags) {
1015 1015
 	cli_dbgmsg("cli_pdf: flags 0x%02x\n", pdf.flags);
1016
-	if (pdf.flags & (1 << ESCAPED_COMMON_PDFNAME)) {
1017
-	    /* for example /Fl#61te#44#65#63#6f#64#65 instead of /FlateDecode */
1018
-	    *ctx->virname = "Heuristics.PDF.ObfuscatedNameObject";
1019
-	    rc = CL_VIRUS;
1020
-	}
1021 1016
 #if 0
1022 1017
 	/* TODO: find both trailers, and /Encrypt settings */
1023 1018
 	if (pdf.flags & (1 << LINEARIZED_PDF))
1024 1019
 	    pdf.flags &= ~ (1 << BAD_ASCIIDECODE);
1020
+	if (pdf.flags & (1 << MANY_FILTERS))
1021
+	    pdf.flags &= ~ (1 << BAD_ASCIIDECODE);
1025 1022
 	if (pdf.flags &
1026 1023
 	    ((1 << BAD_PDF_TOOMANYOBJS) | (1 << BAD_STREAM_FILTERS) |
1027 1024
 	    (1<<BAD_FLATE) | (1<<BAD_ASCIIDECODE)|
... ...
@@ -1029,6 +1074,11 @@ int cli_pdf(const char *dir, cli_ctx *ctx, off_t offset)
1029 1029
 	    rc = CL_EUNPACK;
1030 1030
 	}
1031 1031
 #endif
1032
+	if (pdf.flags & (1 << ESCAPED_COMMON_PDFNAME)) {
1033
+	    /* for example /Fl#61te#44#65#63#6f#64#65 instead of /FlateDecode */
1034
+	    *ctx->virname = "Heuristics.PDF.ObfuscatedNameObject";
1035
+	    rc = CL_VIRUS;
1036
+	}
1032 1037
     }
1033 1038
     cli_dbgmsg("cli_pdf: returning %d\n", rc);
1034 1039
     free(pdf.objs);
... ...
@@ -1640,8 +1690,11 @@ static int asciihexdecode(const char *buf, off_t len, char *output)
1640 1640
 	    continue;
1641 1641
 	if (buf[i] == '>')
1642 1642
 	    break;
1643
-	if (cli_hex2str_to(buf+i, output+j++, 2) == -1)
1644
-	    j--;
1643
+	if (cli_hex2str_to(buf+i, output+j++, 2) == -1) {
1644
+	    if (len - i < 4)
1645
+		continue;
1646
+	    return -1;
1647
+	}
1645 1648
 	i++;
1646 1649
     }
1647 1650
     return j;