Also don't dump images by default, this will be overridable from bytecode.
Török Edvin authored on 2010/08/02 04:14:44... | ... |
@@ -77,6 +77,7 @@ enum pdf_flag { |
77 | 77 |
ESCAPED_COMMON_PDFNAME, |
78 | 78 |
HEX_JAVASCRIPT, |
79 | 79 |
UNKNOWN_FILTER, |
80 |
+ MANY_FILTERS, |
|
80 | 81 |
HAS_OPENACTION, |
81 | 82 |
BAD_STREAMLEN, |
82 | 83 |
ENCRYPTED_PDF, |
... | ... |
@@ -122,7 +123,9 @@ enum objflags { |
122 | 122 |
OBJ_JAVASCRIPT, |
123 | 123 |
OBJ_OPENACTION, |
124 | 124 |
OBJ_HASFILTERS, |
125 |
- OBJ_SIGNED |
|
125 |
+ OBJ_SIGNED, |
|
126 |
+ OBJ_IMAGE, |
|
127 |
+ OBJ_TRUNCATED |
|
126 | 128 |
}; |
127 | 129 |
|
128 | 130 |
struct pdf_obj { |
... | ... |
@@ -166,7 +169,7 @@ static int find_stream_bounds(const char *start, off_t bytesleft, off_t byteslef |
166 | 166 |
q = q2; |
167 | 167 |
q2 = cli_memstr(q, bytesleft2, "endstream", 9); |
168 | 168 |
if (!q2) |
169 |
- return 0;/* no more objs */ |
|
169 |
+ q2 = q + bytesleft2-9; /* till EOF */ |
|
170 | 170 |
*endstream = q2 - start; |
171 | 171 |
return 1; |
172 | 172 |
} |
... | ... |
@@ -210,12 +213,17 @@ static int pdf_findobj(struct pdf_struct *pdf) |
210 | 210 |
off_t p_stream, p_endstream; |
211 | 211 |
q2 = pdf_nextobject(q, bytesleft); |
212 | 212 |
if (!q2) |
213 |
- return 0;/* no more objs */ |
|
213 |
+ q2 = pdf->map + pdf->size; |
|
214 | 214 |
bytesleft -= q2 - q; |
215 | 215 |
if (find_stream_bounds(q-1, q2-q, bytesleft + (q2-q), &p_stream, &p_endstream)) { |
216 | 216 |
obj->flags |= 1 << OBJ_STREAM; |
217 |
- q2 = q-1 + p_endstream + 6; |
|
217 |
+ q2 = q-1 + p_endstream + 9; |
|
218 | 218 |
bytesleft -= q2 - q + 1; |
219 |
+ if (bytesleft < 0) { |
|
220 |
+ obj->flags |= 1 << OBJ_TRUNCATED; |
|
221 |
+ pdf->offset = pdf->size; |
|
222 |
+ return 1;/* truncated */ |
|
223 |
+ } |
|
219 | 224 |
} else if ((q3 = cli_memstr(q-1, q2-q+1, "endobj", 6))) { |
220 | 225 |
q2 = q3 + 6; |
221 | 226 |
pdf->offset = q2 - pdf->map; |
... | ... |
@@ -226,7 +234,9 @@ static int pdf_findobj(struct pdf_struct *pdf) |
226 | 226 |
} |
227 | 227 |
q = q2; |
228 | 228 |
} |
229 |
- return 0;/* no more objs */ |
|
229 |
+ obj->flags |= 1 << OBJ_TRUNCATED; |
|
230 |
+ pdf->offset = pdf->size; |
|
231 |
+ return 1;/* truncated */ |
|
230 | 232 |
} |
231 | 233 |
|
232 | 234 |
static int filter_writen(struct pdf_struct *pdf, struct pdf_obj *obj, |
... | ... |
@@ -300,6 +310,9 @@ static void pdfobj_flag(struct pdf_struct *pdf, struct pdf_obj *obj, enum pdf_fl |
300 | 300 |
case LINEARIZED_PDF: |
301 | 301 |
s = "linearized PDF"; |
302 | 302 |
break; |
303 |
+ case MANY_FILTERS: |
|
304 |
+ s = "more than 2 filters per obj"; |
|
305 |
+ break; |
|
303 | 306 |
} |
304 | 307 |
cli_dbgmsg("cli_pdf: %s flagged in object %u %u\n", s, obj->id>>8, obj->id&0xff); |
305 | 308 |
} |
... | ... |
@@ -497,12 +510,19 @@ static int pdf_extract_obj(struct pdf_struct *pdf, struct pdf_obj *obj) |
497 | 497 |
int rc = CL_SUCCESS; |
498 | 498 |
char *ascii_decoded = NULL; |
499 | 499 |
|
500 |
+ /* TODO: call bytecode hook here, allow override dumpability */ |
|
500 | 501 |
if ((!(obj->flags & (1 << OBJ_STREAM)) || |
501 | 502 |
(obj->flags & (1 << OBJ_HASFILTERS))) |
502 | 503 |
&& !(obj->flags & DUMP_MASK)) { |
503 | 504 |
/* don't dump all streams */ |
504 | 505 |
return CL_CLEAN; |
505 | 506 |
} |
507 |
+#if 1 |
|
508 |
+ if (obj->flags & (1 << OBJ_IMAGE)) { |
|
509 |
+ /* don't dump / scan images */ |
|
510 |
+ return CL_CLEAN; |
|
511 |
+ } |
|
512 |
+#endif |
|
506 | 513 |
snprintf(fullname, sizeof(fullname), "%s"PATHSEP"pdf%02u", pdf->dir, pdf->files++); |
507 | 514 |
fout = open(fullname,O_RDWR|O_CREAT|O_EXCL|O_TRUNC|O_BINARY, 0600); |
508 | 515 |
if (fout < 0) { |
... | ... |
@@ -531,6 +551,11 @@ static int pdf_extract_obj(struct pdf_struct *pdf, struct pdf_obj *obj) |
531 | 531 |
if (length < 0) |
532 | 532 |
length = 0; |
533 | 533 |
orig_length = length; |
534 |
+ if (length > pdf->size || obj->start + p_stream + length > pdf->size) { |
|
535 |
+ cli_dbgmsg("cli_pdf: length out of file: %ld + %ld > %ld\n", |
|
536 |
+ p_stream, length, pdf->size); |
|
537 |
+ length = pdf->size - (obj->start + p_stream); |
|
538 |
+ } |
|
534 | 539 |
if (!(obj->flags & (1 << OBJ_FILTER_FLATE)) && length <= 0) { |
535 | 540 |
const char *q = start + p_endstream; |
536 | 541 |
length = size; |
... | ... |
@@ -553,8 +578,11 @@ static int pdf_extract_obj(struct pdf_struct *pdf, struct pdf_obj *obj) |
553 | 553 |
length = size; |
554 | 554 |
} |
555 | 555 |
} |
556 |
- if (orig_length && size > orig_length + 20) |
|
556 |
+ if (orig_length && size > orig_length + 20) { |
|
557 |
+ cli_dbgmsg("cli_pdf: orig length: %ld, length: %ld, size: %ld\n", |
|
558 |
+ orig_length, length, size); |
|
557 | 559 |
pdfobj_flag(pdf, obj, BAD_STREAMLEN); |
560 |
+ } |
|
558 | 561 |
if (!length) |
559 | 562 |
length = size; |
560 | 563 |
|
... | ... |
@@ -580,10 +608,14 @@ static int pdf_extract_obj(struct pdf_struct *pdf, struct pdf_obj *obj) |
580 | 580 |
(unsigned char*)ascii_decoded); |
581 | 581 |
} |
582 | 582 |
if (ascii_decoded_size < 0) { |
583 |
- pdfobj_flag(pdf, obj, BAD_ASCIIDECODE); |
|
583 |
+ /* don't flag for images or truncated objs*/ |
|
584 |
+ if (!(obj->flags & |
|
585 |
+ ((1 << OBJ_IMAGE) | (1 << OBJ_TRUNCATED)))) |
|
586 |
+ pdfobj_flag(pdf, obj, BAD_ASCIIDECODE); |
|
584 | 587 |
cli_dbgmsg("cli_pdf: failed to asciidecode in %u %u obj\n", obj->id>>8,obj->id&0xff); |
585 |
- rc = CL_CLEAN; |
|
586 |
- break; |
|
588 |
+ free(ascii_decoded); |
|
589 |
+ ascii_decoded = NULL; |
|
590 |
+ /* attempt to directly flatedecode it */ |
|
587 | 591 |
} |
588 | 592 |
/* either direct or ascii-decoded input */ |
589 | 593 |
if (!ascii_decoded) |
... | ... |
@@ -591,7 +623,7 @@ static int pdf_extract_obj(struct pdf_struct *pdf, struct pdf_obj *obj) |
591 | 591 |
flate_in = ascii_decoded ? ascii_decoded : start+p_stream; |
592 | 592 |
|
593 | 593 |
if (obj->flags & (1 << OBJ_FILTER_FLATE)) { |
594 |
- cli_dbgmsg("cli_pdf: deflate len %d (orig %d)\n", ascii_decoded_size, orig_length); |
|
594 |
+ cli_dbgmsg("cli_pdf: deflate len %ld (orig %ld)\n", ascii_decoded_size, (long)orig_length); |
|
595 | 595 |
rc = filter_flatedecode(pdf, obj, flate_in, ascii_decoded_size, fout, &sum); |
596 | 596 |
} else { |
597 | 597 |
if (filter_writen(pdf, obj, fout, flate_in, ascii_decoded_size, &sum) != ascii_decoded_size) |
... | ... |
@@ -686,10 +718,11 @@ static struct pdfname_action pdfname_actions[] = { |
686 | 686 |
{"ASCIIHexDecode", OBJ_FILTER_AH, STATE_FILTER, STATE_FILTER}, |
687 | 687 |
{"ASCII85Decode", OBJ_FILTER_A85, STATE_FILTER, STATE_FILTER}, |
688 | 688 |
{"A85", OBJ_FILTER_A85, STATE_FILTER, STATE_FILTER}, |
689 |
- {"AHx", OBJ_FILTER_A85, STATE_FILTER, STATE_FILTER}, |
|
689 |
+ {"AHx", OBJ_FILTER_AH, STATE_FILTER, STATE_FILTER}, |
|
690 | 690 |
{"EmbeddedFile", OBJ_EMBEDDED_FILE, STATE_NONE, STATE_NONE}, |
691 | 691 |
{"FlateDecode", OBJ_FILTER_FLATE, STATE_FILTER, STATE_FILTER}, |
692 | 692 |
{"Fl", OBJ_FILTER_FLATE, STATE_FILTER, STATE_FILTER}, |
693 |
+ {"Image", OBJ_IMAGE, STATE_NONE, STATE_NONE}, |
|
693 | 694 |
{"LZWDecode", OBJ_FILTER_LZW, STATE_FILTER, STATE_FILTER}, |
694 | 695 |
{"LZW", OBJ_FILTER_LZW, STATE_FILTER, STATE_FILTER}, |
695 | 696 |
{"RunLengthDecode", OBJ_FILTER_RL, STATE_FILTER, STATE_FILTER}, |
... | ... |
@@ -704,6 +737,7 @@ static struct pdfname_action pdfname_actions[] = { |
704 | 704 |
{"Standard", OBJ_FILTER_CRYPT, STATE_FILTER, STATE_FILTER}, |
705 | 705 |
{"Sig", OBJ_SIGNED, STATE_ANY, STATE_NONE}, |
706 | 706 |
{"V", OBJ_SIGNED, STATE_ANY, STATE_NONE}, |
707 |
+ {"R", OBJ_SIGNED, STATE_ANY, STATE_NONE}, |
|
707 | 708 |
{"Linearized", OBJ_DICT, STATE_NONE, STATE_LINEARIZED}, |
708 | 709 |
{"Filter", OBJ_HASFILTERS, STATE_ANY, STATE_FILTER}, |
709 | 710 |
{"JavaScript", OBJ_JAVASCRIPT, STATE_S, STATE_JAVASCRIPT}, |
... | ... |
@@ -751,6 +785,7 @@ static void handle_pdfname(struct pdf_struct *pdf, struct pdf_obj *obj, |
751 | 751 |
if (*state == STATE_FILTER && |
752 | 752 |
act->set_objflag !=OBJ_DICT && |
753 | 753 |
(obj->flags & (1 << act->set_objflag))) { |
754 |
+ cli_dbgmsg("cli_pdf: duplicate stream filter %s\n", pdfname); |
|
754 | 755 |
pdfobj_flag(pdf, obj, BAD_STREAM_FILTERS); |
755 | 756 |
} |
756 | 757 |
obj->flags |= 1 << act->set_objflag; |
... | ... |
@@ -774,8 +809,8 @@ static void pdf_parseobj(struct pdf_struct *pdf, struct pdf_obj *obj) |
774 | 774 |
const char *q = obj->start + pdf->map; |
775 | 775 |
const char *dict, *start; |
776 | 776 |
off_t dict_length; |
777 |
- off_t bytesleft = pdf->offset - obj->start; |
|
778 |
- unsigned i; |
|
777 |
+ off_t bytesleft = obj_size(pdf, obj, 1); |
|
778 |
+ unsigned i, filters=0; |
|
779 | 779 |
enum objstate objstate = STATE_NONE; |
780 | 780 |
|
781 | 781 |
if (bytesleft < 0) |
... | ... |
@@ -795,7 +830,7 @@ static void pdf_parseobj(struct pdf_struct *pdf, struct pdf_obj *obj) |
795 | 795 |
} while (!q3 || q3[1] != '<'); |
796 | 796 |
dict = q3+2; |
797 | 797 |
q = dict; |
798 |
- bytesleft = pdf->offset - obj->start - (q3 - start); |
|
798 |
+ bytesleft = obj_size(pdf, obj, 1) - (q3 - start); |
|
799 | 799 |
/* find end of dictionary */ |
800 | 800 |
do { |
801 | 801 |
q2 = pdf_nextobject(q, bytesleft); |
... | ... |
@@ -830,7 +865,8 @@ static void pdf_parseobj(struct pdf_struct *pdf, struct pdf_obj *obj) |
830 | 830 |
escapes = 1; |
831 | 831 |
continue; |
832 | 832 |
} |
833 |
- if (*q == ' ' || *q == '\r' || *q == '\n' || *q == '/' || *q == '>' || *q == ']') |
|
833 |
+ if (*q == ' ' || *q == '\t' || *q == '\r' || *q == '\n' || |
|
834 |
+ *q == '/' || *q == '>' || *q == ']') |
|
834 | 835 |
break; |
835 | 836 |
pdfname[i] = *q; |
836 | 837 |
} |
... | ... |
@@ -873,6 +909,18 @@ static void pdf_parseobj(struct pdf_struct *pdf, struct pdf_obj *obj) |
873 | 873 |
objstate = STATE_NONE; |
874 | 874 |
} |
875 | 875 |
} |
876 |
+ for (i=0;i<sizeof(pdfname_actions)/sizeof(pdfname_actions[0]);i++) { |
|
877 |
+ const struct pdfname_action *act = &pdfname_actions[i]; |
|
878 |
+ if ((obj->flags & (1 << act->set_objflag)) && |
|
879 |
+ act->from_state == STATE_FILTER && |
|
880 |
+ act->to_state == STATE_FILTER && |
|
881 |
+ act->set_objflag != OBJ_FILTER_CRYPT) { |
|
882 |
+ filters++; |
|
883 |
+ } |
|
884 |
+ } |
|
885 |
+ if (filters > 2) { /* more than 2 non-crypt filters */ |
|
886 |
+ pdfobj_flag(pdf, obj, MANY_FILTERS); |
|
887 |
+ } |
|
876 | 888 |
if (obj->flags & ((1 << OBJ_SIGNED) | KNOWN_FILTERS)) |
877 | 889 |
obj->flags &= ~(1 << OBJ_FILTER_UNKNOWN); |
878 | 890 |
if (obj->flags & (1 << OBJ_FILTER_UNKNOWN)) |
... | ... |
@@ -1013,15 +1061,12 @@ int cli_pdf(const char *dir, cli_ctx *ctx, off_t offset) |
1013 | 1013 |
|
1014 | 1014 |
if (pdf.flags) { |
1015 | 1015 |
cli_dbgmsg("cli_pdf: flags 0x%02x\n", pdf.flags); |
1016 |
- if (pdf.flags & (1 << ESCAPED_COMMON_PDFNAME)) { |
|
1017 |
- /* for example /Fl#61te#44#65#63#6f#64#65 instead of /FlateDecode */ |
|
1018 |
- *ctx->virname = "Heuristics.PDF.ObfuscatedNameObject"; |
|
1019 |
- rc = CL_VIRUS; |
|
1020 |
- } |
|
1021 | 1016 |
#if 0 |
1022 | 1017 |
/* TODO: find both trailers, and /Encrypt settings */ |
1023 | 1018 |
if (pdf.flags & (1 << LINEARIZED_PDF)) |
1024 | 1019 |
pdf.flags &= ~ (1 << BAD_ASCIIDECODE); |
1020 |
+ if (pdf.flags & (1 << MANY_FILTERS)) |
|
1021 |
+ pdf.flags &= ~ (1 << BAD_ASCIIDECODE); |
|
1025 | 1022 |
if (pdf.flags & |
1026 | 1023 |
((1 << BAD_PDF_TOOMANYOBJS) | (1 << BAD_STREAM_FILTERS) | |
1027 | 1024 |
(1<<BAD_FLATE) | (1<<BAD_ASCIIDECODE)| |
... | ... |
@@ -1029,6 +1074,11 @@ int cli_pdf(const char *dir, cli_ctx *ctx, off_t offset) |
1029 | 1029 |
rc = CL_EUNPACK; |
1030 | 1030 |
} |
1031 | 1031 |
#endif |
1032 |
+ if (pdf.flags & (1 << ESCAPED_COMMON_PDFNAME)) { |
|
1033 |
+ /* for example /Fl#61te#44#65#63#6f#64#65 instead of /FlateDecode */ |
|
1034 |
+ *ctx->virname = "Heuristics.PDF.ObfuscatedNameObject"; |
|
1035 |
+ rc = CL_VIRUS; |
|
1036 |
+ } |
|
1032 | 1037 |
} |
1033 | 1038 |
cli_dbgmsg("cli_pdf: returning %d\n", rc); |
1034 | 1039 |
free(pdf.objs); |
... | ... |
@@ -1640,8 +1690,11 @@ static int asciihexdecode(const char *buf, off_t len, char *output) |
1640 | 1640 |
continue; |
1641 | 1641 |
if (buf[i] == '>') |
1642 | 1642 |
break; |
1643 |
- if (cli_hex2str_to(buf+i, output+j++, 2) == -1) |
|
1644 |
- j--; |
|
1643 |
+ if (cli_hex2str_to(buf+i, output+j++, 2) == -1) { |
|
1644 |
+ if (len - i < 4) |
|
1645 |
+ continue; |
|
1646 |
+ return -1; |
|
1647 |
+ } |
|
1645 | 1648 |
i++; |
1646 | 1649 |
} |
1647 | 1650 |
return j; |