Usage: uncomment #define NOISY, rebuild, and scan some PDF files
with an empty DB. It should print info messages for successful
extraction/decryption, and warnings where it fails.
... | ... |
@@ -58,6 +58,7 @@ static char const rcsid[] = "$Id: pdf.c,v 1.61 2007/02/12 20:46:09 njh Exp $"; |
58 | 58 |
#include "sha256.h" |
59 | 59 |
#include "textnorm.h" |
60 | 60 |
|
61 |
+ |
|
61 | 62 |
#ifdef CL_DEBUG |
62 | 63 |
/*#define SAVE_TMP |
63 | 64 |
*Save the file being worked on in tmp */ |
... | ... |
@@ -122,6 +123,17 @@ struct pdf_struct { |
122 | 122 |
unsigned keylen; |
123 | 123 |
}; |
124 | 124 |
|
125 |
+/* define this to be noisy about things that we can't parse properly */ |
|
126 |
+/*#define NOISY*/ |
|
127 |
+ |
|
128 |
+#ifdef NOISY |
|
129 |
+#define noisy_msg(pdf, ...) cli_infomsg(pdf->ctx, __VA_ARGS__) |
|
130 |
+#define noisy_warnmsg cli_warnmsg |
|
131 |
+#else |
|
132 |
+#define noisy_msg (void) |
|
133 |
+#define noisy_warnmsg (void) |
|
134 |
+#endif |
|
135 |
+ |
|
125 | 136 |
static const char *findNextNonWSBack(const char *q, const char *start) |
126 | 137 |
{ |
127 | 138 |
while (q > start && |
... | ... |
@@ -416,10 +428,18 @@ static int filter_flatedecode(struct pdf_struct *pdf, struct pdf_obj *obj, |
416 | 416 |
else |
417 | 417 |
cli_dbgmsg("cli_pdf: after writing %lu bytes, got error %d inflating PDF stream in %u %u obj\n", |
418 | 418 |
(unsigned long)nbytes, zstat, obj->id>>8, obj->id&0xff); |
419 |
+ if(stream.msg) |
|
420 |
+ noisy_warnmsg("cli_pdf: after writing %lu bytes, got error \"%s\" inflating PDF stream in %u %u obj\n", |
|
421 |
+ (unsigned long)nbytes, |
|
422 |
+ stream.msg, obj->id>>8, obj->id&0xff); |
|
423 |
+ else |
|
424 |
+ noisy_warnmsg("cli_pdf: after writing %lu bytes, got error %d inflating PDF stream in %u %u obj\n", |
|
425 |
+ (unsigned long)nbytes, zstat, obj->id>>8, obj->id&0xff); |
|
419 | 426 |
/* mark stream as bad only if not encrypted */ |
420 | 427 |
inflateEnd(&stream); |
421 | 428 |
if (!nbytes) { |
422 | 429 |
cli_dbgmsg("cli_pdf: dumping raw stream (probably encrypted)\n"); |
430 |
+ noisy_warnmsg("cli_pdf: dumping raw stream, probably encrypted and we failed to decrypt'n"); |
|
423 | 431 |
if (filter_writen(pdf, obj, fout, buf, len, sum) != len) { |
424 | 432 |
cli_errmsg("cli_pdf: failed to write output file\n"); |
425 | 433 |
return CL_EWRITE; |
... | ... |
@@ -593,6 +613,7 @@ static void aes_decrypt(const unsigned char *in, off_t *length, unsigned char *q |
593 | 593 |
} |
594 | 594 |
if (len < 32) { |
595 | 595 |
cli_dbgmsg("cli_pdf: aes_decrypt: len is <32: %d\n", len); |
596 |
+ noisy_warnmsg("cli_pdf: aes_decrypt: len is <32: %d\n", len); |
|
596 | 597 |
return; |
597 | 598 |
} |
598 | 599 |
if (has_iv) { |
... | ... |
@@ -618,6 +639,7 @@ static void aes_decrypt(const unsigned char *in, off_t *length, unsigned char *q |
618 | 618 |
pad = q[-1]; |
619 | 619 |
if (pad > 0x10) { |
620 | 620 |
cli_dbgmsg("cli_pdf: aes_decrypt: bad pad: %x (extra len: %d)\n", pad, len-16); |
621 |
+ noisy_warnmsg("cli_pdf: aes_decrypt: bad pad: %x (extra len: %d)\n", pad, len-16); |
|
621 | 622 |
*length -= len; |
622 | 623 |
return; |
623 | 624 |
} |
... | ... |
@@ -625,6 +647,7 @@ static void aes_decrypt(const unsigned char *in, off_t *length, unsigned char *q |
625 | 625 |
for (i=1;i<pad;i++) { |
626 | 626 |
if (q[i] != pad) { |
627 | 627 |
cli_dbgmsg("cli_pdf: aes_decrypt: bad pad: %x != %x\n",q[i],pad); |
628 |
+ noisy_warnmsg("cli_pdf: aes_decrypt: bad pad: %x != %x\n",q[i],pad); |
|
628 | 629 |
*length -= len; |
629 | 630 |
return; |
630 | 631 |
} |
... | ... |
@@ -644,14 +667,18 @@ static char *decrypt_any(struct pdf_struct *pdf, uint32_t id, const char *in, of |
644 | 644 |
cli_md5_ctx md5; |
645 | 645 |
struct arc4_state arc4; |
646 | 646 |
|
647 |
- if (!length || !*length || !in) |
|
647 |
+ if (!length || !*length || !in) { |
|
648 |
+ noisy_warnmsg("decrypt failed for obj %u %u\n", id>>8, id&0xff); |
|
648 | 649 |
return NULL; |
650 |
+ } |
|
649 | 651 |
n = pdf->keylen + 5; |
650 | 652 |
if (enc_method == ENC_AESV2) |
651 | 653 |
n += 4; |
652 | 654 |
key = cli_malloc(n); |
653 |
- if (!key) |
|
655 |
+ if (!key) { |
|
656 |
+ noisy_warnmsg("decrypt_any: malloc failed\n"); |
|
654 | 657 |
return NULL; |
658 |
+ } |
|
655 | 659 |
|
656 | 660 |
memcpy(key, pdf->key, pdf->keylen); |
657 | 661 |
q = key + pdf->keylen; |
... | ... |
@@ -672,8 +699,10 @@ static char *decrypt_any(struct pdf_struct *pdf, uint32_t id, const char *in, of |
672 | 672 |
n = 16; |
673 | 673 |
|
674 | 674 |
q = cli_malloc(*length); |
675 |
- if (!q) |
|
675 |
+ if (!q) { |
|
676 |
+ noisy_warnmsg("decrypt_any: malloc failed\n"); |
|
676 | 677 |
return NULL; |
678 |
+ } |
|
677 | 679 |
|
678 | 680 |
switch (enc_method) { |
679 | 681 |
case ENC_V2: |
... | ... |
@@ -681,26 +710,33 @@ static char *decrypt_any(struct pdf_struct *pdf, uint32_t id, const char *in, of |
681 | 681 |
memcpy(q, in, *length); |
682 | 682 |
arc4_init(&arc4, result, n); |
683 | 683 |
arc4_apply(&arc4, q, *length); |
684 |
+ noisy_msg(pdf, "decrypted ARC4 data\n"); |
|
684 | 685 |
break; |
685 | 686 |
case ENC_AESV2: |
686 | 687 |
cli_dbgmsg("cli_pdf: enc is aesv2\n"); |
687 | 688 |
aes_decrypt(in, length, q, result, n, 1); |
689 |
+ noisy_msg(pdf, "decrypted AES(v2) data\n"); |
|
688 | 690 |
break; |
689 | 691 |
case ENC_AESV3: |
690 | 692 |
cli_dbgmsg("cli_pdf: enc is aesv3\n"); |
691 | 693 |
aes_decrypt(in, length, q, pdf->key, pdf->keylen, 1); |
694 |
+ noisy_msg(pdf, "decrypted AES(v3) data\n"); |
|
692 | 695 |
break; |
693 | 696 |
case ENC_IDENTITY: |
694 | 697 |
cli_dbgmsg("cli_pdf: enc is identity\n"); |
695 | 698 |
memcpy(q, in, *length); |
699 |
+ noisy_msg(pdf, "identity encryption\n"); |
|
696 | 700 |
break; |
697 | 701 |
case ENC_NONE: |
698 | 702 |
cli_dbgmsg("cli_pdf: enc is none\n"); |
703 |
+ noisy_msg(pdf, "encryption is none\n"); |
|
699 | 704 |
free(q); |
700 | 705 |
return NULL; |
701 | 706 |
case ENC_UNKNOWN: |
702 | 707 |
cli_dbgmsg("cli_pdf: enc is unknown\n"); |
703 | 708 |
free(q); |
709 |
+ noisy_warnmsg("decrypt_any: unknown encryption method for obj %u %u\n", |
|
710 |
+ id>>8,id&0xff); |
|
704 | 711 |
return NULL; |
705 | 712 |
} |
706 | 713 |
return q; |
... | ... |
@@ -759,7 +795,7 @@ static int pdf_scan_contents(int fd, struct pdf_struct *pdf) |
759 | 759 |
char fullname[1024]; |
760 | 760 |
char outbuff[BUFSIZ]; |
761 | 761 |
char inbuf[BUFSIZ]; |
762 |
- int fout, n; |
|
762 |
+ int fout, n, rc; |
|
763 | 763 |
enum cstate st = CSTATE_NONE; |
764 | 764 |
|
765 | 765 |
snprintf(fullname, sizeof(fullname), "%s"PATHSEP"pdf%02u_c", pdf->dir, (pdf->files-1)); |
... | ... |
@@ -779,8 +815,13 @@ static int pdf_scan_contents(int fd, struct pdf_struct *pdf) |
779 | 779 |
} |
780 | 780 |
cli_writen(fout, s.out, s.out_pos); |
781 | 781 |
|
782 |
+ lseek(fout, 0, SEEK_SET); |
|
783 |
+ rc = cli_magic_scandesc(fout, pdf->ctx); |
|
782 | 784 |
close(fout); |
783 |
- return CL_SUCCESS; |
|
785 |
+ if (!pdf->ctx->engine->keeptmp) |
|
786 |
+ if (cli_unlink(fullname) && rc != CL_VIRUS) |
|
787 |
+ rc = CL_EUNLINK; |
|
788 |
+ return rc; |
|
784 | 789 |
} |
785 | 790 |
|
786 | 791 |
static const char *pdf_getdict(const char *q0, int* len, const char *key); |
... | ... |
@@ -847,6 +888,8 @@ static int pdf_extract_obj(struct pdf_struct *pdf, struct pdf_obj *obj) |
847 | 847 |
if (length > pdf->size || obj->start + p_stream + length > pdf->size) { |
848 | 848 |
cli_dbgmsg("cli_pdf: length out of file: %ld + %ld > %ld\n", |
849 | 849 |
p_stream, length, pdf->size); |
850 |
+ noisy_warnmsg("length out of file, truncated: %ld + %ld > %ld\n", |
|
851 |
+ p_stream, length, pdf->size); |
|
850 | 852 |
length = pdf->size - (obj->start + p_stream); |
851 | 853 |
} |
852 | 854 |
if (!(obj->flags & (1 << OBJ_FILTER_FLATE)) && length <= 0) { |
... | ... |
@@ -898,9 +941,7 @@ static int pdf_extract_obj(struct pdf_struct *pdf, struct pdf_obj *obj) |
898 | 898 |
else { |
899 | 899 |
decrypted = decrypt_any(pdf, obj->id, flate_in, &length, |
900 | 900 |
enc); |
901 |
- if (!decrypted) |
|
902 |
- cli_warnmsg("cli_pdf:decrypt_any: malloc failed\n"); |
|
903 |
- else |
|
901 |
+ if (decrypted) |
|
904 | 902 |
flate_in = decrypted; |
905 | 903 |
} |
906 | 904 |
} |
... | ... |
@@ -949,7 +990,9 @@ static int pdf_extract_obj(struct pdf_struct *pdf, struct pdf_obj *obj) |
949 | 949 |
if (filter_writen(pdf, obj, fout, flate_in, ascii_decoded_size, &sum) != ascii_decoded_size) |
950 | 950 |
rc = CL_EWRITE; |
951 | 951 |
} |
952 |
- } |
|
952 |
+ } else |
|
953 |
+ noisy_warnmsg("cannot find stream bounds for obj %u %u\n", obj->id>>8, obj->id&0xff); |
|
954 |
+ |
|
953 | 955 |
} else if (obj->flags & (1 << OBJ_JAVASCRIPT)) { |
954 | 956 |
const char *q2; |
955 | 957 |
const char *q = pdf->map+obj->start; |
... | ... |
@@ -1001,8 +1044,10 @@ static int pdf_extract_obj(struct pdf_struct *pdf, struct pdf_obj *obj) |
1001 | 1001 |
cli_dbgmsg("cli_pdf: encrypted string\n"); |
1002 | 1002 |
decrypted = decrypt_any(pdf, obj->id, out, &n, |
1003 | 1003 |
pdf->enc_method_string); |
1004 |
- if (decrypted) |
|
1004 |
+ if (decrypted) { |
|
1005 |
+ noisy_msg(pdf, "decrypted Javascript string from obj %u %u\n", obj->id>>8,obj->id&0xff); |
|
1005 | 1006 |
out = decrypted; |
1007 |
+ } |
|
1006 | 1008 |
} |
1007 | 1009 |
if (filter_writen(pdf, obj, fout, out, n, &sum) != n) { |
1008 | 1010 |
rc = CL_EWRITE; |
... | ... |
@@ -1024,8 +1069,10 @@ static int pdf_extract_obj(struct pdf_struct *pdf, struct pdf_obj *obj) |
1024 | 1024 |
out++; |
1025 | 1025 |
n--; |
1026 | 1026 |
decrypted = decrypt_any(pdf, obj->id, out, &n, pdf->enc_method_string); |
1027 |
- if (decrypted) |
|
1027 |
+ if (decrypted) { |
|
1028 |
+ noisy_msg(pdf, "decrypted Javascript string from obj %u %u\n", obj->id>>8,obj->id&0xff); |
|
1028 | 1029 |
out = decrypted; |
1030 |
+ } |
|
1029 | 1031 |
} |
1030 | 1032 |
decoded = cli_malloc(n); |
1031 | 1033 |
if (!decoded) { |
... | ... |
@@ -1068,6 +1115,7 @@ static int pdf_extract_obj(struct pdf_struct *pdf, struct pdf_obj *obj) |
1068 | 1068 |
rc2 = pdf_scan_contents(fout, pdf); |
1069 | 1069 |
if (rc2 == CL_VIRUS) |
1070 | 1070 |
rc = rc2; |
1071 |
+ noisy_msg(pdf, "extracted text from obj %u %u\n", obj->id>>8, obj->id&0xff); |
|
1071 | 1072 |
} |
1072 | 1073 |
} |
1073 | 1074 |
close(fout); |
... | ... |
@@ -1634,6 +1682,7 @@ static void check_user_password(struct pdf_struct *pdf, int R, const char *O, |
1634 | 1634 |
n = UE ? strlen(UE) : 0; |
1635 | 1635 |
if (n != 32) { |
1636 | 1636 |
cli_dbgmsg("cli_pdf: UE length is not 32: %d\n", n); |
1637 |
+ noisy_warnmsg("cli_pdf: UE length is not 32: %d\n", n); |
|
1637 | 1638 |
} else { |
1638 | 1639 |
pdf->keylen = 32; |
1639 | 1640 |
pdf->key = cli_malloc(32); |
... | ... |
@@ -1708,15 +1757,18 @@ static void check_user_password(struct pdf_struct *pdf, int R, const char *O, |
1708 | 1708 |
password_empty = 1; |
1709 | 1709 |
} else { |
1710 | 1710 |
cli_dbgmsg("cli_pdf: invalid revision %d\n", R); |
1711 |
+ noisy_warnmsg("cli_pdf: invalid revision %d\n", R); |
|
1711 | 1712 |
} |
1712 | 1713 |
} |
1713 | 1714 |
if (password_empty) { |
1714 | 1715 |
cli_dbgmsg("cli_pdf: user password is empty\n"); |
1716 |
+ noisy_msg(pdf, "cli_pdf: encrypted PDF found, user password is empty, will attempt to decrypt\n"); |
|
1715 | 1717 |
/* The key we computed above is the key used to encrypt the streams. |
1716 | 1718 |
* We could decrypt it now if we wanted to */ |
1717 | 1719 |
pdf->flags |= 1 << DECRYPTABLE_PDF; |
1718 | 1720 |
} else { |
1719 | 1721 |
cli_dbgmsg("cli_pdf: user/owner password would be required for decryption\n"); |
1722 |
+ noisy_warnmsg("cli_pdf: encrypted PDF found, user password is NOT empty, cannot decrypt!\n"); |
|
1720 | 1723 |
/* the key is not valid, we would need the user or the owner password to |
1721 | 1724 |
* decrypt */ |
1722 | 1725 |
} |
... | ... |
@@ -1759,11 +1811,13 @@ static void pdf_handle_enc(struct pdf_struct *pdf) |
1759 | 1759 |
return; |
1760 | 1760 |
if (!pdf->fileID) { |
1761 | 1761 |
cli_dbgmsg("cli_pdf: pdf_handle_enc no file ID\n"); |
1762 |
+ noisy_warnmsg("cli_pdf: pdf_handle_enc no file ID\n"); |
|
1762 | 1763 |
return; |
1763 | 1764 |
} |
1764 | 1765 |
obj = find_obj(pdf, pdf->objs, pdf->enc_objid); |
1765 | 1766 |
if (!obj) { |
1766 | 1767 |
cli_dbgmsg("cli_pdf: can't find encrypted object %d %d\n", pdf->enc_objid>>8, pdf->enc_objid&0xff); |
1768 |
+ noisy_warnmsg("cli_pdf: can't find encrypted object %d %d\n", pdf->enc_objid>>8, pdf->enc_objid&0xff); |
|
1767 | 1769 |
return; |
1768 | 1770 |
} |
1769 | 1771 |
len = obj_size(pdf, obj, 1); |
... | ... |
@@ -1778,12 +1832,14 @@ static void pdf_handle_enc(struct pdf_struct *pdf) |
1778 | 1778 |
P = pdf_readint(q, len, "/P"); |
1779 | 1779 |
if (P == ~0u) { |
1780 | 1780 |
cli_dbgmsg("cli_pdf: invalid P\n"); |
1781 |
+ noisy_warnmsg("cli_pdf: invalid P\n"); |
|
1781 | 1782 |
break; |
1782 | 1783 |
} |
1783 | 1784 |
|
1784 | 1785 |
q2 = cli_memstr(q, len, "/Standard", 9); |
1785 | 1786 |
if (!q2) { |
1786 | 1787 |
cli_dbgmsg("cli_pdf: /Standard not found\n"); |
1788 |
+ noisy_warnmsg("cli_pdf: /Standard not found\n"); |
|
1787 | 1789 |
break; |
1788 | 1790 |
} |
1789 | 1791 |
/* we can have both of these: |
... | ... |
@@ -1801,6 +1857,7 @@ static void pdf_handle_enc(struct pdf_struct *pdf) |
1801 | 1801 |
R = pdf_readint(q, len, "/R"); |
1802 | 1802 |
if (R == ~0u) { |
1803 | 1803 |
cli_dbgmsg("cli_pdf: invalid R\n"); |
1804 |
+ noisy_warnmsg("cli_pdf: invalid R\n"); |
|
1804 | 1805 |
break; |
1805 | 1806 |
} |
1806 | 1807 |
|
... | ... |
@@ -1847,6 +1904,7 @@ static void pdf_handle_enc(struct pdf_struct *pdf) |
1847 | 1847 |
O = pdf_readstring(q, len, "/O", &n); |
1848 | 1848 |
if (!O || n < oulen) { |
1849 | 1849 |
cli_dbgmsg("cli_pdf: invalid O: %d\n", n); |
1850 |
+ cli_dbgmsg("cli_pdf: invalid O: %d\n", n); |
|
1850 | 1851 |
if (O) |
1851 | 1852 |
dbg_printhex("invalid O", O, n); |
1852 | 1853 |
break; |
... | ... |
@@ -1857,6 +1915,7 @@ static void pdf_handle_enc(struct pdf_struct *pdf) |
1857 | 1857 |
break; |
1858 | 1858 |
if (i != n) { |
1859 | 1859 |
dbg_printhex("too long O", O, n); |
1860 |
+ noisy_warnmsg("too long O", O, n); |
|
1860 | 1861 |
break; |
1861 | 1862 |
} |
1862 | 1863 |
} |
... | ... |
@@ -1865,6 +1924,7 @@ static void pdf_handle_enc(struct pdf_struct *pdf) |
1865 | 1865 |
U = pdf_readstring(q, len, "/U", &n); |
1866 | 1866 |
if (!U || n < oulen) { |
1867 | 1867 |
cli_dbgmsg("cli_pdf: invalid U: %d\n", n); |
1868 |
+ noisy_warnmsg("cli_pdf: invalid U: %d\n", n); |
|
1868 | 1869 |
if (U) |
1869 | 1870 |
dbg_printhex("invalid U", U, n); |
1870 | 1871 |
break; |
... | ... |
@@ -1881,6 +1941,7 @@ static void pdf_handle_enc(struct pdf_struct *pdf) |
1881 | 1881 |
cli_dbgmsg("cli_pdf: Encrypt R: %d, P %x, length: %d\n", R, P, length); |
1882 | 1882 |
if (length % 8) { |
1883 | 1883 |
cli_dbgmsg("cli_pdf: wrong key length, not multiple of 8\n"); |
1884 |
+ noisy_warnmsg("cli_pdf: wrong key length, not multiple of 8\n"); |
|
1884 | 1885 |
break; |
1885 | 1886 |
} |
1886 | 1887 |
check_user_password(pdf, R, O, U, P, EM, UE, length, oulen); |
... | ... |
@@ -1919,6 +1980,7 @@ int cli_pdf(const char *dir, cli_ctx *ctx, off_t offset) |
1919 | 1919 |
pdfver = cli_memstr(pdfver, versize, "%PDF-", 5); |
1920 | 1920 |
if (!pdfver) { |
1921 | 1921 |
cli_dbgmsg("cli_pdf: no PDF- header found\n"); |
1922 |
+ noisy_warnmsg("cli_pdf: no PDF- header found\n"); |
|
1922 | 1923 |
return CL_SUCCESS; |
1923 | 1924 |
} |
1924 | 1925 |
/* Check for PDF-1.[0-9]. Although 1.7 is highest now, allow for future |