Browse code

add an #ifdef NOISY to pdf.c

Usage: uncomment #define NOISY, rebuild, and scan some PDF files
with an empty DB. It should print info messages for successful
extraction/decryption, and warnings where it fails.

Török Edvin authored on 2011/12/29 02:05:57
Showing 1 changed files
... ...
@@ -58,6 +58,7 @@ static	char	const	rcsid[] = "$Id: pdf.c,v 1.61 2007/02/12 20:46:09 njh Exp $";
58 58
 #include "sha256.h"
59 59
 #include "textnorm.h"
60 60
 
61
+
61 62
 #ifdef	CL_DEBUG
62 63
 /*#define	SAVE_TMP	
63 64
  *Save the file being worked on in tmp */
... ...
@@ -122,6 +123,17 @@ struct pdf_struct {
122 122
     unsigned keylen;
123 123
 };
124 124
 
125
+/* define this to be noisy about things that we can't parse properly */
126
+/*#define NOISY*/
127
+
128
+#ifdef NOISY
129
+#define noisy_msg(pdf, ...) cli_infomsg(pdf->ctx, __VA_ARGS__)
130
+#define noisy_warnmsg cli_warnmsg
131
+#else
132
+#define noisy_msg (void)
133
+#define noisy_warnmsg (void)
134
+#endif
135
+
125 136
 static const char *findNextNonWSBack(const char *q, const char *start)
126 137
 {
127 138
     while (q > start &&
... ...
@@ -416,10 +428,18 @@ static int filter_flatedecode(struct pdf_struct *pdf, struct pdf_obj *obj,
416 416
 		else
417 417
 		    cli_dbgmsg("cli_pdf: after writing %lu bytes, got error %d inflating PDF stream in %u %u obj\n",
418 418
 			       (unsigned long)nbytes, zstat, obj->id>>8, obj->id&0xff);
419
+		if(stream.msg)
420
+		    noisy_warnmsg("cli_pdf: after writing %lu bytes, got error \"%s\" inflating PDF stream in %u %u obj\n",
421
+			       (unsigned long)nbytes,
422
+			       stream.msg, obj->id>>8, obj->id&0xff);
423
+		else
424
+		    noisy_warnmsg("cli_pdf: after writing %lu bytes, got error %d inflating PDF stream in %u %u obj\n",
425
+			       (unsigned long)nbytes, zstat, obj->id>>8, obj->id&0xff);
419 426
 		/* mark stream as bad only if not encrypted */
420 427
 		inflateEnd(&stream);
421 428
 		if (!nbytes) {
422 429
 		    cli_dbgmsg("cli_pdf: dumping raw stream (probably encrypted)\n");
430
+		    noisy_warnmsg("cli_pdf: dumping raw stream, probably encrypted and we failed to decrypt'n");
423 431
 		    if (filter_writen(pdf, obj, fout, buf, len, sum) != len) {
424 432
 			cli_errmsg("cli_pdf: failed to write output file\n");
425 433
 			return CL_EWRITE;
... ...
@@ -593,6 +613,7 @@ static void aes_decrypt(const unsigned char *in, off_t *length, unsigned char *q
593 593
     }
594 594
     if (len < 32) {
595 595
 	cli_dbgmsg("cli_pdf: aes_decrypt: len is <32: %d\n", len);
596
+	noisy_warnmsg("cli_pdf: aes_decrypt: len is <32: %d\n", len);
596 597
 	return;
597 598
     }
598 599
     if (has_iv) {
... ...
@@ -618,6 +639,7 @@ static void aes_decrypt(const unsigned char *in, off_t *length, unsigned char *q
618 618
 	pad = q[-1];
619 619
 	if (pad > 0x10) {
620 620
 	    cli_dbgmsg("cli_pdf: aes_decrypt: bad pad: %x (extra len: %d)\n", pad, len-16);
621
+	    noisy_warnmsg("cli_pdf: aes_decrypt: bad pad: %x (extra len: %d)\n", pad, len-16);
621 622
 	    *length -= len;
622 623
 	    return;
623 624
 	}
... ...
@@ -625,6 +647,7 @@ static void aes_decrypt(const unsigned char *in, off_t *length, unsigned char *q
625 625
 	for (i=1;i<pad;i++) {
626 626
 	    if (q[i] != pad) {
627 627
 		cli_dbgmsg("cli_pdf: aes_decrypt: bad pad: %x != %x\n",q[i],pad);
628
+		noisy_warnmsg("cli_pdf: aes_decrypt: bad pad: %x != %x\n",q[i],pad);
628 629
 		*length -= len;
629 630
 		return;
630 631
 	    }
... ...
@@ -644,14 +667,18 @@ static char *decrypt_any(struct pdf_struct *pdf, uint32_t id, const char *in, of
644 644
     cli_md5_ctx md5;
645 645
     struct arc4_state arc4;
646 646
 
647
-    if (!length || !*length || !in)
647
+    if (!length || !*length || !in) {
648
+	noisy_warnmsg("decrypt failed for obj %u %u\n", id>>8, id&0xff);
648 649
 	return NULL;
650
+    }
649 651
     n = pdf->keylen + 5;
650 652
     if (enc_method == ENC_AESV2)
651 653
 	n += 4;
652 654
     key = cli_malloc(n);
653
-    if (!key)
655
+    if (!key) {
656
+	noisy_warnmsg("decrypt_any: malloc failed\n");
654 657
 	return NULL;
658
+    }
655 659
 
656 660
     memcpy(key, pdf->key, pdf->keylen);
657 661
     q = key + pdf->keylen;
... ...
@@ -672,8 +699,10 @@ static char *decrypt_any(struct pdf_struct *pdf, uint32_t id, const char *in, of
672 672
 	n = 16;
673 673
 
674 674
     q = cli_malloc(*length);
675
-    if (!q)
675
+    if (!q) {
676
+	noisy_warnmsg("decrypt_any: malloc failed\n");
676 677
 	return NULL;
678
+    }
677 679
 
678 680
     switch (enc_method) {
679 681
 	case ENC_V2:
... ...
@@ -681,26 +710,33 @@ static char *decrypt_any(struct pdf_struct *pdf, uint32_t id, const char *in, of
681 681
 	    memcpy(q, in, *length);
682 682
 	    arc4_init(&arc4, result, n);
683 683
 	    arc4_apply(&arc4, q, *length);
684
+	    noisy_msg(pdf, "decrypted ARC4 data\n");
684 685
 	    break;
685 686
 	case ENC_AESV2:
686 687
 	    cli_dbgmsg("cli_pdf: enc is aesv2\n");
687 688
 	    aes_decrypt(in, length, q, result, n, 1);
689
+	    noisy_msg(pdf, "decrypted AES(v2) data\n");
688 690
 	    break;
689 691
 	case ENC_AESV3:
690 692
 	    cli_dbgmsg("cli_pdf: enc is aesv3\n");
691 693
 	    aes_decrypt(in, length, q, pdf->key, pdf->keylen, 1);
694
+	    noisy_msg(pdf, "decrypted AES(v3) data\n");
692 695
 	    break;
693 696
 	case ENC_IDENTITY:
694 697
 	    cli_dbgmsg("cli_pdf: enc is identity\n");
695 698
 	    memcpy(q, in, *length);
699
+	    noisy_msg(pdf, "identity encryption\n");
696 700
 	    break;
697 701
 	case ENC_NONE:
698 702
 	    cli_dbgmsg("cli_pdf: enc is none\n");
703
+	    noisy_msg(pdf, "encryption is none\n");
699 704
 	    free(q);
700 705
 	    return NULL;
701 706
 	case ENC_UNKNOWN:
702 707
 	    cli_dbgmsg("cli_pdf: enc is unknown\n");
703 708
 	    free(q);
709
+	    noisy_warnmsg("decrypt_any: unknown encryption method for obj %u %u\n",
710
+		       id>>8,id&0xff);
704 711
 	    return NULL;
705 712
     }
706 713
     return q;
... ...
@@ -759,7 +795,7 @@ static int pdf_scan_contents(int fd, struct pdf_struct *pdf)
759 759
     char fullname[1024];
760 760
     char outbuff[BUFSIZ];
761 761
     char inbuf[BUFSIZ];
762
-    int fout, n;
762
+    int fout, n, rc;
763 763
     enum cstate st = CSTATE_NONE;
764 764
 
765 765
     snprintf(fullname, sizeof(fullname), "%s"PATHSEP"pdf%02u_c", pdf->dir, (pdf->files-1));
... ...
@@ -779,8 +815,13 @@ static int pdf_scan_contents(int fd, struct pdf_struct *pdf)
779 779
     }
780 780
     cli_writen(fout, s.out, s.out_pos);
781 781
 
782
+    lseek(fout, 0, SEEK_SET);
783
+    rc = cli_magic_scandesc(fout, pdf->ctx);
782 784
     close(fout);
783
-    return CL_SUCCESS;
785
+    if (!pdf->ctx->engine->keeptmp)
786
+	if (cli_unlink(fullname) && rc != CL_VIRUS)
787
+	    rc = CL_EUNLINK;
788
+    return rc;
784 789
 }
785 790
 
786 791
 static const char *pdf_getdict(const char *q0, int* len, const char *key);
... ...
@@ -847,6 +888,8 @@ static int pdf_extract_obj(struct pdf_struct *pdf, struct pdf_obj *obj)
847 847
 	    if (length > pdf->size || obj->start + p_stream + length > pdf->size) {
848 848
 		cli_dbgmsg("cli_pdf: length out of file: %ld + %ld > %ld\n",
849 849
 			   p_stream, length, pdf->size);
850
+		noisy_warnmsg("length out of file, truncated: %ld + %ld > %ld\n",
851
+			   p_stream, length, pdf->size);
850 852
 		length = pdf->size - (obj->start + p_stream);
851 853
 	    }
852 854
 	    if (!(obj->flags & (1 << OBJ_FILTER_FLATE)) && length <= 0) {
... ...
@@ -898,9 +941,7 @@ static int pdf_extract_obj(struct pdf_struct *pdf, struct pdf_obj *obj)
898 898
 		else {
899 899
 		    decrypted = decrypt_any(pdf, obj->id, flate_in, &length,
900 900
 					    enc);
901
-		    if (!decrypted)
902
-			cli_warnmsg("cli_pdf:decrypt_any: malloc failed\n");
903
-		    else
901
+		    if (decrypted)
904 902
 			flate_in = decrypted;
905 903
 		}
906 904
 	    }
... ...
@@ -949,7 +990,9 @@ static int pdf_extract_obj(struct pdf_struct *pdf, struct pdf_obj *obj)
949 949
 		if (filter_writen(pdf, obj, fout, flate_in, ascii_decoded_size, &sum) != ascii_decoded_size)
950 950
 		    rc = CL_EWRITE;
951 951
 	    }
952
-	}
952
+	} else
953
+	    noisy_warnmsg("cannot find stream bounds for obj %u %u\n", obj->id>>8, obj->id&0xff);
954
+
953 955
     } else if (obj->flags & (1 << OBJ_JAVASCRIPT)) {
954 956
 	const char *q2;
955 957
 	const char *q = pdf->map+obj->start;
... ...
@@ -1001,8 +1044,10 @@ static int pdf_extract_obj(struct pdf_struct *pdf, struct pdf_obj *obj)
1001 1001
 		cli_dbgmsg("cli_pdf: encrypted string\n");
1002 1002
 		decrypted = decrypt_any(pdf, obj->id, out, &n,
1003 1003
 					pdf->enc_method_string);
1004
-		if (decrypted)
1004
+		if (decrypted) {
1005
+		    noisy_msg(pdf, "decrypted Javascript string from obj %u %u\n", obj->id>>8,obj->id&0xff);
1005 1006
 		    out = decrypted;
1007
+		}
1006 1008
 	    }
1007 1009
 	    if (filter_writen(pdf, obj, fout, out, n, &sum) != n) {
1008 1010
 		rc = CL_EWRITE;
... ...
@@ -1024,8 +1069,10 @@ static int pdf_extract_obj(struct pdf_struct *pdf, struct pdf_obj *obj)
1024 1024
 		out++;
1025 1025
 		n--;
1026 1026
 		decrypted = decrypt_any(pdf, obj->id, out, &n, pdf->enc_method_string);
1027
-		if (decrypted)
1027
+		if (decrypted) {
1028
+		    noisy_msg(pdf, "decrypted Javascript string from obj %u %u\n", obj->id>>8,obj->id&0xff);
1028 1029
 		    out = decrypted;
1030
+		}
1029 1031
 	    }
1030 1032
 	    decoded = cli_malloc(n);
1031 1033
 	    if (!decoded) {
... ...
@@ -1068,6 +1115,7 @@ static int pdf_extract_obj(struct pdf_struct *pdf, struct pdf_obj *obj)
1068 1068
 	    rc2 = pdf_scan_contents(fout, pdf);
1069 1069
 	    if (rc2 == CL_VIRUS)
1070 1070
 		rc = rc2;
1071
+	    noisy_msg(pdf, "extracted text from obj %u %u\n", obj->id>>8, obj->id&0xff);
1071 1072
 	}
1072 1073
     }
1073 1074
     close(fout);
... ...
@@ -1634,6 +1682,7 @@ static void check_user_password(struct pdf_struct *pdf, int R, const char *O,
1634 1634
 	    n = UE ? strlen(UE) : 0;
1635 1635
 	    if (n != 32) {
1636 1636
 		cli_dbgmsg("cli_pdf: UE length is not 32: %d\n", n);
1637
+		noisy_warnmsg("cli_pdf: UE length is not 32: %d\n", n);
1637 1638
 	    } else {
1638 1639
 		pdf->keylen = 32;
1639 1640
 		pdf->key = cli_malloc(32);
... ...
@@ -1708,15 +1757,18 @@ static void check_user_password(struct pdf_struct *pdf, int R, const char *O,
1708 1708
 		password_empty = 1;
1709 1709
 	} else {
1710 1710
 	    cli_dbgmsg("cli_pdf: invalid revision %d\n", R);
1711
+	    noisy_warnmsg("cli_pdf: invalid revision %d\n", R);
1711 1712
 	}
1712 1713
     }
1713 1714
     if (password_empty) {
1714 1715
 	cli_dbgmsg("cli_pdf: user password is empty\n");
1716
+	noisy_msg(pdf, "cli_pdf: encrypted PDF found, user password is empty, will attempt to decrypt\n");
1715 1717
 	/* The key we computed above is the key used to encrypt the streams.
1716 1718
 	 * We could decrypt it now if we wanted to */
1717 1719
 	pdf->flags |= 1 << DECRYPTABLE_PDF;
1718 1720
     } else {
1719 1721
 	cli_dbgmsg("cli_pdf: user/owner password would be required for decryption\n");
1722
+	noisy_warnmsg("cli_pdf: encrypted PDF found, user password is NOT empty, cannot decrypt!\n");
1720 1723
 	/* the key is not valid, we would need the user or the owner password to
1721 1724
 	 * decrypt */
1722 1725
     }
... ...
@@ -1759,11 +1811,13 @@ static void pdf_handle_enc(struct pdf_struct *pdf)
1759 1759
 	return;
1760 1760
     if (!pdf->fileID) {
1761 1761
 	cli_dbgmsg("cli_pdf: pdf_handle_enc no file ID\n");
1762
+	noisy_warnmsg("cli_pdf: pdf_handle_enc no file ID\n");
1762 1763
 	return;
1763 1764
     }
1764 1765
     obj = find_obj(pdf, pdf->objs, pdf->enc_objid);
1765 1766
     if (!obj) {
1766 1767
 	cli_dbgmsg("cli_pdf: can't find encrypted object %d %d\n", pdf->enc_objid>>8, pdf->enc_objid&0xff);
1768
+	noisy_warnmsg("cli_pdf: can't find encrypted object %d %d\n", pdf->enc_objid>>8, pdf->enc_objid&0xff);
1767 1769
 	return;
1768 1770
     }
1769 1771
     len = obj_size(pdf, obj, 1);
... ...
@@ -1778,12 +1832,14 @@ static void pdf_handle_enc(struct pdf_struct *pdf)
1778 1778
 	P = pdf_readint(q, len, "/P");
1779 1779
 	if (P == ~0u) {
1780 1780
 	    cli_dbgmsg("cli_pdf: invalid P\n");
1781
+	    noisy_warnmsg("cli_pdf: invalid P\n");
1781 1782
 	    break;
1782 1783
 	}
1783 1784
 
1784 1785
 	q2 = cli_memstr(q, len, "/Standard", 9);
1785 1786
 	if (!q2) {
1786 1787
 	    cli_dbgmsg("cli_pdf: /Standard not found\n");
1788
+	    noisy_warnmsg("cli_pdf: /Standard not found\n");
1787 1789
 	    break;
1788 1790
 	}
1789 1791
 	/* we can have both of these:
... ...
@@ -1801,6 +1857,7 @@ static void pdf_handle_enc(struct pdf_struct *pdf)
1801 1801
 	R = pdf_readint(q, len, "/R");
1802 1802
 	if (R == ~0u) {
1803 1803
 	    cli_dbgmsg("cli_pdf: invalid R\n");
1804
+	    noisy_warnmsg("cli_pdf: invalid R\n");
1804 1805
 	    break;
1805 1806
 	}
1806 1807
 
... ...
@@ -1847,6 +1904,7 @@ static void pdf_handle_enc(struct pdf_struct *pdf)
1847 1847
 	O = pdf_readstring(q, len, "/O", &n);
1848 1848
 	if (!O || n < oulen) {
1849 1849
 	    cli_dbgmsg("cli_pdf: invalid O: %d\n", n);
1850
+	    cli_dbgmsg("cli_pdf: invalid O: %d\n", n);
1850 1851
 	    if (O)
1851 1852
 		dbg_printhex("invalid O", O, n);
1852 1853
 	    break;
... ...
@@ -1857,6 +1915,7 @@ static void pdf_handle_enc(struct pdf_struct *pdf)
1857 1857
 		    break;
1858 1858
 	    if (i != n) {
1859 1859
 		dbg_printhex("too long O", O, n);
1860
+		noisy_warnmsg("too long O", O, n);
1860 1861
 		break;
1861 1862
 	    }
1862 1863
 	}
... ...
@@ -1865,6 +1924,7 @@ static void pdf_handle_enc(struct pdf_struct *pdf)
1865 1865
 	U = pdf_readstring(q, len, "/U", &n);
1866 1866
 	if (!U || n < oulen) {
1867 1867
 	    cli_dbgmsg("cli_pdf: invalid U: %d\n", n);
1868
+	    noisy_warnmsg("cli_pdf: invalid U: %d\n", n);
1868 1869
 	    if (U)
1869 1870
 		dbg_printhex("invalid U", U, n);
1870 1871
 	    break;
... ...
@@ -1881,6 +1941,7 @@ static void pdf_handle_enc(struct pdf_struct *pdf)
1881 1881
 	cli_dbgmsg("cli_pdf: Encrypt R: %d, P %x, length: %d\n", R, P, length);
1882 1882
 	if (length % 8) {
1883 1883
 	    cli_dbgmsg("cli_pdf: wrong key length, not multiple of 8\n");
1884
+	    noisy_warnmsg("cli_pdf: wrong key length, not multiple of 8\n");
1884 1885
 	    break;
1885 1886
 	}
1886 1887
 	check_user_password(pdf, R, O, U, P, EM, UE, length, oulen);
... ...
@@ -1919,6 +1980,7 @@ int cli_pdf(const char *dir, cli_ctx *ctx, off_t offset)
1919 1919
     pdfver = cli_memstr(pdfver, versize, "%PDF-", 5);
1920 1920
     if (!pdfver) {
1921 1921
 	cli_dbgmsg("cli_pdf: no PDF- header found\n");
1922
+	noisy_warnmsg("cli_pdf: no PDF- header found\n");
1922 1923
 	return CL_SUCCESS;
1923 1924
     }
1924 1925
     /* Check for PDF-1.[0-9]. Although 1.7 is highest now, allow for future