Browse code

Start basic PDF stats code for use later

Shawn Webb authored on 2014/04/17 03:23:16
Showing 1 changed files
... ...
@@ -72,6 +72,31 @@ static	int	ascii85decode(const char *buf, off_t len, unsigned char *output);
72 72
 static	const	char	*pdf_nextlinestart(const char *ptr, size_t len);
73 73
 static	const	char	*pdf_nextobject(const char *ptr, size_t len);
74 74
 
75
+/* PDF statistics callbacks */
76
+struct pdf_struct;
77
+struct pdf_action;
78
+
79
+static void ASCIIHexDecode_cb(struct pdf_struct *, struct pdf_obj *, struct pdf_action *);
80
+static void ASCII85Decode_cb(struct pdf_struct *, struct pdf_obj *, struct pdf_action *);
81
+static void EmbeddedFile_cb(struct pdf_struct *, struct pdf_obj *, struct pdf_action *);
82
+static void FlateDecode_cb(struct pdf_struct *, struct pdf_obj *, struct pdf_action *);
83
+static void Image_cb(struct pdf_struct *, struct pdf_obj *, struct pdf_action *);
84
+static void LZWDecode_cb(struct pdf_struct *, struct pdf_obj *, struct pdf_action *);
85
+static void RunLengthDecode_cb(struct pdf_struct *, struct pdf_obj *, struct pdf_action *);
86
+static void CCITTFaxDecode_cb(struct pdf_struct *, struct pdf_obj *, struct pdf_action *);
87
+static void JBIG2Decode_cb(struct pdf_struct *, struct pdf_obj *, struct pdf_action *);
88
+static void DCTDecode_cb(struct pdf_struct *, struct pdf_obj *, struct pdf_action *);
89
+static void JPXDecode_cb(struct pdf_struct *, struct pdf_obj *, struct pdf_action *);
90
+static void Crypt_cb(struct pdf_struct *, struct pdf_obj *, struct pdf_action *);
91
+static void Standard_cb(struct pdf_struct *, struct pdf_obj *, struct pdf_action *);
92
+static void Sig_cb(struct pdf_struct *, struct pdf_obj *, struct pdf_action *);
93
+static void JavaScript_cb(struct pdf_struct *, struct pdf_obj *, struct pdf_action *);
94
+static void OpenAction_cb(struct pdf_struct *, struct pdf_obj *, struct pdf_action *);
95
+static void Launch_cb(struct pdf_struct *, struct pdf_obj *, struct pdf_action *);
96
+static void Page_cb(struct pdf_struct *, struct pdf_obj *, struct pdf_action *);
97
+static void print_pdf_stats(struct pdf_struct *);
98
+/* End PDF statistics callbacks */
99
+
75 100
 static int xrefCheck(const char *xref, const char *eof)
76 101
 {
77 102
     const char *q;
... ...
@@ -107,6 +132,31 @@ enum enc_method {
107 107
     ENC_AESV3
108 108
 };
109 109
 
110
+struct pdf_stats {
111
+    unsigned long ninvalidobjs;     /* Number of invalid objects */
112
+    unsigned long njs;              /* Number of javascript objects */
113
+    unsigned long nflate;           /* Number of flate-encoded objects */
114
+    unsigned long nactivex;         /* Number of ActiveX objects */
115
+    unsigned long nflash;           /* Number of flash objects */
116
+    unsigned long ncolors;          /* Number of colors */
117
+    unsigned long nasciihexdecode;  /* Number of ASCIIHexDecode-filtered objects */
118
+    unsigned long nascii85decode;   /* Number of ASCII85Decode-filtered objects */
119
+    unsigned long nembeddedfile;    /* Number of embedded files */
120
+    unsigned long nimage;           /* Number of image objects */
121
+    unsigned long nlzw;             /* Number of LZW-filtered objects */
122
+    unsigned long nrunlengthdecode; /* Number of RunLengthDecode-filtered objects */
123
+    unsigned long nfaxdecode;       /* Number of CCITT-filtered objects */
124
+    unsigned long njbig2decode;     /* Number of JBIG2Decode-filtered objects */
125
+    unsigned long ndctdecode;       /* Number of DCTDecode-filtered objects */
126
+    unsigned long njpxdecode;       /* Number of JPXDecode-filtered objects */
127
+    unsigned long ncrypt;           /* Number of Crypt-filtered objects */
128
+    unsigned long nstandard;        /* Number of Standard-filtered objects */
129
+    unsigned long nsigned;          /* Number of Signed objects */
130
+    unsigned long nopenaction;      /* Number of OpenAction objects */
131
+    unsigned long nlaunch;          /* Number of Launch objects */
132
+    unsigned long npage;            /* Number of Page objects */
133
+};
134
+
110 135
 struct pdf_struct {
111 136
     struct pdf_obj *objs;
112 137
     unsigned nobjs;
... ...
@@ -128,6 +178,7 @@ struct pdf_struct {
128 128
     unsigned fileIDlen;
129 129
     char *key;
130 130
     unsigned keylen;
131
+    struct pdf_stats stats;
131 132
 };
132 133
 
133 134
 /* define this to be noisy about things that we can't parse properly */
... ...
@@ -1282,42 +1333,43 @@ struct pdfname_action {
1282 1282
     enum pdf_objflags set_objflag;/* OBJ_DICT is noop */
1283 1283
     enum objstate from_state;/* STATE_NONE is noop */
1284 1284
     enum objstate to_state;
1285
+    void (*pdf_stats_cb)(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdf_action *act);
1285 1286
 };
1286 1287
 
1287 1288
 static struct pdfname_action pdfname_actions[] = {
1288
-    {"ASCIIHexDecode", OBJ_FILTER_AH, STATE_FILTER, STATE_FILTER},
1289
-    {"ASCII85Decode", OBJ_FILTER_A85, STATE_FILTER, STATE_FILTER},
1290
-    {"A85", OBJ_FILTER_A85, STATE_FILTER, STATE_FILTER},
1291
-    {"AHx", OBJ_FILTER_AH, STATE_FILTER, STATE_FILTER},
1292
-    {"EmbeddedFile", OBJ_EMBEDDED_FILE, STATE_NONE, STATE_NONE},
1293
-    {"FlateDecode", OBJ_FILTER_FLATE, STATE_FILTER, STATE_FILTER},
1294
-    {"Fl", OBJ_FILTER_FLATE, STATE_FILTER, STATE_FILTER},
1295
-    {"Image", OBJ_IMAGE, STATE_NONE, STATE_NONE},
1296
-    {"LZWDecode", OBJ_FILTER_LZW, STATE_FILTER, STATE_FILTER},
1297
-    {"LZW", OBJ_FILTER_LZW, STATE_FILTER, STATE_FILTER},
1298
-    {"RunLengthDecode", OBJ_FILTER_RL, STATE_FILTER, STATE_FILTER},
1299
-    {"RL", OBJ_FILTER_RL, STATE_FILTER, STATE_FILTER},
1300
-    {"CCITTFaxDecode", OBJ_FILTER_FAX, STATE_FILTER, STATE_FILTER},
1301
-    {"CCF", OBJ_FILTER_FAX, STATE_FILTER, STATE_FILTER},
1302
-    {"JBIG2Decode", OBJ_FILTER_DCT, STATE_FILTER, STATE_FILTER},
1303
-    {"DCTDecode", OBJ_FILTER_DCT, STATE_FILTER, STATE_FILTER},
1304
-    {"DCT", OBJ_FILTER_DCT, STATE_FILTER, STATE_FILTER},
1305
-    {"JPXDecode", OBJ_FILTER_JPX, STATE_FILTER, STATE_FILTER},
1306
-    {"Crypt",  OBJ_FILTER_CRYPT, STATE_FILTER, STATE_NONE},
1307
-    {"Standard", OBJ_FILTER_STANDARD, STATE_FILTER, STATE_FILTER},
1308
-    {"Sig",    OBJ_SIGNED, STATE_ANY, STATE_NONE},
1309
-    {"V",     OBJ_SIGNED, STATE_ANY, STATE_NONE},
1310
-    {"R",     OBJ_SIGNED, STATE_ANY, STATE_NONE},
1311
-    {"Linearized", OBJ_DICT, STATE_NONE, STATE_LINEARIZED},
1312
-    {"Filter", OBJ_HASFILTERS, STATE_ANY, STATE_FILTER},
1313
-    {"JavaScript", OBJ_JAVASCRIPT, STATE_S, STATE_JAVASCRIPT},
1314
-    {"Length", OBJ_DICT, STATE_FILTER, STATE_NONE},
1315
-    {"S", OBJ_DICT, STATE_NONE, STATE_S},
1316
-    {"Type", OBJ_DICT, STATE_NONE, STATE_NONE},
1317
-    {"OpenAction", OBJ_OPENACTION, STATE_ANY, STATE_OPENACTION},
1318
-    {"Launch", OBJ_LAUNCHACTION, STATE_ANY, STATE_LAUNCHACTION},
1319
-    {"Page", OBJ_PAGE, STATE_NONE, STATE_NONE},
1320
-    {"Contents", OBJ_CONTENTS, STATE_NONE, STATE_CONTENTS}
1289
+    {"ASCIIHexDecode", OBJ_FILTER_AH, STATE_FILTER, STATE_FILTER, ASCIIHexDecode_cb},
1290
+    {"ASCII85Decode", OBJ_FILTER_A85, STATE_FILTER, STATE_FILTER, ASCII85Decode_cb},
1291
+    {"A85", OBJ_FILTER_A85, STATE_FILTER, STATE_FILTER, ASCII85Decode_cb},
1292
+    {"AHx", OBJ_FILTER_AH, STATE_FILTER, STATE_FILTER, ASCIIHexDecode_cb},
1293
+    {"EmbeddedFile", OBJ_EMBEDDED_FILE, STATE_NONE, STATE_NONE, EmbeddedFile_cb},
1294
+    {"FlateDecode", OBJ_FILTER_FLATE, STATE_FILTER, STATE_FILTER, FlateDecode_cb},
1295
+    {"Fl", OBJ_FILTER_FLATE, STATE_FILTER, STATE_FILTER, FlateDecode_cb},
1296
+    {"Image", OBJ_IMAGE, STATE_NONE, STATE_NONE, Image_cb},
1297
+    {"LZWDecode", OBJ_FILTER_LZW, STATE_FILTER, STATE_FILTER, LZWDecode_cb},
1298
+    {"LZW", OBJ_FILTER_LZW, STATE_FILTER, STATE_FILTER, LZWDecode_cb},
1299
+    {"RunLengthDecode", OBJ_FILTER_RL, STATE_FILTER, STATE_FILTER, RunLengthDecode_cb},
1300
+    {"RL", OBJ_FILTER_RL, STATE_FILTER, STATE_FILTER, RunLengthDecode_cb},
1301
+    {"CCITTFaxDecode", OBJ_FILTER_FAX, STATE_FILTER, STATE_FILTER, CCITTFaxDecode_cb},
1302
+    {"CCF", OBJ_FILTER_FAX, STATE_FILTER, STATE_FILTER, CCITTFaxDecode_cb},
1303
+    {"JBIG2Decode", OBJ_FILTER_DCT, STATE_FILTER, STATE_FILTER, JBIG2Decode_cb},
1304
+    {"DCTDecode", OBJ_FILTER_DCT, STATE_FILTER, STATE_FILTER, DCTDecode_cb},
1305
+    {"DCT", OBJ_FILTER_DCT, STATE_FILTER, STATE_FILTER, DCTDecode_cb},
1306
+    {"JPXDecode", OBJ_FILTER_JPX, STATE_FILTER, STATE_FILTER, JPXDecode_cb},
1307
+    {"Crypt",  OBJ_FILTER_CRYPT, STATE_FILTER, STATE_NONE, Crypt_cb},
1308
+    {"Standard", OBJ_FILTER_STANDARD, STATE_FILTER, STATE_FILTER, Standard_cb},
1309
+    {"Sig",    OBJ_SIGNED, STATE_ANY, STATE_NONE, Sig_cb},
1310
+    {"V",     OBJ_SIGNED, STATE_ANY, STATE_NONE, NULL},
1311
+    {"R",     OBJ_SIGNED, STATE_ANY, STATE_NONE, NULL},
1312
+    {"Linearized", OBJ_DICT, STATE_NONE, STATE_LINEARIZED, NULL},
1313
+    {"Filter", OBJ_HASFILTERS, STATE_ANY, STATE_FILTER, NULL},
1314
+    {"JavaScript", OBJ_JAVASCRIPT, STATE_S, STATE_JAVASCRIPT, JavaScript_cb},
1315
+    {"Length", OBJ_DICT, STATE_FILTER, STATE_NONE, NULL},
1316
+    {"S", OBJ_DICT, STATE_NONE, STATE_S, NULL},
1317
+    {"Type", OBJ_DICT, STATE_NONE, STATE_NONE, NULL},
1318
+    {"OpenAction", OBJ_OPENACTION, STATE_ANY, STATE_OPENACTION, OpenAction_cb},
1319
+    {"Launch", OBJ_LAUNCHACTION, STATE_ANY, STATE_LAUNCHACTION, Launch_cb},
1320
+    {"Page", OBJ_PAGE, STATE_NONE, STATE_NONE, Page_cb},
1321
+    {"Contents", OBJ_CONTENTS, STATE_NONE, STATE_CONTENTS, NULL}
1321 1322
 };
1322 1323
 
1323 1324
 #define KNOWN_FILTERS ((1 << OBJ_FILTER_AH) | (1 << OBJ_FILTER_RL) | (1 << OBJ_FILTER_A85) | (1 << OBJ_FILTER_FLATE) | (1 << OBJ_FILTER_LZW) | (1 << OBJ_FILTER_FAX) | (1 << OBJ_FILTER_DCT) | (1 << OBJ_FILTER_JPX) | (1 << OBJ_FILTER_CRYPT))
... ...
@@ -1345,6 +1397,9 @@ static void handle_pdfname(struct pdf_struct *pdf, struct pdf_obj *obj, const ch
1345 1345
         return;
1346 1346
     }
1347 1347
 
1348
+    if ((act->pdf_stats_cb))
1349
+        act->pdf_stats_cb(pdf, obj, act);
1350
+
1348 1351
     if (escapes) {
1349 1352
         /* if a commonly used PDF name is escaped that is certainly
1350 1353
            suspicious. */
... ...
@@ -2475,6 +2530,7 @@ int cli_pdf(const char *dir, cli_ctx *ctx, off_t offset)
2475 2475
                 /* Don't halt on one bad object */
2476 2476
                 cli_dbgmsg("cli_pdf: bad format object, skipping to next\n");
2477 2477
                 badobjects++;
2478
+                pdf.stats.ninvalidobjs++;
2478 2479
                 rc = CL_CLEAN;
2479 2480
                 break;
2480 2481
             case CL_VIRUS:
... ...
@@ -2531,6 +2587,8 @@ int cli_pdf(const char *dir, cli_ctx *ctx, off_t offset)
2531 2531
         rc = CL_EFORMAT;
2532 2532
     }
2533 2533
 
2534
+    print_pdf_stats(&pdf);
2535
+
2534 2536
     cli_dbgmsg("cli_pdf: returning %d\n", rc);
2535 2537
     free(pdf.objs);
2536 2538
     free(pdf.fileID);
... ...
@@ -2718,3 +2776,178 @@ pdf_nextobject(const char *ptr, size_t len)
2718 2718
 
2719 2719
     return NULL;
2720 2720
 }
2721
+
2722
+/* PDF statistics */
2723
+static void ASCIIHexDecode_cb(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdf_action *act)
2724
+{
2725
+    if (!(pdf))
2726
+        return;
2727
+
2728
+    pdf->stats.nasciihexdecode++;
2729
+}
2730
+
2731
+static void ASCII85Decode_cb(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdf_action *act)
2732
+{
2733
+    if (!(pdf))
2734
+        return;
2735
+
2736
+    pdf->stats.nascii85decode++;
2737
+}
2738
+
2739
+static void EmbeddedFile_cb(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdf_action *act)
2740
+{
2741
+    if (!(pdf))
2742
+        return;
2743
+
2744
+    pdf->stats.nembeddedfile++;
2745
+}
2746
+
2747
+static void FlateDecode_cb(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdf_action *act)
2748
+{
2749
+    if (!(pdf))
2750
+        return;
2751
+
2752
+    pdf->stats.nflate++;
2753
+}
2754
+
2755
+static void Image_cb(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdf_action *act)
2756
+{
2757
+    if (!(pdf))
2758
+        return;
2759
+
2760
+    pdf->stats.nimage++;
2761
+}
2762
+
2763
+static void LZWDecode_cb(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdf_action *act)
2764
+{
2765
+    if (!(pdf))
2766
+        return;
2767
+
2768
+    pdf->stats.nlzw++;
2769
+}
2770
+
2771
+static void RunLengthDecode_cb(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdf_action *act)
2772
+{
2773
+    if (!(pdf))
2774
+        return;
2775
+
2776
+    pdf->stats.nrunlengthdecode++;
2777
+}
2778
+
2779
+static void CCITTFaxDecode_cb(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdf_action *act)
2780
+{
2781
+    if (!(pdf))
2782
+        return;
2783
+
2784
+    pdf->stats.nfaxdecode++;
2785
+}
2786
+
2787
+static void JBIG2Decode_cb(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdf_action *act)
2788
+{
2789
+     if (!(pdf))
2790
+         return;
2791
+
2792
+     pdf->stats.njbig2decode++;
2793
+}
2794
+
2795
+static void DCTDecode_cb(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdf_action *act)
2796
+{
2797
+    if (!(pdf))
2798
+        return;
2799
+
2800
+    pdf->stats.ndctdecode++;
2801
+}
2802
+
2803
+static void JPXDecode_cb(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdf_action *act)
2804
+{
2805
+    if (!(pdf))
2806
+        return;
2807
+
2808
+    pdf->stats.njpxdecode++;
2809
+}
2810
+
2811
+static void Crypt_cb(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdf_action *act)
2812
+{
2813
+    if (!(pdf))
2814
+        return;
2815
+
2816
+    pdf->stats.ncrypt++;
2817
+}
2818
+
2819
+static void Standard_cb(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdf_action *act)
2820
+{
2821
+    if (!(pdf))
2822
+        return;
2823
+
2824
+    pdf->stats.nstandard++;
2825
+}
2826
+
2827
+static void Sig_cb(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdf_action *act)
2828
+{
2829
+    if (!(pdf))
2830
+        return;
2831
+
2832
+    pdf->stats.nsigned++;
2833
+}
2834
+
2835
+static void JavaScript_cb(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdf_action *act)
2836
+{
2837
+    if (!(pdf))
2838
+        return;
2839
+
2840
+    pdf->stats.njs++;
2841
+}
2842
+
2843
+static void OpenAction_cb(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdf_action *act)
2844
+{
2845
+    if (!(pdf))
2846
+        return;
2847
+
2848
+    pdf->stats.nopenaction++;
2849
+}
2850
+
2851
+static void Launch_cb(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdf_action *act)
2852
+{
2853
+    if (!(pdf))
2854
+        return;
2855
+
2856
+    pdf->stats.nlaunch++;
2857
+}
2858
+
2859
+static void Page_cb(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdf_action *act)
2860
+{
2861
+    if (!(pdf))
2862
+        return;
2863
+
2864
+    pdf->stats.npage++;
2865
+}
2866
+
2867
+static void print_pdf_stats(struct pdf_struct *pdf)
2868
+{
2869
+    if (!(pdf))
2870
+        return;
2871
+
2872
+    cli_dbgmsg("Statistics collected from PDF:\n");
2873
+    cli_dbgmsg("    Invalid Objects:\t\t\t\t%lu\n", pdf->stats.ninvalidobjs);
2874
+    cli_dbgmsg("    Number of JavaScript Objects:\t\t%lu\n", pdf->stats.njs);
2875
+    cli_dbgmsg("    Number of Inflate-Encoded Objects:\t\t%lu\n", pdf->stats.nflate);
2876
+    cli_dbgmsg("    Number of ActiveX Objects:\t\t\t%lu\n", pdf->stats.nactivex);
2877
+    cli_dbgmsg("    Number of Flash Objects:\t\t\t%lu\n", pdf->stats.nflash);
2878
+    cli_dbgmsg("    Number of Declared Colors:\t\t\t%lu\n", pdf->stats.ncolors);
2879
+    cli_dbgmsg("    Number of ASCIIHexEncoded Objects:\t\t%lu\n", pdf->stats.nasciihexdecode);
2880
+    cli_dbgmsg("    Number of ASCII85Encoded Objects:\t\t%lu\n", pdf->stats.nascii85decode);
2881
+    cli_dbgmsg("    Number of Embedded Files:\t\t\t%lu\n", pdf->stats.nembeddedfile);
2882
+    cli_dbgmsg("    Number of Image Objects:\t\t\t%lu\n", pdf->stats.nimage);
2883
+    cli_dbgmsg("    Number of LZW-Encoded Objects:\t\t%lu\n", pdf->stats.nlzw);
2884
+    cli_dbgmsg("    Number of RunLengthEncoded Objects:\t%lu\n", pdf->stats.nrunlengthdecode);
2885
+    cli_dbgmsg("    Number of Fax-Encoded Objects:\t\t%lu\n", pdf->stats.nfaxdecode);
2886
+    cli_dbgmsg("    Number of JBIG2-Encoded Objects:\t\t%lu\n", pdf->stats.njbig2decode);
2887
+    cli_dbgmsg("    Number of DCT-Encoded Objects:\t\t%lu\n", pdf->stats.ndctdecode);
2888
+    cli_dbgmsg("    Number of JPX-Encoded Objects:\t\t%lu\n", pdf->stats.njpxdecode);
2889
+    cli_dbgmsg("    Number of Crypt-Encoded Objects:\t\t%lu\n", pdf->stats.ncrypt);
2890
+    cli_dbgmsg("    Number of Standard-Filtered Objects:\t%lu\n", pdf->stats.nstandard);
2891
+    cli_dbgmsg("    Number of Signed Objects:\t\t\t%lu\n", pdf->stats.nsigned);
2892
+    cli_dbgmsg("    Number of Open Actions:\t\t\t%lu\n", pdf->stats.nopenaction);
2893
+    cli_dbgmsg("    Number of Launch Objects:\t\t\t%lu\n", pdf->stats.nlaunch);
2894
+    cli_dbgmsg("    Number of Objects with /Pages:\t\t%lu\n", pdf->stats.npage);
2895
+}