Browse code

Parse the Author, Producer, Creator, CreationDate, and ModDate PDF flags

Shawn Webb authored on 2014/05/24 03:06:35
Showing 1 changed files
... ...
@@ -45,6 +45,8 @@ static	char	const	rcsid[] = "$Id: pdf.c,v 1.61 2007/02/12 20:46:09 njh Exp $";
45 45
 #endif
46 46
 #include <zlib.h>
47 47
 
48
+#include <iconv.h>
49
+
48 50
 #include <openssl/ssl.h>
49 51
 #include <openssl/err.h>
50 52
 #include "libclamav/crypto.h"
... ...
@@ -71,6 +73,7 @@ static	int	asciihexdecode(const char *buf, off_t len, char *output);
71 71
 static	int	ascii85decode(const char *buf, off_t len, unsigned char *output);
72 72
 static	const	char	*pdf_nextlinestart(const char *ptr, size_t len);
73 73
 static	const	char	*pdf_nextobject(const char *ptr, size_t len);
74
+static char *pdf_parse_string(const char *objstart, size_t objsize, const char *str);
74 75
 
75 76
 /* PDF statistics callbacks and related */
76 77
 struct pdf_struct;
... ...
@@ -97,6 +100,11 @@ static void OpenAction_cb(struct pdf_struct *, struct pdf_obj *, struct pdf_acti
97 97
 static void Launch_cb(struct pdf_struct *, struct pdf_obj *, struct pdf_action *);
98 98
 static void Page_cb(struct pdf_struct *, struct pdf_obj *, struct pdf_action *);
99 99
 static void print_pdf_stats(struct pdf_struct *);
100
+static void Author_cb(struct pdf_struct *, struct pdf_obj *, struct pdf_action *);
101
+static void Creator_cb(struct pdf_struct *, struct pdf_obj *, struct pdf_action *);
102
+static void Producer_cb(struct pdf_struct *, struct pdf_obj *, struct pdf_action *);
103
+static void CreationDate_cb(struct pdf_struct *, struct pdf_obj *, struct pdf_action *);
104
+static void ModificationDate_cb(struct pdf_struct *, struct pdf_obj *, struct pdf_action *);
100 105
 /* End PDF statistics callbacks and related */
101 106
 
102 107
 static int xrefCheck(const char *xref, const char *eof)
... ...
@@ -157,6 +165,11 @@ struct pdf_stats {
157 157
     int32_t nopenaction;      /* Number of OpenAction objects */
158 158
     int32_t nlaunch;          /* Number of Launch objects */
159 159
     int32_t npage;            /* Number of Page objects */
160
+    char *author;             /* Author of the PDF */
161
+    char *creator;            /* Application used to create the PDF */
162
+    char *producer;           /* Application used to produce the PDF */
163
+    char *creationdate;       /* Date the PDF was created */
164
+    char *modificationdate;   /* Date the PDF was modified */
160 165
 };
161 166
 
162 167
 struct pdf_struct {
... ...
@@ -1371,7 +1384,12 @@ static struct pdfname_action pdfname_actions[] = {
1371 1371
     {"OpenAction", OBJ_OPENACTION, STATE_ANY, STATE_OPENACTION, OpenAction_cb},
1372 1372
     {"Launch", OBJ_LAUNCHACTION, STATE_ANY, STATE_LAUNCHACTION, Launch_cb},
1373 1373
     {"Page", OBJ_PAGE, STATE_NONE, STATE_NONE, Page_cb},
1374
-    {"Contents", OBJ_CONTENTS, STATE_NONE, STATE_CONTENTS, NULL}
1374
+    {"Contents", OBJ_CONTENTS, STATE_NONE, STATE_CONTENTS, NULL},
1375
+    {"Author", OBJ_DICT, STATE_NONE, STATE_NONE, Author_cb},
1376
+    {"Producer", OBJ_DICT, STATE_NONE, STATE_NONE, Producer_cb},
1377
+    {"CreationDate", OBJ_DICT, STATE_NONE, STATE_NONE, CreationDate_cb},
1378
+    {"ModDate", OBJ_DICT, STATE_NONE, STATE_NONE, ModificationDate_cb},
1379
+    {"Creator", OBJ_DICT, STATE_NONE, STATE_NONE, Creator_cb}
1375 1380
 };
1376 1381
 
1377 1382
 #define KNOWN_FILTERS ((1 << OBJ_FILTER_AH) | (1 << OBJ_FILTER_RL) | (1 << OBJ_FILTER_A85) | (1 << OBJ_FILTER_FLATE) | (1 << OBJ_FILTER_LZW) | (1 << OBJ_FILTER_FAX) | (1 << OBJ_FILTER_DCT) | (1 << OBJ_FILTER_JPX) | (1 << OBJ_FILTER_CRYPT))
... ...
@@ -2783,6 +2801,130 @@ pdf_nextobject(const char *ptr, size_t len)
2783 2783
     return NULL;
2784 2784
 }
2785 2785
 
2786
+static char *pdf_parse_string(const char *objstart, size_t objsize, const char *str)
2787
+{
2788
+    const char *q = objstart;
2789
+    char *p1, *p2;
2790
+    size_t inlen, outlen, len;
2791
+    char *buf, *outbuf, *res;
2792
+    iconv_t cd;
2793
+    int likelyutf = 0;
2794
+    unsigned int i;
2795
+    char *encodings[] = {
2796
+        "UTF-8",
2797
+        "UTF-16",
2798
+        NULL
2799
+    };
2800
+
2801
+    if (objsize < strlen(str) + 3)
2802
+        return NULL;
2803
+
2804
+    res = NULL;
2805
+
2806
+    /* Yes, all of this is required to find the start and end of a potentially UTF-* string */
2807
+
2808
+    for (p1=(char *)q; (p1 - q) < objsize-8; p1++)
2809
+        if (!strncmp(p1, str, strlen(str)))
2810
+            break;
2811
+
2812
+    if (p1 - q > objsize - 8 || strncmp(p1, str, strlen(str)))
2813
+        return NULL;
2814
+
2815
+    while ((p1 - q) <= objsize && *p1 != '(')
2816
+        p1++;
2817
+
2818
+    if ((p1 - q) > objsize || *p1 != '(')
2819
+        return NULL;
2820
+
2821
+    p2 = ++p1;
2822
+    while (1) {
2823
+        int shouldbreak=1;
2824
+        unsigned int upperlimit=1;
2825
+
2826
+        while ((p2 - q) < objsize && *p2 != ')') {
2827
+            if (!likelyutf && (*((unsigned char *)p2) > (unsigned char)0x7f || *p2 == '\0'))
2828
+                likelyutf = 1;
2829
+
2830
+            p2++;
2831
+        }
2832
+
2833
+        if ((p2 - q) > objsize || *p2 != ')')
2834
+            return NULL;
2835
+
2836
+        if (likelyutf)
2837
+            upperlimit = 3;
2838
+
2839
+        for (i=0; i <= upperlimit && p2 - i > p1; i++) {
2840
+            if (*(p2-i) == '\\') {
2841
+                shouldbreak=0;
2842
+                p2++;
2843
+            }
2844
+        }
2845
+
2846
+        if (shouldbreak) {
2847
+            p2--;
2848
+            break;
2849
+        }
2850
+    }
2851
+
2852
+    if (p2 - p1 == 0)
2853
+        return NULL;
2854
+
2855
+    len = inlen = outlen = (size_t)(p2 - p1) + 1;
2856
+
2857
+    if (likelyutf == 0) {
2858
+        res = cli_calloc(1, len);
2859
+        if (!(res))
2860
+            return NULL;
2861
+
2862
+        memcpy(res, p1, len);
2863
+        return res;
2864
+    }
2865
+
2866
+    buf = cli_calloc(1, inlen);
2867
+    if (!(buf))
2868
+        return NULL;
2869
+
2870
+    memcpy(buf, p1, inlen);
2871
+    p1 = buf;
2872
+
2873
+    p2 = outbuf = cli_calloc(1, outlen);
2874
+    if (!(outbuf)) {
2875
+        free(buf);
2876
+        return NULL;
2877
+    }
2878
+
2879
+    for (i=0; encodings[i] != NULL; i++) {
2880
+        buf = p1;
2881
+        outbuf = p2;
2882
+
2883
+        cd = iconv_open("ASCII", encodings[i]);
2884
+        if (cd == (iconv_t)(-1)) {
2885
+            cli_errmsg("Could not initialize iconv\n");
2886
+            continue;
2887
+        }
2888
+
2889
+        iconv(cd, &buf, &inlen, &outbuf, &outlen);
2890
+
2891
+        if (outlen == len) {
2892
+            /* Decoding unsuccessful right from the start */
2893
+            iconv_close(cd);
2894
+            continue;
2895
+        }
2896
+
2897
+        p2[len - outlen] = '\0';
2898
+
2899
+        res = strdup(p2);
2900
+        iconv_close(cd);
2901
+        break;
2902
+    }
2903
+
2904
+    free(p1);
2905
+    free(p2);
2906
+
2907
+    return res;
2908
+}
2909
+
2786 2910
 /* PDF statistics */
2787 2911
 static void ASCIIHexDecode_cb(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdf_action *act)
2788 2912
 {
... ...
@@ -2928,34 +3070,74 @@ static void Page_cb(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdf_acti
2928 2928
     pdf->stats.npage++;
2929 2929
 }
2930 2930
 
2931
+static void Author_cb(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdf_action *act)
2932
+{
2933
+    if (!(pdf))
2934
+        return;
2935
+
2936
+    pdf->stats.author = pdf_parse_string(obj->start + pdf->map, obj_size(pdf, obj, 1), "/Author");
2937
+}
2938
+
2939
+static void Creator_cb(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdf_action *act)
2940
+{
2941
+    if (!(pdf))
2942
+        return;
2943
+
2944
+    pdf->stats.creator = pdf_parse_string(obj->start + pdf->map, obj_size(pdf, obj, 1), "/Creator");
2945
+}
2946
+
2947
+static void ModificationDate_cb(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdf_action *act)
2948
+{
2949
+    if (!(pdf))
2950
+        return;
2951
+
2952
+    pdf->stats.modificationdate = pdf_parse_string(obj->start + pdf->map, obj_size(pdf, obj, 1), "/ModDate");
2953
+}
2954
+
2955
+static void CreationDate_cb(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdf_action *act)
2956
+{
2957
+    if (!(pdf))
2958
+        return;
2959
+
2960
+    pdf->stats.creationdate = pdf_parse_string(obj->start + pdf->map, obj_size(pdf, obj, 1), "/CreationDate");
2961
+}
2962
+
2963
+static void Producer_cb(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdf_action *act)
2964
+{
2965
+    if (!(pdf))
2966
+        return;
2967
+
2968
+    pdf->stats.producer = pdf_parse_string(obj->start + pdf->map, obj_size(pdf, obj, 1), "/Producer");
2969
+}
2970
+
2931 2971
 static void print_pdf_stats(struct pdf_struct *pdf)
2932 2972
 {
2933 2973
     if (!(pdf))
2934 2974
         return;
2935 2975
 
2936 2976
     cli_dbgmsg("Statistics collected from PDF:\n");
2937
-    cli_dbgmsg("    Invalid Objects:\t\t\t\t%lu\n", pdf->stats.ninvalidobjs);
2938
-    cli_dbgmsg("    Number of JavaScript Objects:\t\t%lu\n", pdf->stats.njs);
2939
-    cli_dbgmsg("    Number of Inflate-Encoded Objects:\t\t%lu\n", pdf->stats.nflate);
2940
-    cli_dbgmsg("    Number of ActiveX Objects:\t\t\t%lu\n", pdf->stats.nactivex);
2941
-    cli_dbgmsg("    Number of Flash Objects:\t\t\t%lu\n", pdf->stats.nflash);
2942
-    cli_dbgmsg("    Number of Declared Colors:\t\t\t%lu\n", pdf->stats.ncolors);
2943
-    cli_dbgmsg("    Number of ASCIIHexEncoded Objects:\t\t%lu\n", pdf->stats.nasciihexdecode);
2944
-    cli_dbgmsg("    Number of ASCII85Encoded Objects:\t\t%lu\n", pdf->stats.nascii85decode);
2945
-    cli_dbgmsg("    Number of Embedded Files:\t\t\t%lu\n", pdf->stats.nembeddedfile);
2946
-    cli_dbgmsg("    Number of Image Objects:\t\t\t%lu\n", pdf->stats.nimage);
2947
-    cli_dbgmsg("    Number of LZW-Encoded Objects:\t\t%lu\n", pdf->stats.nlzw);
2948
-    cli_dbgmsg("    Number of RunLengthEncoded Objects:\t%lu\n", pdf->stats.nrunlengthdecode);
2949
-    cli_dbgmsg("    Number of Fax-Encoded Objects:\t\t%lu\n", pdf->stats.nfaxdecode);
2950
-    cli_dbgmsg("    Number of JBIG2-Encoded Objects:\t\t%lu\n", pdf->stats.njbig2decode);
2951
-    cli_dbgmsg("    Number of DCT-Encoded Objects:\t\t%lu\n", pdf->stats.ndctdecode);
2952
-    cli_dbgmsg("    Number of JPX-Encoded Objects:\t\t%lu\n", pdf->stats.njpxdecode);
2953
-    cli_dbgmsg("    Number of Crypt-Encoded Objects:\t\t%lu\n", pdf->stats.ncrypt);
2954
-    cli_dbgmsg("    Number of Standard-Filtered Objects:\t%lu\n", pdf->stats.nstandard);
2955
-    cli_dbgmsg("    Number of Signed Objects:\t\t\t%lu\n", pdf->stats.nsigned);
2956
-    cli_dbgmsg("    Number of Open Actions:\t\t\t%lu\n", pdf->stats.nopenaction);
2957
-    cli_dbgmsg("    Number of Launch Objects:\t\t\t%lu\n", pdf->stats.nlaunch);
2958
-    cli_dbgmsg("    Number of Objects with /Pages:\t\t%lu\n", pdf->stats.npage);
2977
+    cli_dbgmsg("    Invalid Objects:\t\t\t\t%u\n", pdf->stats.ninvalidobjs);
2978
+    cli_dbgmsg("    Number of JavaScript Objects:\t\t%u\n", pdf->stats.njs);
2979
+    cli_dbgmsg("    Number of Inflate-Encoded Objects:\t\t%u\n", pdf->stats.nflate);
2980
+    cli_dbgmsg("    Number of ActiveX Objects:\t\t\t%u\n", pdf->stats.nactivex);
2981
+    cli_dbgmsg("    Number of Flash Objects:\t\t\t%u\n", pdf->stats.nflash);
2982
+    cli_dbgmsg("    Number of Declared Colors:\t\t\t%u\n", pdf->stats.ncolors);
2983
+    cli_dbgmsg("    Number of ASCIIHexEncoded Objects:\t\t%u\n", pdf->stats.nasciihexdecode);
2984
+    cli_dbgmsg("    Number of ASCII85Encoded Objects:\t\t%u\n", pdf->stats.nascii85decode);
2985
+    cli_dbgmsg("    Number of Embedded Files:\t\t\t%u\n", pdf->stats.nembeddedfile);
2986
+    cli_dbgmsg("    Number of Image Objects:\t\t\t%u\n", pdf->stats.nimage);
2987
+    cli_dbgmsg("    Number of LZW-Encoded Objects:\t\t%u\n", pdf->stats.nlzw);
2988
+    cli_dbgmsg("    Number of RunLengthEncoded Objects:\t%u\n", pdf->stats.nrunlengthdecode);
2989
+    cli_dbgmsg("    Number of Fax-Encoded Objects:\t\t%u\n", pdf->stats.nfaxdecode);
2990
+    cli_dbgmsg("    Number of JBIG2-Encoded Objects:\t\t%u\n", pdf->stats.njbig2decode);
2991
+    cli_dbgmsg("    Number of DCT-Encoded Objects:\t\t%u\n", pdf->stats.ndctdecode);
2992
+    cli_dbgmsg("    Number of JPX-Encoded Objects:\t\t%u\n", pdf->stats.njpxdecode);
2993
+    cli_dbgmsg("    Number of Crypt-Encoded Objects:\t\t%u\n", pdf->stats.ncrypt);
2994
+    cli_dbgmsg("    Number of Standard-Filtered Objects:\t%u\n", pdf->stats.nstandard);
2995
+    cli_dbgmsg("    Number of Signed Objects:\t\t\t%u\n", pdf->stats.nsigned);
2996
+    cli_dbgmsg("    Number of Open Actions:\t\t\t%u\n", pdf->stats.nopenaction);
2997
+    cli_dbgmsg("    Number of Launch Objects:\t\t\t%u\n", pdf->stats.nlaunch);
2998
+    cli_dbgmsg("    Number of Objects with /Pages:\t\t%u\n", pdf->stats.npage);
2959 2999
 }
2960 3000
 
2961 3001
 static void pdf_export_json(struct pdf_struct *pdf)
... ...
@@ -2977,6 +3159,41 @@ static void pdf_export_json(struct pdf_struct *pdf)
2977 2977
         return;
2978 2978
 
2979 2979
     json_object_object_add(pdf->ctx->wrkproperty, "PDFStats", pdfobj);
2980
+    if (pdf->stats.author) {
2981
+        cli_jsonstr(pdfobj, "Author", pdf->stats.author);
2982
+
2983
+        free(pdf->stats.author);
2984
+        pdf->stats.author = NULL;
2985
+    }
2986
+
2987
+    if (pdf->stats.creator) {
2988
+        cli_jsonstr(pdfobj, "Creator", pdf->stats.creator);
2989
+
2990
+        free(pdf->stats.creator);
2991
+        pdf->stats.creator = NULL;
2992
+    }
2993
+
2994
+    if (pdf->stats.producer) {
2995
+        cli_jsonstr(pdfobj, "Producer", pdf->stats.producer);
2996
+
2997
+        free(pdf->stats.producer);
2998
+        pdf->stats.producer = NULL;
2999
+    }
3000
+
3001
+    if (pdf->stats.modificationdate) {
3002
+        cli_jsonstr(pdfobj, "ModificationDate", pdf->stats.modificationdate);
3003
+
3004
+        free(pdf->stats.modificationdate);
3005
+        pdf->stats.modificationdate = NULL;
3006
+    }
3007
+
3008
+    if (pdf->stats.creationdate) {
3009
+        cli_jsonstr(pdfobj, "CreationDate", pdf->stats.creationdate);
3010
+
3011
+        free(pdf->stats.creationdate);
3012
+        pdf->stats.creationdate = NULL;
3013
+    }
3014
+
2980 3015
     if (pdf->stats.ninvalidobjs)
2981 3016
         cli_jsonint(pdfobj, "InvalidObjectCount", pdf->stats.ninvalidobjs);
2982 3017
     if (pdf->stats.njs)