...
|
...
|
@@ -45,6 +45,8 @@ static char const rcsid[] = "$Id: pdf.c,v 1.61 2007/02/12 20:46:09 njh Exp $";
|
45
|
45
|
#endif
|
46
|
46
|
#include <zlib.h>
|
47
|
47
|
|
|
48
|
+#include <iconv.h>
|
|
49
|
+
|
48
|
50
|
#include <openssl/ssl.h>
|
49
|
51
|
#include <openssl/err.h>
|
50
|
52
|
#include "libclamav/crypto.h"
|
...
|
...
|
@@ -71,6 +73,7 @@ static int asciihexdecode(const char *buf, off_t len, char *output);
|
71
|
71
|
static int ascii85decode(const char *buf, off_t len, unsigned char *output);
|
72
|
72
|
static const char *pdf_nextlinestart(const char *ptr, size_t len);
|
73
|
73
|
static const char *pdf_nextobject(const char *ptr, size_t len);
|
|
74
|
+static char *pdf_parse_string(const char *objstart, size_t objsize, const char *str);
|
74
|
75
|
|
75
|
76
|
/* PDF statistics callbacks and related */
|
76
|
77
|
struct pdf_struct;
|
...
|
...
|
@@ -97,6 +100,11 @@ static void OpenAction_cb(struct pdf_struct *, struct pdf_obj *, struct pdf_acti
|
97
|
97
|
static void Launch_cb(struct pdf_struct *, struct pdf_obj *, struct pdf_action *);
|
98
|
98
|
static void Page_cb(struct pdf_struct *, struct pdf_obj *, struct pdf_action *);
|
99
|
99
|
static void print_pdf_stats(struct pdf_struct *);
|
|
100
|
+static void Author_cb(struct pdf_struct *, struct pdf_obj *, struct pdf_action *);
|
|
101
|
+static void Creator_cb(struct pdf_struct *, struct pdf_obj *, struct pdf_action *);
|
|
102
|
+static void Producer_cb(struct pdf_struct *, struct pdf_obj *, struct pdf_action *);
|
|
103
|
+static void CreationDate_cb(struct pdf_struct *, struct pdf_obj *, struct pdf_action *);
|
|
104
|
+static void ModificationDate_cb(struct pdf_struct *, struct pdf_obj *, struct pdf_action *);
|
100
|
105
|
/* End PDF statistics callbacks and related */
|
101
|
106
|
|
102
|
107
|
static int xrefCheck(const char *xref, const char *eof)
|
...
|
...
|
@@ -157,6 +165,11 @@ struct pdf_stats {
|
157
|
157
|
int32_t nopenaction; /* Number of OpenAction objects */
|
158
|
158
|
int32_t nlaunch; /* Number of Launch objects */
|
159
|
159
|
int32_t npage; /* Number of Page objects */
|
|
160
|
+ char *author; /* Author of the PDF */
|
|
161
|
+ char *creator; /* Application used to create the PDF */
|
|
162
|
+ char *producer; /* Application used to produce the PDF */
|
|
163
|
+ char *creationdate; /* Date the PDF was created */
|
|
164
|
+ char *modificationdate; /* Date the PDF was modified */
|
160
|
165
|
};
|
161
|
166
|
|
162
|
167
|
struct pdf_struct {
|
...
|
...
|
@@ -1371,7 +1384,12 @@ static struct pdfname_action pdfname_actions[] = {
|
1371
|
1371
|
{"OpenAction", OBJ_OPENACTION, STATE_ANY, STATE_OPENACTION, OpenAction_cb},
|
1372
|
1372
|
{"Launch", OBJ_LAUNCHACTION, STATE_ANY, STATE_LAUNCHACTION, Launch_cb},
|
1373
|
1373
|
{"Page", OBJ_PAGE, STATE_NONE, STATE_NONE, Page_cb},
|
1374
|
|
- {"Contents", OBJ_CONTENTS, STATE_NONE, STATE_CONTENTS, NULL}
|
|
1374
|
+ {"Contents", OBJ_CONTENTS, STATE_NONE, STATE_CONTENTS, NULL},
|
|
1375
|
+ {"Author", OBJ_DICT, STATE_NONE, STATE_NONE, Author_cb},
|
|
1376
|
+ {"Producer", OBJ_DICT, STATE_NONE, STATE_NONE, Producer_cb},
|
|
1377
|
+ {"CreationDate", OBJ_DICT, STATE_NONE, STATE_NONE, CreationDate_cb},
|
|
1378
|
+ {"ModDate", OBJ_DICT, STATE_NONE, STATE_NONE, ModificationDate_cb},
|
|
1379
|
+ {"Creator", OBJ_DICT, STATE_NONE, STATE_NONE, Creator_cb}
|
1375
|
1380
|
};
|
1376
|
1381
|
|
1377
|
1382
|
#define KNOWN_FILTERS ((1 << OBJ_FILTER_AH) | (1 << OBJ_FILTER_RL) | (1 << OBJ_FILTER_A85) | (1 << OBJ_FILTER_FLATE) | (1 << OBJ_FILTER_LZW) | (1 << OBJ_FILTER_FAX) | (1 << OBJ_FILTER_DCT) | (1 << OBJ_FILTER_JPX) | (1 << OBJ_FILTER_CRYPT))
|
...
|
...
|
@@ -2783,6 +2801,130 @@ pdf_nextobject(const char *ptr, size_t len)
|
2783
|
2783
|
return NULL;
|
2784
|
2784
|
}
|
2785
|
2785
|
|
|
2786
|
+static char *pdf_parse_string(const char *objstart, size_t objsize, const char *str)
|
|
2787
|
+{
|
|
2788
|
+ const char *q = objstart;
|
|
2789
|
+ char *p1, *p2;
|
|
2790
|
+ size_t inlen, outlen, len;
|
|
2791
|
+ char *buf, *outbuf, *res;
|
|
2792
|
+ iconv_t cd;
|
|
2793
|
+ int likelyutf = 0;
|
|
2794
|
+ unsigned int i;
|
|
2795
|
+ char *encodings[] = {
|
|
2796
|
+ "UTF-8",
|
|
2797
|
+ "UTF-16",
|
|
2798
|
+ NULL
|
|
2799
|
+ };
|
|
2800
|
+
|
|
2801
|
+ if (objsize < strlen(str) + 3)
|
|
2802
|
+ return NULL;
|
|
2803
|
+
|
|
2804
|
+ res = NULL;
|
|
2805
|
+
|
|
2806
|
+ /* Yes, all of this is required to find the start and end of a potentially UTF-* string */
|
|
2807
|
+
|
|
2808
|
+ for (p1=(char *)q; (p1 - q) < objsize-8; p1++)
|
|
2809
|
+ if (!strncmp(p1, str, strlen(str)))
|
|
2810
|
+ break;
|
|
2811
|
+
|
|
2812
|
+ if (p1 - q > objsize - 8 || strncmp(p1, str, strlen(str)))
|
|
2813
|
+ return NULL;
|
|
2814
|
+
|
|
2815
|
+ while ((p1 - q) <= objsize && *p1 != '(')
|
|
2816
|
+ p1++;
|
|
2817
|
+
|
|
2818
|
+ if ((p1 - q) > objsize || *p1 != '(')
|
|
2819
|
+ return NULL;
|
|
2820
|
+
|
|
2821
|
+ p2 = ++p1;
|
|
2822
|
+ while (1) {
|
|
2823
|
+ int shouldbreak=1;
|
|
2824
|
+ unsigned int upperlimit=1;
|
|
2825
|
+
|
|
2826
|
+ while ((p2 - q) < objsize && *p2 != ')') {
|
|
2827
|
+ if (!likelyutf && (*((unsigned char *)p2) > (unsigned char)0x7f || *p2 == '\0'))
|
|
2828
|
+ likelyutf = 1;
|
|
2829
|
+
|
|
2830
|
+ p2++;
|
|
2831
|
+ }
|
|
2832
|
+
|
|
2833
|
+ if ((p2 - q) > objsize || *p2 != ')')
|
|
2834
|
+ return NULL;
|
|
2835
|
+
|
|
2836
|
+ if (likelyutf)
|
|
2837
|
+ upperlimit = 3;
|
|
2838
|
+
|
|
2839
|
+ for (i=0; i <= upperlimit && p2 - i > p1; i++) {
|
|
2840
|
+ if (*(p2-i) == '\\') {
|
|
2841
|
+ shouldbreak=0;
|
|
2842
|
+ p2++;
|
|
2843
|
+ }
|
|
2844
|
+ }
|
|
2845
|
+
|
|
2846
|
+ if (shouldbreak) {
|
|
2847
|
+ p2--;
|
|
2848
|
+ break;
|
|
2849
|
+ }
|
|
2850
|
+ }
|
|
2851
|
+
|
|
2852
|
+ if (p2 - p1 == 0)
|
|
2853
|
+ return NULL;
|
|
2854
|
+
|
|
2855
|
+ len = inlen = outlen = (size_t)(p2 - p1) + 1;
|
|
2856
|
+
|
|
2857
|
+ if (likelyutf == 0) {
|
|
2858
|
+ res = cli_calloc(1, len);
|
|
2859
|
+ if (!(res))
|
|
2860
|
+ return NULL;
|
|
2861
|
+
|
|
2862
|
+ memcpy(res, p1, len);
|
|
2863
|
+ return res;
|
|
2864
|
+ }
|
|
2865
|
+
|
|
2866
|
+ buf = cli_calloc(1, inlen);
|
|
2867
|
+ if (!(buf))
|
|
2868
|
+ return NULL;
|
|
2869
|
+
|
|
2870
|
+ memcpy(buf, p1, inlen);
|
|
2871
|
+ p1 = buf;
|
|
2872
|
+
|
|
2873
|
+ p2 = outbuf = cli_calloc(1, outlen);
|
|
2874
|
+ if (!(outbuf)) {
|
|
2875
|
+ free(buf);
|
|
2876
|
+ return NULL;
|
|
2877
|
+ }
|
|
2878
|
+
|
|
2879
|
+ for (i=0; encodings[i] != NULL; i++) {
|
|
2880
|
+ buf = p1;
|
|
2881
|
+ outbuf = p2;
|
|
2882
|
+
|
|
2883
|
+ cd = iconv_open("ASCII", encodings[i]);
|
|
2884
|
+ if (cd == (iconv_t)(-1)) {
|
|
2885
|
+ cli_errmsg("Could not initialize iconv\n");
|
|
2886
|
+ continue;
|
|
2887
|
+ }
|
|
2888
|
+
|
|
2889
|
+ iconv(cd, &buf, &inlen, &outbuf, &outlen);
|
|
2890
|
+
|
|
2891
|
+ if (outlen == len) {
|
|
2892
|
+ /* Decoding unsuccessful right from the start */
|
|
2893
|
+ iconv_close(cd);
|
|
2894
|
+ continue;
|
|
2895
|
+ }
|
|
2896
|
+
|
|
2897
|
+ p2[len - outlen] = '\0';
|
|
2898
|
+
|
|
2899
|
+ res = strdup(p2);
|
|
2900
|
+ iconv_close(cd);
|
|
2901
|
+ break;
|
|
2902
|
+ }
|
|
2903
|
+
|
|
2904
|
+ free(p1);
|
|
2905
|
+ free(p2);
|
|
2906
|
+
|
|
2907
|
+ return res;
|
|
2908
|
+}
|
|
2909
|
+
|
2786
|
2910
|
/* PDF statistics */
|
2787
|
2911
|
static void ASCIIHexDecode_cb(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdf_action *act)
|
2788
|
2912
|
{
|
...
|
...
|
@@ -2928,34 +3070,74 @@ static void Page_cb(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdf_acti
|
2928
|
2928
|
pdf->stats.npage++;
|
2929
|
2929
|
}
|
2930
|
2930
|
|
|
2931
|
+static void Author_cb(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdf_action *act)
|
|
2932
|
+{
|
|
2933
|
+ if (!(pdf))
|
|
2934
|
+ return;
|
|
2935
|
+
|
|
2936
|
+ pdf->stats.author = pdf_parse_string(obj->start + pdf->map, obj_size(pdf, obj, 1), "/Author");
|
|
2937
|
+}
|
|
2938
|
+
|
|
2939
|
+static void Creator_cb(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdf_action *act)
|
|
2940
|
+{
|
|
2941
|
+ if (!(pdf))
|
|
2942
|
+ return;
|
|
2943
|
+
|
|
2944
|
+ pdf->stats.creator = pdf_parse_string(obj->start + pdf->map, obj_size(pdf, obj, 1), "/Creator");
|
|
2945
|
+}
|
|
2946
|
+
|
|
2947
|
+static void ModificationDate_cb(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdf_action *act)
|
|
2948
|
+{
|
|
2949
|
+ if (!(pdf))
|
|
2950
|
+ return;
|
|
2951
|
+
|
|
2952
|
+ pdf->stats.modificationdate = pdf_parse_string(obj->start + pdf->map, obj_size(pdf, obj, 1), "/ModDate");
|
|
2953
|
+}
|
|
2954
|
+
|
|
2955
|
+static void CreationDate_cb(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdf_action *act)
|
|
2956
|
+{
|
|
2957
|
+ if (!(pdf))
|
|
2958
|
+ return;
|
|
2959
|
+
|
|
2960
|
+ pdf->stats.creationdate = pdf_parse_string(obj->start + pdf->map, obj_size(pdf, obj, 1), "/CreationDate");
|
|
2961
|
+}
|
|
2962
|
+
|
|
2963
|
+static void Producer_cb(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdf_action *act)
|
|
2964
|
+{
|
|
2965
|
+ if (!(pdf))
|
|
2966
|
+ return;
|
|
2967
|
+
|
|
2968
|
+ pdf->stats.producer = pdf_parse_string(obj->start + pdf->map, obj_size(pdf, obj, 1), "/Producer");
|
|
2969
|
+}
|
|
2970
|
+
|
2931
|
2971
|
static void print_pdf_stats(struct pdf_struct *pdf)
|
2932
|
2972
|
{
|
2933
|
2973
|
if (!(pdf))
|
2934
|
2974
|
return;
|
2935
|
2975
|
|
2936
|
2976
|
cli_dbgmsg("Statistics collected from PDF:\n");
|
2937
|
|
- cli_dbgmsg(" Invalid Objects:\t\t\t\t%lu\n", pdf->stats.ninvalidobjs);
|
2938
|
|
- cli_dbgmsg(" Number of JavaScript Objects:\t\t%lu\n", pdf->stats.njs);
|
2939
|
|
- cli_dbgmsg(" Number of Inflate-Encoded Objects:\t\t%lu\n", pdf->stats.nflate);
|
2940
|
|
- cli_dbgmsg(" Number of ActiveX Objects:\t\t\t%lu\n", pdf->stats.nactivex);
|
2941
|
|
- cli_dbgmsg(" Number of Flash Objects:\t\t\t%lu\n", pdf->stats.nflash);
|
2942
|
|
- cli_dbgmsg(" Number of Declared Colors:\t\t\t%lu\n", pdf->stats.ncolors);
|
2943
|
|
- cli_dbgmsg(" Number of ASCIIHexEncoded Objects:\t\t%lu\n", pdf->stats.nasciihexdecode);
|
2944
|
|
- cli_dbgmsg(" Number of ASCII85Encoded Objects:\t\t%lu\n", pdf->stats.nascii85decode);
|
2945
|
|
- cli_dbgmsg(" Number of Embedded Files:\t\t\t%lu\n", pdf->stats.nembeddedfile);
|
2946
|
|
- cli_dbgmsg(" Number of Image Objects:\t\t\t%lu\n", pdf->stats.nimage);
|
2947
|
|
- cli_dbgmsg(" Number of LZW-Encoded Objects:\t\t%lu\n", pdf->stats.nlzw);
|
2948
|
|
- cli_dbgmsg(" Number of RunLengthEncoded Objects:\t%lu\n", pdf->stats.nrunlengthdecode);
|
2949
|
|
- cli_dbgmsg(" Number of Fax-Encoded Objects:\t\t%lu\n", pdf->stats.nfaxdecode);
|
2950
|
|
- cli_dbgmsg(" Number of JBIG2-Encoded Objects:\t\t%lu\n", pdf->stats.njbig2decode);
|
2951
|
|
- cli_dbgmsg(" Number of DCT-Encoded Objects:\t\t%lu\n", pdf->stats.ndctdecode);
|
2952
|
|
- cli_dbgmsg(" Number of JPX-Encoded Objects:\t\t%lu\n", pdf->stats.njpxdecode);
|
2953
|
|
- cli_dbgmsg(" Number of Crypt-Encoded Objects:\t\t%lu\n", pdf->stats.ncrypt);
|
2954
|
|
- cli_dbgmsg(" Number of Standard-Filtered Objects:\t%lu\n", pdf->stats.nstandard);
|
2955
|
|
- cli_dbgmsg(" Number of Signed Objects:\t\t\t%lu\n", pdf->stats.nsigned);
|
2956
|
|
- cli_dbgmsg(" Number of Open Actions:\t\t\t%lu\n", pdf->stats.nopenaction);
|
2957
|
|
- cli_dbgmsg(" Number of Launch Objects:\t\t\t%lu\n", pdf->stats.nlaunch);
|
2958
|
|
- cli_dbgmsg(" Number of Objects with /Pages:\t\t%lu\n", pdf->stats.npage);
|
|
2977
|
+ cli_dbgmsg(" Invalid Objects:\t\t\t\t%u\n", pdf->stats.ninvalidobjs);
|
|
2978
|
+ cli_dbgmsg(" Number of JavaScript Objects:\t\t%u\n", pdf->stats.njs);
|
|
2979
|
+ cli_dbgmsg(" Number of Inflate-Encoded Objects:\t\t%u\n", pdf->stats.nflate);
|
|
2980
|
+ cli_dbgmsg(" Number of ActiveX Objects:\t\t\t%u\n", pdf->stats.nactivex);
|
|
2981
|
+ cli_dbgmsg(" Number of Flash Objects:\t\t\t%u\n", pdf->stats.nflash);
|
|
2982
|
+ cli_dbgmsg(" Number of Declared Colors:\t\t\t%u\n", pdf->stats.ncolors);
|
|
2983
|
+ cli_dbgmsg(" Number of ASCIIHexEncoded Objects:\t\t%u\n", pdf->stats.nasciihexdecode);
|
|
2984
|
+ cli_dbgmsg(" Number of ASCII85Encoded Objects:\t\t%u\n", pdf->stats.nascii85decode);
|
|
2985
|
+ cli_dbgmsg(" Number of Embedded Files:\t\t\t%u\n", pdf->stats.nembeddedfile);
|
|
2986
|
+ cli_dbgmsg(" Number of Image Objects:\t\t\t%u\n", pdf->stats.nimage);
|
|
2987
|
+ cli_dbgmsg(" Number of LZW-Encoded Objects:\t\t%u\n", pdf->stats.nlzw);
|
|
2988
|
+ cli_dbgmsg(" Number of RunLengthEncoded Objects:\t%u\n", pdf->stats.nrunlengthdecode);
|
|
2989
|
+ cli_dbgmsg(" Number of Fax-Encoded Objects:\t\t%u\n", pdf->stats.nfaxdecode);
|
|
2990
|
+ cli_dbgmsg(" Number of JBIG2-Encoded Objects:\t\t%u\n", pdf->stats.njbig2decode);
|
|
2991
|
+ cli_dbgmsg(" Number of DCT-Encoded Objects:\t\t%u\n", pdf->stats.ndctdecode);
|
|
2992
|
+ cli_dbgmsg(" Number of JPX-Encoded Objects:\t\t%u\n", pdf->stats.njpxdecode);
|
|
2993
|
+ cli_dbgmsg(" Number of Crypt-Encoded Objects:\t\t%u\n", pdf->stats.ncrypt);
|
|
2994
|
+ cli_dbgmsg(" Number of Standard-Filtered Objects:\t%u\n", pdf->stats.nstandard);
|
|
2995
|
+ cli_dbgmsg(" Number of Signed Objects:\t\t\t%u\n", pdf->stats.nsigned);
|
|
2996
|
+ cli_dbgmsg(" Number of Open Actions:\t\t\t%u\n", pdf->stats.nopenaction);
|
|
2997
|
+ cli_dbgmsg(" Number of Launch Objects:\t\t\t%u\n", pdf->stats.nlaunch);
|
|
2998
|
+ cli_dbgmsg(" Number of Objects with /Pages:\t\t%u\n", pdf->stats.npage);
|
2959
|
2999
|
}
|
2960
|
3000
|
|
2961
|
3001
|
static void pdf_export_json(struct pdf_struct *pdf)
|
...
|
...
|
@@ -2977,6 +3159,41 @@ static void pdf_export_json(struct pdf_struct *pdf)
|
2977
|
2977
|
return;
|
2978
|
2978
|
|
2979
|
2979
|
json_object_object_add(pdf->ctx->wrkproperty, "PDFStats", pdfobj);
|
|
2980
|
+ if (pdf->stats.author) {
|
|
2981
|
+ cli_jsonstr(pdfobj, "Author", pdf->stats.author);
|
|
2982
|
+
|
|
2983
|
+ free(pdf->stats.author);
|
|
2984
|
+ pdf->stats.author = NULL;
|
|
2985
|
+ }
|
|
2986
|
+
|
|
2987
|
+ if (pdf->stats.creator) {
|
|
2988
|
+ cli_jsonstr(pdfobj, "Creator", pdf->stats.creator);
|
|
2989
|
+
|
|
2990
|
+ free(pdf->stats.creator);
|
|
2991
|
+ pdf->stats.creator = NULL;
|
|
2992
|
+ }
|
|
2993
|
+
|
|
2994
|
+ if (pdf->stats.producer) {
|
|
2995
|
+ cli_jsonstr(pdfobj, "Producer", pdf->stats.producer);
|
|
2996
|
+
|
|
2997
|
+ free(pdf->stats.producer);
|
|
2998
|
+ pdf->stats.producer = NULL;
|
|
2999
|
+ }
|
|
3000
|
+
|
|
3001
|
+ if (pdf->stats.modificationdate) {
|
|
3002
|
+ cli_jsonstr(pdfobj, "ModificationDate", pdf->stats.modificationdate);
|
|
3003
|
+
|
|
3004
|
+ free(pdf->stats.modificationdate);
|
|
3005
|
+ pdf->stats.modificationdate = NULL;
|
|
3006
|
+ }
|
|
3007
|
+
|
|
3008
|
+ if (pdf->stats.creationdate) {
|
|
3009
|
+ cli_jsonstr(pdfobj, "CreationDate", pdf->stats.creationdate);
|
|
3010
|
+
|
|
3011
|
+ free(pdf->stats.creationdate);
|
|
3012
|
+ pdf->stats.creationdate = NULL;
|
|
3013
|
+ }
|
|
3014
|
+
|
2980
|
3015
|
if (pdf->stats.ninvalidobjs)
|
2981
|
3016
|
cli_jsonint(pdfobj, "InvalidObjectCount", pdf->stats.ninvalidobjs);
|
2982
|
3017
|
if (pdf->stats.njs)
|