... | ... |
@@ -746,7 +746,7 @@ static void aes_decrypt(const unsigned char *in, off_t *length, unsigned char *q |
746 | 746 |
} |
747 | 747 |
|
748 | 748 |
|
749 |
-static char *decrypt_any(struct pdf_struct *pdf, uint32_t id, const char *in, off_t *length, enum enc_method enc_method) |
|
749 |
+char *decrypt_any(struct pdf_struct *pdf, uint32_t id, const char *in, off_t *length, enum enc_method enc_method) |
|
750 | 750 |
{ |
751 | 751 |
unsigned char *key, *q, result[16]; |
752 | 752 |
unsigned n; |
... | ... |
@@ -846,7 +846,7 @@ static char *decrypt_any(struct pdf_struct *pdf, uint32_t id, const char *in, of |
846 | 846 |
return (char *)q; |
847 | 847 |
} |
848 | 848 |
|
849 |
-static enum enc_method get_enc_method(struct pdf_struct *pdf, struct pdf_obj *obj) |
|
849 |
+enum enc_method get_enc_method(struct pdf_struct *pdf, struct pdf_obj *obj) |
|
850 | 850 |
{ |
851 | 851 |
if (obj->flags & (1 << OBJ_EMBEDDED_FILE)) |
852 | 852 |
return pdf->enc_method_embeddedfile; |
... | ... |
@@ -2244,7 +2244,7 @@ static enum enc_method parse_enc_method(const char *dict, unsigned len, const ch |
2244 | 2244 |
return ret; |
2245 | 2245 |
} |
2246 | 2246 |
|
2247 |
-static void pdf_handle_enc(struct pdf_struct *pdf) |
|
2247 |
+void pdf_handle_enc(struct pdf_struct *pdf) |
|
2248 | 2248 |
{ |
2249 | 2249 |
struct pdf_obj *obj; |
2250 | 2250 |
uint32_t len, n, R, P, length, EM = 1, i, oulen; |
... | ... |
@@ -2612,6 +2612,9 @@ int cli_pdf(const char *dir, cli_ctx *ctx, off_t offset) |
2612 | 2612 |
if (rc == -1) |
2613 | 2613 |
pdf.flags |= 1 << BAD_PDF_TOOMANYOBJS; |
2614 | 2614 |
|
2615 |
+ /* needs to be here for JSON output decryption */ |
|
2616 |
+ pdf_handle_enc(&pdf); |
|
2617 |
+ |
|
2615 | 2618 |
/* must parse after finding all objs, so we can flag indirect objects */ |
2616 | 2619 |
for (i=0;i<pdf.nobjs;i++) { |
2617 | 2620 |
struct pdf_obj *obj = &pdf.objs[i]; |
... | ... |
@@ -2632,7 +2635,6 @@ int cli_pdf(const char *dir, cli_ctx *ctx, off_t offset) |
2632 | 2632 |
pdf_parseobj(&pdf, obj); |
2633 | 2633 |
} |
2634 | 2634 |
|
2635 |
- pdf_handle_enc(&pdf); |
|
2636 | 2635 |
if (pdf.flags & (1 << ENCRYPTED_PDF)) |
2637 | 2636 |
cli_dbgmsg("cli_pdf: encrypted pdf found, %s!\n", |
2638 | 2637 |
(pdf.flags & (1 << DECRYPTABLE_PDF)) ? |
... | ... |
@@ -144,6 +144,10 @@ int pdf_extract_obj(struct pdf_struct *pdf, struct pdf_obj *obj, uint32_t flags) |
144 | 144 |
int pdf_findobj(struct pdf_struct *pdf); |
145 | 145 |
struct pdf_obj *find_obj(struct pdf_struct *pdf, struct pdf_obj *obj, uint32_t objid); |
146 | 146 |
|
147 |
+void pdf_handle_enc(struct pdf_struct *pdf); |
|
148 |
+char *decrypt_any(struct pdf_struct *pdf, uint32_t id, const char *in, off_t *length, enum enc_method enc_method); |
|
149 |
+enum enc_method get_enc_method(struct pdf_struct *pdf, struct pdf_obj *obj); |
|
150 |
+ |
|
147 | 151 |
char *pdf_parse_string(struct pdf_struct *pdf, struct pdf_obj *obj, const char *objstart, size_t objsize, const char *str, char **endchar); |
148 | 152 |
struct pdf_array *pdf_parse_array(struct pdf_struct *pdf, struct pdf_obj *obj, size_t objsz, char *begin, char **endchar); |
149 | 153 |
struct pdf_dict *pdf_parse_dict(struct pdf_struct *pdf, struct pdf_obj *obj, size_t objsz, char *begin, char **endchar); |
... | ... |
@@ -75,10 +75,9 @@ char *pdf_convert_utf(char *begin, size_t sz) |
75 | 75 |
{ |
76 | 76 |
char *res=NULL; |
77 | 77 |
char *buf, *outbuf; |
78 |
- size_t sz2, i; |
|
79 | 78 |
#if HAVE_ICONV |
80 | 79 |
char *p1, *p2; |
81 |
- size_t inlen, outlen; |
|
80 |
+ size_t inlen, outlen, i, sz2; |
|
82 | 81 |
char *encodings[] = { |
83 | 82 |
"UTF-16", |
84 | 83 |
NULL |
... | ... |
@@ -90,59 +89,6 @@ char *pdf_convert_utf(char *begin, size_t sz) |
90 | 90 |
if (!(buf)) |
91 | 91 |
return NULL; |
92 | 92 |
|
93 |
- /* convert PDF specific escape sequences, like octal sequences */ |
|
94 |
- sz2 = 0; |
|
95 |
- for (i = 0; i < sz; ++i) { |
|
96 |
- if ((i+1 < sz) && begin[i] == '\\') { |
|
97 |
- if ((i+3 < sz) && |
|
98 |
- (isdigit(begin[i+1]) && isdigit(begin[i+2]) && isdigit(begin[i+3]))) { |
|
99 |
- /* octal sequence */ |
|
100 |
- char octal[4], *check; |
|
101 |
- unsigned long value; |
|
102 |
- |
|
103 |
- memcpy(octal, &begin[i+1], 3); |
|
104 |
- octal[3] = '\0'; |
|
105 |
- |
|
106 |
- value = (char)strtoul(octal, &check, 8); |
|
107 |
- /* check if all characters were converted */ |
|
108 |
- if (check == &octal[3]) |
|
109 |
- buf[sz2++] = value; |
|
110 |
- i += 3; |
|
111 |
- } else { |
|
112 |
- /* other sequences */ |
|
113 |
- switch(begin[i+1]) { |
|
114 |
- case 'n': |
|
115 |
- buf[sz2++] = 0x0a; |
|
116 |
- break; |
|
117 |
- case 'r': |
|
118 |
- buf[sz2++] = 0x0d; |
|
119 |
- break; |
|
120 |
- case 't': |
|
121 |
- buf[sz2++] = 0x09; |
|
122 |
- break; |
|
123 |
- case 'b': |
|
124 |
- buf[sz2++] = 0x08; |
|
125 |
- break; |
|
126 |
- case 'f': |
|
127 |
- buf[sz2++] = 0x0c; |
|
128 |
- break; |
|
129 |
- case '(': |
|
130 |
- buf[sz2++] = 0x28; |
|
131 |
- break; |
|
132 |
- case ')': |
|
133 |
- buf[sz2++] = 0x29; |
|
134 |
- break; |
|
135 |
- case '\\': |
|
136 |
- buf[sz2++] = 0x5c; |
|
137 |
- break; |
|
138 |
- default: |
|
139 |
- /* IGNORE THE REVERSE SOLIDUS - PDF3000-2008 */ |
|
140 |
- break; |
|
141 |
- } |
|
142 |
- } |
|
143 |
- } else |
|
144 |
- buf[sz2++] = begin[i]; |
|
145 |
- } |
|
146 | 93 |
#if HAVE_ICONV |
147 | 94 |
//memcpy(buf, begin, sz); |
148 | 95 |
p1 = buf; |
... | ... |
@@ -277,13 +223,145 @@ int is_object_reference(char *begin, char **endchar, uint32_t *id) |
277 | 277 |
return 0; |
278 | 278 |
} |
279 | 279 |
|
280 |
+static char *pdf_decrypt_string(struct pdf_struct *pdf, struct pdf_obj *obj, const char *in, off_t *length) |
|
281 |
+{ |
|
282 |
+ enum enc_method enc; |
|
283 |
+ |
|
284 |
+ /* handled only once in cli_pdf() */ |
|
285 |
+ //pdf_handle_enc(pdf); |
|
286 |
+ if (pdf->flags & (1 << DECRYPTABLE_PDF)) { |
|
287 |
+ enc = get_enc_method(pdf, obj); |
|
288 |
+ return decrypt_any(pdf, obj->id, in, length, enc); |
|
289 |
+ } |
|
290 |
+ return NULL; |
|
291 |
+} |
|
292 |
+ |
|
293 |
+static char *pdf_finalize_string(struct pdf_struct *pdf, struct pdf_obj *obj, const char *in, size_t len) |
|
294 |
+{ |
|
295 |
+ char *wrkstr, *output = NULL; |
|
296 |
+ size_t wrklen = len, outlen; |
|
297 |
+ unsigned int i, likelyutf = 0; |
|
298 |
+ |
|
299 |
+ /* get a working copy */ |
|
300 |
+ wrkstr = cli_calloc(len+1, sizeof(char)); |
|
301 |
+ if (!wrkstr) |
|
302 |
+ return NULL; |
|
303 |
+ memcpy(wrkstr, in, len); |
|
304 |
+ |
|
305 |
+ cli_errmsg("pdf_final: start(%d): %s\n", wrklen, wrkstr); |
|
306 |
+ |
|
307 |
+ /* convert PDF specific escape sequences, like octal sequences */ |
|
308 |
+ /* TODO: replace the escape sequences directly in the wrkstr */ |
|
309 |
+ if (strchr(wrkstr, '\\')) { |
|
310 |
+ output = cli_calloc(wrklen+1, sizeof(char)); |
|
311 |
+ if (!output) |
|
312 |
+ return NULL; |
|
313 |
+ |
|
314 |
+ outlen = 0; |
|
315 |
+ for (i = 0; i < wrklen; ++i) { |
|
316 |
+ if ((i+1 < wrklen) && wrkstr[i] == '\\') { |
|
317 |
+ if ((i+3 < wrklen) && |
|
318 |
+ (isdigit(wrkstr[i+1]) && isdigit(wrkstr[i+2]) && isdigit(wrkstr[i+3]))) { |
|
319 |
+ /* octal sequence */ |
|
320 |
+ char octal[4], *check; |
|
321 |
+ unsigned long value; |
|
322 |
+ |
|
323 |
+ memcpy(octal, &wrkstr[i+1], 3); |
|
324 |
+ octal[3] = '\0'; |
|
325 |
+ |
|
326 |
+ value = (char)strtoul(octal, &check, 8); |
|
327 |
+ /* check if all characters were converted */ |
|
328 |
+ if (check == &octal[3]) |
|
329 |
+ output[outlen++] = value; |
|
330 |
+ i += 3; /* 4 with for loop [\ddd] */ |
|
331 |
+ } else { |
|
332 |
+ /* other sequences */ |
|
333 |
+ switch(wrkstr[i+1]) { |
|
334 |
+ case 'n': |
|
335 |
+ output[outlen++] = 0x0a; |
|
336 |
+ break; |
|
337 |
+ case 'r': |
|
338 |
+ output[outlen++] = 0x0d; |
|
339 |
+ break; |
|
340 |
+ case 't': |
|
341 |
+ output[outlen++] = 0x09; |
|
342 |
+ break; |
|
343 |
+ case 'b': |
|
344 |
+ output[outlen++] = 0x08; |
|
345 |
+ break; |
|
346 |
+ case 'f': |
|
347 |
+ output[outlen++] = 0x0c; |
|
348 |
+ break; |
|
349 |
+ case '(': |
|
350 |
+ output[outlen++] = 0x28; |
|
351 |
+ break; |
|
352 |
+ case ')': |
|
353 |
+ output[outlen++] = 0x29; |
|
354 |
+ break; |
|
355 |
+ case '\\': |
|
356 |
+ output[outlen++] = 0x5c; |
|
357 |
+ break; |
|
358 |
+ default: |
|
359 |
+ /* IGNORE THE REVERSE SOLIDUS - PDF3000-2008 */ |
|
360 |
+ break; |
|
361 |
+ } |
|
362 |
+ i += 1; /* 2 with for loop [\c] */ |
|
363 |
+ } |
|
364 |
+ } else { |
|
365 |
+ output[outlen++] = wrkstr[i]; |
|
366 |
+ } |
|
367 |
+ } |
|
368 |
+ |
|
369 |
+ free(wrkstr); |
|
370 |
+ wrkstr = cli_strdup(output); |
|
371 |
+ free(output); |
|
372 |
+ wrklen = outlen; |
|
373 |
+ } |
|
374 |
+ |
|
375 |
+ cli_errmsg("pdf_final: escaped(%d): %s\n", wrklen, wrkstr); |
|
376 |
+ |
|
377 |
+ /* check for encryption and decrypt */ |
|
378 |
+ if (pdf->flags & (1 << ENCRYPTED_PDF)) |
|
379 |
+ { |
|
380 |
+ off_t tmpsz = (off_t)wrklen; |
|
381 |
+ output = pdf_decrypt_string(pdf, obj, wrkstr, &tmpsz); |
|
382 |
+ outlen = (size_t)tmpsz; |
|
383 |
+ free(wrkstr); |
|
384 |
+ if (output) { |
|
385 |
+ wrkstr = output; |
|
386 |
+ wrklen = outlen; |
|
387 |
+ } else { |
|
388 |
+ return NULL; |
|
389 |
+ } |
|
390 |
+ } |
|
391 |
+ |
|
392 |
+ cli_errmsg("pdf_final: decrypt(%d): %s\n", wrklen, wrkstr); |
|
393 |
+ |
|
394 |
+ /* check for UTF-* and convert to UTF-8 */ |
|
395 |
+ for (i = 0; i < wrklen; ++i) { |
|
396 |
+ if (((unsigned char)wrkstr[i] > (unsigned char)0x7f) || (wrkstr[i] == '\0')) { |
|
397 |
+ likelyutf = 1; |
|
398 |
+ break; |
|
399 |
+ } |
|
400 |
+ } |
|
401 |
+ |
|
402 |
+ if (likelyutf) { |
|
403 |
+ output = pdf_convert_utf(wrkstr, wrklen); |
|
404 |
+ free(wrkstr); |
|
405 |
+ wrkstr = output; |
|
406 |
+ } |
|
407 |
+ |
|
408 |
+ cli_errmsg("pdf_final: postutf(%d): %s\n", wrklen, wrkstr); |
|
409 |
+ |
|
410 |
+ return wrkstr; |
|
411 |
+} |
|
412 |
+ |
|
280 | 413 |
char *pdf_parse_string(struct pdf_struct *pdf, struct pdf_obj *obj, const char *objstart, size_t objsize, const char *str, char **endchar) |
281 | 414 |
{ |
282 | 415 |
const char *q = objstart; |
283 | 416 |
char *p1, *p2; |
284 | 417 |
size_t len, checklen; |
285 |
- char *res; |
|
286 |
- int likelyutf = 0; |
|
418 |
+ char *res = NULL; |
|
287 | 419 |
uint32_t objid; |
288 | 420 |
size_t i; |
289 | 421 |
|
... | ... |
@@ -297,8 +375,6 @@ char *pdf_parse_string(struct pdf_struct *pdf, struct pdf_obj *obj, const char * |
297 | 297 |
* Fourth, Attempt to decode from UTF-* to UTF-8 |
298 | 298 |
*/ |
299 | 299 |
|
300 |
- res = NULL; |
|
301 |
- |
|
302 | 300 |
if (str) { |
303 | 301 |
checklen = strlen(str); |
304 | 302 |
|
... | ... |
@@ -414,18 +490,10 @@ char *pdf_parse_string(struct pdf_struct *pdf, struct pdf_obj *obj, const char * |
414 | 414 |
free(begin); |
415 | 415 |
break; |
416 | 416 |
default: |
417 |
- for (i=0; i < objsize2; i++) { |
|
418 |
- if (p3[i] >= 0x7f) { |
|
419 |
- likelyutf=1; |
|
420 |
- break; |
|
421 |
- } |
|
422 |
- } |
|
423 |
- |
|
424 |
- res = likelyutf ? pdf_convert_utf(p3, objsize2) : NULL; |
|
425 |
- |
|
426 |
- if (!(res)) { |
|
427 |
- res = begin; |
|
428 |
- res[objsize2] = '\0'; |
|
417 |
+ res = pdf_finalize_string(pdf, obj, begin, objsize2); |
|
418 |
+ if (!res) { |
|
419 |
+ /* WE NEED TO BASE64 ENCODE IT! */ |
|
420 |
+ return NULL; /* for now, just return NULL */ |
|
429 | 421 |
} else { |
430 | 422 |
free(begin); |
431 | 423 |
} |
... | ... |
@@ -474,9 +542,6 @@ char *pdf_parse_string(struct pdf_struct *pdf, struct pdf_obj *obj, const char * |
474 | 474 |
while (p2 < objstart + objsize) { |
475 | 475 |
int shouldbreak=0; |
476 | 476 |
|
477 |
- if (!likelyutf && (*((unsigned char *)p2) > (unsigned char)0x7f || *p2 == '\0')) |
|
478 |
- likelyutf = 1; |
|
479 |
- |
|
480 | 477 |
switch (*p2) { |
481 | 478 |
case '\\': |
482 | 479 |
p2++; |
... | ... |
@@ -499,26 +564,20 @@ char *pdf_parse_string(struct pdf_struct *pdf, struct pdf_obj *obj, const char * |
499 | 499 |
|
500 | 500 |
len = (size_t)(p2 - p1) + 1; |
501 | 501 |
|
502 |
- if (likelyutf == 0) { |
|
503 |
- /* We're not UTF-*, so just make a copy of the string and return that */ |
|
504 |
- res = cli_calloc(1, len+1); |
|
505 |
- if (!(res)) |
|
506 |
- return NULL; |
|
502 |
+ /* EXPERIMENTAL */ |
|
507 | 503 |
|
508 |
- memcpy(res, p1, len); |
|
509 |
- res[len] = '\0'; |
|
510 |
- if (endchar) |
|
511 |
- *endchar = p2; |
|
512 |
- |
|
513 |
- return res; |
|
504 |
+ res = pdf_finalize_string(pdf, obj, p1, len); |
|
505 |
+ if (!res) { |
|
506 |
+ /* WE NEED TO BASE64 ENCODE IT! */ |
|
507 |
+ return NULL; /* for now, just return NULL */ |
|
514 | 508 |
} |
515 | 509 |
|
516 |
- res = pdf_convert_utf(p1, len); |
|
517 |
- |
|
518 | 510 |
if (res && endchar) |
519 | 511 |
*endchar = p2; |
520 | 512 |
|
521 | 513 |
return res; |
514 |
+ |
|
515 |
+ /* EXPERIMENTAL */ |
|
522 | 516 |
} |
523 | 517 |
|
524 | 518 |
struct pdf_dict *pdf_parse_dict(struct pdf_struct *pdf, struct pdf_obj *obj, size_t objsz, char *begin, char **endchar) |