Browse code

pdf: string decryption and code clean-up pdfng: fixed a bug in escape sequence handling

Kevin Lin authored on 2015/03/21 04:10:52
Showing 3 changed files
... ...
@@ -746,7 +746,7 @@ static void aes_decrypt(const unsigned char *in, off_t *length, unsigned char *q
746 746
 }
747 747
 
748 748
 
749
-static char *decrypt_any(struct pdf_struct *pdf, uint32_t id, const char *in, off_t *length, enum enc_method enc_method)
749
+char *decrypt_any(struct pdf_struct *pdf, uint32_t id, const char *in, off_t *length, enum enc_method enc_method)
750 750
 {
751 751
     unsigned char *key, *q, result[16];
752 752
     unsigned n;
... ...
@@ -846,7 +846,7 @@ static char *decrypt_any(struct pdf_struct *pdf, uint32_t id, const char *in, of
846 846
     return (char *)q;
847 847
 }
848 848
 
849
-static enum enc_method get_enc_method(struct pdf_struct *pdf, struct pdf_obj *obj)
849
+enum enc_method get_enc_method(struct pdf_struct *pdf, struct pdf_obj *obj)
850 850
 {
851 851
     if (obj->flags & (1 << OBJ_EMBEDDED_FILE))
852 852
         return pdf->enc_method_embeddedfile;
... ...
@@ -2244,7 +2244,7 @@ static enum enc_method parse_enc_method(const char *dict, unsigned len, const ch
2244 2244
     return ret;
2245 2245
 }
2246 2246
 
2247
-static void pdf_handle_enc(struct pdf_struct *pdf)
2247
+void pdf_handle_enc(struct pdf_struct *pdf)
2248 2248
 {
2249 2249
     struct pdf_obj *obj;
2250 2250
     uint32_t len, n, R, P, length, EM = 1, i, oulen;
... ...
@@ -2612,6 +2612,9 @@ int cli_pdf(const char *dir, cli_ctx *ctx, off_t offset)
2612 2612
     if (rc == -1)
2613 2613
         pdf.flags |= 1 << BAD_PDF_TOOMANYOBJS;
2614 2614
 
2615
+    /* needs to be here for JSON output decryption */
2616
+    pdf_handle_enc(&pdf);
2617
+
2615 2618
     /* must parse after finding all objs, so we can flag indirect objects */
2616 2619
     for (i=0;i<pdf.nobjs;i++) {
2617 2620
         struct pdf_obj *obj = &pdf.objs[i];
... ...
@@ -2632,7 +2635,6 @@ int cli_pdf(const char *dir, cli_ctx *ctx, off_t offset)
2632 2632
         pdf_parseobj(&pdf, obj);
2633 2633
     }
2634 2634
 
2635
-    pdf_handle_enc(&pdf);
2636 2635
     if (pdf.flags & (1 << ENCRYPTED_PDF))
2637 2636
         cli_dbgmsg("cli_pdf: encrypted pdf found, %s!\n",
2638 2637
                (pdf.flags & (1 << DECRYPTABLE_PDF)) ?
... ...
@@ -144,6 +144,10 @@ int pdf_extract_obj(struct pdf_struct *pdf, struct pdf_obj *obj, uint32_t flags)
144 144
 int pdf_findobj(struct pdf_struct *pdf);
145 145
 struct pdf_obj *find_obj(struct pdf_struct *pdf, struct pdf_obj *obj, uint32_t objid);
146 146
 
147
+void pdf_handle_enc(struct pdf_struct *pdf);
148
+char *decrypt_any(struct pdf_struct *pdf, uint32_t id, const char *in, off_t *length, enum enc_method enc_method);
149
+enum enc_method get_enc_method(struct pdf_struct *pdf, struct pdf_obj *obj);
150
+
147 151
 char *pdf_parse_string(struct pdf_struct *pdf, struct pdf_obj *obj, const char *objstart, size_t objsize, const char *str, char **endchar);
148 152
 struct pdf_array *pdf_parse_array(struct pdf_struct *pdf, struct pdf_obj *obj, size_t objsz, char *begin, char **endchar);
149 153
 struct pdf_dict *pdf_parse_dict(struct pdf_struct *pdf, struct pdf_obj *obj, size_t objsz, char *begin, char **endchar);
... ...
@@ -75,10 +75,9 @@ char *pdf_convert_utf(char *begin, size_t sz)
75 75
 {
76 76
     char *res=NULL;
77 77
     char *buf, *outbuf;
78
-    size_t sz2, i;
79 78
 #if HAVE_ICONV
80 79
     char *p1, *p2;
81
-    size_t inlen, outlen;
80
+    size_t inlen, outlen, i, sz2;
82 81
     char *encodings[] = {
83 82
         "UTF-16",
84 83
         NULL
... ...
@@ -90,59 +89,6 @@ char *pdf_convert_utf(char *begin, size_t sz)
90 90
     if (!(buf))
91 91
         return NULL;
92 92
 
93
-    /* convert PDF specific escape sequences, like octal sequences */
94
-    sz2 = 0;
95
-    for (i = 0; i < sz; ++i) {
96
-        if ((i+1 < sz) && begin[i] == '\\') {
97
-            if ((i+3 < sz) &&
98
-                (isdigit(begin[i+1]) && isdigit(begin[i+2]) && isdigit(begin[i+3]))) {
99
-                /* octal sequence */
100
-                char octal[4], *check;
101
-                unsigned long value;
102
-
103
-                memcpy(octal, &begin[i+1], 3);
104
-                octal[3] = '\0';
105
-
106
-                value = (char)strtoul(octal, &check, 8);
107
-                /* check if all characters were converted */
108
-                if (check == &octal[3])
109
-                    buf[sz2++] = value;
110
-                i += 3;
111
-            } else {
112
-                /* other sequences */
113
-                switch(begin[i+1]) {
114
-                case 'n':
115
-                    buf[sz2++] = 0x0a;
116
-                    break;
117
-                case 'r':
118
-                    buf[sz2++] = 0x0d;
119
-                    break;
120
-                case 't':
121
-                    buf[sz2++] = 0x09;
122
-                    break;
123
-                case 'b':
124
-                    buf[sz2++] = 0x08;
125
-                    break;
126
-                case 'f':
127
-                    buf[sz2++] = 0x0c;
128
-                    break;
129
-                case '(':
130
-                    buf[sz2++] = 0x28;
131
-                    break;
132
-                case ')':
133
-                    buf[sz2++] = 0x29;
134
-                    break;
135
-                case '\\':
136
-                    buf[sz2++] = 0x5c;
137
-                    break;
138
-                default:
139
-                    /* IGNORE THE REVERSE SOLIDUS - PDF3000-2008 */
140
-                    break;
141
-                }
142
-            }
143
-        } else
144
-            buf[sz2++] = begin[i]; 
145
-    }
146 93
 #if HAVE_ICONV
147 94
     //memcpy(buf, begin, sz);
148 95
     p1 = buf;
... ...
@@ -277,13 +223,145 @@ int is_object_reference(char *begin, char **endchar, uint32_t *id)
277 277
     return 0;
278 278
 }
279 279
 
280
+static char *pdf_decrypt_string(struct pdf_struct *pdf, struct pdf_obj *obj, const char *in, off_t *length)
281
+{
282
+    enum enc_method enc;
283
+
284
+    /* handled only once in cli_pdf() */
285
+    //pdf_handle_enc(pdf);
286
+    if (pdf->flags & (1 << DECRYPTABLE_PDF)) {
287
+        enc = get_enc_method(pdf, obj);
288
+        return decrypt_any(pdf, obj->id, in, length, enc);
289
+    }
290
+    return NULL;
291
+}
292
+
293
+static char *pdf_finalize_string(struct pdf_struct *pdf, struct pdf_obj *obj, const char *in, size_t len)
294
+{
295
+    char *wrkstr, *output = NULL;
296
+    size_t wrklen = len, outlen;
297
+    unsigned int i, likelyutf = 0;
298
+
299
+    /* get a working copy */
300
+    wrkstr = cli_calloc(len+1, sizeof(char));
301
+    if (!wrkstr)
302
+        return NULL;
303
+    memcpy(wrkstr, in, len);
304
+
305
+    cli_errmsg("pdf_final: start(%d):   %s\n", wrklen, wrkstr);
306
+
307
+    /* convert PDF specific escape sequences, like octal sequences */
308
+    /* TODO: replace the escape sequences directly in the wrkstr   */
309
+    if (strchr(wrkstr, '\\')) {
310
+        output = cli_calloc(wrklen+1, sizeof(char));
311
+        if (!output)
312
+            return NULL;
313
+
314
+        outlen = 0;
315
+        for (i = 0; i < wrklen; ++i) {
316
+            if ((i+1 < wrklen) && wrkstr[i] == '\\') {
317
+                if ((i+3 < wrklen) &&
318
+                    (isdigit(wrkstr[i+1]) && isdigit(wrkstr[i+2]) && isdigit(wrkstr[i+3]))) {
319
+                    /* octal sequence */
320
+                    char octal[4], *check;
321
+                    unsigned long value;
322
+
323
+                    memcpy(octal, &wrkstr[i+1], 3);
324
+                    octal[3] = '\0';
325
+
326
+                    value = (char)strtoul(octal, &check, 8);
327
+                    /* check if all characters were converted */
328
+                    if (check == &octal[3])
329
+                        output[outlen++] = value;
330
+                    i += 3; /* 4 with for loop [\ddd] */
331
+                } else {
332
+                    /* other sequences */
333
+                    switch(wrkstr[i+1]) {
334
+                    case 'n':
335
+                        output[outlen++] = 0x0a;
336
+                        break;
337
+                    case 'r':
338
+                        output[outlen++] = 0x0d;
339
+                        break;
340
+                    case 't':
341
+                        output[outlen++] = 0x09;
342
+                        break;
343
+                    case 'b':
344
+                        output[outlen++] = 0x08;
345
+                        break;
346
+                    case 'f':
347
+                        output[outlen++] = 0x0c;
348
+                        break;
349
+                    case '(':
350
+                        output[outlen++] = 0x28;
351
+                        break;
352
+                    case ')':
353
+                        output[outlen++] = 0x29;
354
+                        break;
355
+                    case '\\':
356
+                        output[outlen++] = 0x5c;
357
+                        break;
358
+                    default:
359
+                        /* IGNORE THE REVERSE SOLIDUS - PDF3000-2008 */
360
+                        break;
361
+                    }
362
+                    i += 1; /* 2 with for loop [\c] */
363
+                }
364
+            } else {
365
+                output[outlen++] = wrkstr[i];
366
+            }
367
+        }
368
+
369
+        free(wrkstr);
370
+        wrkstr = cli_strdup(output);
371
+        free(output);
372
+        wrklen = outlen;
373
+    }
374
+
375
+    cli_errmsg("pdf_final: escaped(%d): %s\n", wrklen, wrkstr);
376
+
377
+    /* check for encryption and decrypt */
378
+    if (pdf->flags & (1 << ENCRYPTED_PDF))
379
+    {
380
+        off_t tmpsz = (off_t)wrklen;
381
+        output = pdf_decrypt_string(pdf, obj, wrkstr, &tmpsz);
382
+        outlen = (size_t)tmpsz;
383
+        free(wrkstr);
384
+        if (output) {
385
+            wrkstr = output;
386
+            wrklen = outlen;
387
+        } else {
388
+            return NULL;
389
+        }
390
+    }
391
+
392
+    cli_errmsg("pdf_final: decrypt(%d): %s\n", wrklen, wrkstr);
393
+
394
+    /* check for UTF-* and convert to UTF-8 */
395
+    for (i = 0; i < wrklen; ++i) {
396
+        if (((unsigned char)wrkstr[i] > (unsigned char)0x7f) || (wrkstr[i] == '\0')) {
397
+            likelyutf = 1;
398
+            break;
399
+        }
400
+    }
401
+
402
+    if (likelyutf) {
403
+        output = pdf_convert_utf(wrkstr, wrklen);
404
+        free(wrkstr);
405
+        wrkstr = output;
406
+    }
407
+
408
+    cli_errmsg("pdf_final: postutf(%d): %s\n", wrklen, wrkstr);
409
+
410
+    return wrkstr;
411
+}
412
+
280 413
 char *pdf_parse_string(struct pdf_struct *pdf, struct pdf_obj *obj, const char *objstart, size_t objsize, const char *str, char **endchar)
281 414
 {
282 415
     const char *q = objstart;
283 416
     char *p1, *p2;
284 417
     size_t len, checklen;
285
-    char *res;
286
-    int likelyutf = 0;
418
+    char *res = NULL;
287 419
     uint32_t objid;
288 420
     size_t i;
289 421
 
... ...
@@ -297,8 +375,6 @@ char *pdf_parse_string(struct pdf_struct *pdf, struct pdf_obj *obj, const char *
297 297
      * Fourth, Attempt to decode from UTF-* to UTF-8
298 298
      */
299 299
 
300
-    res = NULL;
301
-
302 300
     if (str) {
303 301
         checklen = strlen(str);
304 302
 
... ...
@@ -414,18 +490,10 @@ char *pdf_parse_string(struct pdf_struct *pdf, struct pdf_obj *obj, const char *
414 414
                     free(begin);
415 415
                     break;
416 416
                 default:
417
-                    for (i=0; i < objsize2; i++) {
418
-                        if (p3[i] >= 0x7f) {
419
-                            likelyutf=1;
420
-                            break;
421
-                        }
422
-                    }
423
-
424
-                    res = likelyutf ? pdf_convert_utf(p3, objsize2) : NULL;
425
-
426
-                    if (!(res)) {
427
-                        res = begin;
428
-                        res[objsize2] = '\0';
417
+                    res = pdf_finalize_string(pdf, obj, begin, objsize2);
418
+                    if (!res) {
419
+                        /* WE NEED TO BASE64 ENCODE IT! */
420
+                        return NULL; /* for now, just return NULL */
429 421
                     } else {
430 422
                         free(begin);
431 423
                     }
... ...
@@ -474,9 +542,6 @@ char *pdf_parse_string(struct pdf_struct *pdf, struct pdf_obj *obj, const char *
474 474
     while (p2 < objstart + objsize) {
475 475
         int shouldbreak=0;
476 476
 
477
-        if (!likelyutf && (*((unsigned char *)p2) > (unsigned char)0x7f || *p2 == '\0'))
478
-            likelyutf = 1;
479
-
480 477
         switch (*p2) {
481 478
             case '\\':
482 479
                 p2++;
... ...
@@ -499,26 +564,20 @@ char *pdf_parse_string(struct pdf_struct *pdf, struct pdf_obj *obj, const char *
499 499
 
500 500
     len = (size_t)(p2 - p1) + 1;
501 501
 
502
-    if (likelyutf == 0) {
503
-        /* We're not UTF-*, so just make a copy of the string and return that */
504
-        res = cli_calloc(1, len+1);
505
-        if (!(res))
506
-            return NULL;
502
+    /* EXPERIMENTAL */
507 503
 
508
-        memcpy(res, p1, len);
509
-        res[len] = '\0';
510
-        if (endchar)
511
-            *endchar = p2;
512
-
513
-        return res;
504
+    res = pdf_finalize_string(pdf, obj, p1, len);
505
+    if (!res) {
506
+        /* WE NEED TO BASE64 ENCODE IT! */
507
+        return NULL; /* for now, just return NULL */
514 508
     }
515 509
 
516
-    res = pdf_convert_utf(p1, len);
517
-
518 510
     if (res && endchar)
519 511
         *endchar = p2;
520 512
 
521 513
     return res;
514
+
515
+    /* EXPERIMENTAL */
522 516
 }
523 517
 
524 518
 struct pdf_dict *pdf_parse_dict(struct pdf_struct *pdf, struct pdf_obj *obj, size_t objsz, char *begin, char **endchar)