Browse code

pdf: correctly handle encryption objects to decrypt

Kevin Lin authored on 2015/04/02 06:41:59
Showing 3 changed files
... ...
@@ -2612,9 +2612,6 @@ int cli_pdf(const char *dir, cli_ctx *ctx, off_t offset)
2612 2612
     if (rc == -1)
2613 2613
         pdf.flags |= 1 << BAD_PDF_TOOMANYOBJS;
2614 2614
 
2615
-    /* needs to be here for JSON output decryption */
2616
-    pdf_handle_enc(&pdf);
2617
-
2618 2615
     /* must parse after finding all objs, so we can flag indirect objects */
2619 2616
     for (i=0;i<pdf.nobjs;i++) {
2620 2617
         struct pdf_obj *obj = &pdf.objs[i];
... ...
@@ -2635,6 +2632,7 @@ int cli_pdf(const char *dir, cli_ctx *ctx, off_t offset)
2635 2635
         pdf_parseobj(&pdf, obj);
2636 2636
     }
2637 2637
 
2638
+    pdf_handle_enc(&pdf);
2638 2639
     if (pdf.flags & (1 << ENCRYPTED_PDF))
2639 2640
         cli_dbgmsg("cli_pdf: encrypted pdf found, %s!\n",
2640 2641
                (pdf.flags & (1 << DECRYPTABLE_PDF)) ?
... ...
@@ -3216,8 +3214,12 @@ static void Author_cb(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdfnam
3216 3216
     if (!(pdf->ctx->options & CL_SCAN_FILE_PROPERTIES))
3217 3217
         return;
3218 3218
 
3219
-    if (!(pdf->stats.author))
3220
-        pdf->stats.author = pdf_parse_string(pdf, obj, obj->start + pdf->map, obj_size(pdf, obj, 1), "/Author", NULL, &(pdf->stats.author_b64));
3219
+    if (!(pdf->stats.author)) {
3220
+        pdf->stats.author = cli_calloc(1, sizeof(struct pdf_stats_entry));
3221
+        if (!(pdf->stats.author))
3222
+            return;
3223
+        pdf->stats.author->data = pdf_parse_string(pdf, obj, obj->start + pdf->map, obj_size(pdf, obj, 1), "/Author", NULL, &(pdf->stats.author->meta));
3224
+    }
3221 3225
 }
3222 3226
 #endif
3223 3227
 
... ...
@@ -3232,8 +3234,12 @@ static void Creator_cb(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdfna
3232 3232
     if (!(pdf->ctx->options & CL_SCAN_FILE_PROPERTIES))
3233 3233
         return;
3234 3234
 
3235
-    if (!(pdf->stats.creator))
3236
-        pdf->stats.creator = pdf_parse_string(pdf, obj, obj->start + pdf->map, obj_size(pdf, obj, 1), "/Creator", NULL, &(pdf->stats.creator_b64));
3235
+    if (!(pdf->stats.creator)) {
3236
+        pdf->stats.creator = cli_calloc(1, sizeof(struct pdf_stats_entry));
3237
+        if (!(pdf->stats.creator))
3238
+            return;
3239
+        pdf->stats.creator->data = pdf_parse_string(pdf, obj, obj->start + pdf->map, obj_size(pdf, obj, 1), "/Creator", NULL, &(pdf->stats.creator->meta));
3240
+    }
3237 3241
 }
3238 3242
 #endif
3239 3243
 
... ...
@@ -3248,8 +3254,12 @@ static void ModificationDate_cb(struct pdf_struct *pdf, struct pdf_obj *obj, str
3248 3248
     if (!(pdf->ctx->options & CL_SCAN_FILE_PROPERTIES))
3249 3249
         return;
3250 3250
 
3251
-    if (!(pdf->stats.modificationdate))
3252
-        pdf->stats.modificationdate = pdf_parse_string(pdf, obj, obj->start + pdf->map, obj_size(pdf, obj, 1), "/ModDate", NULL, &(pdf->stats.modificationdate_b64));
3251
+    if (!(pdf->stats.modificationdate)) {
3252
+        pdf->stats.modificationdate = cli_calloc(1, sizeof(struct pdf_stats_entry));
3253
+        if (!(pdf->stats.modificationdate))
3254
+            return;
3255
+        pdf->stats.modificationdate->data = pdf_parse_string(pdf, obj, obj->start + pdf->map, obj_size(pdf, obj, 1), "/ModDate", NULL, &(pdf->stats.modificationdate->meta));
3256
+    }
3253 3257
 }
3254 3258
 #endif
3255 3259
 
... ...
@@ -3264,8 +3274,12 @@ static void CreationDate_cb(struct pdf_struct *pdf, struct pdf_obj *obj, struct
3264 3264
     if (!(pdf->ctx->options & CL_SCAN_FILE_PROPERTIES))
3265 3265
         return;
3266 3266
 
3267
-    if (!(pdf->stats.creationdate))
3268
-        pdf->stats.creationdate = pdf_parse_string(pdf, obj, obj->start + pdf->map, obj_size(pdf, obj, 1), "/CreationDate", NULL, &(pdf->stats.creationdate_b64));
3267
+    if (!(pdf->stats.creationdate)) {
3268
+        pdf->stats.creationdate = cli_calloc(1, sizeof(struct pdf_stats_entry));
3269
+        if (!(pdf->stats.creationdate))
3270
+            return;
3271
+        pdf->stats.creationdate->data = pdf_parse_string(pdf, obj, obj->start + pdf->map, obj_size(pdf, obj, 1), "/CreationDate", NULL, &(pdf->stats.creationdate->meta));
3272
+    }
3269 3273
 }
3270 3274
 #endif
3271 3275
 
... ...
@@ -3280,8 +3294,12 @@ static void Producer_cb(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdfn
3280 3280
     if (!(pdf->ctx->options & CL_SCAN_FILE_PROPERTIES))
3281 3281
         return;
3282 3282
 
3283
-    if (!(pdf->stats.producer))
3284
-        pdf->stats.producer = pdf_parse_string(pdf, obj, obj->start + pdf->map, obj_size(pdf, obj, 1), "/Producer", NULL, &(pdf->stats.producer_b64));
3283
+    if (!(pdf->stats.producer)) {
3284
+        pdf->stats.producer = cli_calloc(1, sizeof(struct pdf_stats_entry));
3285
+        if (!(pdf->stats.producer))
3286
+            return;
3287
+        pdf->stats.producer->data = pdf_parse_string(pdf, obj, obj->start + pdf->map, obj_size(pdf, obj, 1), "/Producer", NULL, &(pdf->stats.producer->meta));
3288
+    }
3285 3289
 }
3286 3290
 #endif
3287 3291
 
... ...
@@ -3296,8 +3314,12 @@ static void Title_cb(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdfname
3296 3296
     if (!(pdf->ctx->options & CL_SCAN_FILE_PROPERTIES))
3297 3297
         return;
3298 3298
 
3299
-    if (!(pdf->stats.title))
3300
-        pdf->stats.title = pdf_parse_string(pdf, obj, obj->start + pdf->map, obj_size(pdf, obj, 1), "/Title", NULL, &(pdf->stats.title_b64));
3299
+    if (!(pdf->stats.title)) {
3300
+        pdf->stats.title = cli_calloc(1, sizeof(struct pdf_stats_entry));
3301
+        if (!(pdf->stats.title))
3302
+            return;
3303
+        pdf->stats.title->data = pdf_parse_string(pdf, obj, obj->start + pdf->map, obj_size(pdf, obj, 1), "/Title", NULL, &(pdf->stats.title->meta));
3304
+    }
3301 3305
 }
3302 3306
 #endif
3303 3307
 
... ...
@@ -3312,8 +3334,12 @@ static void Keywords_cb(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdfn
3312 3312
     if (!(pdf->ctx->options & CL_SCAN_FILE_PROPERTIES))
3313 3313
         return;
3314 3314
 
3315
-    if (!(pdf->stats.keywords))
3316
-        pdf->stats.keywords = pdf_parse_string(pdf, obj, obj->start + pdf->map, obj_size(pdf, obj, 1), "/Keywords", NULL, &(pdf->stats.keywords_b64));
3315
+    if (!(pdf->stats.keywords)) {
3316
+        pdf->stats.keywords = cli_calloc(1, sizeof(struct pdf_stats_entry));
3317
+        if (!(pdf->stats.keywords))
3318
+            return;
3319
+        pdf->stats.keywords->data = pdf_parse_string(pdf, obj, obj->start + pdf->map, obj_size(pdf, obj, 1), "/Keywords", NULL, &(pdf->stats.keywords->meta));
3320
+    }
3317 3321
 }
3318 3322
 #endif
3319 3323
 
... ...
@@ -3328,8 +3354,12 @@ static void Subject_cb(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdfna
3328 3328
     if (!(pdf->ctx->options & CL_SCAN_FILE_PROPERTIES))
3329 3329
         return;
3330 3330
 
3331
-    if (!(pdf->stats.subject))
3332
-        pdf->stats.subject = pdf_parse_string(pdf, obj, obj->start + pdf->map, obj_size(pdf, obj, 1), "/Subject", NULL, &(pdf->stats.subject_b64));
3331
+    if (!(pdf->stats.subject)) {
3332
+        pdf->stats.subject = cli_calloc(1, sizeof(struct pdf_stats_entry));
3333
+        if (!(pdf->stats.subject))
3334
+            return;
3335
+        pdf->stats.subject->data = pdf_parse_string(pdf, obj, obj->start + pdf->map, obj_size(pdf, obj, 1), "/Subject", NULL, &(pdf->stats.subject->meta));
3336
+    }
3333 3337
 }
3334 3338
 #endif
3335 3339
 
... ...
@@ -3514,124 +3544,164 @@ static void pdf_export_json(struct pdf_struct *pdf)
3514 3514
     }
3515 3515
 
3516 3516
     if (pdf->stats.author) {
3517
-        if (pdf->stats.author_b64) {
3518
-            cli_jsonstr(pdfobj, "Author", pdf->stats.author);
3519
-            cli_jsonbool(pdfobj, "Author_base64", 1);
3520
-        } else {
3521
-            if (cli_isutf8(pdf->stats.author, strlen(pdf->stats.author)))
3522
-                cli_jsonstr(pdfobj, "Author", pdf->stats.author);
3523
-            else {
3524
-                char *b64 = (char *)cl_base64_encode(pdf->stats.author, strlen(pdf->stats.author));
3525
-                cli_jsonstr(pdfobj, "Author", b64);
3526
-                cli_jsonbool(pdfobj, "Author_base64", 1);
3527
-                free(b64);
3517
+        if (!pdf->stats.author->meta.success) {
3518
+            char *out = pdf_finalize_string(pdf, pdf->stats.author->meta.obj, pdf->stats.author->data, pdf->stats.author->meta.length);
3519
+            if (out) {
3520
+                free(pdf->stats.author->data);
3521
+                pdf->stats.author->data = out;
3522
+                pdf->stats.author->meta.length = strlen(out);
3523
+                pdf->stats.author->meta.success = 1;
3528 3524
             }
3529 3525
         }
3526
+
3527
+        if (pdf->stats.author->meta.success && cli_isutf8(pdf->stats.author->data, pdf->stats.author->meta.length)) {
3528
+            cli_jsonstr(pdfobj, "Author", pdf->stats.author->data);
3529
+        } else {
3530
+            char *b64 = (char *)cl_base64_encode(pdf->stats.author->data, pdf->stats.author->meta.length);
3531
+            cli_jsonstr(pdfobj, "Author", b64);
3532
+            cli_jsonbool(pdfobj, "Author_base64", 1);
3533
+            free(b64);
3534
+        }
3530 3535
     }
3531 3536
     if (pdf->stats.creator) {
3532
-        if (pdf->stats.creator_b64) {
3533
-            cli_jsonstr(pdfobj, "Creator", pdf->stats.creator);
3534
-            cli_jsonbool(pdfobj, "Creator_base64", 1);
3535
-        } else {
3536
-            if (cli_isutf8(pdf->stats.creator, strlen(pdf->stats.creator)))
3537
-                cli_jsonstr(pdfobj, "Creator", pdf->stats.creator);
3538
-            else {
3539
-                char *b64 = (char *)cl_base64_encode(pdf->stats.creator, strlen(pdf->stats.creator));
3540
-                cli_jsonstr(pdfobj, "Creator", b64);
3541
-                cli_jsonbool(pdfobj, "Creator_base64", 1);
3542
-                free(b64);
3537
+        if (!pdf->stats.creator->meta.success) {
3538
+            char *out = pdf_finalize_string(pdf, pdf->stats.creator->meta.obj, pdf->stats.creator->data, pdf->stats.creator->meta.length);
3539
+            if (out) {
3540
+                free(pdf->stats.creator->data);
3541
+                pdf->stats.creator->data = out;
3542
+                pdf->stats.creator->meta.length = strlen(out);
3543
+                pdf->stats.creator->meta.success = 1;
3543 3544
             }
3544 3545
         }
3546
+
3547
+        if (pdf->stats.creator->meta.success && cli_isutf8(pdf->stats.creator->data, pdf->stats.creator->meta.length)) {
3548
+            cli_jsonstr(pdfobj, "Creator", pdf->stats.creator->data);
3549
+        } else {
3550
+            char *b64 = (char *)cl_base64_encode(pdf->stats.creator->data, pdf->stats.creator->meta.length);
3551
+            cli_jsonstr(pdfobj, "Creator", b64);
3552
+            cli_jsonbool(pdfobj, "Creator_base64", 1);
3553
+            free(b64);
3554
+        }
3545 3555
     }
3546 3556
     if (pdf->stats.producer) {
3547
-        if (pdf->stats.producer_b64) {
3548
-            cli_jsonstr(pdfobj, "Producer", pdf->stats.producer);
3549
-            cli_jsonbool(pdfobj, "Producer_base64", 1);
3550
-        } else {
3551
-            if (cli_isutf8(pdf->stats.producer, strlen(pdf->stats.producer)))
3552
-                cli_jsonstr(pdfobj, "Producer", pdf->stats.producer);
3553
-            else {
3554
-                char *b64 = (char *)cl_base64_encode(pdf->stats.producer, strlen(pdf->stats.producer));
3555
-                cli_jsonstr(pdfobj, "Producer", b64);
3556
-                cli_jsonbool(pdfobj, "Producer_base64", 1);
3557
-                free(b64);
3557
+        if (!pdf->stats.producer->meta.success) {
3558
+            char *out = pdf_finalize_string(pdf, pdf->stats.producer->meta.obj, pdf->stats.producer->data, pdf->stats.producer->meta.length);
3559
+            if (out) {
3560
+                free(pdf->stats.producer->data);
3561
+                pdf->stats.producer->data = out;
3562
+                pdf->stats.producer->meta.length = strlen(out);
3563
+                pdf->stats.producer->meta.success = 1;
3558 3564
             }
3559 3565
         }
3566
+
3567
+        if (pdf->stats.producer->meta.success && cli_isutf8(pdf->stats.producer->data, pdf->stats.producer->meta.length)) {
3568
+            cli_jsonstr(pdfobj, "Producer", pdf->stats.producer->data);
3569
+        } else {
3570
+            char *b64 = (char *)cl_base64_encode(pdf->stats.producer->data, pdf->stats.producer->meta.length);
3571
+            cli_jsonstr(pdfobj, "Producer", b64);
3572
+            cli_jsonbool(pdfobj, "Producer_base64", 1);
3573
+            free(b64);
3574
+        }
3560 3575
     }
3561 3576
     if (pdf->stats.modificationdate) {
3562
-        if (pdf->stats.modificationdate_b64) {
3563
-            cli_jsonstr(pdfobj, "ModificationDate", pdf->stats.modificationdate);
3564
-            cli_jsonbool(pdfobj, "ModificationDate_base64", 1);
3565
-        } else {
3566
-            if (cli_isutf8(pdf->stats.modificationdate, strlen(pdf->stats.modificationdate)))
3567
-                cli_jsonstr(pdfobj, "ModificationDate", pdf->stats.modificationdate);
3568
-            else {
3569
-                char *b64 = (char *)cl_base64_encode(pdf->stats.modificationdate, strlen(pdf->stats.modificationdate));
3570
-                cli_jsonstr(pdfobj, "ModificationDate", b64);
3571
-                cli_jsonbool(pdfobj, "ModificationDate_base64", 1);
3572
-                free(b64);
3577
+        if (!pdf->stats.modificationdate->meta.success) {
3578
+            char *out = pdf_finalize_string(pdf, pdf->stats.modificationdate->meta.obj, pdf->stats.modificationdate->data, pdf->stats.modificationdate->meta.length);
3579
+            if (out) {
3580
+                free(pdf->stats.modificationdate->data);
3581
+                pdf->stats.modificationdate->data = out;
3582
+                pdf->stats.modificationdate->meta.length = strlen(out);
3583
+                pdf->stats.modificationdate->meta.success = 1;
3573 3584
             }
3574 3585
         }
3586
+
3587
+        if (pdf->stats.modificationdate->meta.success && cli_isutf8(pdf->stats.modificationdate->data, pdf->stats.modificationdate->meta.length)) {
3588
+            cli_jsonstr(pdfobj, "ModificationDate", pdf->stats.modificationdate->data);
3589
+        } else {
3590
+            char *b64 = (char *)cl_base64_encode(pdf->stats.modificationdate->data, pdf->stats.modificationdate->meta.length);
3591
+            cli_jsonstr(pdfobj, "ModificationDate", b64);
3592
+            cli_jsonbool(pdfobj, "ModificationDate_base64", 1);
3593
+            free(b64);
3594
+        }
3575 3595
     }
3576 3596
     if (pdf->stats.creationdate) {
3577
-        if (pdf->stats.creationdate_b64) {
3578
-            cli_jsonstr(pdfobj, "CreationDate", pdf->stats.creationdate);
3579
-            cli_jsonbool(pdfobj, "CreationDate_base64", 1);
3580
-        } else {
3581
-            if (cli_isutf8(pdf->stats.creationdate, strlen(pdf->stats.creationdate)))
3582
-                cli_jsonstr(pdfobj, "CreationDate", pdf->stats.creationdate);
3583
-            else {
3584
-                char *b64 = (char *)cl_base64_encode(pdf->stats.creationdate, strlen(pdf->stats.creationdate));
3585
-                cli_jsonstr(pdfobj, "CreationDate", b64);
3586
-                cli_jsonbool(pdfobj, "CreationDate_base64", 1);
3587
-                free(b64);
3597
+        if (!pdf->stats.creationdate->meta.success) {
3598
+            char *out = pdf_finalize_string(pdf, pdf->stats.creationdate->meta.obj, pdf->stats.creationdate->data, pdf->stats.creationdate->meta.length);
3599
+            if (out) {
3600
+                free(pdf->stats.creationdate->data);
3601
+                pdf->stats.creationdate->data = out;
3602
+                pdf->stats.creationdate->meta.length = strlen(out);
3603
+                pdf->stats.creationdate->meta.success = 1;
3588 3604
             }
3589 3605
         }
3606
+
3607
+        if (pdf->stats.creationdate->meta.success && cli_isutf8(pdf->stats.creationdate->data, pdf->stats.creationdate->meta.length)) {
3608
+            cli_jsonstr(pdfobj, "CreationDate", pdf->stats.creationdate->data);
3609
+        } else {
3610
+            char *b64 = (char *)cl_base64_encode(pdf->stats.creationdate->data, pdf->stats.creationdate->meta.length);
3611
+            cli_jsonstr(pdfobj, "CreationDate", b64);
3612
+            cli_jsonbool(pdfobj, "CreationDate_base64", 1);
3613
+            free(b64);
3614
+        }
3590 3615
     }
3591 3616
     if (pdf->stats.title) {
3592
-        if (pdf->stats.title_b64) {
3593
-            cli_jsonstr(pdfobj, "Title", pdf->stats.title);
3594
-            cli_jsonbool(pdfobj, "Title_base64", 1);
3595
-        } else {
3596
-            if (cli_isutf8(pdf->stats.title, strlen(pdf->stats.title)))
3597
-                cli_jsonstr(pdfobj, "Title", pdf->stats.title);
3598
-            else {
3599
-                char *b64 = (char *)cl_base64_encode(pdf->stats.title, strlen(pdf->stats.title));
3600
-                cli_jsonstr(pdfobj, "Title", b64);
3601
-                cli_jsonbool(pdfobj, "Title_base64", 1);
3602
-                free(b64);
3617
+        if (!pdf->stats.title->meta.success) {
3618
+            char *out = pdf_finalize_string(pdf, pdf->stats.title->meta.obj, pdf->stats.title->data, pdf->stats.title->meta.length);
3619
+            if (out) {
3620
+                free(pdf->stats.title->data);
3621
+                pdf->stats.title->data = out;
3622
+                pdf->stats.title->meta.length = strlen(out);
3623
+                pdf->stats.title->meta.success = 1;
3603 3624
             }
3604 3625
         }
3626
+
3627
+        if (pdf->stats.title->meta.success && cli_isutf8(pdf->stats.title->data, pdf->stats.title->meta.length)) {
3628
+            cli_jsonstr(pdfobj, "Title", pdf->stats.title->data);
3629
+        } else {
3630
+            char *b64 = (char *)cl_base64_encode(pdf->stats.title->data, pdf->stats.title->meta.length);
3631
+            cli_jsonstr(pdfobj, "Title", b64);
3632
+            cli_jsonbool(pdfobj, "Title_base64", 1);
3633
+            free(b64);
3634
+        }
3605 3635
     }
3606 3636
     if (pdf->stats.subject) {
3607
-        if (pdf->stats.subject_b64) {
3608
-            cli_jsonstr(pdfobj, "Subject", pdf->stats.subject);
3609
-            cli_jsonbool(pdfobj, "Subject_base64", 1);
3610
-        } else {
3611
-            if (cli_isutf8(pdf->stats.subject, strlen(pdf->stats.subject)))
3612
-                cli_jsonstr(pdfobj, "Subject", pdf->stats.subject);
3613
-            else {
3614
-                char *b64 = (char *)cl_base64_encode(pdf->stats.subject, strlen(pdf->stats.subject));
3615
-                cli_jsonstr(pdfobj, "Subject", b64);
3616
-                cli_jsonbool(pdfobj, "Subject_base64", 1);
3617
-                free(b64);
3637
+        if (!pdf->stats.subject->meta.success) {
3638
+            char *out = pdf_finalize_string(pdf, pdf->stats.subject->meta.obj, pdf->stats.subject->data, pdf->stats.subject->meta.length);
3639
+            if (out) {
3640
+                free(pdf->stats.subject->data);
3641
+                pdf->stats.subject->data = out;
3642
+                pdf->stats.subject->meta.length = strlen(out);
3643
+                pdf->stats.subject->meta.success = 1;
3618 3644
             }
3619 3645
         }
3646
+
3647
+        if (pdf->stats.subject->meta.success && cli_isutf8(pdf->stats.subject->data, pdf->stats.subject->meta.length)) {
3648
+            cli_jsonstr(pdfobj, "Subject", pdf->stats.subject->data);
3649
+        } else {
3650
+            char *b64 = (char *)cl_base64_encode(pdf->stats.subject->data, pdf->stats.subject->meta.length);
3651
+            cli_jsonstr(pdfobj, "Subject", b64);
3652
+            cli_jsonbool(pdfobj, "Subject_base64", 1);
3653
+            free(b64);
3654
+        }
3620 3655
     }
3621 3656
     if (pdf->stats.keywords) {
3622
-        if (pdf->stats.keywords_b64) {
3623
-            cli_jsonstr(pdfobj, "Keywords", pdf->stats.keywords);
3624
-            cli_jsonbool(pdfobj, "Keywords_base64", 1);
3625
-        } else {
3626
-            if (cli_isutf8(pdf->stats.keywords, strlen(pdf->stats.keywords)))
3627
-                cli_jsonstr(pdfobj, "Keywords", pdf->stats.keywords);
3628
-            else {
3629
-                char *b64 = (char *)cl_base64_encode(pdf->stats.keywords, strlen(pdf->stats.keywords));
3630
-                cli_jsonstr(pdfobj, "Keywords", b64);
3631
-                cli_jsonbool(pdfobj, "Keywords_base64", 1);
3632
-                free(b64);
3657
+        if (!pdf->stats.keywords->meta.success) {
3658
+            char *out = pdf_finalize_string(pdf, pdf->stats.keywords->meta.obj, pdf->stats.keywords->data, pdf->stats.keywords->meta.length);
3659
+            if (out) {
3660
+                free(pdf->stats.keywords->data);
3661
+                pdf->stats.keywords->data = out;
3662
+                pdf->stats.keywords->meta.length = strlen(out);
3663
+                pdf->stats.keywords->meta.success = 1;
3633 3664
             }
3634 3665
         }
3666
+
3667
+        if (pdf->stats.keywords->meta.success && cli_isutf8(pdf->stats.keywords->data, pdf->stats.keywords->meta.length)) {
3668
+            cli_jsonstr(pdfobj, "Keywords", pdf->stats.keywords->data);
3669
+        } else {
3670
+            char *b64 = (char *)cl_base64_encode(pdf->stats.keywords->data, pdf->stats.keywords->meta.length);
3671
+            cli_jsonstr(pdfobj, "Keywords", b64);
3672
+            cli_jsonbool(pdfobj, "Keywords_base64", 1);
3673
+            free(b64);
3674
+        }
3635 3675
     }
3636 3676
     if (pdf->stats.ninvalidobjs)
3637 3677
         cli_jsonint(pdfobj, "InvalidObjectCount", pdf->stats.ninvalidobjs);
... ...
@@ -3713,41 +3783,57 @@ static void pdf_export_json(struct pdf_struct *pdf)
3713 3713
 
3714 3714
 cleanup:
3715 3715
     if ((pdf->stats.author)) {
3716
+        if (pdf->stats.author->data)
3717
+            free(pdf->stats.author->data);
3716 3718
         free(pdf->stats.author);
3717 3719
         pdf->stats.author = NULL;
3718 3720
     }
3719 3721
 
3720 3722
     if (pdf->stats.creator) {
3723
+        if (pdf->stats.creator->data)
3724
+            free(pdf->stats.creator->data);
3721 3725
         free(pdf->stats.creator);
3722 3726
         pdf->stats.creator = NULL;
3723 3727
     }
3724 3728
 
3725 3729
     if (pdf->stats.producer) {
3730
+        if (pdf->stats.producer->data)
3731
+            free(pdf->stats.producer->data);
3726 3732
         free(pdf->stats.producer);
3727 3733
         pdf->stats.producer = NULL;
3728 3734
     }
3729 3735
 
3730 3736
     if (pdf->stats.modificationdate) {
3737
+        if (pdf->stats.modificationdate->data)
3738
+            free(pdf->stats.modificationdate->data);
3731 3739
         free(pdf->stats.modificationdate);
3732 3740
         pdf->stats.modificationdate = NULL;
3733 3741
     }
3734 3742
 
3735 3743
     if (pdf->stats.creationdate) {
3744
+        if (pdf->stats.creationdate->data)
3745
+            free(pdf->stats.creationdate->data);
3736 3746
         free(pdf->stats.creationdate);
3737 3747
         pdf->stats.creationdate = NULL;
3738 3748
     }
3739 3749
 
3740 3750
     if (pdf->stats.title) {
3751
+        if (pdf->stats.title->data)
3752
+            free(pdf->stats.title->data);
3741 3753
         free(pdf->stats.title);
3742 3754
         pdf->stats.title = NULL;
3743 3755
     }
3744 3756
 
3745 3757
     if (pdf->stats.subject) {
3758
+        if (pdf->stats.subject->data)
3759
+            free(pdf->stats.subject->data);
3746 3760
         free(pdf->stats.subject);
3747 3761
         pdf->stats.subject = NULL;
3748 3762
     }
3749 3763
 
3750 3764
     if (pdf->stats.keywords) {
3765
+        if (pdf->stats.keywords->data)
3766
+            free(pdf->stats.keywords->data);
3751 3767
         free(pdf->stats.keywords);
3752 3768
         pdf->stats.keywords = NULL;
3753 3769
     }
... ...
@@ -62,6 +62,17 @@ struct pdf_dict {
62 62
     struct pdf_dict_node *tail;
63 63
 };
64 64
 
65
+struct pdf_stats_entry {
66
+    char *data;
67
+
68
+    /* populated by pdf_parse_string */
69
+    struct pdf_stats_metadata {
70
+        int length;
71
+        struct pdf_obj *obj;
72
+        int success; /* if finalize succeeds */
73
+    } meta;
74
+};
75
+
65 76
 struct pdf_stats {
66 77
     int32_t ninvalidobjs;     /* Number of invalid objects */
67 78
     int32_t njs;              /* Number of javascript objects */
... ...
@@ -88,22 +99,14 @@ struct pdf_stats {
88 88
     int32_t nrichmedia;       /* Number of RichMedia objects */
89 89
     int32_t nacroform;        /* Number of AcroForm objects */
90 90
     int32_t nxfa;             /* Number of XFA objects */
91
-    char *author;             /* Author of the PDF */
92
-    int8_t author_b64;
93
-    char *creator;            /* Application used to create the PDF */
94
-    int8_t creator_b64;
95
-    char *producer;           /* Application used to produce the PDF */
96
-    int8_t producer_b64;
97
-    char *creationdate;       /* Date the PDF was created */
98
-    int8_t creationdate_b64;
99
-    char *modificationdate;   /* Date the PDF was modified */
100
-    int8_t modificationdate_b64;
101
-    char *title;              /* Title of the PDF */
102
-    int8_t title_b64;
103
-    char *subject;            /* Subject of the PDF */
104
-    int8_t subject_b64;
105
-    char *keywords;           /* Keywords of the PDF */
106
-    int8_t keywords_b64;
91
+    struct pdf_stats_entry *author;             /* Author of the PDF */
92
+    struct pdf_stats_entry *creator;            /* Application used to create the PDF */
93
+    struct pdf_stats_entry *producer;           /* Application used to produce the PDF */
94
+    struct pdf_stats_entry *creationdate;       /* Date the PDF was created */
95
+    struct pdf_stats_entry *modificationdate;   /* Date the PDF was modified */
96
+    struct pdf_stats_entry *title;              /* Title of the PDF */
97
+    struct pdf_stats_entry *subject;            /* Subject of the PDF */
98
+    struct pdf_stats_entry *keywords;           /* Keywords of the PDF */
107 99
 };
108 100
 
109 101
 
... ...
@@ -156,7 +159,8 @@ void pdf_handle_enc(struct pdf_struct *pdf);
156 156
 char *decrypt_any(struct pdf_struct *pdf, uint32_t id, const char *in, off_t *length, enum enc_method enc_method);
157 157
 enum enc_method get_enc_method(struct pdf_struct *pdf, struct pdf_obj *obj);
158 158
 
159
-char *pdf_parse_string(struct pdf_struct *pdf, struct pdf_obj *obj, const char *objstart, size_t objsize, const char *str, char **endchar, int8_t *b64);
159
+char *pdf_finalize_string(struct pdf_struct *pdf, struct pdf_obj *obj, const char *in, size_t len);
160
+char *pdf_parse_string(struct pdf_struct *pdf, struct pdf_obj *obj, const char *objstart, size_t objsize, const char *str, char **endchar, struct pdf_stats_metadata *stats);
160 161
 struct pdf_array *pdf_parse_array(struct pdf_struct *pdf, struct pdf_obj *obj, size_t objsz, char *begin, char **endchar);
161 162
 struct pdf_dict *pdf_parse_dict(struct pdf_struct *pdf, struct pdf_obj *obj, size_t objsz, char *begin, char **endchar);
162 163
 int is_object_reference(char *begin, char **endchar, uint32_t *id);
... ...
@@ -237,12 +237,15 @@ static char *pdf_decrypt_string(struct pdf_struct *pdf, struct pdf_obj *obj, con
237 237
     return NULL;
238 238
 }
239 239
 
240
-static char *pdf_finalize_string(struct pdf_struct *pdf, struct pdf_obj *obj, const char *in, size_t len)
240
+char *pdf_finalize_string(struct pdf_struct *pdf, struct pdf_obj *obj, const char *in, size_t len)
241 241
 {
242 242
     char *wrkstr, *output = NULL;
243 243
     size_t wrklen = len, outlen;
244 244
     unsigned int i, likelyutf = 0;
245 245
 
246
+    if (!in)
247
+        return NULL;
248
+
246 249
     /* get a working copy */
247 250
     wrkstr = cli_calloc(len+1, sizeof(char));
248 251
     if (!wrkstr)
... ...
@@ -363,7 +366,7 @@ static char *pdf_finalize_string(struct pdf_struct *pdf, struct pdf_obj *obj, co
363 363
     return wrkstr;
364 364
 }
365 365
 
366
-char *pdf_parse_string(struct pdf_struct *pdf, struct pdf_obj *obj, const char *objstart, size_t objsize, const char *str, char **endchar, int8_t *b64)
366
+char *pdf_parse_string(struct pdf_struct *pdf, struct pdf_obj *obj, const char *objstart, size_t objsize, const char *str, char **endchar, struct pdf_stats_metadata *meta)
367 367
 {
368 368
     const char *q = objstart;
369 369
     char *p1, *p2;
... ...
@@ -493,14 +496,27 @@ char *pdf_parse_string(struct pdf_struct *pdf, struct pdf_obj *obj, const char *
493 493
             switch (*p3) {
494 494
                 case '(':
495 495
                 case '<':
496
-                    res = pdf_parse_string(pdf, obj, p3, objsize2, NULL, NULL, b64);
496
+                    res = pdf_parse_string(pdf, obj, p3, objsize2, NULL, NULL, meta);
497 497
                     break;
498 498
                 default:
499 499
                     res = pdf_finalize_string(pdf, obj, begin, objsize2);
500 500
                     if (!res) {
501
-                        res = (char *)cl_base64_encode(begin, objsize2);
502
-                        if (b64) *b64 = 1;
503
-                    } 
501
+                        res = cli_calloc(1, objsize2+1);
502
+                        if (!(res))
503
+                            return NULL;
504
+                        memcpy(res, begin, objsize2);
505
+                        res[objsize2] = '\0';
506
+
507
+                        if (meta) {
508
+                            meta->length = objsize2;
509
+                            meta->obj = obj;
510
+                            meta->success = 0;
511
+                        }
512
+                    } else if (meta) {
513
+                        meta->length = strlen(res);
514
+                        meta->obj = obj;
515
+                        meta->success = 1;
516
+                    }
504 517
             }
505 518
             free(begin);
506 519
         }
... ...
@@ -571,8 +587,21 @@ char *pdf_parse_string(struct pdf_struct *pdf, struct pdf_obj *obj, const char *
571 571
 
572 572
     res = pdf_finalize_string(pdf, obj, p1, len);
573 573
     if (!res) {
574
-        res = (char *)cl_base64_encode(p1, len);
575
-        if (b64) *b64 = 1;
574
+        res = cli_calloc(1, len+1);
575
+        if (!(res))
576
+            return NULL;
577
+        memcpy(res, p1, len);
578
+        res[len] = '\0';
579
+
580
+        if (meta) {
581
+            meta->length = len;
582
+            meta->obj = obj;
583
+            meta->success = 0;
584
+        }
585
+    } else if (meta) {
586
+        meta->length = strlen(res);
587
+        meta->obj = obj;
588
+        meta->success = 1;
576 589
     }
577 590
 
578 591
     if (res && endchar)