Browse code

pdf: base64 encode strings that fail to finalize

Kevin Lin authored on 2015/03/21 05:36:41
Showing 3 changed files
... ...
@@ -3217,7 +3217,7 @@ static void Author_cb(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdfnam
3217 3217
         return;
3218 3218
 
3219 3219
     if (!(pdf->stats.author))
3220
-        pdf->stats.author = pdf_parse_string(pdf, obj, obj->start + pdf->map, obj_size(pdf, obj, 1), "/Author", NULL);
3220
+        pdf->stats.author = pdf_parse_string(pdf, obj, obj->start + pdf->map, obj_size(pdf, obj, 1), "/Author", NULL, &(pdf->stats.author_b64));
3221 3221
 }
3222 3222
 #endif
3223 3223
 
... ...
@@ -3233,7 +3233,7 @@ static void Creator_cb(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdfna
3233 3233
         return;
3234 3234
 
3235 3235
     if (!(pdf->stats.creator))
3236
-        pdf->stats.creator = pdf_parse_string(pdf, obj, obj->start + pdf->map, obj_size(pdf, obj, 1), "/Creator", NULL);
3236
+        pdf->stats.creator = pdf_parse_string(pdf, obj, obj->start + pdf->map, obj_size(pdf, obj, 1), "/Creator", NULL, &(pdf->stats.creator_b64));
3237 3237
 }
3238 3238
 #endif
3239 3239
 
... ...
@@ -3249,7 +3249,7 @@ static void ModificationDate_cb(struct pdf_struct *pdf, struct pdf_obj *obj, str
3249 3249
         return;
3250 3250
 
3251 3251
     if (!(pdf->stats.modificationdate))
3252
-        pdf->stats.modificationdate = pdf_parse_string(pdf, obj, obj->start + pdf->map, obj_size(pdf, obj, 1), "/ModDate", NULL);
3252
+        pdf->stats.modificationdate = pdf_parse_string(pdf, obj, obj->start + pdf->map, obj_size(pdf, obj, 1), "/ModDate", NULL, &(pdf->stats.modificationdate_b64));
3253 3253
 }
3254 3254
 #endif
3255 3255
 
... ...
@@ -3265,7 +3265,7 @@ static void CreationDate_cb(struct pdf_struct *pdf, struct pdf_obj *obj, struct
3265 3265
         return;
3266 3266
 
3267 3267
     if (!(pdf->stats.creationdate))
3268
-        pdf->stats.creationdate = pdf_parse_string(pdf, obj, obj->start + pdf->map, obj_size(pdf, obj, 1), "/CreationDate", NULL);
3268
+        pdf->stats.creationdate = pdf_parse_string(pdf, obj, obj->start + pdf->map, obj_size(pdf, obj, 1), "/CreationDate", NULL, &(pdf->stats.creationdate_b64));
3269 3269
 }
3270 3270
 #endif
3271 3271
 
... ...
@@ -3281,7 +3281,7 @@ static void Producer_cb(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdfn
3281 3281
         return;
3282 3282
 
3283 3283
     if (!(pdf->stats.producer))
3284
-        pdf->stats.producer = pdf_parse_string(pdf, obj, obj->start + pdf->map, obj_size(pdf, obj, 1), "/Producer", NULL);
3284
+        pdf->stats.producer = pdf_parse_string(pdf, obj, obj->start + pdf->map, obj_size(pdf, obj, 1), "/Producer", NULL, &(pdf->stats.producer_b64));
3285 3285
 }
3286 3286
 #endif
3287 3287
 
... ...
@@ -3297,7 +3297,7 @@ static void Title_cb(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdfname
3297 3297
         return;
3298 3298
 
3299 3299
     if (!(pdf->stats.title))
3300
-        pdf->stats.title = pdf_parse_string(pdf, obj, obj->start + pdf->map, obj_size(pdf, obj, 1), "/Title", NULL);
3300
+        pdf->stats.title = pdf_parse_string(pdf, obj, obj->start + pdf->map, obj_size(pdf, obj, 1), "/Title", NULL, &(pdf->stats.title_b64));
3301 3301
 }
3302 3302
 #endif
3303 3303
 
... ...
@@ -3313,7 +3313,7 @@ static void Keywords_cb(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdfn
3313 3313
         return;
3314 3314
 
3315 3315
     if (!(pdf->stats.keywords))
3316
-        pdf->stats.keywords = pdf_parse_string(pdf, obj, obj->start + pdf->map, obj_size(pdf, obj, 1), "/Keywords", NULL);
3316
+        pdf->stats.keywords = pdf_parse_string(pdf, obj, obj->start + pdf->map, obj_size(pdf, obj, 1), "/Keywords", NULL, &(pdf->stats.keywords_b64));
3317 3317
 }
3318 3318
 #endif
3319 3319
 
... ...
@@ -3329,7 +3329,7 @@ static void Subject_cb(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdfna
3329 3329
         return;
3330 3330
 
3331 3331
     if (!(pdf->stats.subject))
3332
-        pdf->stats.subject = pdf_parse_string(pdf, obj, obj->start + pdf->map, obj_size(pdf, obj, 1), "/Subject", NULL);
3332
+        pdf->stats.subject = pdf_parse_string(pdf, obj, obj->start + pdf->map, obj_size(pdf, obj, 1), "/Subject", NULL, &(pdf->stats.subject_b64));
3333 3333
 }
3334 3334
 #endif
3335 3335
 
... ...
@@ -3514,83 +3514,123 @@ static void pdf_export_json(struct pdf_struct *pdf)
3514 3514
     }
3515 3515
 
3516 3516
     if (pdf->stats.author) {
3517
-        if (cli_isutf8(pdf->stats.author, strlen(pdf->stats.author)))
3517
+        if (pdf->stats.author_b64) {
3518 3518
             cli_jsonstr(pdfobj, "Author", pdf->stats.author);
3519
-        else {
3520
-            char *b64 = (char *)cl_base64_encode(pdf->stats.author, strlen(pdf->stats.author));
3521
-            cli_jsonstr(pdfobj, "Author", b64);
3522 3519
             cli_jsonbool(pdfobj, "Author_base64", 1);
3523
-            free(b64);
3520
+        } else {
3521
+            if (cli_isutf8(pdf->stats.author, strlen(pdf->stats.author)))
3522
+                cli_jsonstr(pdfobj, "Author", pdf->stats.author);
3523
+            else {
3524
+                char *b64 = (char *)cl_base64_encode(pdf->stats.author, strlen(pdf->stats.author));
3525
+                cli_jsonstr(pdfobj, "Author", b64);
3526
+                cli_jsonbool(pdfobj, "Author_base64", 1);
3527
+                free(b64);
3528
+            }
3524 3529
         }
3525 3530
     }
3526 3531
     if (pdf->stats.creator) {
3527
-        if (cli_isutf8(pdf->stats.creator, strlen(pdf->stats.creator)))
3532
+        if (pdf->stats.creator_b64) {
3528 3533
             cli_jsonstr(pdfobj, "Creator", pdf->stats.creator);
3529
-        else {
3530
-            char *b64 = (char *)cl_base64_encode(pdf->stats.creator, strlen(pdf->stats.creator));
3531
-            cli_jsonstr(pdfobj, "Creator", b64);
3532 3534
             cli_jsonbool(pdfobj, "Creator_base64", 1);
3533
-            free(b64);
3535
+        } else {
3536
+            if (cli_isutf8(pdf->stats.creator, strlen(pdf->stats.creator)))
3537
+                cli_jsonstr(pdfobj, "Creator", pdf->stats.creator);
3538
+            else {
3539
+                char *b64 = (char *)cl_base64_encode(pdf->stats.creator, strlen(pdf->stats.creator));
3540
+                cli_jsonstr(pdfobj, "Creator", b64);
3541
+                cli_jsonbool(pdfobj, "Creator_base64", 1);
3542
+                free(b64);
3543
+            }
3534 3544
         }
3535 3545
     }
3536 3546
     if (pdf->stats.producer) {
3537
-        if (cli_isutf8(pdf->stats.producer, strlen(pdf->stats.producer)))
3547
+        if (pdf->stats.producer_b64) {
3538 3548
             cli_jsonstr(pdfobj, "Producer", pdf->stats.producer);
3539
-        else {
3540
-            char *b64 = (char *)cl_base64_encode(pdf->stats.producer, strlen(pdf->stats.producer));
3541
-            cli_jsonstr(pdfobj, "Producer", b64);
3542 3549
             cli_jsonbool(pdfobj, "Producer_base64", 1);
3543
-            free(b64);
3550
+        } else {
3551
+            if (cli_isutf8(pdf->stats.producer, strlen(pdf->stats.producer)))
3552
+                cli_jsonstr(pdfobj, "Producer", pdf->stats.producer);
3553
+            else {
3554
+                char *b64 = (char *)cl_base64_encode(pdf->stats.producer, strlen(pdf->stats.producer));
3555
+                cli_jsonstr(pdfobj, "Producer", b64);
3556
+                cli_jsonbool(pdfobj, "Producer_base64", 1);
3557
+                free(b64);
3558
+            }
3544 3559
         }
3545 3560
     }
3546 3561
     if (pdf->stats.modificationdate) {
3547
-        if (cli_isutf8(pdf->stats.modificationdate, strlen(pdf->stats.modificationdate)))
3562
+        if (pdf->stats.modificationdate_b64) {
3548 3563
             cli_jsonstr(pdfobj, "ModificationDate", pdf->stats.modificationdate);
3549
-        else {
3550
-            char *b64 = (char *)cl_base64_encode(pdf->stats.modificationdate, strlen(pdf->stats.modificationdate));
3551
-            cli_jsonstr(pdfobj, "ModificationDate", b64);
3552 3564
             cli_jsonbool(pdfobj, "ModificationDate_base64", 1);
3553
-            free(b64);
3565
+        } else {
3566
+            if (cli_isutf8(pdf->stats.modificationdate, strlen(pdf->stats.modificationdate)))
3567
+                cli_jsonstr(pdfobj, "ModificationDate", pdf->stats.modificationdate);
3568
+            else {
3569
+                char *b64 = (char *)cl_base64_encode(pdf->stats.modificationdate, strlen(pdf->stats.modificationdate));
3570
+                cli_jsonstr(pdfobj, "ModificationDate", b64);
3571
+                cli_jsonbool(pdfobj, "ModificationDate_base64", 1);
3572
+                free(b64);
3573
+            }
3554 3574
         }
3555 3575
     }
3556 3576
     if (pdf->stats.creationdate) {
3557
-        if (cli_isutf8(pdf->stats.creationdate, strlen(pdf->stats.creationdate)))
3577
+        if (pdf->stats.creationdate_b64) {
3558 3578
             cli_jsonstr(pdfobj, "CreationDate", pdf->stats.creationdate);
3559
-        else {
3560
-            char *b64 = (char *)cl_base64_encode(pdf->stats.creationdate, strlen(pdf->stats.creationdate));
3561
-            cli_jsonstr(pdfobj, "CreationDate", b64);
3562 3579
             cli_jsonbool(pdfobj, "CreationDate_base64", 1);
3563
-            free(b64);
3580
+        } else {
3581
+            if (cli_isutf8(pdf->stats.creationdate, strlen(pdf->stats.creationdate)))
3582
+                cli_jsonstr(pdfobj, "CreationDate", pdf->stats.creationdate);
3583
+            else {
3584
+                char *b64 = (char *)cl_base64_encode(pdf->stats.creationdate, strlen(pdf->stats.creationdate));
3585
+                cli_jsonstr(pdfobj, "CreationDate", b64);
3586
+                cli_jsonbool(pdfobj, "CreationDate_base64", 1);
3587
+                free(b64);
3588
+            }
3564 3589
         }
3565 3590
     }
3566 3591
     if (pdf->stats.title) {
3567
-        if (cli_isutf8(pdf->stats.title, strlen(pdf->stats.title)))
3592
+        if (pdf->stats.title_b64) {
3568 3593
             cli_jsonstr(pdfobj, "Title", pdf->stats.title);
3569
-        else {
3570
-            char *b64 = (char *)cl_base64_encode(pdf->stats.title, strlen(pdf->stats.title));
3571
-            cli_jsonstr(pdfobj, "Title", b64);
3572 3594
             cli_jsonbool(pdfobj, "Title_base64", 1);
3573
-            free(b64);
3595
+        } else {
3596
+            if (cli_isutf8(pdf->stats.title, strlen(pdf->stats.title)))
3597
+                cli_jsonstr(pdfobj, "Title", pdf->stats.title);
3598
+            else {
3599
+                char *b64 = (char *)cl_base64_encode(pdf->stats.title, strlen(pdf->stats.title));
3600
+                cli_jsonstr(pdfobj, "Title", b64);
3601
+                cli_jsonbool(pdfobj, "Title_base64", 1);
3602
+                free(b64);
3603
+            }
3574 3604
         }
3575 3605
     }
3576 3606
     if (pdf->stats.subject) {
3577
-        if (cli_isutf8(pdf->stats.subject, strlen(pdf->stats.subject)))
3607
+        if (pdf->stats.subject_b64) {
3578 3608
             cli_jsonstr(pdfobj, "Subject", pdf->stats.subject);
3579
-        else {
3580
-            char *b64 = (char *)cl_base64_encode(pdf->stats.subject, strlen(pdf->stats.subject));
3581
-            cli_jsonstr(pdfobj, "Subject", b64);
3582 3609
             cli_jsonbool(pdfobj, "Subject_base64", 1);
3583
-            free(b64);
3610
+        } else {
3611
+            if (cli_isutf8(pdf->stats.subject, strlen(pdf->stats.subject)))
3612
+                cli_jsonstr(pdfobj, "Subject", pdf->stats.subject);
3613
+            else {
3614
+                char *b64 = (char *)cl_base64_encode(pdf->stats.subject, strlen(pdf->stats.subject));
3615
+                cli_jsonstr(pdfobj, "Subject", b64);
3616
+                cli_jsonbool(pdfobj, "Subject_base64", 1);
3617
+                free(b64);
3618
+            }
3584 3619
         }
3585 3620
     }
3586 3621
     if (pdf->stats.keywords) {
3587
-        if (cli_isutf8(pdf->stats.keywords, strlen(pdf->stats.keywords)))
3622
+        if (pdf->stats.keywords_b64) {
3588 3623
             cli_jsonstr(pdfobj, "Keywords", pdf->stats.keywords);
3589
-        else {
3590
-            char *b64 = (char *)cl_base64_encode(pdf->stats.keywords, strlen(pdf->stats.keywords));
3591
-            cli_jsonstr(pdfobj, "Keywords", b64);
3592 3624
             cli_jsonbool(pdfobj, "Keywords_base64", 1);
3593
-            free(b64);
3625
+        } else {
3626
+            if (cli_isutf8(pdf->stats.keywords, strlen(pdf->stats.keywords)))
3627
+                cli_jsonstr(pdfobj, "Keywords", pdf->stats.keywords);
3628
+            else {
3629
+                char *b64 = (char *)cl_base64_encode(pdf->stats.keywords, strlen(pdf->stats.keywords));
3630
+                cli_jsonstr(pdfobj, "Keywords", b64);
3631
+                cli_jsonbool(pdfobj, "Keywords_base64", 1);
3632
+                free(b64);
3633
+            }
3594 3634
         }
3595 3635
     }
3596 3636
     if (pdf->stats.ninvalidobjs)
... ...
@@ -89,13 +89,21 @@ struct pdf_stats {
89 89
     int32_t nacroform;        /* Number of AcroForm objects */
90 90
     int32_t nxfa;             /* Number of XFA objects */
91 91
     char *author;             /* Author of the PDF */
92
+    int8_t author_b64;
92 93
     char *creator;            /* Application used to create the PDF */
94
+    int8_t creator_b64;
93 95
     char *producer;           /* Application used to produce the PDF */
96
+    int8_t producer_b64;
94 97
     char *creationdate;       /* Date the PDF was created */
98
+    int8_t creationdate_b64;
95 99
     char *modificationdate;   /* Date the PDF was modified */
100
+    int8_t modificationdate_b64;
96 101
     char *title;              /* Title of the PDF */
102
+    int8_t title_b64;
97 103
     char *subject;            /* Subject of the PDF */
104
+    int8_t subject_b64;
98 105
     char *keywords;           /* Keywords of the PDF */
106
+    int8_t keywords_b64;
99 107
 };
100 108
 
101 109
 
... ...
@@ -148,7 +156,7 @@ void pdf_handle_enc(struct pdf_struct *pdf);
148 148
 char *decrypt_any(struct pdf_struct *pdf, uint32_t id, const char *in, off_t *length, enum enc_method enc_method);
149 149
 enum enc_method get_enc_method(struct pdf_struct *pdf, struct pdf_obj *obj);
150 150
 
151
-char *pdf_parse_string(struct pdf_struct *pdf, struct pdf_obj *obj, const char *objstart, size_t objsize, const char *str, char **endchar);
151
+char *pdf_parse_string(struct pdf_struct *pdf, struct pdf_obj *obj, const char *objstart, size_t objsize, const char *str, char **endchar, int8_t *b64);
152 152
 struct pdf_array *pdf_parse_array(struct pdf_struct *pdf, struct pdf_obj *obj, size_t objsz, char *begin, char **endchar);
153 153
 struct pdf_dict *pdf_parse_dict(struct pdf_struct *pdf, struct pdf_obj *obj, size_t objsz, char *begin, char **endchar);
154 154
 int is_object_reference(char *begin, char **endchar, uint32_t *id);
... ...
@@ -356,7 +356,7 @@ static char *pdf_finalize_string(struct pdf_struct *pdf, struct pdf_obj *obj, co
356 356
     return wrkstr;
357 357
 }
358 358
 
359
-char *pdf_parse_string(struct pdf_struct *pdf, struct pdf_obj *obj, const char *objstart, size_t objsize, const char *str, char **endchar)
359
+char *pdf_parse_string(struct pdf_struct *pdf, struct pdf_obj *obj, const char *objstart, size_t objsize, const char *str, char **endchar, int8_t *b64)
360 360
 {
361 361
     const char *q = objstart;
362 362
     char *p1, *p2;
... ...
@@ -486,18 +486,16 @@ char *pdf_parse_string(struct pdf_struct *pdf, struct pdf_obj *obj, const char *
486 486
             switch (*p3) {
487 487
                 case '(':
488 488
                 case '<':
489
-                    res = pdf_parse_string(pdf, obj, p3, objsize2, NULL, NULL);
490
-                    free(begin);
489
+                    res = pdf_parse_string(pdf, obj, p3, objsize2, NULL, NULL, b64);
491 490
                     break;
492 491
                 default:
493 492
                     res = pdf_finalize_string(pdf, obj, begin, objsize2);
494 493
                     if (!res) {
495
-                        /* WE NEED TO BASE64 ENCODE IT! */
496
-                        return NULL; /* for now, just return NULL */
497
-                    } else {
498
-                        free(begin);
499
-                    }
494
+                        res = (char *)cl_base64_encode(begin, objsize2);
495
+                        if (b64) *b64 = 1;
496
+                    } 
500 497
             }
498
+            free(begin);
501 499
         }
502 500
 
503 501
         close(fd);
... ...
@@ -564,20 +562,16 @@ char *pdf_parse_string(struct pdf_struct *pdf, struct pdf_obj *obj, const char *
564 564
 
565 565
     len = (size_t)(p2 - p1) + 1;
566 566
 
567
-    /* EXPERIMENTAL */
568
-
569 567
     res = pdf_finalize_string(pdf, obj, p1, len);
570 568
     if (!res) {
571
-        /* WE NEED TO BASE64 ENCODE IT! */
572
-        return NULL; /* for now, just return NULL */
569
+        res = (char *)cl_base64_encode(p1, len);
570
+        if (b64) *b64 = 1;
573 571
     }
574 572
 
575 573
     if (res && endchar)
576 574
         *endchar = p2;
577 575
 
578 576
     return res;
579
-
580
-    /* EXPERIMENTAL */
581 577
 }
582 578
 
583 579
 struct pdf_dict *pdf_parse_dict(struct pdf_struct *pdf, struct pdf_obj *obj, size_t objsz, char *begin, char **endchar)
... ...
@@ -734,7 +728,7 @@ struct pdf_dict *pdf_parse_dict(struct pdf_struct *pdf, struct pdf_obj *obj, siz
734 734
 
735 735
         switch (begin[0]) {
736 736
             case '(':
737
-                val = pdf_parse_string(pdf, obj, begin, objsz, NULL, &p1);
737
+                val = pdf_parse_string(pdf, obj, begin, objsz, NULL, &p1, NULL);
738 738
                 begin = p1+2;
739 739
                 break;
740 740
             case '[':
... ...
@@ -750,7 +744,7 @@ struct pdf_dict *pdf_parse_dict(struct pdf_struct *pdf, struct pdf_obj *obj, siz
750 750
                     }
751 751
                 }
752 752
 
753
-                val = pdf_parse_string(pdf, obj, begin, objsz, NULL, &p1);
753
+                val = pdf_parse_string(pdf, obj, begin, objsz, NULL, &p1, NULL);
754 754
                 begin = p1+2;
755 755
                 break;
756 756
             default:
... ...
@@ -932,7 +926,7 @@ struct pdf_array *pdf_parse_array(struct pdf_struct *pdf, struct pdf_obj *obj, s
932 932
 
933 933
                 /* Not a dictionary. Intentially fall through. */
934 934
             case '(':
935
-                val = pdf_parse_string(pdf, obj, begin, objsz, NULL, &begin);
935
+                val = pdf_parse_string(pdf, obj, begin, objsz, NULL, &begin, NULL);
936 936
                 begin += 2;
937 937
                 break;
938 938
             case '[':