Browse code

Revert "pdf strings are now base64 encoded if utf conversion fails"

This reverts commit 6c3cc09415c478fe1869c796fca035a8c6da2a5b.

Kevin Lin authored on 2015/03/03 09:05:09
Showing 3 changed files
... ...
@@ -3215,7 +3215,7 @@ static void Author_cb(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdfnam
3215 3215
         return;
3216 3216
 
3217 3217
     if (!(pdf->stats.author))
3218
-        pdf->stats.author = pdf_parse_string(pdf, obj, obj->start + pdf->map, obj_size(pdf, obj, 1), "/Author", NULL, &pdf->stats.author_base64);
3218
+        pdf->stats.author = pdf_parse_string(pdf, obj, obj->start + pdf->map, obj_size(pdf, obj, 1), "/Author", NULL);
3219 3219
 }
3220 3220
 #endif
3221 3221
 
... ...
@@ -3231,7 +3231,7 @@ static void Creator_cb(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdfna
3231 3231
         return;
3232 3232
 
3233 3233
     if (!(pdf->stats.creator))
3234
-        pdf->stats.creator = pdf_parse_string(pdf, obj, obj->start + pdf->map, obj_size(pdf, obj, 1), "/Creator", NULL, &pdf->stats.creator_base64);
3234
+        pdf->stats.creator = pdf_parse_string(pdf, obj, obj->start + pdf->map, obj_size(pdf, obj, 1), "/Creator", NULL);
3235 3235
 }
3236 3236
 #endif
3237 3237
 
... ...
@@ -3247,7 +3247,7 @@ static void ModificationDate_cb(struct pdf_struct *pdf, struct pdf_obj *obj, str
3247 3247
         return;
3248 3248
 
3249 3249
     if (!(pdf->stats.modificationdate))
3250
-        pdf->stats.modificationdate = pdf_parse_string(pdf, obj, obj->start + pdf->map, obj_size(pdf, obj, 1), "/ModDate", NULL, &pdf->stats.modificationdate_base64);
3250
+        pdf->stats.modificationdate = pdf_parse_string(pdf, obj, obj->start + pdf->map, obj_size(pdf, obj, 1), "/ModDate", NULL);
3251 3251
 }
3252 3252
 #endif
3253 3253
 
... ...
@@ -3263,7 +3263,7 @@ static void CreationDate_cb(struct pdf_struct *pdf, struct pdf_obj *obj, struct
3263 3263
         return;
3264 3264
 
3265 3265
     if (!(pdf->stats.creationdate))
3266
-        pdf->stats.creationdate = pdf_parse_string(pdf, obj, obj->start + pdf->map, obj_size(pdf, obj, 1), "/CreationDate", NULL, &pdf->stats.creationdate_base64);
3266
+        pdf->stats.creationdate = pdf_parse_string(pdf, obj, obj->start + pdf->map, obj_size(pdf, obj, 1), "/CreationDate", NULL);
3267 3267
 }
3268 3268
 #endif
3269 3269
 
... ...
@@ -3279,7 +3279,7 @@ static void Producer_cb(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdfn
3279 3279
         return;
3280 3280
 
3281 3281
     if (!(pdf->stats.producer))
3282
-        pdf->stats.producer = pdf_parse_string(pdf, obj, obj->start + pdf->map, obj_size(pdf, obj, 1), "/Producer", NULL, &pdf->stats.producer_base64);
3282
+        pdf->stats.producer = pdf_parse_string(pdf, obj, obj->start + pdf->map, obj_size(pdf, obj, 1), "/Producer", NULL);
3283 3283
 }
3284 3284
 #endif
3285 3285
 
... ...
@@ -3295,7 +3295,7 @@ static void Title_cb(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdfname
3295 3295
         return;
3296 3296
 
3297 3297
     if (!(pdf->stats.title))
3298
-        pdf->stats.title = pdf_parse_string(pdf, obj, obj->start + pdf->map, obj_size(pdf, obj, 1), "/Title", NULL, &pdf->stats.title_base64);
3298
+        pdf->stats.title = pdf_parse_string(pdf, obj, obj->start + pdf->map, obj_size(pdf, obj, 1), "/Title", NULL);
3299 3299
 }
3300 3300
 #endif
3301 3301
 
... ...
@@ -3311,7 +3311,7 @@ static void Keywords_cb(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdfn
3311 3311
         return;
3312 3312
 
3313 3313
     if (!(pdf->stats.keywords))
3314
-        pdf->stats.keywords = pdf_parse_string(pdf, obj, obj->start + pdf->map, obj_size(pdf, obj, 1), "/Keywords", NULL, &pdf->stats.keywords_base64);
3314
+        pdf->stats.keywords = pdf_parse_string(pdf, obj, obj->start + pdf->map, obj_size(pdf, obj, 1), "/Keywords", NULL);
3315 3315
 }
3316 3316
 #endif
3317 3317
 
... ...
@@ -3327,7 +3327,7 @@ static void Subject_cb(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdfna
3327 3327
         return;
3328 3328
 
3329 3329
     if (!(pdf->stats.subject))
3330
-        pdf->stats.subject = pdf_parse_string(pdf, obj, obj->start + pdf->map, obj_size(pdf, obj, 1), "/Subject", NULL, &pdf->stats.subject_base64);
3330
+        pdf->stats.subject = pdf_parse_string(pdf, obj, obj->start + pdf->map, obj_size(pdf, obj, 1), "/Subject", NULL);
3331 3331
 }
3332 3332
 #endif
3333 3333
 
... ...
@@ -3511,46 +3511,22 @@ static void pdf_export_json(struct pdf_struct *pdf)
3511 3511
         goto cleanup;
3512 3512
     }
3513 3513
 
3514
-    if (pdf->stats.author) {
3514
+    if (pdf->stats.author)
3515 3515
         cli_jsonstr(pdfobj, "Author", pdf->stats.author);
3516
-        if (pdf->stats.author_base64)
3517
-            cli_jsonbool(pdfobj, "Author_base64", 1);
3518
-    }
3519
-    if (pdf->stats.creator) {
3516
+    if (pdf->stats.creator)
3520 3517
         cli_jsonstr(pdfobj, "Creator", pdf->stats.creator);
3521
-        if (pdf->stats.creator_base64)
3522
-            cli_jsonbool(pdfobj, "Creator_base64", 1);
3523
-    }
3524
-    if (pdf->stats.producer) {
3518
+    if (pdf->stats.producer)
3525 3519
         cli_jsonstr(pdfobj, "Producer", pdf->stats.producer);
3526
-        if (pdf->stats.producer_base64)
3527
-            cli_jsonbool(pdfobj, "Producer_base64", 1);
3528
-    }
3529
-    if (pdf->stats.modificationdate) {
3520
+    if (pdf->stats.modificationdate)
3530 3521
         cli_jsonstr(pdfobj, "ModificationDate", pdf->stats.modificationdate);
3531
-        if (pdf->stats.modificationdate_base64)
3532
-            cli_jsonbool(pdfobj, "ModificationDate_base64", 1);
3533
-    }
3534
-    if (pdf->stats.creationdate) {
3522
+    if (pdf->stats.creationdate)
3535 3523
         cli_jsonstr(pdfobj, "CreationDate", pdf->stats.creationdate);
3536
-        if (pdf->stats.creationdate_base64)
3537
-            cli_jsonbool(pdfobj, "CreationDate_base64", 1);
3538
-    }
3539
-    if (pdf->stats.title) {
3524
+    if (pdf->stats.title)
3540 3525
         cli_jsonstr(pdfobj, "Title", pdf->stats.title);
3541
-        if (pdf->stats.title_base64)
3542
-            cli_jsonbool(pdfobj, "Title_base64", 1);
3543
-    }
3544
-    if (pdf->stats.subject) {
3526
+    if (pdf->stats.subject)
3545 3527
         cli_jsonstr(pdfobj, "Subject", pdf->stats.subject);
3546
-        if (pdf->stats.subject_base64)
3547
-            cli_jsonbool(pdfobj, "Subject_base64", 1);
3548
-    }
3549
-    if (pdf->stats.keywords) {
3528
+    if (pdf->stats.keywords)
3550 3529
         cli_jsonstr(pdfobj, "Keywords", pdf->stats.keywords);
3551
-        if (pdf->stats.keywords_base64)
3552
-            cli_jsonbool(pdfobj, "Keywords_base64", 1);
3553
-    }
3554 3530
     if (pdf->stats.ninvalidobjs)
3555 3531
         cli_jsonint(pdfobj, "InvalidObjectCount", pdf->stats.ninvalidobjs);
3556 3532
     if (pdf->stats.njs)
... ...
@@ -89,21 +89,13 @@ struct pdf_stats {
89 89
     int32_t nacroform;        /* Number of AcroForm objects */
90 90
     int32_t nxfa;             /* Number of XFA objects */
91 91
     char *author;             /* Author of the PDF */
92
-    int8_t author_base64;     /* Author string is base64 encoded */
93 92
     char *creator;            /* Application used to create the PDF */
94
-    int8_t creator_base64;    /* Author string is base64 encoded */
95 93
     char *producer;           /* Application used to produce the PDF */
96
-    int8_t producer_base64;   /* Application string is base64 encoded */
97
-    char *creationdate;         /* Date the PDF was created */
98
-    int8_t creationdate_base64; /* Date of creation string is base64 encoded */
99
-    char *modificationdate;         /* Date the PDF was modified */
100
-    int8_t modificationdate_base64; /* Date of modification string is base64 encoded */
94
+    char *creationdate;       /* Date the PDF was created */
95
+    char *modificationdate;   /* Date the PDF was modified */
101 96
     char *title;              /* Title of the PDF */
102
-    int8_t title_base64;      /* Title string is base64 encoded */
103 97
     char *subject;            /* Subject of the PDF */
104
-    int8_t subject_base64;    /* Subject string is base64 encoded */
105 98
     char *keywords;           /* Keywords of the PDF */
106
-    int8_t keywords_base64;   /* Keywords string is base64 encoded */
107 99
 };
108 100
 
109 101
 
... ...
@@ -152,7 +144,7 @@ int pdf_extract_obj(struct pdf_struct *pdf, struct pdf_obj *obj, uint32_t flags)
152 152
 int pdf_findobj(struct pdf_struct *pdf);
153 153
 struct pdf_obj *find_obj(struct pdf_struct *pdf, struct pdf_obj *obj, uint32_t objid);
154 154
 
155
-char *pdf_parse_string(struct pdf_struct *pdf, struct pdf_obj *obj, const char *objstart, size_t objsize, const char *str, char **endchar, int8_t *base64);
155
+char *pdf_parse_string(struct pdf_struct *pdf, struct pdf_obj *obj, const char *objstart, size_t objsize, const char *str, char **endchar);
156 156
 struct pdf_array *pdf_parse_array(struct pdf_struct *pdf, struct pdf_obj *obj, size_t objsz, char *begin, char **endchar);
157 157
 struct pdf_dict *pdf_parse_dict(struct pdf_struct *pdf, struct pdf_obj *obj, size_t objsz, char *begin, char **endchar);
158 158
 int is_object_reference(char *begin, char **endchar, uint32_t *id);
... ...
@@ -188,6 +188,7 @@ char *pdf_convert_utf(char *begin, size_t sz)
188 188
 #endif
189 189
     free(buf);
190 190
     free(outbuf);
191
+
191 192
     return res;
192 193
 }
193 194
 
... ...
@@ -275,7 +276,7 @@ int is_object_reference(char *begin, char **endchar, uint32_t *id)
275 275
     return 0;
276 276
 }
277 277
 
278
-char *pdf_parse_string(struct pdf_struct *pdf, struct pdf_obj *obj, const char *objstart, size_t objsize, const char *str, char **endchar, int8_t *base64)
278
+char *pdf_parse_string(struct pdf_struct *pdf, struct pdf_obj *obj, const char *objstart, size_t objsize, const char *str, char **endchar)
279 279
 {
280 280
     const char *q = objstart;
281 281
     char *p1, *p2;
... ...
@@ -408,7 +409,7 @@ char *pdf_parse_string(struct pdf_struct *pdf, struct pdf_obj *obj, const char *
408 408
             switch (*p3) {
409 409
                 case '(':
410 410
                 case '<':
411
-                    res = pdf_parse_string(pdf, obj, p3, objsize2, NULL, NULL, NULL);
411
+                    res = pdf_parse_string(pdf, obj, p3, objsize2, NULL, NULL);
412 412
                     free(begin);
413 413
                     break;
414 414
                 default:
... ...
@@ -422,14 +423,8 @@ char *pdf_parse_string(struct pdf_struct *pdf, struct pdf_obj *obj, const char *
422 422
                     res = likelyutf ? pdf_convert_utf(p3, objsize2) : NULL;
423 423
 
424 424
                     if (!(res)) {
425
-                        if (base64) {
426
-                            res = (char*)cl_base64_encode(p1, len);
427
-                            if (res)
428
-                                *base64 = 1;
429
-                        } else {
430
-                            res = begin;
431
-                            res[objsize2] = '\0';
432
-                        }
425
+                        res = begin;
426
+                        res[objsize2] = '\0';
433 427
                     } else {
434 428
                         free(begin);
435 429
                     }
... ...
@@ -518,11 +513,6 @@ char *pdf_parse_string(struct pdf_struct *pdf, struct pdf_obj *obj, const char *
518 518
     }
519 519
 
520 520
     res = pdf_convert_utf(p1, len);
521
-    if (!res && base64) {
522
-        res = (char*)cl_base64_encode(p1, len);
523
-        if (res)
524
-            *base64 = 1;
525
-    }
526 521
 
527 522
     if (res && endchar)
528 523
         *endchar = p2;
... ...
@@ -684,7 +674,7 @@ struct pdf_dict *pdf_parse_dict(struct pdf_struct *pdf, struct pdf_obj *obj, siz
684 684
 
685 685
         switch (begin[0]) {
686 686
             case '(':
687
-                val = pdf_parse_string(pdf, obj, begin, objsz, NULL, &p1, NULL);
687
+                val = pdf_parse_string(pdf, obj, begin, objsz, NULL, &p1);
688 688
                 begin = p1+2;
689 689
                 break;
690 690
             case '[':
... ...
@@ -700,7 +690,7 @@ struct pdf_dict *pdf_parse_dict(struct pdf_struct *pdf, struct pdf_obj *obj, siz
700 700
                     }
701 701
                 }
702 702
 
703
-                val = pdf_parse_string(pdf, obj, begin, objsz, NULL, &p1, NULL);
703
+                val = pdf_parse_string(pdf, obj, begin, objsz, NULL, &p1);
704 704
                 begin = p1+2;
705 705
                 break;
706 706
             default:
... ...
@@ -882,7 +872,7 @@ struct pdf_array *pdf_parse_array(struct pdf_struct *pdf, struct pdf_obj *obj, s
882 882
 
883 883
                 /* Not a dictionary. Intentially fall through. */
884 884
             case '(':
885
-                val = pdf_parse_string(pdf, obj, begin, objsz, NULL, &begin, NULL);
885
+                val = pdf_parse_string(pdf, obj, begin, objsz, NULL, &begin);
886 886
                 begin += 2;
887 887
                 break;
888 888
             case '[':