Browse code

Squash a few bugs. Enhance the code that pulls out indirect object references with strings. Get at more PDF properties.

Shawn Webb authored on 2014/06/14 09:40:46
Showing 1 changed files
... ...
@@ -2895,13 +2895,6 @@ static char *pdf_parse_string(struct pdf_struct *pdf, struct pdf_obj *obj, const
2895 2895
     int likelyutf = 0;
2896 2896
     unsigned int i;
2897 2897
 
2898
-    if (objsize < strlen(str) + 3)
2899
-        return NULL;
2900
-
2901
-    res = NULL;
2902
-
2903
-    checklen = strlen(str);
2904
-
2905 2898
     /* Yes, all of this is required to find the start and end of a potentially UTF-* string
2906 2899
      *
2907 2900
      * First, find the key of the key/value pair we're looking for in this object.
... ...
@@ -2911,32 +2904,41 @@ static char *pdf_parse_string(struct pdf_struct *pdf, struct pdf_obj *obj, const
2911 2911
      * Fourth, Attempt to decode from UTF-* to ASCII
2912 2912
      */
2913 2913
 
2914
-    for (p1=(char *)q; (p1 - q) < objsize-checklen; p1++)
2915
-        if (!strncmp(p1, str, checklen))
2916
-            break;
2914
+    res = NULL;
2917 2915
 
2918
-    if (p1 - q > objsize - checklen || strncmp(p1, str, checklen))
2919
-        return NULL;
2916
+    if (str) {
2917
+        checklen = strlen(str);
2920 2918
 
2921
-    p1 += checklen;
2919
+        if (objsize < strlen(str) + 3)
2920
+            return NULL;
2922 2921
 
2923
-    while ((p1 - q) < objsize && *p1 != '(') {
2924
-        if (!isspace(p1[0]))
2925
-            break;
2926 2922
 
2927
-        p1++;
2923
+        for (p1=(char *)q; (p1 - q) < objsize-checklen; p1++)
2924
+            if (!strncmp(p1, str, checklen))
2925
+                break;
2926
+
2927
+        if (p1 - q == objsize - checklen || strncmp(p1, str, checklen))
2928
+            return NULL;
2929
+
2930
+        p1 += checklen;
2931
+    } else {
2932
+        p1 = q;
2928 2933
     }
2929 2934
 
2935
+    while ((p1 - q) < objsize && isspace(p1[0]))
2936
+        p1++;
2937
+
2930 2938
     if ((p1 - q) == objsize)
2931 2939
         return NULL;
2932 2940
 
2933 2941
     /* We should be at the start of the string, minus 1 */
2934 2942
 
2935 2943
     if (isdigit(p1[0])) {
2936
-        unsigned long objnum;
2944
+        unsigned long objnum, revnum;
2937 2945
         struct pdf_obj *newobj;
2938 2946
         char *end, *begin;
2939 2947
         STATBUF sb;
2948
+        uint32_t objflags;
2940 2949
         int fd;
2941 2950
 
2942 2951
         /*
... ...
@@ -2949,7 +2951,17 @@ static char *pdf_parse_string(struct pdf_struct *pdf, struct pdf_obj *obj, const
2949 2949
         if ((end - p1) == 0)
2950 2950
             return NULL;
2951 2951
 
2952
-        newobj = find_obj(pdf, obj, objnum);
2952
+        /* Skip whitespace and get the revision number */
2953
+        p1 = end+1;
2954
+        while (p1 - q < objsize && isspace(p1[0]))
2955
+            p1++;
2956
+
2957
+        if (p1 - q == objsize)
2958
+            return NULL;
2959
+
2960
+        revnum = strtoul(p1, &end, 10);
2961
+
2962
+        newobj = find_obj(pdf, obj, (objnum<<8) | (revnum & 0xff));
2953 2963
         if (!(newobj))
2954 2964
             return NULL;
2955 2965
 
... ...
@@ -2963,9 +2975,15 @@ static char *pdf_parse_string(struct pdf_struct *pdf, struct pdf_obj *obj, const
2963 2963
         if (!(newobj->statsflags & OBJ_FLAG_PDFNAME_DONE))
2964 2964
             pdf_parseobj(pdf, newobj);
2965 2965
 
2966
+        /* Extract the object. Force pdf_extract_obj() to dump this object. */
2967
+        objflags = newobj->flags;
2968
+        newobj->flags |= (1 << OBJ_FORCEDUMP);
2969
+
2966 2970
         if (pdf_extract_obj(pdf, newobj, PDF_EXTRACT_OBJ_NONE) != CL_SUCCESS)
2967 2971
             return NULL;
2968 2972
 
2973
+        newobj->flags = objflags;
2974
+
2969 2975
         if (!(newobj->path))
2970 2976
             return NULL;
2971 2977
 
... ...
@@ -2982,23 +3000,35 @@ static char *pdf_parse_string(struct pdf_struct *pdf, struct pdf_obj *obj, const
2982 2982
             free(newobj->path);
2983 2983
         }
2984 2984
 
2985
-        begin = calloc(1, sb.st_size);
2986
-        if (!(begin)) {
2987
-            close(fd);
2988
-            cli_unlink(newobj->path);
2989
-            free(newobj->path);
2990
-        }
2985
+        if (sb.st_size) {
2986
+            begin = calloc(1, sb.st_size);
2987
+            if (!(begin)) {
2988
+                close(fd);
2989
+                cli_unlink(newobj->path);
2990
+                free(newobj->path);
2991
+            }
2991 2992
 
2992
-        read(fd, res, sb.st_size);
2993
-        res = pdf_convert_utf(buf, sb.st_size);
2994
-        if (!(res))
2995
-            res = begin;
2996
-        else
2997
-            free(begin);
2993
+            read(fd, begin, sb.st_size);
2994
+
2995
+            switch (begin[0]) {
2996
+                case '(':
2997
+                case '<':
2998
+                    res = pdf_parse_string(pdf, obj, begin, sb.st_size, NULL);
2999
+                    free(begin);
3000
+                    break;
3001
+                default:
3002
+                    res = pdf_convert_utf(begin, sb.st_size);
3003
+                    if (!(res))
3004
+                        res = begin;
3005
+                    else
3006
+                        free(begin);
3007
+            }
3008
+        }
2998 3009
 
2999 3010
         close(fd);
3000 3011
         cli_unlink(newobj->path);
3001 3012
         free(newobj->path);
3013
+        newobj->path = NULL;
3002 3014
 
3003 3015
         return res;
3004 3016
     }
... ...
@@ -3064,7 +3094,7 @@ static char *pdf_parse_string(struct pdf_struct *pdf, struct pdf_obj *obj, const
3064 3064
 
3065 3065
     len = inlen = outlen = (size_t)(p2 - p1) + 1;
3066 3066
 
3067
-    if (likelyutf == 0) {
3067
+    if (likelyutf == 0 && len) {
3068 3068
         /* We're not UTF-*, so just make a copy of the string and return that */
3069 3069
         res = cli_calloc(1, len);
3070 3070
         if (!(res))
... ...
@@ -3074,7 +3104,8 @@ static char *pdf_parse_string(struct pdf_struct *pdf, struct pdf_obj *obj, const
3074 3074
         return res;
3075 3075
     }
3076 3076
 
3077
-    res = pdf_convert_utf(p1, len);
3077
+    if (len)
3078
+        res = pdf_convert_utf(p1, len);
3078 3079
 
3079 3080
     return res;
3080 3081
 }
... ...
@@ -3146,10 +3177,27 @@ static void CCITTFaxDecode_cb(struct pdf_struct *pdf, struct pdf_obj *obj, struc
3146 3146
 
3147 3147
 static void JBIG2Decode_cb(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdf_action *act)
3148 3148
 {
3149
-     if (!(pdf))
3150
-         return;
3149
+#if HAVE_JSON
3150
+    struct json_object *pdfobj, *jbig2arr, *jbig2obj;
3151 3151
 
3152
-     pdf->stats.njbig2decode++;
3152
+    if (!(pdf))
3153
+        return;
3154
+
3155
+    if (!(pdf->ctx->wrkproperty))
3156
+        return;
3157
+
3158
+    pdfobj = cli_jsonobj(pdf->ctx->wrkproperty, "PDFStats");
3159
+    if (!(pdfobj))
3160
+        return;
3161
+
3162
+    jbig2arr = cli_jsonarray(pdfobj, "JBIG2Objects");
3163
+    if (!(jbig2arr))
3164
+        return;
3165
+
3166
+    cli_jsonint_array(jbig2arr, obj->id>>8);
3167
+
3168
+    pdf->stats.njbig2decode++;
3169
+#endif
3153 3170
 }
3154 3171
 
3155 3172
 static void DCTDecode_cb(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdf_action *act)
... ...
@@ -3358,13 +3406,11 @@ static void pdf_export_json(struct pdf_struct *pdf)
3358 3358
         goto cleanup;
3359 3359
     }
3360 3360
 
3361
-    pdfobj = json_object_new_object();
3361
+    pdfobj = cli_jsonobj(pdf->ctx->wrkproperty, "PDFStats");
3362 3362
     if (!(pdfobj)) {
3363 3363
         goto cleanup;
3364 3364
     }
3365 3365
 
3366
-    json_object_object_add(pdf->ctx->wrkproperty, "PDFStats", pdfobj);
3367
-
3368 3366
     if (pdf->stats.author)
3369 3367
         cli_jsonstr(pdfobj, "Author", pdf->stats.author);
3370 3368
     if (pdf->stats.creator)