... | ... |
@@ -2895,13 +2895,6 @@ static char *pdf_parse_string(struct pdf_struct *pdf, struct pdf_obj *obj, const |
2895 | 2895 |
int likelyutf = 0; |
2896 | 2896 |
unsigned int i; |
2897 | 2897 |
|
2898 |
- if (objsize < strlen(str) + 3) |
|
2899 |
- return NULL; |
|
2900 |
- |
|
2901 |
- res = NULL; |
|
2902 |
- |
|
2903 |
- checklen = strlen(str); |
|
2904 |
- |
|
2905 | 2898 |
/* Yes, all of this is required to find the start and end of a potentially UTF-* string |
2906 | 2899 |
* |
2907 | 2900 |
* First, find the key of the key/value pair we're looking for in this object. |
... | ... |
@@ -2911,32 +2904,41 @@ static char *pdf_parse_string(struct pdf_struct *pdf, struct pdf_obj *obj, const |
2911 | 2911 |
* Fourth, Attempt to decode from UTF-* to ASCII |
2912 | 2912 |
*/ |
2913 | 2913 |
|
2914 |
- for (p1=(char *)q; (p1 - q) < objsize-checklen; p1++) |
|
2915 |
- if (!strncmp(p1, str, checklen)) |
|
2916 |
- break; |
|
2914 |
+ res = NULL; |
|
2917 | 2915 |
|
2918 |
- if (p1 - q > objsize - checklen || strncmp(p1, str, checklen)) |
|
2919 |
- return NULL; |
|
2916 |
+ if (str) { |
|
2917 |
+ checklen = strlen(str); |
|
2920 | 2918 |
|
2921 |
- p1 += checklen; |
|
2919 |
+ if (objsize < strlen(str) + 3) |
|
2920 |
+ return NULL; |
|
2922 | 2921 |
|
2923 |
- while ((p1 - q) < objsize && *p1 != '(') { |
|
2924 |
- if (!isspace(p1[0])) |
|
2925 |
- break; |
|
2926 | 2922 |
|
2927 |
- p1++; |
|
2923 |
+ for (p1=(char *)q; (p1 - q) < objsize-checklen; p1++) |
|
2924 |
+ if (!strncmp(p1, str, checklen)) |
|
2925 |
+ break; |
|
2926 |
+ |
|
2927 |
+ if (p1 - q == objsize - checklen || strncmp(p1, str, checklen)) |
|
2928 |
+ return NULL; |
|
2929 |
+ |
|
2930 |
+ p1 += checklen; |
|
2931 |
+ } else { |
|
2932 |
+ p1 = q; |
|
2928 | 2933 |
} |
2929 | 2934 |
|
2935 |
+ while ((p1 - q) < objsize && isspace(p1[0])) |
|
2936 |
+ p1++; |
|
2937 |
+ |
|
2930 | 2938 |
if ((p1 - q) == objsize) |
2931 | 2939 |
return NULL; |
2932 | 2940 |
|
2933 | 2941 |
/* We should be at the start of the string, minus 1 */ |
2934 | 2942 |
|
2935 | 2943 |
if (isdigit(p1[0])) { |
2936 |
- unsigned long objnum; |
|
2944 |
+ unsigned long objnum, revnum; |
|
2937 | 2945 |
struct pdf_obj *newobj; |
2938 | 2946 |
char *end, *begin; |
2939 | 2947 |
STATBUF sb; |
2948 |
+ uint32_t objflags; |
|
2940 | 2949 |
int fd; |
2941 | 2950 |
|
2942 | 2951 |
/* |
... | ... |
@@ -2949,7 +2951,17 @@ static char *pdf_parse_string(struct pdf_struct *pdf, struct pdf_obj *obj, const |
2949 | 2949 |
if ((end - p1) == 0) |
2950 | 2950 |
return NULL; |
2951 | 2951 |
|
2952 |
- newobj = find_obj(pdf, obj, objnum); |
|
2952 |
+ /* Skip whitespace and get the revision number */ |
|
2953 |
+ p1 = end+1; |
|
2954 |
+ while (p1 - q < objsize && isspace(p1[0])) |
|
2955 |
+ p1++; |
|
2956 |
+ |
|
2957 |
+ if (p1 - q == objsize) |
|
2958 |
+ return NULL; |
|
2959 |
+ |
|
2960 |
+ revnum = strtoul(p1, &end, 10); |
|
2961 |
+ |
|
2962 |
+ newobj = find_obj(pdf, obj, (objnum<<8) | (revnum & 0xff)); |
|
2953 | 2963 |
if (!(newobj)) |
2954 | 2964 |
return NULL; |
2955 | 2965 |
|
... | ... |
@@ -2963,9 +2975,15 @@ static char *pdf_parse_string(struct pdf_struct *pdf, struct pdf_obj *obj, const |
2963 | 2963 |
if (!(newobj->statsflags & OBJ_FLAG_PDFNAME_DONE)) |
2964 | 2964 |
pdf_parseobj(pdf, newobj); |
2965 | 2965 |
|
2966 |
+ /* Extract the object. Force pdf_extract_obj() to dump this object. */ |
|
2967 |
+ objflags = newobj->flags; |
|
2968 |
+ newobj->flags |= (1 << OBJ_FORCEDUMP); |
|
2969 |
+ |
|
2966 | 2970 |
if (pdf_extract_obj(pdf, newobj, PDF_EXTRACT_OBJ_NONE) != CL_SUCCESS) |
2967 | 2971 |
return NULL; |
2968 | 2972 |
|
2973 |
+ newobj->flags = objflags; |
|
2974 |
+ |
|
2969 | 2975 |
if (!(newobj->path)) |
2970 | 2976 |
return NULL; |
2971 | 2977 |
|
... | ... |
@@ -2982,23 +3000,35 @@ static char *pdf_parse_string(struct pdf_struct *pdf, struct pdf_obj *obj, const |
2982 | 2982 |
free(newobj->path); |
2983 | 2983 |
} |
2984 | 2984 |
|
2985 |
- begin = calloc(1, sb.st_size); |
|
2986 |
- if (!(begin)) { |
|
2987 |
- close(fd); |
|
2988 |
- cli_unlink(newobj->path); |
|
2989 |
- free(newobj->path); |
|
2990 |
- } |
|
2985 |
+ if (sb.st_size) { |
|
2986 |
+ begin = calloc(1, sb.st_size); |
|
2987 |
+ if (!(begin)) { |
|
2988 |
+ close(fd); |
|
2989 |
+ cli_unlink(newobj->path); |
|
2990 |
+ free(newobj->path); |
|
2991 |
+ } |
|
2991 | 2992 |
|
2992 |
- read(fd, res, sb.st_size); |
|
2993 |
- res = pdf_convert_utf(buf, sb.st_size); |
|
2994 |
- if (!(res)) |
|
2995 |
- res = begin; |
|
2996 |
- else |
|
2997 |
- free(begin); |
|
2993 |
+ read(fd, begin, sb.st_size); |
|
2994 |
+ |
|
2995 |
+ switch (begin[0]) { |
|
2996 |
+ case '(': |
|
2997 |
+ case '<': |
|
2998 |
+ res = pdf_parse_string(pdf, obj, begin, sb.st_size, NULL); |
|
2999 |
+ free(begin); |
|
3000 |
+ break; |
|
3001 |
+ default: |
|
3002 |
+ res = pdf_convert_utf(begin, sb.st_size); |
|
3003 |
+ if (!(res)) |
|
3004 |
+ res = begin; |
|
3005 |
+ else |
|
3006 |
+ free(begin); |
|
3007 |
+ } |
|
3008 |
+ } |
|
2998 | 3009 |
|
2999 | 3010 |
close(fd); |
3000 | 3011 |
cli_unlink(newobj->path); |
3001 | 3012 |
free(newobj->path); |
3013 |
+ newobj->path = NULL; |
|
3002 | 3014 |
|
3003 | 3015 |
return res; |
3004 | 3016 |
} |
... | ... |
@@ -3064,7 +3094,7 @@ static char *pdf_parse_string(struct pdf_struct *pdf, struct pdf_obj *obj, const |
3064 | 3064 |
|
3065 | 3065 |
len = inlen = outlen = (size_t)(p2 - p1) + 1; |
3066 | 3066 |
|
3067 |
- if (likelyutf == 0) { |
|
3067 |
+ if (likelyutf == 0 && len) { |
|
3068 | 3068 |
/* We're not UTF-*, so just make a copy of the string and return that */ |
3069 | 3069 |
res = cli_calloc(1, len); |
3070 | 3070 |
if (!(res)) |
... | ... |
@@ -3074,7 +3104,8 @@ static char *pdf_parse_string(struct pdf_struct *pdf, struct pdf_obj *obj, const |
3074 | 3074 |
return res; |
3075 | 3075 |
} |
3076 | 3076 |
|
3077 |
- res = pdf_convert_utf(p1, len); |
|
3077 |
+ if (len) |
|
3078 |
+ res = pdf_convert_utf(p1, len); |
|
3078 | 3079 |
|
3079 | 3080 |
return res; |
3080 | 3081 |
} |
... | ... |
@@ -3146,10 +3177,27 @@ static void CCITTFaxDecode_cb(struct pdf_struct *pdf, struct pdf_obj *obj, struc |
3146 | 3146 |
|
3147 | 3147 |
static void JBIG2Decode_cb(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdf_action *act) |
3148 | 3148 |
{ |
3149 |
- if (!(pdf)) |
|
3150 |
- return; |
|
3149 |
+#if HAVE_JSON |
|
3150 |
+ struct json_object *pdfobj, *jbig2arr, *jbig2obj; |
|
3151 | 3151 |
|
3152 |
- pdf->stats.njbig2decode++; |
|
3152 |
+ if (!(pdf)) |
|
3153 |
+ return; |
|
3154 |
+ |
|
3155 |
+ if (!(pdf->ctx->wrkproperty)) |
|
3156 |
+ return; |
|
3157 |
+ |
|
3158 |
+ pdfobj = cli_jsonobj(pdf->ctx->wrkproperty, "PDFStats"); |
|
3159 |
+ if (!(pdfobj)) |
|
3160 |
+ return; |
|
3161 |
+ |
|
3162 |
+ jbig2arr = cli_jsonarray(pdfobj, "JBIG2Objects"); |
|
3163 |
+ if (!(jbig2arr)) |
|
3164 |
+ return; |
|
3165 |
+ |
|
3166 |
+ cli_jsonint_array(jbig2arr, obj->id>>8); |
|
3167 |
+ |
|
3168 |
+ pdf->stats.njbig2decode++; |
|
3169 |
+#endif |
|
3153 | 3170 |
} |
3154 | 3171 |
|
3155 | 3172 |
static void DCTDecode_cb(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdf_action *act) |
... | ... |
@@ -3358,13 +3406,11 @@ static void pdf_export_json(struct pdf_struct *pdf) |
3358 | 3358 |
goto cleanup; |
3359 | 3359 |
} |
3360 | 3360 |
|
3361 |
- pdfobj = json_object_new_object(); |
|
3361 |
+ pdfobj = cli_jsonobj(pdf->ctx->wrkproperty, "PDFStats"); |
|
3362 | 3362 |
if (!(pdfobj)) { |
3363 | 3363 |
goto cleanup; |
3364 | 3364 |
} |
3365 | 3365 |
|
3366 |
- json_object_object_add(pdf->ctx->wrkproperty, "PDFStats", pdfobj); |
|
3367 |
- |
|
3368 | 3366 |
if (pdf->stats.author) |
3369 | 3367 |
cli_jsonstr(pdfobj, "Author", pdf->stats.author); |
3370 | 3368 |
if (pdf->stats.creator) |