Browse code

Add dictionary parsing support.

Shawn Webb authored on 2014/06/25 07:43:51
Showing 2 changed files
... ...
@@ -80,8 +80,10 @@ static	int	asciihexdecode(const char *buf, off_t len, char *output);
80 80
 static	int	ascii85decode(const char *buf, off_t len, unsigned char *output);
81 81
 static	const	char	*pdf_nextlinestart(const char *ptr, size_t len);
82 82
 static	const	char	*pdf_nextobject(const char *ptr, size_t len);
83
-static char *pdf_parse_string(struct pdf_struct *pdf, struct pdf_obj *obj, const char *objstart, size_t objsize, const char *str);
84
-static struct pdf_array *pdf_parse_array(struct pdf_struct *pdf, struct pdf_obj *obj, size_t objsz, char *begin);
83
+static char *pdf_parse_string(struct pdf_struct *pdf, struct pdf_obj *obj, const char *objstart, size_t objsize, const char *str, char **endchar);
84
+static struct pdf_array *pdf_parse_array(struct pdf_struct *pdf, struct pdf_obj *obj, size_t objsz, char *begin, char **endchar);
85
+static struct pdf_dict *pdf_parse_dict(struct pdf_struct *pdf, struct pdf_obj *obj, size_t objsz, char *begin, char **endchar);
86
+static void pdf_free_dict(struct pdf_dict *dict);
85 87
 static void pdf_free_array(struct pdf_array *array);
86 88
 static int pdf_extract_obj(struct pdf_struct *pdf, struct pdf_obj *obj, uint32_t flags);
87 89
 static char *pdf_convert_utf(char *begin, size_t sz);
... ...
@@ -2899,7 +2901,69 @@ static char *pdf_convert_utf(char *begin, size_t sz)
2899 2899
 #endif
2900 2900
 }
2901 2901
 
2902
-static char *pdf_parse_string(struct pdf_struct *pdf, struct pdf_obj *obj, const char *objstart, size_t objsize, const char *str)
2902
+static int is_object_reference(char *begin, char **endchar)
2903
+{
2904
+    char *end = *endchar;
2905
+    char *p1=begin, *p2;
2906
+    unsigned long n;
2907
+
2908
+    while (p1 < end && isspace(p1[0]))
2909
+        p1++;
2910
+
2911
+    if (p1 == end)
2912
+        return 0;
2913
+
2914
+    if (!isnumber(p1[0]))
2915
+        return 0;
2916
+
2917
+    p2 = p1+1;
2918
+    while (p2 < end && !isspace(p2[0]))
2919
+        p2++;
2920
+
2921
+    if (p2 == end)
2922
+        return 0;
2923
+
2924
+    n = strtoul(p1, &p2, 10);
2925
+    if (n == ULONG_MAX && errno)
2926
+        return 0;
2927
+
2928
+    p1 = p2;
2929
+    while (p1 < end && isspace(p1[0]))
2930
+        p1++;
2931
+
2932
+    if (p1 == end)
2933
+        return 0;
2934
+
2935
+    if (!isnumber(p1[0]))
2936
+        return 0;
2937
+
2938
+    p2 = p1+1;
2939
+    while (p2 < end && !isspace(p2[0]))
2940
+        p2++;
2941
+
2942
+    if (p2 == end)
2943
+        return 0;
2944
+
2945
+    n = strtoul(p1, &p2, 10);
2946
+    if (n == ULONG_MAX && errno)
2947
+        return 0;
2948
+
2949
+    p1 = p2;
2950
+    while (p1 < end && isspace(p1[0]))
2951
+        p1++;
2952
+
2953
+    if (p1 == end)
2954
+        return 0;
2955
+
2956
+    if (p1[0] == 'R') {
2957
+        *endchar = p1+1;
2958
+        return 1;
2959
+    }
2960
+
2961
+    return 0;
2962
+}
2963
+
2964
+static char *pdf_parse_string(struct pdf_struct *pdf, struct pdf_obj *obj, const char *objstart, size_t objsize, const char *str, char **endchar)
2903 2965
 {
2904 2966
     const char *q = objstart;
2905 2967
     char *p1, *p2;
... ...
@@ -2946,7 +3010,8 @@ static char *pdf_parse_string(struct pdf_struct *pdf, struct pdf_obj *obj, const
2946 2946
 
2947 2947
     /* We should be at the start of the string, minus 1 */
2948 2948
 
2949
-    if (isdigit(p1[0])) {
2949
+    p2 = q + objsize;
2950
+    if (is_object_reference(p1, &p2)) {
2950 2951
         unsigned long objnum, revnum;
2951 2952
         struct pdf_obj *newobj;
2952 2953
         char *end, *begin;
... ...
@@ -2974,6 +3039,16 @@ static char *pdf_parse_string(struct pdf_struct *pdf, struct pdf_obj *obj, const
2974 2974
 
2975 2975
         revnum = strtoul(p1, &end, 10);
2976 2976
 
2977
+        p1 = end+1;
2978
+        while (p1 - q < objsize && isspace(p1[0]))
2979
+            p1++;
2980
+
2981
+        if (p1 - q == objsize)
2982
+            return NULL;
2983
+
2984
+        if (p1[0] != 'R')
2985
+            return NULL;
2986
+
2977 2987
         newobj = find_obj(pdf, obj, (objnum<<8) | (revnum & 0xff));
2978 2988
         if (!(newobj))
2979 2989
             return NULL;
... ...
@@ -3038,7 +3113,7 @@ static char *pdf_parse_string(struct pdf_struct *pdf, struct pdf_obj *obj, const
3038 3038
             switch (begin[0]) {
3039 3039
                 case '(':
3040 3040
                 case '<':
3041
-                    res = pdf_parse_string(pdf, obj, begin, sb.st_size, NULL);
3041
+                    res = pdf_parse_string(pdf, obj, begin, sb.st_size, NULL, NULL);
3042 3042
                     free(begin);
3043 3043
                     break;
3044 3044
                 default:
... ...
@@ -3055,6 +3130,9 @@ static char *pdf_parse_string(struct pdf_struct *pdf, struct pdf_obj *obj, const
3055 3055
         free(newobj->path);
3056 3056
         newobj->path = NULL;
3057 3057
 
3058
+        if (endchar)
3059
+            *endchar = p1;
3060
+
3058 3061
         return res;
3059 3062
     }
3060 3063
 
... ...
@@ -3076,6 +3154,8 @@ static char *pdf_parse_string(struct pdf_struct *pdf, struct pdf_obj *obj, const
3076 3076
             return NULL;
3077 3077
 
3078 3078
         strncpy(res, p1, (p2 - p1) + 1);
3079
+        if (endchar)
3080
+            *endchar = p2;
3079 3081
 
3080 3082
         return res;
3081 3083
     }
... ...
@@ -3130,20 +3210,227 @@ static char *pdf_parse_string(struct pdf_struct *pdf, struct pdf_obj *obj, const
3130 3130
 
3131 3131
         memcpy(res, p1, len);
3132 3132
         res[len] = '\0';
3133
+        if (endchar)
3134
+            *endchar = p2;
3135
+
3133 3136
         return res;
3134 3137
     }
3135 3138
 
3136
-        res = pdf_convert_utf(p1, len);
3139
+    res = pdf_convert_utf(p1, len);
3140
+
3141
+    if (res && endchar)
3142
+        *endchar = p2;
3143
+
3144
+    return res;
3145
+}
3146
+
3147
+static struct pdf_dict *pdf_parse_dict(struct pdf_struct *pdf, struct pdf_obj *obj, size_t objsz, char *begin, char **endchar)
3148
+{
3149
+    struct pdf_dict *res=NULL;
3150
+    struct pdf_dict_node *node=NULL;
3151
+    const char *objstart = obj->start + pdf->map;
3152
+    char *end;
3153
+    unsigned int in_string=0, ninner=0;
3154
+
3155
+    /* Sanity checking */
3156
+    if (!(pdf) || !(obj) || !(begin))
3157
+        return NULL;
3158
+
3159
+    if (begin < objstart || begin - objstart >= objsz - 2)
3160
+        return NULL;
3161
+
3162
+    if (begin[0] != '<' || begin[1] != '<')
3163
+        return NULL;
3164
+
3165
+    /* Find the end of the dictionary */
3166
+    end = begin;
3167
+    while (end - objstart < objsz) {
3168
+        if (in_string) {
3169
+            if (*end == ')')
3170
+                in_string = 0;
3171
+
3172
+            end++;
3173
+            continue;
3174
+        }
3175
+
3176
+        switch (*end) {
3177
+            case '(':
3178
+                in_string=1;
3179
+                break;
3180
+            case '<':
3181
+                if (end - objstart <= objsz - 2 && end[1] == '<')
3182
+                    ninner++;
3183
+                break;
3184
+            case '>':
3185
+                if (end - objstart <= objsz - 2 && end[1] == '>')
3186
+                    ninner--;
3187
+                break;
3188
+            case '\\':
3189
+                end += 2;
3190
+                if (end - objstart >= objsz)
3191
+                    return NULL;
3192
+        }
3193
+
3194
+        if (end - objstart <= objsz - 2)
3195
+            if (end[0] == '>' && end[1] == '>' && ninner == 0)
3196
+                break;
3197
+
3198
+        end++;
3199
+    }
3200
+
3201
+    /* More sanity checking */
3202
+    if (end - objstart >= objsz - 1)
3203
+        return NULL;
3204
+
3205
+    if (end[0] != '>' || end[1] != '>')
3206
+        return NULL;
3207
+
3208
+    res = cli_calloc(1, sizeof(struct pdf_dict));
3209
+    if (!(res))
3210
+        return NULL;
3211
+
3212
+    /* Loop through each element of the dictionary */
3213
+    begin += 2;
3214
+    while (begin < end) {
3215
+        char *val=NULL, *key=NULL, *p1;
3216
+        struct pdf_dict *dict=NULL;
3217
+        struct pdf_array *arr=NULL;
3218
+
3219
+        /* Skip any whitespaces */
3220
+        while (begin < end && isspace(begin[0]))
3221
+            begin++;
3222
+
3223
+        if (begin == end)
3224
+            break;
3225
+
3226
+        /* Get the key */
3227
+        p1 = begin+1;
3228
+        while (p1 < end && isalpha(p1[0]))
3229
+            p1++;
3230
+
3231
+        if (p1 == end)
3232
+            break;
3233
+
3234
+        key = cli_calloc((p1 - begin) + 2, 1);
3235
+        if (!(key))
3236
+            break;
3237
+
3238
+        strncpy(key, begin, p1 - begin);
3239
+        key[p1 - begin] = '\0';
3240
+
3241
+        /* Now for the value */
3242
+        begin = p1;
3243
+
3244
+        /* Skip any whitespaces */
3245
+        while (begin < end && isspace(begin[0]))
3246
+            begin++;
3247
+
3248
+        if (begin == end) {
3249
+            free(key);
3250
+            break;
3251
+        }
3252
+
3253
+        switch (begin[0]) {
3254
+            case '(':
3255
+                val = pdf_parse_string(pdf, obj, begin, objsz, NULL, &p1);
3256
+                begin = p1+2;
3257
+                break;
3258
+            case '[':
3259
+                arr = pdf_parse_array(pdf, obj, objsz, begin, &p1);
3260
+                begin = p1+1;
3261
+                break;
3262
+            case '<':
3263
+                if (begin - objstart < objsz - 2) {
3264
+                    if (begin[1] == '<') {
3265
+                        dict = pdf_parse_dict(pdf, obj, objsz, begin, &p1);
3266
+                        begin = p1+2;
3267
+                        break;
3268
+                    }
3269
+                }
3270
+
3271
+                val = pdf_parse_string(pdf, obj, begin, objsz, NULL, &p1);
3272
+                begin = p1+2;
3273
+                break;
3274
+            default:
3275
+                p1 = (begin[0] == '/') ? begin+1 : begin;
3276
+                while (p1 < end) {
3277
+                    int shouldbreak = 0;
3278
+                    switch (p1[0]) {
3279
+                        case '>':
3280
+                        case '/':
3281
+                            shouldbreak=1;
3282
+                            break;
3283
+                    }
3284
+
3285
+                    if (shouldbreak)
3286
+                        break;
3287
+
3288
+                    p1++;
3289
+                }
3290
+
3291
+                is_object_reference(begin, &p1);
3292
+
3293
+                val = cli_calloc((p1 - begin) + 2, 1);
3294
+                if (!(val))
3295
+                    break;
3296
+
3297
+                strncpy(val, begin, p1 - begin);
3298
+                val[p1 - begin] = '\0';
3299
+
3300
+                if (p1[0] != '/')
3301
+                    begin = p1+1;
3302
+                else
3303
+                    begin = p1;
3304
+
3305
+                break;
3306
+        }
3307
+
3308
+        if (!(val) && !(dict) && !(arr))
3309
+            break;
3310
+
3311
+        if (!(res->nodes)) {
3312
+            res->nodes = res->tail = node = cli_calloc(1, sizeof(struct pdf_dict_node));
3313
+            if (!(node))
3314
+                break;
3315
+        } else {
3316
+            node = calloc(1, sizeof(struct pdf_dict_node));
3317
+            if (!(node))
3318
+                break;
3319
+
3320
+            node->prev = res->tail;
3321
+            if (res->tail)
3322
+                res->tail->next = node;
3323
+            res->tail = node;
3324
+        }
3325
+
3326
+        node->key = key;
3327
+        if ((val)) {
3328
+            node->value = val;
3329
+            node->valuesz = strlen(val);
3330
+            node->type = PDF_DICT_STRING;
3331
+        } else if ((arr)) {
3332
+            node->value = arr;
3333
+            node->valuesz = sizeof(struct pdf_array);
3334
+            node->type = PDF_DICT_ARRAY;
3335
+        } else if ((dict)) {
3336
+            node->value = dict;
3337
+            node->valuesz = sizeof(struct pdf_dict);
3338
+            node->type = PDF_DICT_DICT;
3339
+        }
3340
+    }
3341
+
3342
+    if (endchar)
3343
+        *endchar = end;
3137 3344
 
3138 3345
     return res;
3139 3346
 }
3140 3347
 
3141
-static struct pdf_array *pdf_parse_array(struct pdf_struct *pdf, struct pdf_obj *obj, size_t objsz, char *begin)
3348
+static struct pdf_array *pdf_parse_array(struct pdf_struct *pdf, struct pdf_obj *obj, size_t objsz, char *begin, char **endchar)
3142 3349
 {
3143 3350
     struct pdf_array *res=NULL;
3144 3351
     struct pdf_array_node *node=NULL;
3145 3352
     const char *objstart = obj->start + pdf->map;
3146
-    char *end;
3353
+    char *end, *tempend;
3147 3354
     int in_string=0, ninner=0;
3148 3355
 
3149 3356
     /* Sanity checking */
... ...
@@ -3204,6 +3491,7 @@ static struct pdf_array *pdf_parse_array(struct pdf_struct *pdf, struct pdf_obj
3204 3204
     while (begin < end) {
3205 3205
         char *val=NULL, *p1;
3206 3206
         struct pdf_array *arr=NULL;
3207
+        struct pdf_dict *dict=NULL;
3207 3208
 
3208 3209
         while (begin < end && isspace(begin[0]))
3209 3210
             begin++;
... ...
@@ -3214,23 +3502,26 @@ static struct pdf_array *pdf_parse_array(struct pdf_struct *pdf, struct pdf_obj
3214 3214
         switch (begin[0]) {
3215 3215
             case '<':
3216 3216
                 if (begin - objstart < objsz - 2 && begin[1] == '<') {
3217
-                    /* Handle dictionaries later */
3217
+                    dict = pdf_parse_dict(pdf, obj, objsz, begin, &begin);
3218 3218
                     break;
3219 3219
                 }
3220 3220
 
3221 3221
                 /* Not a dictionary. Intentially fall through. */
3222 3222
             case '(':
3223
-                val = pdf_parse_string(pdf, obj, begin, objsz, NULL);
3223
+                val = pdf_parse_string(pdf, obj, begin, objsz, NULL, &begin);
3224 3224
                 break;
3225 3225
             case '[':
3226 3226
                 /* XXX We should have a recursion counter here */
3227
-                arr = pdf_parse_array(pdf, obj, objsz, begin);
3227
+                arr = pdf_parse_array(pdf, obj, objsz, begin, &begin);
3228 3228
                 break;
3229 3229
             default:
3230 3230
                 /* This should just be a number or the letter R */
3231
-                p1 = begin+1;
3232
-                while (p1 < end && !isspace(p1[0]))
3233
-                    p1++;
3231
+                p1 = end;
3232
+                if (!is_object_reference(begin, &p1)) {
3233
+                    p1 = begin+1;
3234
+                    while (p1 < end && !isspace(p1[0]))
3235
+                        p1++;
3236
+                }
3234 3237
 
3235 3238
                 val = cli_calloc((p1 - begin) + 2, 1);
3236 3239
                 if (!(val))
... ...
@@ -3238,12 +3529,13 @@ static struct pdf_array *pdf_parse_array(struct pdf_struct *pdf, struct pdf_obj
3238 3238
 
3239 3239
                 strncpy(val, begin, p1 - begin);
3240 3240
                 val[p1 - begin] = '\0';
3241
-                
3241
+
3242
+                begin = p1;
3242 3243
                 break;
3243 3244
         }
3244 3245
 
3245 3246
         /* Parse error, just return what we could */
3246
-        if (!(val) && !(arr))
3247
+        if (!(val) && !(arr) && !(dict))
3247 3248
             break;
3248 3249
 
3249 3250
         if (!(node)) {
... ...
@@ -3265,21 +3557,48 @@ static struct pdf_array *pdf_parse_array(struct pdf_struct *pdf, struct pdf_obj
3265 3265
             node->type = PDF_ARR_STRING;
3266 3266
             node->data = val;
3267 3267
             node->datasz = strlen(val);
3268
+        } else if (dict != NULL) {
3269
+            node->type = PDF_ARR_DICT;
3270
+            node->data = dict;
3271
+            node->datasz = sizeof(struct pdf_dict);
3268 3272
         } else {
3269 3273
             node->type = PDF_ARR_ARRAY;
3270 3274
             node->data = arr;
3271 3275
             node->datasz = sizeof(struct pdf_array);
3272 3276
         }
3273
-
3274
-        begin++;
3275 3277
     }
3276 3278
 
3279
+    if (endchar)
3280
+        *endchar = end;
3281
+
3277 3282
     return res;
3278 3283
 }
3279 3284
 
3285
+static void pdf_free_dict(struct pdf_dict *dict)
3286
+{
3287
+    struct pdf_dict_node *node, *next;
3288
+
3289
+    node = dict->nodes;
3290
+    while (node != NULL) {
3291
+        free(node->key);
3292
+
3293
+        if (node->type == PDF_DICT_STRING)
3294
+            free(node->value);
3295
+        else if (node->type == PDF_DICT_ARRAY)
3296
+            pdf_free_array((struct pdf_array *)(node->value));
3297
+        else if (node->type == PDF_DICT_DICT)
3298
+            pdf_free_dict((struct pdf_dict *)(node->value));
3299
+
3300
+        next = node->next;
3301
+        free(node);
3302
+        node = next;
3303
+    }
3304
+
3305
+    free(dict);
3306
+}
3307
+
3280 3308
 static void pdf_free_array(struct pdf_array *array)
3281 3309
 {
3282
-    struct pdf_array *arr;
3283 3310
     struct pdf_array_node *node, *next;
3284 3311
 
3285 3312
     if (!(array))
... ...
@@ -3289,6 +3608,8 @@ static void pdf_free_array(struct pdf_array *array)
3289 3289
     while (node != NULL) {
3290 3290
         if (node->type == PDF_ARR_ARRAY)
3291 3291
             pdf_free_array((struct pdf_array *)(node->data));
3292
+        else if (node->type == PDF_ARR_DICT)
3293
+            pdf_free_dict((struct pdf_dict *)(node->data));
3292 3294
         else
3293 3295
             free(node->data);
3294 3296
 
... ...
@@ -3313,6 +3634,20 @@ static void pdf_print_array(struct pdf_array *array, unsigned long depth)
3313 3313
     }
3314 3314
 }
3315 3315
 
3316
+static void pdf_print_dict(struct pdf_dict *dict, unsigned long depth)
3317
+{
3318
+    struct pdf_dict_node *node;
3319
+
3320
+    for (node = dict->nodes; node != NULL; node = node->next) {
3321
+        if (node->type == PDF_DICT_STRING)
3322
+            cli_errmsg("dict[%lu][%s]: %s\n", depth, node->key, (char *)(node->value));
3323
+        else if (node->type == PDF_DICT_ARRAY)
3324
+            pdf_print_array((struct pdf_array *)(node->value), depth);
3325
+        else if (node->type == PDF_DICT_DICT)
3326
+            pdf_print_dict((struct pdf_dict *)(node->value), depth+1);
3327
+    }
3328
+}
3329
+
3316 3330
 /* PDF statistics */
3317 3331
 static void ASCIIHexDecode_cb(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdf_action *act)
3318 3332
 {
... ...
@@ -3499,7 +3834,7 @@ static void Author_cb(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdf_ac
3499 3499
         return;
3500 3500
 
3501 3501
     if (!(pdf->stats.author))
3502
-        pdf->stats.author = pdf_parse_string(pdf, obj, obj->start + pdf->map, obj_size(pdf, obj, 1), "/Author");
3502
+        pdf->stats.author = pdf_parse_string(pdf, obj, obj->start + pdf->map, obj_size(pdf, obj, 1), "/Author", NULL);
3503 3503
 #endif
3504 3504
 }
3505 3505
 
... ...
@@ -3510,7 +3845,7 @@ static void Creator_cb(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdf_a
3510 3510
         return;
3511 3511
 
3512 3512
     if (!(pdf->stats.creator))
3513
-        pdf->stats.creator = pdf_parse_string(pdf, obj, obj->start + pdf->map, obj_size(pdf, obj, 1), "/Creator");
3513
+        pdf->stats.creator = pdf_parse_string(pdf, obj, obj->start + pdf->map, obj_size(pdf, obj, 1), "/Creator", NULL);
3514 3514
 #endif
3515 3515
 }
3516 3516
 
... ...
@@ -3521,7 +3856,7 @@ static void ModificationDate_cb(struct pdf_struct *pdf, struct pdf_obj *obj, str
3521 3521
         return;
3522 3522
 
3523 3523
     if (!(pdf->stats.modificationdate))
3524
-        pdf->stats.modificationdate = pdf_parse_string(pdf, obj, obj->start + pdf->map, obj_size(pdf, obj, 1), "/ModDate");
3524
+        pdf->stats.modificationdate = pdf_parse_string(pdf, obj, obj->start + pdf->map, obj_size(pdf, obj, 1), "/ModDate", NULL);
3525 3525
 #endif
3526 3526
 }
3527 3527
 
... ...
@@ -3532,7 +3867,7 @@ static void CreationDate_cb(struct pdf_struct *pdf, struct pdf_obj *obj, struct
3532 3532
         return;
3533 3533
 
3534 3534
     if (!(pdf->stats.creationdate))
3535
-        pdf->stats.creationdate = pdf_parse_string(pdf, obj, obj->start + pdf->map, obj_size(pdf, obj, 1), "/CreationDate");
3535
+        pdf->stats.creationdate = pdf_parse_string(pdf, obj, obj->start + pdf->map, obj_size(pdf, obj, 1), "/CreationDate", NULL);
3536 3536
 #endif
3537 3537
 }
3538 3538
 
... ...
@@ -3543,7 +3878,7 @@ static void Producer_cb(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdf_
3543 3543
         return;
3544 3544
 
3545 3545
     if (!(pdf->stats.producer))
3546
-        pdf->stats.producer = pdf_parse_string(pdf, obj, obj->start + pdf->map, obj_size(pdf, obj, 1), "/Producer");
3546
+        pdf->stats.producer = pdf_parse_string(pdf, obj, obj->start + pdf->map, obj_size(pdf, obj, 1), "/Producer", NULL);
3547 3547
 #endif
3548 3548
 }
3549 3549
 
... ...
@@ -3554,7 +3889,7 @@ static void Title_cb(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdf_act
3554 3554
         return;
3555 3555
 
3556 3556
     if (!(pdf->stats.title))
3557
-        pdf->stats.title = pdf_parse_string(pdf, obj, obj->start + pdf->map, obj_size(pdf, obj, 1), "/Title");
3557
+        pdf->stats.title = pdf_parse_string(pdf, obj, obj->start + pdf->map, obj_size(pdf, obj, 1), "/Title", NULL);
3558 3558
 #endif
3559 3559
 }
3560 3560
 
... ...
@@ -3565,7 +3900,7 @@ static void Keywords_cb(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdf_
3565 3565
         return;
3566 3566
 
3567 3567
     if (!(pdf->stats.keywords))
3568
-        pdf->stats.keywords = pdf_parse_string(pdf, obj, obj->start + pdf->map, obj_size(pdf, obj, 1), "/Keywords");
3568
+        pdf->stats.keywords = pdf_parse_string(pdf, obj, obj->start + pdf->map, obj_size(pdf, obj, 1), "/Keywords", NULL);
3569 3569
 #endif
3570 3570
 }
3571 3571
 
... ...
@@ -3576,7 +3911,7 @@ static void Subject_cb(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdf_a
3576 3576
         return;
3577 3577
 
3578 3578
     if (!(pdf->stats.subject))
3579
-        pdf->stats.subject = pdf_parse_string(pdf, obj, obj->start + pdf->map, obj_size(pdf, obj, 1), "/Subject");
3579
+        pdf->stats.subject = pdf_parse_string(pdf, obj, obj->start + pdf->map, obj_size(pdf, obj, 1), "/Subject", NULL);
3580 3580
 #endif
3581 3581
 }
3582 3582
 
... ...
@@ -3589,6 +3924,7 @@ static void Pages_cb(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdf_act
3589 3589
     unsigned int objsz = obj_size(pdf, obj, 1);
3590 3590
     unsigned long npages=0, count;
3591 3591
     struct pdf_array_node *node;
3592
+    struct pdf_dict *dict;
3592 3593
     json_object *pdfobj;
3593 3594
 
3594 3595
     if (!(pdf) || !(pdf->ctx->wrkproperty))
... ...
@@ -3598,21 +3934,29 @@ static void Pages_cb(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdf_act
3598 3598
     if (!(pdfobj))
3599 3599
         return;
3600 3600
 
3601
+    begin = cli_memstr(objstart, objsz, "<<", 2);
3602
+    if (!(begin))
3603
+        return;
3604
+
3605
+    dict = pdf_parse_dict(pdf, obj, objsz, begin, NULL);
3606
+    if (dict)
3607
+        pdf_free_dict(dict);
3608
+
3601 3609
     begin = cli_memstr(objstart, objsz, "/Kids", 5);
3602 3610
     if (!(begin))
3603 3611
         return;
3604 3612
 
3605 3613
     begin += 5;
3606 3614
 
3607
-    array = pdf_parse_array(pdf, obj, objsz, begin);
3608
-    if (!(array))
3615
+    array = pdf_parse_array(pdf, obj, objsz, begin, NULL);
3616
+    if (!(array)) {
3617
+        cli_jsonbool(pdfobj, "IncorrectPagesCount", 1);
3609 3618
         return;
3610
-
3611
-    pdf_print_array(array, 0);
3619
+    }
3612 3620
 
3613 3621
     for (node = array->nodes; node != NULL; node = node->next)
3614 3622
         if (node->datasz)
3615
-            if (((char *)(node->data))[0] == 'R')
3623
+            if (strchr((char *)(node->data), 'R'))
3616 3624
                 npages++;
3617 3625
 
3618 3626
     begin = cli_memstr(obj->start + pdf->map, objsz, "/Count", 6);
... ...
@@ -30,7 +30,8 @@ struct pdf_obj {
30 30
     char *path;
31 31
 };
32 32
 
33
-enum pdf_array_type { PDF_ARR_UNKNOWN=0, PDF_ARR_STRING, PDF_ARR_ARRAY };
33
+enum pdf_array_type { PDF_ARR_UNKNOWN=0, PDF_ARR_STRING, PDF_ARR_ARRAY, PDF_ARR_DICT };
34
+enum pdf_dict_type { PDF_DICT_UNKNOWN=0, PDF_DICT_STRING, PDF_DICT_ARRAY, PDF_DICT_DICT };
34 35
 
35 36
 struct pdf_array_node {
36 37
     void *data;
... ...
@@ -46,6 +47,21 @@ struct pdf_array {
46 46
     struct pdf_array_node *tail;
47 47
 };
48 48
 
49
+struct pdf_dict_node {
50
+    char *key;
51
+    void *value;
52
+    size_t valuesz;
53
+    enum pdf_dict_type type;
54
+
55
+    struct pdf_dict_node *prev;
56
+    struct pdf_dict_node *next;
57
+};
58
+
59
+struct pdf_dict {
60
+    struct pdf_dict_node *nodes;
61
+    struct pdf_dict_node *tail;
62
+};
63
+
49 64
 #define OBJ_FLAG_PDFNAME_NONE 0x0
50 65
 #define OBJ_FLAG_PDFNAME_DONE 0x1
51 66