Browse code

Make some of the new PDF code a little more efficient

Shawn Webb authored on 2014/06/25 23:58:55
Showing 1 changed files
... ...
@@ -83,6 +83,7 @@ static	const	char	*pdf_nextobject(const char *ptr, size_t len);
83 83
 static char *pdf_parse_string(struct pdf_struct *pdf, struct pdf_obj *obj, const char *objstart, size_t objsize, const char *str, char **endchar);
84 84
 static struct pdf_array *pdf_parse_array(struct pdf_struct *pdf, struct pdf_obj *obj, size_t objsz, char *begin, char **endchar);
85 85
 static struct pdf_dict *pdf_parse_dict(struct pdf_struct *pdf, struct pdf_obj *obj, size_t objsz, char *begin, char **endchar);
86
+static int is_object_reference(char *begin, char **endchar, uint32_t *id);
86 87
 static void pdf_free_dict(struct pdf_dict *dict);
87 88
 static void pdf_free_array(struct pdf_array *array);
88 89
 static int pdf_extract_obj(struct pdf_struct *pdf, struct pdf_obj *obj, uint32_t flags);
... ...
@@ -2901,12 +2902,23 @@ static char *pdf_convert_utf(char *begin, size_t sz)
2901 2901
 #endif
2902 2902
 }
2903 2903
 
2904
-static int is_object_reference(char *begin, char **endchar)
2904
+static int is_object_reference(char *begin, char **endchar, uint32_t *id)
2905 2905
 {
2906 2906
     char *end = *endchar;
2907 2907
     char *p1=begin, *p2;
2908 2908
     unsigned long n;
2909
+    uint32_t t=0;
2909 2910
 
2911
+    /*
2912
+     * Object references are always this format:
2913
+     * XXXX YYYY R
2914
+     * Where XXXX is the object ID and YYYY is the revision ID of the object.
2915
+     * The letter R signifies that this is a reference.
2916
+     *
2917
+     * In between each item can be an arbitrary amount of whitespace.
2918
+     */
2919
+
2920
+    /* Skip whitespace */
2910 2921
     while (p1 < end && isspace(p1[0]))
2911 2922
         p1++;
2912 2923
 
... ...
@@ -2916,6 +2928,7 @@ static int is_object_reference(char *begin, char **endchar)
2916 2916
     if (!isnumber(p1[0]))
2917 2917
         return 0;
2918 2918
 
2919
+    /* Ensure strtoul() isn't going to go past our buffer */
2919 2920
     p2 = p1+1;
2920 2921
     while (p2 < end && !isspace(p2[0]))
2921 2922
         p2++;
... ...
@@ -2927,6 +2940,9 @@ static int is_object_reference(char *begin, char **endchar)
2927 2927
     if (n == ULONG_MAX && errno)
2928 2928
         return 0;
2929 2929
 
2930
+    t = n<<8;
2931
+
2932
+    /* Skip more whitespace */
2930 2933
     p1 = p2;
2931 2934
     while (p1 < end && isspace(p1[0]))
2932 2935
         p1++;
... ...
@@ -2937,6 +2953,7 @@ static int is_object_reference(char *begin, char **endchar)
2937 2937
     if (!isnumber(p1[0]))
2938 2938
         return 0;
2939 2939
 
2940
+    /* Ensure strtoul() is going to go past our buffer */
2940 2941
     p2 = p1+1;
2941 2942
     while (p2 < end && !isspace(p2[0]))
2942 2943
         p2++;
... ...
@@ -2948,6 +2965,8 @@ static int is_object_reference(char *begin, char **endchar)
2948 2948
     if (n == ULONG_MAX && errno)
2949 2949
         return 0;
2950 2950
 
2951
+    t |= (n&0xff);
2952
+
2951 2953
     p1 = p2;
2952 2954
     while (p1 < end && isspace(p1[0]))
2953 2955
         p1++;
... ...
@@ -2957,6 +2976,9 @@ static int is_object_reference(char *begin, char **endchar)
2957 2957
 
2958 2958
     if (p1[0] == 'R') {
2959 2959
         *endchar = p1+1;
2960
+        if (id)
2961
+            *id = t;
2962
+
2960 2963
         return 1;
2961 2964
     }
2962 2965
 
... ...
@@ -2971,6 +2993,7 @@ static char *pdf_parse_string(struct pdf_struct *pdf, struct pdf_obj *obj, const
2971 2971
     char *buf, *outbuf, *res;
2972 2972
     int likelyutf = 0;
2973 2973
     unsigned int i;
2974
+    uint32_t objid;
2974 2975
 
2975 2976
     /*
2976 2977
      * Yes, all of this is required to find the start and end of a potentially UTF-* string
... ...
@@ -3011,45 +3034,14 @@ static char *pdf_parse_string(struct pdf_struct *pdf, struct pdf_obj *obj, const
3011 3011
     /* We should be at the start of the string, minus 1 */
3012 3012
 
3013 3013
     p2 = q + objsize;
3014
-    if (is_object_reference(p1, &p2)) {
3015
-        unsigned long objnum, revnum;
3014
+    if (is_object_reference(p1, &p2, &objid)) {
3016 3015
         struct pdf_obj *newobj;
3017 3016
         char *end, *begin;
3018 3017
         STATBUF sb;
3019 3018
         uint32_t objflags;
3020 3019
         int fd;
3021 3020
 
3022
-        /*
3023
-         * This is kind of sketchy... This string says it points to another object.
3024
-         * Try to get/parse the object and return the decoded value as an ASCII/UTF-8 string.
3025
-         */
3026
-
3027
-        /* Get the object number */
3028
-        objnum = strtoul(p1, &end, 10);
3029
-        if ((end - p1) == 0)
3030
-            return NULL;
3031
-
3032
-        /* Skip whitespace and get the revision number */
3033
-        p1 = end+1;
3034
-        while (p1 - q < objsize && isspace(p1[0]))
3035
-            p1++;
3036
-
3037
-        if (p1 - q == objsize)
3038
-            return NULL;
3039
-
3040
-        revnum = strtoul(p1, &end, 10);
3041
-
3042
-        p1 = end+1;
3043
-        while (p1 - q < objsize && isspace(p1[0]))
3044
-            p1++;
3045
-
3046
-        if (p1 - q == objsize)
3047
-            return NULL;
3048
-
3049
-        if (p1[0] != 'R')
3050
-            return NULL;
3051
-
3052
-        newobj = find_obj(pdf, obj, (objnum<<8) | (revnum & 0xff));
3021
+        newobj = find_obj(pdf, obj, objid);
3053 3022
         if (!(newobj))
3054 3023
             return NULL;
3055 3024
 
... ...
@@ -3131,7 +3123,7 @@ static char *pdf_parse_string(struct pdf_struct *pdf, struct pdf_obj *obj, const
3131 3131
         newobj->path = NULL;
3132 3132
 
3133 3133
         if (endchar)
3134
-            *endchar = p1;
3134
+            *endchar = p2;
3135 3135
 
3136 3136
         return res;
3137 3137
     }
... ...
@@ -3368,7 +3360,7 @@ static struct pdf_dict *pdf_parse_dict(struct pdf_struct *pdf, struct pdf_obj *o
3368 3368
                     p1++;
3369 3369
                 }
3370 3370
 
3371
-                is_object_reference(begin, &p1);
3371
+                is_object_reference(begin, &p1, NULL);
3372 3372
 
3373 3373
                 val = cli_calloc((p1 - begin) + 2, 1);
3374 3374
                 if (!(val))
... ...
@@ -3517,7 +3509,7 @@ static struct pdf_array *pdf_parse_array(struct pdf_struct *pdf, struct pdf_obj
3517 3517
             default:
3518 3518
                 /* This should just be a number or the letter R */
3519 3519
                 p1 = end;
3520
-                if (!is_object_reference(begin, &p1)) {
3520
+                if (!is_object_reference(begin, &p1, NULL)) {
3521 3521
                     p1 = begin+1;
3522 3522
                     while (p1 < end && !isspace(p1[0]))
3523 3523
                         p1++;
... ...
@@ -3639,12 +3631,14 @@ static void pdf_print_dict(struct pdf_dict *dict, unsigned long depth)
3639 3639
     struct pdf_dict_node *node;
3640 3640
 
3641 3641
     for (node = dict->nodes; node != NULL; node = node->next) {
3642
-        if (node->type == PDF_DICT_STRING)
3642
+        if (node->type == PDF_DICT_STRING) {
3643 3643
             cli_errmsg("dict[%lu][%s]: %s\n", depth, node->key, (char *)(node->value));
3644
-        else if (node->type == PDF_DICT_ARRAY)
3644
+        } else if (node->type == PDF_DICT_ARRAY) {
3645
+            cli_errmsg("dict[%lu][%s]: Array =>\n", depth, node->key);
3645 3646
             pdf_print_array((struct pdf_array *)(node->value), depth);
3646
-        else if (node->type == PDF_DICT_DICT)
3647
+        } else if (node->type == PDF_DICT_DICT) {
3647 3648
             pdf_print_dict((struct pdf_dict *)(node->value), depth+1);
3649
+        }
3648 3650
     }
3649 3651
 }
3650 3652
 
... ...
@@ -3939,8 +3933,11 @@ static void Pages_cb(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdf_act
3939 3939
         return;
3940 3940
 
3941 3941
     dict = pdf_parse_dict(pdf, obj, objsz, begin, NULL);
3942
-    if (dict)
3942
+    if (dict) {
3943
+        cli_errmsg("==== ==== ==== ====\n");
3944
+        pdf_print_dict(dict, 0);
3943 3945
         pdf_free_dict(dict);
3946
+    }
3944 3947
 
3945 3948
     begin = cli_memstr(objstart, objsz, "/Kids", 5);
3946 3949
     if (!(begin))