Browse code

bb#11238 - added missing PDF preclass operations > added whitespace fix for indirect references strings > added PDF escape sequence handling (including octal)

Kevin Lin authored on 2015/01/13 06:45:36
Showing 1 changed files
... ...
@@ -76,7 +76,7 @@ char *pdf_convert_utf(char *begin, size_t sz)
76 76
     char *res=NULL;
77 77
 #if HAVE_ICONV
78 78
     char *buf, *outbuf, *p1, *p2;
79
-    size_t inlen, outlen, i;
79
+    size_t sz2, inlen, outlen, i;
80 80
     char *encodings[] = {
81 81
         "UTF-16",
82 82
         NULL
... ...
@@ -87,10 +87,63 @@ char *pdf_convert_utf(char *begin, size_t sz)
87 87
     if (!(buf))
88 88
         return NULL;
89 89
 
90
-    memcpy(buf, begin, sz);
90
+    /* convert PDF specific escape sequences, like octal sequences */
91
+    sz2 = 0;
92
+    for (i = 0; i < sz; ++i) {
93
+        if ((i+1 < sz) && begin[i] == '\\') {
94
+            if ((i+3 < sz) &&
95
+                (isdigit(begin[i+1]) && isdigit(begin[i+2]) && isdigit(begin[i+3]))) {
96
+                /* octal sequence */
97
+                char octal[4], *check;
98
+                unsigned long value;
99
+
100
+                memcpy(octal, &begin[i+1], 3);
101
+                octal[3] = '\0';
102
+
103
+                value = (char)strtoul(octal, &check, 8);
104
+                /* check if all characters were converted */
105
+                if (check == &octal[3])
106
+                    buf[sz2++] = value;
107
+                i += 3;
108
+            } else {
109
+                /* other sequences */
110
+                switch(begin[i+1]) {
111
+                case 'n':
112
+                    buf[sz2++] = 0x0a;
113
+                    break;
114
+                case 'r':
115
+                    buf[sz2++] = 0x0d;
116
+                    break;
117
+                case 't':
118
+                    buf[sz2++] = 0x09;
119
+                    break;
120
+                case 'b':
121
+                    buf[sz2++] = 0x08;
122
+                    break;
123
+                case 'f':
124
+                    buf[sz2++] = 0x0c;
125
+                    break;
126
+                case '(':
127
+                    buf[sz2++] = 0x28;
128
+                    break;
129
+                case ')':
130
+                    buf[sz2++] = 0x29;
131
+                    break;
132
+                case '\\':
133
+                    buf[sz2++] = 0x5c;
134
+                    break;
135
+                default:
136
+                    /* IGNORE THE REVERSE SOLIDUS - PDF3000-2008 */
137
+                    break;
138
+                }
139
+            }
140
+        } else
141
+            buf[sz2++] = begin[i]; 
142
+    }
143
+    //memcpy(buf, begin, sz);
91 144
     p1 = buf;
92 145
 
93
-    p2 = outbuf = cli_calloc(1, sz+1);
146
+    p2 = outbuf = cli_calloc(1, sz2+1);
94 147
     if (!(outbuf)) {
95 148
         free(buf);
96 149
         return NULL;
... ...
@@ -99,7 +152,7 @@ char *pdf_convert_utf(char *begin, size_t sz)
99 99
     for (i=0; encodings[i] != NULL; i++) {
100 100
         p1 = buf;
101 101
         p2 = outbuf;
102
-        inlen = outlen = sz;
102
+        inlen = outlen = sz2;
103 103
 
104 104
         cd = iconv_open("UTF-8", encodings[i]);
105 105
         if (cd == (iconv_t)(-1)) {
... ...
@@ -109,13 +162,13 @@ char *pdf_convert_utf(char *begin, size_t sz)
109 109
 
110 110
         iconv(cd, (char **)(&p1), &inlen, &p2, &outlen);
111 111
 
112
-        if (outlen == sz) {
112
+        if (outlen == sz2) {
113 113
             /* Decoding unsuccessful right from the start */
114 114
             iconv_close(cd);
115 115
             continue;
116 116
         }
117 117
 
118
-        outbuf[sz - outlen] = '\0';
118
+        outbuf[sz2 - outlen] = '\0';
119 119
 
120 120
         res = strdup(outbuf);
121 121
         iconv_close(cd);
... ...
@@ -277,10 +330,11 @@ char *pdf_parse_string(struct pdf_struct *pdf, struct pdf_obj *obj, const char *
277 277
     p2 = (char *)(q + objsize);
278 278
     if (is_object_reference(p1, &p2, &objid)) {
279 279
         struct pdf_obj *newobj;
280
-        char *begin;
280
+        char *begin, *p3;
281 281
         STATBUF sb;
282 282
         uint32_t objflags;
283 283
         int fd;
284
+        size_t objsize2;
284 285
 
285 286
         newobj = find_obj(pdf, obj, objid);
286 287
         if (!(newobj))
... ...
@@ -343,25 +397,32 @@ char *pdf_parse_string(struct pdf_struct *pdf, struct pdf_obj *obj, const char *
343 343
                 return NULL;
344 344
             }
345 345
 
346
-            switch (begin[0]) {
346
+            p3 = begin;
347
+            objsize2 = sb.st_size;
348
+            while ((size_t)(p3 - begin) < objsize2 && isspace(p3[0])) {
349
+                p3++;
350
+                objsize2--;
351
+            }
352
+
353
+            switch (*p3) {
347 354
                 case '(':
348 355
                 case '<':
349
-                    res = pdf_parse_string(pdf, obj, begin, sb.st_size, NULL, NULL);
356
+                    res = pdf_parse_string(pdf, obj, p3, objsize2, NULL, NULL);
350 357
                     free(begin);
351 358
                     break;
352 359
                 default:
353
-                    for (i=0; i < sb.st_size; i++) {
354
-                        if (begin[i] >= 0x7f) {
360
+                    for (i=0; i < objsize2; i++) {
361
+                        if (p3[i] >= 0x7f) {
355 362
                             likelyutf=1;
356 363
                             break;
357 364
                         }
358 365
                     }
359 366
 
360
-                    res = likelyutf ? pdf_convert_utf(begin, sb.st_size) : NULL;
367
+                    res = likelyutf ? pdf_convert_utf(p3, objsize2) : NULL;
361 368
 
362 369
                     if (!(res)) {
363 370
                         res = begin;
364
-                        res[sb.st_size] = '\0';
371
+                        res[objsize2] = '\0';
365 372
                     } else {
366 373
                         free(begin);
367 374
                     }