... | ... |
@@ -76,7 +76,7 @@ char *pdf_convert_utf(char *begin, size_t sz) |
76 | 76 |
char *res=NULL; |
77 | 77 |
#if HAVE_ICONV |
78 | 78 |
char *buf, *outbuf, *p1, *p2; |
79 |
- size_t inlen, outlen, i; |
|
79 |
+ size_t sz2, inlen, outlen, i; |
|
80 | 80 |
char *encodings[] = { |
81 | 81 |
"UTF-16", |
82 | 82 |
NULL |
... | ... |
@@ -87,10 +87,63 @@ char *pdf_convert_utf(char *begin, size_t sz) |
87 | 87 |
if (!(buf)) |
88 | 88 |
return NULL; |
89 | 89 |
|
90 |
- memcpy(buf, begin, sz); |
|
90 |
+ /* convert PDF specific escape sequences, like octal sequences */ |
|
91 |
+ sz2 = 0; |
|
92 |
+ for (i = 0; i < sz; ++i) { |
|
93 |
+ if ((i+1 < sz) && begin[i] == '\\') { |
|
94 |
+ if ((i+3 < sz) && |
|
95 |
+ (isdigit(begin[i+1]) && isdigit(begin[i+2]) && isdigit(begin[i+3]))) { |
|
96 |
+ /* octal sequence */ |
|
97 |
+ char octal[4], *check; |
|
98 |
+ unsigned long value; |
|
99 |
+ |
|
100 |
+ memcpy(octal, &begin[i+1], 3); |
|
101 |
+ octal[3] = '\0'; |
|
102 |
+ |
|
103 |
+ value = (char)strtoul(octal, &check, 8); |
|
104 |
+ /* check if all characters were converted */ |
|
105 |
+ if (check == &octal[3]) |
|
106 |
+ buf[sz2++] = value; |
|
107 |
+ i += 3; |
|
108 |
+ } else { |
|
109 |
+ /* other sequences */ |
|
110 |
+ switch(begin[i+1]) { |
|
111 |
+ case 'n': |
|
112 |
+ buf[sz2++] = 0x0a; |
|
113 |
+ break; |
|
114 |
+ case 'r': |
|
115 |
+ buf[sz2++] = 0x0d; |
|
116 |
+ break; |
|
117 |
+ case 't': |
|
118 |
+ buf[sz2++] = 0x09; |
|
119 |
+ break; |
|
120 |
+ case 'b': |
|
121 |
+ buf[sz2++] = 0x08; |
|
122 |
+ break; |
|
123 |
+ case 'f': |
|
124 |
+ buf[sz2++] = 0x0c; |
|
125 |
+ break; |
|
126 |
+ case '(': |
|
127 |
+ buf[sz2++] = 0x28; |
|
128 |
+ break; |
|
129 |
+ case ')': |
|
130 |
+ buf[sz2++] = 0x29; |
|
131 |
+ break; |
|
132 |
+ case '\\': |
|
133 |
+ buf[sz2++] = 0x5c; |
|
134 |
+ break; |
|
135 |
+ default: |
|
136 |
+ /* IGNORE THE REVERSE SOLIDUS - PDF3000-2008 */ |
|
137 |
+ break; |
|
138 |
+ } |
|
139 |
+ } |
|
140 |
+ } else |
|
141 |
+ buf[sz2++] = begin[i]; |
|
142 |
+ } |
|
143 |
+ //memcpy(buf, begin, sz); |
|
91 | 144 |
p1 = buf; |
92 | 145 |
|
93 |
- p2 = outbuf = cli_calloc(1, sz+1); |
|
146 |
+ p2 = outbuf = cli_calloc(1, sz2+1); |
|
94 | 147 |
if (!(outbuf)) { |
95 | 148 |
free(buf); |
96 | 149 |
return NULL; |
... | ... |
@@ -99,7 +152,7 @@ char *pdf_convert_utf(char *begin, size_t sz) |
99 | 99 |
for (i=0; encodings[i] != NULL; i++) { |
100 | 100 |
p1 = buf; |
101 | 101 |
p2 = outbuf; |
102 |
- inlen = outlen = sz; |
|
102 |
+ inlen = outlen = sz2; |
|
103 | 103 |
|
104 | 104 |
cd = iconv_open("UTF-8", encodings[i]); |
105 | 105 |
if (cd == (iconv_t)(-1)) { |
... | ... |
@@ -109,13 +162,13 @@ char *pdf_convert_utf(char *begin, size_t sz) |
109 | 109 |
|
110 | 110 |
iconv(cd, (char **)(&p1), &inlen, &p2, &outlen); |
111 | 111 |
|
112 |
- if (outlen == sz) { |
|
112 |
+ if (outlen == sz2) { |
|
113 | 113 |
/* Decoding unsuccessful right from the start */ |
114 | 114 |
iconv_close(cd); |
115 | 115 |
continue; |
116 | 116 |
} |
117 | 117 |
|
118 |
- outbuf[sz - outlen] = '\0'; |
|
118 |
+ outbuf[sz2 - outlen] = '\0'; |
|
119 | 119 |
|
120 | 120 |
res = strdup(outbuf); |
121 | 121 |
iconv_close(cd); |
... | ... |
@@ -277,10 +330,11 @@ char *pdf_parse_string(struct pdf_struct *pdf, struct pdf_obj *obj, const char * |
277 | 277 |
p2 = (char *)(q + objsize); |
278 | 278 |
if (is_object_reference(p1, &p2, &objid)) { |
279 | 279 |
struct pdf_obj *newobj; |
280 |
- char *begin; |
|
280 |
+ char *begin, *p3; |
|
281 | 281 |
STATBUF sb; |
282 | 282 |
uint32_t objflags; |
283 | 283 |
int fd; |
284 |
+ size_t objsize2; |
|
284 | 285 |
|
285 | 286 |
newobj = find_obj(pdf, obj, objid); |
286 | 287 |
if (!(newobj)) |
... | ... |
@@ -343,25 +397,32 @@ char *pdf_parse_string(struct pdf_struct *pdf, struct pdf_obj *obj, const char * |
343 | 343 |
return NULL; |
344 | 344 |
} |
345 | 345 |
|
346 |
- switch (begin[0]) { |
|
346 |
+ p3 = begin; |
|
347 |
+ objsize2 = sb.st_size; |
|
348 |
+ while ((size_t)(p3 - begin) < objsize2 && isspace(p3[0])) { |
|
349 |
+ p3++; |
|
350 |
+ objsize2--; |
|
351 |
+ } |
|
352 |
+ |
|
353 |
+ switch (*p3) { |
|
347 | 354 |
case '(': |
348 | 355 |
case '<': |
349 |
- res = pdf_parse_string(pdf, obj, begin, sb.st_size, NULL, NULL); |
|
356 |
+ res = pdf_parse_string(pdf, obj, p3, objsize2, NULL, NULL); |
|
350 | 357 |
free(begin); |
351 | 358 |
break; |
352 | 359 |
default: |
353 |
- for (i=0; i < sb.st_size; i++) { |
|
354 |
- if (begin[i] >= 0x7f) { |
|
360 |
+ for (i=0; i < objsize2; i++) { |
|
361 |
+ if (p3[i] >= 0x7f) { |
|
355 | 362 |
likelyutf=1; |
356 | 363 |
break; |
357 | 364 |
} |
358 | 365 |
} |
359 | 366 |
|
360 |
- res = likelyutf ? pdf_convert_utf(begin, sb.st_size) : NULL; |
|
367 |
+ res = likelyutf ? pdf_convert_utf(p3, objsize2) : NULL; |
|
361 | 368 |
|
362 | 369 |
if (!(res)) { |
363 | 370 |
res = begin; |
364 |
- res[sb.st_size] = '\0'; |
|
371 |
+ res[objsize2] = '\0'; |
|
365 | 372 |
} else { |
366 | 373 |
free(begin); |
367 | 374 |
} |