... | ... |
@@ -68,7 +68,7 @@ |
68 | 68 |
#include "json_api.h" |
69 | 69 |
|
70 | 70 |
#ifdef CL_DEBUG |
71 |
-/*#define SAVE_TMP |
|
71 |
+/*#define SAVE_TMP |
|
72 | 72 |
*Save the file being worked on in tmp */ |
73 | 73 |
#endif |
74 | 74 |
|
... | ... |
@@ -134,14 +134,14 @@ static int xrefCheck(const char *xref, const char *eof) |
134 | 134 |
if (xref + 4 >= eof) |
135 | 135 |
return -1; |
136 | 136 |
|
137 |
- if (!memcmp(xref, "xref", 4)) { |
|
137 |
+ if (!memcmp(xref, "xref", strlen("xref"))) { |
|
138 | 138 |
cli_dbgmsg("cli_pdf: found xref\n"); |
139 | 139 |
return 0; |
140 | 140 |
} |
141 | 141 |
|
142 | 142 |
/* could be xref stream */ |
143 | 143 |
for (q = xref; q + 5 < eof; q++) { |
144 |
- if (!memcmp(q, "/XRef", 4)) { |
|
144 |
+ if (!memcmp(q, "/XRef", strlen("/XRef"))) { |
|
145 | 145 |
cli_dbgmsg("cli_pdf: found /XRef\n"); |
146 | 146 |
return 0; |
147 | 147 |
} |
... | ... |
@@ -163,10 +163,10 @@ static int xrefCheck(const char *xref, const char *eof) |
163 | 163 |
|
164 | 164 |
/** |
165 | 165 |
* @brief Searching BACKwards, find the next character that is not a whitespace. |
166 |
- * |
|
166 |
+ * |
|
167 | 167 |
* @param q Index to start from (at the end of the search space) |
168 |
- * @param start Beginning of the search space. |
|
169 |
- * |
|
168 |
+ * @param start Beginning of the search space. |
|
169 |
+ * |
|
170 | 170 |
* @return const char* Address of the final non-whitespace character OR the same address as the start. |
171 | 171 |
*/ |
172 | 172 |
static const char *findNextNonWSBack(const char *q, const char *start) |
... | ... |
@@ -179,10 +179,10 @@ static const char *findNextNonWSBack(const char *q, const char *start) |
179 | 179 |
|
180 | 180 |
/** |
181 | 181 |
* @brief Searching FORwards, find the next character that is not a whitespace. |
182 |
- * |
|
182 |
+ * |
|
183 | 183 |
* @param q Index to start from (at the end of the search space) |
184 |
- * @param start Beginning of the search space. |
|
185 |
- * |
|
184 |
+ * @param start Beginning of the search space. |
|
185 |
+ * |
|
186 | 186 |
* @return const char* Address of the final non-whitespace character OR the same address as the start. |
187 | 187 |
*/ |
188 | 188 |
static const char *findNextNonWS(const char *q, const char *end) |
... | ... |
@@ -195,100 +195,116 @@ static const char *findNextNonWS(const char *q, const char *end) |
195 | 195 |
|
196 | 196 |
/** |
197 | 197 |
* @brief Find bounds of stream. |
198 |
- * |
|
198 |
+ * |
|
199 | 199 |
* PDF streams are prefixed with "stream" and suffixed with "endstream". |
200 | 200 |
* Return value indicates success or failure. |
201 |
- * |
|
201 |
+ * |
|
202 | 202 |
* @param start start address of search space. |
203 |
- * @param bytesleft size of search space for "stream" |
|
204 |
- * @param bytesleft2 size of search space for "endstream" |
|
203 |
+ * @param size size of search space |
|
205 | 204 |
* @param[out] stream output param, address of start of stream data |
206 |
- * @param[out] endstream output param, address of end of stream data |
|
205 |
+ * @param[out] stream_size output param, size of stream data |
|
207 | 206 |
* @param newline_hack hack to support newlines that are \r\n, and not just \n or just \r. |
208 |
- * |
|
209 |
- * @return int 1 if stream bounds were found. |
|
210 |
- * @return int 0 if stream bounds could not be found. |
|
207 |
+ * |
|
208 |
+ * @return cl_error_t CL_SUCCESS if stream bounds were found. |
|
209 |
+ * @return cl_error_t CL_BREAK if stream bounds could not be found. |
|
210 |
+ * @return cl_error_t CL_EFORMAT if stream start was found, but not end. (truncated) |
|
211 |
+ * @return cl_error_t CL_EARG if invalid args were provided. |
|
211 | 212 |
*/ |
212 |
-static int find_stream_bounds( |
|
213 |
+static cl_error_t find_stream_bounds( |
|
213 | 214 |
const char *start, |
214 |
- off_t bytesleft, |
|
215 |
- off_t bytesleft2, |
|
216 |
- off_t *stream, |
|
217 |
- off_t *endstream, |
|
215 |
+ size_t size, |
|
216 |
+ const char **stream, |
|
217 |
+ size_t *stream_size, |
|
218 | 218 |
int newline_hack) |
219 | 219 |
{ |
220 |
- const char *q2, *q; |
|
220 |
+ cl_error_t status = CL_BREAK; |
|
221 |
+ |
|
222 |
+ const char *idx; |
|
223 |
+ const char *stream_begin; |
|
224 |
+ const char *endstream_begin; |
|
225 |
+ size_t bytesleft = size; |
|
226 |
+ |
|
227 |
+ if ((NULL == start) || (0 == bytesleft) || (NULL == stream) || (NULL == stream_size)) { |
|
228 |
+ status = CL_EARG; |
|
229 |
+ return status; |
|
230 |
+ } |
|
231 |
+ |
|
232 |
+ *stream = NULL; |
|
233 |
+ *stream_size = 0; |
|
221 | 234 |
|
222 | 235 |
/* Begin by finding the "stream" string that prefixes stream data. */ |
223 |
- if ((q2 = cli_memstr(start, bytesleft, "stream", 6))) { |
|
224 |
- q2 += 6; |
|
225 |
- bytesleft -= q2 - start; |
|
236 |
+ if ((stream_begin = cli_memstr(start, bytesleft, "stream", strlen("stream")))) { |
|
237 |
+ idx = stream_begin + strlen("stream"); |
|
238 |
+ bytesleft -= idx - start; |
|
226 | 239 |
if (bytesleft < 0) |
227 |
- return 0; |
|
240 |
+ goto done; |
|
228 | 241 |
|
229 | 242 |
/* Skip any new line charcters. */ |
230 |
- if (bytesleft >= 2 && q2[0] == '\xd' && q2[1] == '\xa') { |
|
231 |
- q2 += 2; |
|
232 |
- if (newline_hack && (bytesleft > 2) && q2[0] == '\xa') |
|
233 |
- q2++; |
|
234 |
- } else if (bytesleft && q2[0] == '\xa') { |
|
235 |
- q2++; |
|
243 |
+ if (bytesleft >= 2 && idx[0] == '\xd' && idx[1] == '\xa') { |
|
244 |
+ idx += 2; |
|
245 |
+ if (newline_hack && (bytesleft > 2) && idx[0] == '\xa') |
|
246 |
+ idx++; |
|
247 |
+ } else if (bytesleft && idx[0] == '\xa') { |
|
248 |
+ idx++; |
|
236 | 249 |
} |
237 | 250 |
|
238 |
- *stream = q2 - start; |
|
251 |
+ /* Pass back start of the stream data. */ |
|
252 |
+ *stream = idx; |
|
239 | 253 |
|
240 |
- bytesleft2 -= q2 - start; |
|
241 |
- if (bytesleft2 <= 0) |
|
242 |
- return 0; |
|
254 |
+ bytesleft = size - (idx - start); |
|
255 |
+ if (bytesleft <= 0) |
|
256 |
+ goto done; |
|
243 | 257 |
|
244 |
- /* Now find the "endstream" string that suffixes stream data */ |
|
245 |
- q = q2; |
|
246 |
- q2 = cli_memstr(q, bytesleft2, "endstream", 9); |
|
247 |
- if (!q2) { |
|
248 |
- /* Couldn't find "endstream" */ |
|
249 |
- return 0; |
|
258 |
+ /* Now find the "endstream" string that suffixes stream data. */ |
|
259 |
+ endstream_begin = cli_memstr(idx, bytesleft, "endstream", strlen("endstream")); |
|
260 |
+ if (!endstream_begin) { |
|
261 |
+ /* Couldn't find "endstream", but that's ok -- |
|
262 |
+ * -- we'll just count the rest of the provided buffer. */ |
|
263 |
+ cli_dbgmsg("find_stream_bounds: Truncated stream found!\n"); |
|
264 |
+ endstream_begin = start + size; |
|
265 |
+ status = CL_EFORMAT; |
|
250 | 266 |
} |
251 | 267 |
|
252 |
- *endstream = q2 - start; |
|
268 |
+ /* Pass back end of the stream data, as offset from start. */ |
|
269 |
+ *stream_size = endstream_begin - *stream; |
|
253 | 270 |
|
254 |
- /* Double-check that endstream >= stream */ |
|
255 |
- if (*endstream < *stream) |
|
256 |
- *endstream = *stream; |
|
257 |
- |
|
258 |
- return 1; |
|
271 |
+ if (CL_EFORMAT != status) |
|
272 |
+ status = CL_SUCCESS; |
|
259 | 273 |
} |
260 | 274 |
|
261 |
- return 0; |
|
275 |
+done: |
|
276 |
+ |
|
277 |
+ return status; |
|
262 | 278 |
} |
263 | 279 |
|
264 | 280 |
/** |
265 |
- * @brief Find the next *indirect* object in an object stream, adds it to our list of |
|
281 |
+ * @brief Find the next *indirect* object in an object stream, adds it to our list of |
|
266 | 282 |
* objects, and increments nobj. |
267 |
- * |
|
283 |
+ * |
|
268 | 284 |
* Indirect objects in a stream DON'T begin with "obj" and end with "endobj". |
269 | 285 |
* Instead, they have an obj ID and an offset from the first object to point you |
270 | 286 |
* right at them. |
271 |
- * |
|
287 |
+ * |
|
272 | 288 |
* If found, objstm->current will be updated to the next obj id. |
273 |
- * |
|
274 |
- * All objects in an object stream are indirect and thus do not begin or start |
|
275 |
- * with "obj" or "endobj". Instead, the object stream takes the following |
|
289 |
+ * |
|
290 |
+ * All objects in an object stream are indirect and thus do not begin or start |
|
291 |
+ * with "obj" or "endobj". Instead, the object stream takes the following |
|
276 | 292 |
* format. |
277 |
- * |
|
293 |
+ * |
|
278 | 294 |
* <dictionary describing stream> objstm content endobjstm |
279 |
- * |
|
295 |
+ * |
|
280 | 296 |
* where content looks something like the following: |
281 |
- * |
|
297 |
+ * |
|
282 | 298 |
* 15 0 16 3 17 46 (ab)<</IDS 8 0 R/JavaScript 27 0 R/URLS 9 0 R>><</Names[(Test)28 0 R]>> |
283 |
- * |
|
284 |
- * In the above example, the literal string (ab) is indirect object # 15, and |
|
285 |
- * begins at offset 0 of the set of objects. The next object, # 16 begis at |
|
286 |
- * offset 3 is a dictionary. The final object is also a dictionary, beginning |
|
299 |
+ * |
|
300 |
+ * In the above example, the literal string (ab) is indirect object # 15, and |
|
301 |
+ * begins at offset 0 of the set of objects. The next object, # 16 begis at |
|
302 |
+ * offset 3 is a dictionary. The final object is also a dictionary, beginning |
|
287 | 303 |
* at offset 46. |
288 |
- * |
|
289 |
- * @param pdf Pdf struct that keeps track of all information found in the PDF. |
|
304 |
+ * |
|
305 |
+ * @param pdf Pdf struct that keeps track of all information found in the PDF. |
|
290 | 306 |
* @param objstm |
291 |
- * |
|
307 |
+ * |
|
292 | 308 |
* @return CL_SUCCESS if success |
293 | 309 |
* @return CL_EPARSE if parsing error |
294 | 310 |
* @return CL_EMEM if error allocating memory |
... | ... |
@@ -298,7 +314,7 @@ int pdf_findobj_in_objstm(struct pdf_struct *pdf, struct objstm_struct *objstm, |
298 | 298 |
{ |
299 | 299 |
cl_error_t status = CL_EPARSE; |
300 | 300 |
struct pdf_obj *obj = NULL; |
301 |
- unsigned long objid = 0, objsize = 0, objoff = 0; |
|
301 |
+ unsigned long objid = 0, objoff = 0; |
|
302 | 302 |
long temp_long = 0; |
303 | 303 |
const char *index = NULL; |
304 | 304 |
size_t bytes_remaining = 0; |
... | ... |
@@ -382,10 +398,10 @@ int pdf_findobj_in_objstm(struct pdf_struct *pdf, struct objstm_struct *objstm, |
382 | 382 |
(index < objstm->streambuf + objstm->streambuf_len)) { |
383 | 383 |
unsigned long next_objid = 0, next_objoff = 0; |
384 | 384 |
|
385 |
- /* |
|
386 |
- * While we're at it, |
|
385 |
+ /* |
|
386 |
+ * While we're at it, |
|
387 | 387 |
* lets record the size as running up to the next object offset. |
388 |
- * |
|
388 |
+ * |
|
389 | 389 |
* To do so, we will need to parse the next obj pair. |
390 | 390 |
*/ |
391 | 391 |
/* objstm->current_pair points directly to the obj id */ |
... | ... |
@@ -440,9 +456,9 @@ int pdf_findobj_in_objstm(struct pdf_struct *pdf, struct objstm_struct *objstm, |
440 | 440 |
} else { |
441 | 441 |
/* |
442 | 442 |
* Should be no more objects. We should verify. |
443 |
- * |
|
443 |
+ * |
|
444 | 444 |
* Either way... |
445 |
- * obj->size should be the rest of the buffer. |
|
445 |
+ * obj->size should be the rest of the buffer. |
|
446 | 446 |
*/ |
447 | 447 |
if (objstm->nobjs_found < objstm->n) { |
448 | 448 |
cli_warnmsg("pdf_findobj_in_objstm: Fewer objects found in object stream than expected!\n"); |
... | ... |
@@ -476,17 +492,25 @@ done: |
476 | 476 |
|
477 | 477 |
/** |
478 | 478 |
* @brief Find the next *indirect* object. |
479 |
- * |
|
480 |
- * Indirect objects begin with "obj" and end with "endobj". |
|
481 |
- * Identify objects that contain streams. |
|
482 |
- * Identify truncated objects. |
|
483 |
- * |
|
479 |
+ * |
|
480 |
+ * Indirect objects located outside of an object stream are prefaced with: |
|
481 |
+ * <objid> <genid> obj |
|
482 |
+ * |
|
483 |
+ * Each of the above are separated by whitespace of some sort. |
|
484 |
+ * |
|
485 |
+ * Indirect objects are postfaced with: |
|
486 |
+ * endobj |
|
487 |
+ * |
|
488 |
+ * The specification does not say if whitespace is required before or after "endobj". |
|
489 |
+ * |
|
490 |
+ * Identify truncated objects. |
|
491 |
+ * |
|
484 | 492 |
* If found, pdf->offset will be updated to just after the "endobj". |
485 | 493 |
* If truncated, pdf->offset will == pdf->size. |
486 | 494 |
* If not found, pdf->offset will not be updated. |
487 |
- * |
|
488 |
- * @param pdf Pdf context struct that keeps track of all information found in the PDF. |
|
489 |
- * |
|
495 |
+ * |
|
496 |
+ * @param pdf Pdf context struct that keeps track of all information found in the PDF. |
|
497 |
+ * |
|
490 | 498 |
* @return CL_SUCCESS if success |
491 | 499 |
* @return CL_BREAK if no more objects |
492 | 500 |
* @return CL_EPARSE if parsing error |
... | ... |
@@ -495,9 +519,13 @@ done: |
495 | 495 |
cl_error_t pdf_findobj(struct pdf_struct *pdf) |
496 | 496 |
{ |
497 | 497 |
cl_error_t status = CL_EPARSE; |
498 |
- const char *start, *q, *q2, *q3, *eof; |
|
498 |
+ const char *start, *idx, *genid_search_index, *objid_search_index; |
|
499 |
+ |
|
500 |
+ const char *obj_begin = NULL, *obj_end = NULL; |
|
501 |
+ const char *endobj_begin = NULL, *endobj_end = NULL; |
|
502 |
+ |
|
499 | 503 |
struct pdf_obj *obj = NULL; |
500 |
- off_t bytesleft; |
|
504 |
+ size_t bytesleft; |
|
501 | 505 |
unsigned long genid, objid; |
502 | 506 |
long temp_long; |
503 | 507 |
|
... | ... |
@@ -520,100 +548,111 @@ cl_error_t pdf_findobj(struct pdf_struct *pdf) |
520 | 520 |
start = pdf->map + pdf->offset; |
521 | 521 |
bytesleft = pdf->size - pdf->offset; |
522 | 522 |
|
523 |
- /* Indirect objects located outside of an object stream are prefaced with "obj" |
|
524 |
- * and suffixed with "endobj". Find the "obj" preface. */ |
|
525 |
- while (bytesleft > 0) { |
|
526 |
- q2 = cli_memstr(start, bytesleft, "obj", 3); |
|
527 |
- if (!q2) { |
|
528 |
- status = CL_BREAK; /* no more objs */ |
|
529 |
- goto done; |
|
523 |
+ /* |
|
524 |
+ * Start by searching for "obj" |
|
525 |
+ */ |
|
526 |
+ idx = start + 1; |
|
527 |
+ while (bytesleft > 1 + strlen("obj")) { |
|
528 |
+ /* `- 1` accounts for size of white space before obj */ |
|
529 |
+ idx = cli_memstr(idx, bytesleft - 1, "obj", strlen("obj")); |
|
530 |
+ if (NULL == idx) { |
|
531 |
+ status = CL_BREAK; |
|
532 |
+ goto done; /* No more objs. */ |
|
530 | 533 |
} |
531 | 534 |
|
532 |
- /* verify that "obj" has a whitespace before it, and is not the end of |
|
533 |
- * a previous string like... "globj" */ |
|
534 |
- q2--; |
|
535 |
- bytesleft -= q2 - start; |
|
535 |
+ /* verify that the word has a whitespace before it, and is not the end of |
|
536 |
+ * a previous word */ |
|
537 |
+ idx--; |
|
538 |
+ bytesleft = (pdf->size - pdf->offset) - (size_t)(idx - start); |
|
536 | 539 |
|
537 |
- if (*q2 != 0 && *q2 != 9 && *q2 != 0xa && *q2 != 0xc && *q2 != 0xd && *q2 != 0x20) { |
|
538 |
- /* This instance of the "obj" string appears to be part of another string. |
|
540 |
+ if (*idx != 0 && *idx != 9 && *idx != 0xa && *idx != 0xc && *idx != 0xd && *idx != 0x20) { |
|
541 |
+ /* This instance of "obj" appears to be part of a longer string. |
|
539 | 542 |
* Skip it, and keep searching for an object. */ |
540 |
- start = q2 + 4; |
|
541 |
- bytesleft -= 4; |
|
543 |
+ idx += 1 + strlen("obj"); |
|
544 |
+ bytesleft -= 1 + strlen("obj"); |
|
542 | 545 |
continue; |
543 | 546 |
} |
544 | 547 |
|
545 |
- break; /* Found it. q2 should point to the whitespace before the "obj" string */ |
|
546 |
- } |
|
548 |
+ /* Found the beginning of the word */ |
|
549 |
+ obj_begin = idx; |
|
550 |
+ obj_end = idx + 1 + strlen("obj"); |
|
547 | 551 |
|
548 |
- if (bytesleft <= 0) { |
|
549 |
- status = CL_BREAK; /* No "obj" found. */ |
|
550 |
- goto done; |
|
552 |
+ break; |
|
551 | 553 |
} |
552 | 554 |
|
553 |
- /* "obj" found! */ |
|
555 |
+ if ((NULL == obj_begin) || (NULL == obj_end)) { |
|
556 |
+ status = CL_BREAK; |
|
557 |
+ goto done; /* No more objs. */ |
|
558 |
+ } |
|
554 | 559 |
|
555 | 560 |
/* Find the generation id (genid) that appears before the "obj" */ |
556 |
- q = findNextNonWSBack(q2 - 1, start); |
|
557 |
- while (q > start && isdigit(*q)) |
|
558 |
- q--; |
|
561 |
+ genid_search_index = findNextNonWSBack(obj_begin - 1, start); |
|
562 |
+ while (genid_search_index > start && isdigit(*genid_search_index)) |
|
563 |
+ genid_search_index--; |
|
559 | 564 |
|
560 |
- if (CL_SUCCESS != cli_strntol_wrap(q, (size_t)(bytesleft + (q2 - q)), 0, 10, &temp_long)) { |
|
565 |
+ if (CL_SUCCESS != cli_strntol_wrap(genid_search_index, (size_t)((obj_begin)-genid_search_index), 0, 10, &temp_long)) { |
|
561 | 566 |
cli_dbgmsg("pdf_findobj: Failed to parse object genid (# objects found: %u)\n", pdf->nobjs); |
562 | 567 |
/* Failed to parse, probably not a real object. Skip past the "obj" thing, and continue. */ |
563 |
- pdf->offset = q2 + 4 - pdf->map; |
|
568 |
+ pdf->offset = obj_end - pdf->map; |
|
564 | 569 |
status = CL_EPARSE; |
565 | 570 |
goto done; |
566 | 571 |
} else if (temp_long < 0) { |
567 | 572 |
cli_dbgmsg("pdf_findobj: Encountered invalid negative obj genid (%ld).\n", temp_long); |
568 |
- pdf->offset = q2 + 4 - pdf->map; |
|
573 |
+ pdf->offset = obj_end - pdf->map; |
|
569 | 574 |
status = CL_EPARSE; |
570 | 575 |
goto done; |
571 | 576 |
} |
572 | 577 |
genid = (unsigned long)temp_long; |
573 | 578 |
|
574 |
- /* Find the object id (objid) that appers before the genid */ |
|
575 |
- q = findNextNonWSBack(q - 1, start); |
|
576 |
- while (q > start && isdigit(*q)) |
|
577 |
- q--; |
|
579 |
+ /* Find the object id (objid) that appears before the genid */ |
|
580 |
+ objid_search_index = findNextNonWSBack(genid_search_index - 1, start); |
|
581 |
+ while (objid_search_index > start && isdigit(*objid_search_index)) |
|
582 |
+ objid_search_index--; |
|
578 | 583 |
|
579 |
- if (CL_SUCCESS != cli_strntol_wrap(q, (size_t)(bytesleft + (q2 - q)), 0, 10, &temp_long)) { |
|
584 |
+ if (CL_SUCCESS != cli_strntol_wrap(objid_search_index, (size_t)((genid_search_index)-objid_search_index), 0, 10, &temp_long)) { |
|
580 | 585 |
/* |
581 |
- * PDFs with multiple revisions will have %%EOF before the end of the file, |
|
582 |
- * followed by the next revision of the PDF. If this is the case, we can |
|
583 |
- * detect it and continue parsing after the %%EOF. |
|
586 |
+ * Edge case: |
|
587 |
+ * |
|
588 |
+ * PDFs with multiple revisions will have %%EOF before the end of the file, |
|
589 |
+ * followed by the next revision of the PDF, which will probably be an immediate objid. |
|
590 |
+ * |
|
591 |
+ * Example: |
|
592 |
+ * %%EOF1 1 obj <blah> endobj |
|
593 |
+ * |
|
594 |
+ * If this is the case, we can detect it and continue parsing after the %%EOF. |
|
584 | 595 |
*/ |
585 |
- if (q - 4 > start) { |
|
586 |
- const char *lastfile = q - 4; |
|
596 |
+ if (objid_search_index - strlen("\%\%EO") > start) { |
|
597 |
+ const char *lastfile = objid_search_index - strlen("\%\%EO"); |
|
587 | 598 |
if (0 != strncmp(lastfile, "\%\%EOF", 5)) { |
588 | 599 |
/* Nope, wasn't %%EOF */ |
589 | 600 |
cli_dbgmsg("pdf_findobj: Failed to parse object objid (# objects found: %u)\n", pdf->nobjs); |
590 | 601 |
/* Skip past the "obj" thing, and continue. */ |
591 |
- pdf->offset = q2 + 4 - pdf->map; |
|
602 |
+ pdf->offset = obj_end - pdf->map; |
|
592 | 603 |
status = CL_EPARSE; |
593 | 604 |
goto done; |
594 | 605 |
} |
595 |
- /* Yup, Looks, like the file continues after %%EOF. |
|
606 |
+ /* Yup, Looks, like the file continues after %%EOF. |
|
596 | 607 |
* Probably another revision. Keep parsing... */ |
597 |
- q++; |
|
598 |
- cli_dbgmsg("pdf_findobj: \%\%EOF detected before end of file, at %zu\n", (size_t)q); |
|
608 |
+ objid_search_index++; |
|
609 |
+ cli_dbgmsg("pdf_findobj: \%\%EOF detected before end of file, at offset: %zu\n", (size_t)(objid_search_index - pdf->map)); |
|
599 | 610 |
} else { |
600 | 611 |
/* Failed parsing at the very beginning */ |
601 | 612 |
cli_dbgmsg("pdf_findobj: Failed to parse object objid (# objects found: %u)\n", pdf->nobjs); |
602 | 613 |
/* Probably not a real object. Skip past the "obj" thing, and continue. */ |
603 |
- pdf->offset = q2 + 4 - pdf->map; |
|
614 |
+ pdf->offset = obj_end - pdf->map; |
|
604 | 615 |
status = CL_EPARSE; |
605 | 616 |
goto done; |
606 | 617 |
} |
607 | 618 |
/* Try again, with offset slightly adjusted */ |
608 |
- if (CL_SUCCESS != cli_strntol_wrap(q, (size_t)(bytesleft + (q2 - q)), 0, 10, &temp_long)) { |
|
619 |
+ if (CL_SUCCESS != cli_strntol_wrap(objid_search_index, (size_t)((genid_search_index - 1) - objid_search_index), 0, 10, &temp_long)) { |
|
609 | 620 |
cli_dbgmsg("pdf_findobj: Failed to parse object objid (# objects found: %u)\n", pdf->nobjs); |
610 | 621 |
/* Still failed... Probably not a real object. Skip past the "obj" thing, and continue. */ |
611 |
- pdf->offset = q2 + 4 - pdf->map; |
|
622 |
+ pdf->offset = obj_end - pdf->map; |
|
612 | 623 |
status = CL_EPARSE; |
613 | 624 |
goto done; |
614 | 625 |
} else if (temp_long < 0) { |
615 | 626 |
cli_dbgmsg("pdf_findobj: Encountered invalid negative objid (%ld).\n", temp_long); |
616 |
- pdf->offset = q2 + 4 - pdf->map; |
|
627 |
+ pdf->offset = obj_end - pdf->map; |
|
617 | 628 |
status = CL_EPARSE; |
618 | 629 |
goto done; |
619 | 630 |
} |
... | ... |
@@ -621,82 +660,52 @@ cl_error_t pdf_findobj(struct pdf_struct *pdf) |
621 | 621 |
cli_dbgmsg("pdf_findobj: There appears to be an additional revision. Continuing to parse...\n"); |
622 | 622 |
} else if (temp_long < 0) { |
623 | 623 |
cli_dbgmsg("pdf_findobj: Encountered invalid negative objid (%ld).\n", temp_long); |
624 |
- pdf->offset = q2 + 4 - pdf->map; |
|
624 |
+ pdf->offset = obj_end - pdf->map; |
|
625 | 625 |
status = CL_EPARSE; |
626 | 626 |
goto done; |
627 | 627 |
} |
628 | 628 |
objid = (unsigned long)temp_long; |
629 | 629 |
|
630 |
- /* |
|
631 |
- * Ok so we have the objid, genid, and "obj" string. |
|
632 |
- * Time to store that information and then ... |
|
633 |
- * ... investigate what kind of object this is. |
|
634 |
- */ |
|
635 | 630 |
obj->id = (objid << 8) | (genid & 0xff); |
636 |
- obj->start = q2 + 4 - pdf->map; /* obj start begins just after the "obj" string */ |
|
631 |
+ obj->start = obj_end - pdf->map; /* obj start begins just after the "obj" string */ |
|
637 | 632 |
obj->flags = 0; |
638 | 633 |
|
639 |
- bytesleft -= 4; |
|
640 |
- eof = pdf->map + pdf->size; |
|
641 |
- q = pdf->map + obj->start; |
|
642 |
- |
|
643 |
- while (q < eof && bytesleft > 0) { |
|
644 |
- off_t p_stream, p_endstream; |
|
645 |
- q2 = pdf_nextobject(q, bytesleft); |
|
646 |
- if (!q2) |
|
647 |
- q2 = pdf->map + pdf->size; /* No interesting objects found, fast-forward to eof */ |
|
648 |
- |
|
649 |
- bytesleft -= q2 - q; |
|
650 |
- if (find_stream_bounds(q - 1, q2 - q, bytesleft + (q2 - q), &p_stream, &p_endstream, 1)) { |
|
651 |
- /* |
|
652 |
- * Found obj that contains a stream. |
|
653 |
- */ |
|
654 |
- obj->flags |= 1 << OBJ_STREAM; |
|
655 |
- q2 = q - 1 + p_endstream + 9; |
|
656 |
- bytesleft -= q2 - q + 1; |
|
657 |
- |
|
658 |
- if (bytesleft < 0) { |
|
659 |
- /* ... and the stream is truncated. Hmm... */ |
|
660 |
- obj->flags |= 1 << OBJ_TRUNCATED; |
|
661 |
- pdf->offset = pdf->size; |
|
662 |
- |
|
663 |
- status = CL_SUCCESS; |
|
664 |
- goto done; /* Truncated file, no end to obj/stream. |
|
665 |
- * The next call to pdf_findobj() will return no more objects. */ |
|
666 |
- } |
|
667 |
- } else if ((q3 = cli_memstr(q - 1, q2 - q + 1, "endobj", 6))) { |
|
668 |
- /* |
|
669 |
- * obj found and offset positioned. ideal return case |
|
670 |
- */ |
|
671 |
- q2 = q3 + 6; |
|
672 |
- pdf->offset = q2 - pdf->map; /* update the offset to just after the endobj */ |
|
673 |
- |
|
674 |
- status = CL_SUCCESS; |
|
675 |
- goto done; |
|
676 |
- } else { |
|
677 |
- q2++; |
|
678 |
- bytesleft--; |
|
679 |
- } |
|
680 |
- |
|
681 |
- q = q2; |
|
634 |
+ /* |
|
635 |
+ * We now have the objid, genid, and object start. |
|
636 |
+ * Find the object end ("endobj"). |
|
637 |
+ */ |
|
638 |
+ /* `- 1` accounts for size of white space before obj */ |
|
639 |
+ endobj_begin = cli_memstr(obj_end, pdf->map + pdf->size - obj_end, "endobj", strlen("endobj")); |
|
640 |
+ if (NULL == endobj_begin) { |
|
641 |
+ /* No end to object. |
|
642 |
+ * PDF appears to be malformed or truncated. |
|
643 |
+ * Will record the object size as going ot the end of the file. |
|
644 |
+ * Will record that the object is truncated. |
|
645 |
+ * Will position the pdf offset to the end of the PDF. |
|
646 |
+ * The next iteration of this function will find no more objects. */ |
|
647 |
+ obj->flags |= 1 << OBJ_TRUNCATED; |
|
648 |
+ obj->size = (pdf->map + pdf->size) - obj_end; |
|
649 |
+ pdf->offset = pdf->size; |
|
650 |
+ |
|
651 |
+ /* Truncated "object" found! */ |
|
652 |
+ status = CL_SUCCESS; |
|
653 |
+ goto done; |
|
682 | 654 |
} |
655 |
+ endobj_end = endobj_begin + strlen("endobj"); |
|
683 | 656 |
|
684 |
- obj->flags |= 1 << OBJ_TRUNCATED; |
|
685 |
- pdf->offset = pdf->size; |
|
657 |
+ /* Size of the object goes from "obj" <-> "endobject". */ |
|
658 |
+ obj->size = endobj_begin - obj_end; |
|
659 |
+ pdf->offset = endobj_end - pdf->map; |
|
686 | 660 |
|
661 |
+ /* |
|
662 |
+ * Object found! |
|
663 |
+ */ |
|
687 | 664 |
status = CL_SUCCESS; /* truncated file, no end to obj. */ |
688 | 665 |
|
689 | 666 |
done: |
690 | 667 |
if (status == CL_SUCCESS) { |
691 |
- cli_dbgmsg("pdf_findobj: found %d %d obj @%lld\n", obj->id >> 8, obj->id & 0xff, (long long)(obj->start + pdf->startoff)); |
|
668 |
+ cli_dbgmsg("pdf_findobj: found %d %d obj @%lld, size: %zu bytes.\n", obj->id >> 8, obj->id & 0xff, (long long)(obj->start + pdf->startoff), obj->size); |
|
692 | 669 |
} else { |
693 |
- if (status == CL_BREAK) { |
|
694 |
- cli_dbgmsg("pdf_findobj: No more objects (# objects found: %u)\n", pdf->nobjs); |
|
695 |
- } else if (status == CL_EMEM) { |
|
696 |
- cli_warnmsg("pdf_findobj: Error allocating memory (# objects found: %u)\n", pdf->nobjs); |
|
697 |
- } else { |
|
698 |
- cli_dbgmsg("pdf_findobj: Unexpected status code %d.\n", status); |
|
699 |
- } |
|
700 | 670 |
/* Remove the unused obj reference from our list of objects found */ |
701 | 671 |
/* No need to realloc pdf->objs back down. It won't leak. */ |
702 | 672 |
pdf->objs[pdf->nobjs - 1] = NULL; |
... | ... |
@@ -705,6 +714,14 @@ done: |
705 | 705 |
/* Free up the obj struct. */ |
706 | 706 |
if (NULL != obj) |
707 | 707 |
free(obj); |
708 |
+ |
|
709 |
+ if (status == CL_BREAK) { |
|
710 |
+ cli_dbgmsg("pdf_findobj: No more objects (# objects found: %u)\n", pdf->nobjs); |
|
711 |
+ } else if (status == CL_EMEM) { |
|
712 |
+ cli_warnmsg("pdf_findobj: Error allocating memory (# objects found: %u)\n", pdf->nobjs); |
|
713 |
+ } else { |
|
714 |
+ cli_dbgmsg("pdf_findobj: Unexpected status code %d.\n", status); |
|
715 |
+ } |
|
708 | 716 |
} |
709 | 717 |
|
710 | 718 |
return status; |
... | ... |
@@ -828,14 +845,14 @@ struct pdf_obj *find_obj(struct pdf_struct *pdf, struct pdf_obj *obj, uint32_t o |
828 | 828 |
|
829 | 829 |
/** |
830 | 830 |
* @brief Find and interpret the "/Length" dictionary key value. |
831 |
- * |
|
831 |
+ * |
|
832 | 832 |
* The value may be: |
833 |
- * - a direct object (i.e. just a number) |
|
833 |
+ * - a direct object (i.e. just a number) |
|
834 | 834 |
* - an indirect object, where the value is somewhere else in the document and we have to look it up. |
835 | 835 |
* indirect objects are referenced using an object id (objid), generation id (genid) genid, and the letter 'R'. |
836 |
- * |
|
836 |
+ * |
|
837 | 837 |
* Example dictionary with a single key "/Length" that relies direct object for the value. |
838 |
- * |
|
838 |
+ * |
|
839 | 839 |
* 1 0 obj |
840 | 840 |
* << /Length 534 |
841 | 841 |
* /Filter [ /ASCII85Decode /LZWDecode ] |
... | ... |
@@ -849,9 +866,9 @@ struct pdf_obj *find_obj(struct pdf_struct *pdf, struct pdf_obj *obj, uint32_t o |
849 | 849 |
* JD?M$0QP)lKn06l1apKDC@\qJ4B!!(5m+j.7F790m(Vj88l8Q:_CZ(Gm1%X\N1&u!FKHMB~> |
850 | 850 |
* endstream |
851 | 851 |
* endobj |
852 |
- * |
|
852 |
+ * |
|
853 | 853 |
* Example dictionary with a single key "/Length" that relies on an indirect object for the value. |
854 |
- * |
|
854 |
+ * |
|
855 | 855 |
* 7 0 obj |
856 | 856 |
* << /Length 8 0 R >> % An indirect reference to object 8, with generation id 0. |
857 | 857 |
* stream |
... | ... |
@@ -862,11 +879,11 @@ struct pdf_obj *find_obj(struct pdf_struct *pdf, struct pdf_obj *obj, uint32_t o |
862 | 862 |
* ET |
863 | 863 |
* endstream |
864 | 864 |
* endobj |
865 |
- * |
|
865 |
+ * |
|
866 | 866 |
* 8 0 obj |
867 | 867 |
* 77 % The length of the preceding stream |
868 | 868 |
* endobj |
869 |
- * |
|
869 |
+ * |
|
870 | 870 |
* @param pdf Pdf context structure. |
871 | 871 |
* @param obj Pdf object context structure. |
872 | 872 |
* @param start Pointer start of the dictionary string. |
... | ... |
@@ -906,7 +923,7 @@ static size_t find_length(struct pdf_struct *pdf, struct pdf_obj *obj, const cha |
906 | 906 |
if (!obj_start) |
907 | 907 |
return 0; |
908 | 908 |
|
909 |
- if (bytes_remaining < obj_start - index) { |
|
909 |
+ if (bytes_remaining < (size_t)(obj_start - index)) { |
|
910 | 910 |
return 0; |
911 | 911 |
} |
912 | 912 |
bytes_remaining -= obj_start - index; |
... | ... |
@@ -923,10 +940,10 @@ static size_t find_length(struct pdf_struct *pdf, struct pdf_obj *obj, const cha |
923 | 923 |
} |
924 | 924 |
length = (size_t)temp_long; /* length or maybe object id */ |
925 | 925 |
|
926 |
- /* |
|
927 |
- * Keep parsing, skipping past the first integer that might have been what we wanted. |
|
928 |
- * If it's an indirect object, we'll find a Generation ID followed by the letter 'R' |
|
929 |
- * I.e. something like " 0 R" |
|
926 |
+ /* |
|
927 |
+ * Keep parsing, skipping past the first integer that might have been what we wanted. |
|
928 |
+ * If it's an indirect object, we'll find a Generation ID followed by the letter 'R' |
|
929 |
+ * I.e. something like " 0 R" |
|
930 | 930 |
*/ |
931 | 931 |
while ((bytes_remaining > 0) && isdigit(*index)) { |
932 | 932 |
index++; |
... | ... |
@@ -958,8 +975,8 @@ static size_t find_length(struct pdf_struct *pdf, struct pdf_obj *obj, const cha |
958 | 958 |
} |
959 | 959 |
|
960 | 960 |
if (index[0] == ' ' && index[1] == 'R') { |
961 |
- /* |
|
962 |
- * Ok so we found a genid and that 'R'. Which means that first value |
|
961 |
+ /* |
|
962 |
+ * Ok so we found a genid and that 'R'. Which means that first value |
|
963 | 963 |
* was actually the objid. |
964 | 964 |
* We can look up the indirect object using this information. |
965 | 965 |
*/ |
... | ... |
@@ -984,7 +1001,7 @@ static size_t find_length(struct pdf_struct *pdf, struct pdf_obj *obj, const cha |
984 | 984 |
return 0; |
985 | 985 |
} |
986 | 986 |
|
987 |
- if (bytes_remaining < index - indirect_obj_start) { |
|
987 |
+ if (bytes_remaining < (size_t)(index - indirect_obj_start)) { |
|
988 | 988 |
return 0; |
989 | 989 |
} |
990 | 990 |
bytes_remaining -= index - indirect_obj_start; |
... | ... |
@@ -1002,7 +1019,7 @@ static size_t find_length(struct pdf_struct *pdf, struct pdf_obj *obj, const cha |
1002 | 1002 |
} |
1003 | 1003 |
|
1004 | 1004 |
/* limit length */ |
1005 |
- if (obj_start - pdf->map + length + 5 > pdf->size) |
|
1005 |
+ if ((size_t)(obj_start - pdf->map) + length + 5 > pdf->size) |
|
1006 | 1006 |
length = pdf->size - (obj_start - pdf->map) - 5; |
1007 | 1007 |
|
1008 | 1008 |
return length; |
... | ... |
@@ -1010,101 +1027,6 @@ static size_t find_length(struct pdf_struct *pdf, struct pdf_obj *obj, const cha |
1010 | 1010 |
|
1011 | 1011 |
#define DUMP_MASK ((1 << OBJ_CONTENTS) | (1 << OBJ_FILTER_FLATE) | (1 << OBJ_FILTER_DCT) | (1 << OBJ_FILTER_AH) | (1 << OBJ_FILTER_A85) | (1 << OBJ_EMBEDDED_FILE) | (1 << OBJ_JAVASCRIPT) | (1 << OBJ_OPENACTION) | (1 << OBJ_LAUNCHACTION)) |
1012 | 1012 |
|
1013 |
-static int obj_size(struct pdf_struct *pdf, struct pdf_obj *obj, int binary) |
|
1014 |
-{ |
|
1015 |
- if (0 == obj->size) { |
|
1016 |
- /* |
|
1017 |
- * Programmatically determine size if not already known. |
|
1018 |
- */ |
|
1019 |
- unsigned i = 0; |
|
1020 |
- |
|
1021 |
- /* Find the index of the current object */ |
|
1022 |
- for (i = 0; i < pdf->nobjs; i++) { |
|
1023 |
- if (pdf->objs[i] == obj) |
|
1024 |
- break; |
|
1025 |
- } |
|
1026 |
- |
|
1027 |
- /* Find the next object that exists in the same buffer (pdf fmap, or object stream) */ |
|
1028 |
- if (i < pdf->nobjs) { |
|
1029 |
- i++; |
|
1030 |
- } |
|
1031 |
- |
|
1032 |
- if (obj->objstm == NULL) { |
|
1033 |
- /* Current object isn't in an object stream, we want to find |
|
1034 |
- * the next object that also isn't in an object stream. */ |
|
1035 |
- for (; i < pdf->nobjs; i++) { |
|
1036 |
- if (pdf->objs[i]->objstm == NULL) |
|
1037 |
- break; |
|
1038 |
- } |
|
1039 |
- } else { |
|
1040 |
- /* Current object is in an object stream, we want to find |
|
1041 |
- * the next object that is in the same object stream. |
|
1042 |
- * |
|
1043 |
- * This really shouldn't happen, so throw a warning and |
|
1044 |
- * then see if we can solve it anyhow */ |
|
1045 |
- cli_warnmsg("obj_size: Encountered pdf object in an object stream that has an unknown size!!\n"); |
|
1046 |
- |
|
1047 |
- for (; i < pdf->nobjs; i++) { |
|
1048 |
- if (pdf->objs[i]->objstm == obj->objstm) |
|
1049 |
- break; |
|
1050 |
- } |
|
1051 |
- } |
|
1052 |
- |
|
1053 |
- /* Step backwards from the "next" object to find the end of the current object */ |
|
1054 |
- if (i < pdf->nobjs) { |
|
1055 |
- int s = pdf->objs[i]->start - obj->start - 4; |
|
1056 |
- if (s > 0) { |
|
1057 |
- if (!binary) { |
|
1058 |
- const char *p = NULL; |
|
1059 |
- const char *q = NULL; |
|
1060 |
- |
|
1061 |
- if (obj->objstm == NULL) { |
|
1062 |
- p = pdf->map + obj->start; |
|
1063 |
- } else { |
|
1064 |
- p = obj->objstm->streambuf + obj->start; |
|
1065 |
- } |
|
1066 |
- q = p + s; |
|
1067 |
- |
|
1068 |
- while (q > p && (isspace(*q) || isdigit(*q))) |
|
1069 |
- q--; |
|
1070 |
- |
|
1071 |
- if (q > p + 5 && !memcmp(q - 5, "endobj", 6)) |
|
1072 |
- q -= 6; |
|
1073 |
- |
|
1074 |
- q = findNextNonWSBack(q, p); |
|
1075 |
- q++; |
|
1076 |
- |
|
1077 |
- obj->size = q - p; |
|
1078 |
- goto done; |
|
1079 |
- } |
|
1080 |
- |
|
1081 |
- obj->size = s; |
|
1082 |
- goto done; |
|
1083 |
- } |
|
1084 |
- } |
|
1085 |
- |
|
1086 |
- /* If we've gotten this far, we didn't find a "next" object... so our |
|
1087 |
- * current object must be at the end of the pdf fmap or the end of the |
|
1088 |
- * object stream. */ |
|
1089 |
- if (obj->objstm == NULL) { |
|
1090 |
- /* Current object isn't in an object stream, so we can determine object |
|
1091 |
- * size based on the remaining size of the file (in theory). */ |
|
1092 |
- if (binary) |
|
1093 |
- obj->size = pdf->size - obj->start; |
|
1094 |
- else |
|
1095 |
- obj->size = pdf->offset - obj->start - 6; /* This hack I think assumes that we reached the end of the file when finding objects. */ |
|
1096 |
- } else { |
|
1097 |
- /* Current object is in an object stream, we want to find |
|
1098 |
- * the next object that is in the same object stream. */ |
|
1099 |
- obj->size = obj->objstm->streambuf_len - obj->start; |
|
1100 |
- } |
|
1101 |
- } |
|
1102 |
- |
|
1103 |
-done: |
|
1104 |
- |
|
1105 |
- return obj->size; |
|
1106 |
-} |
|
1107 |
- |
|
1108 | 1013 |
static int run_pdf_hooks(struct pdf_struct *pdf, enum pdf_phase phase, int fd, int dumpid) |
1109 | 1014 |
{ |
1110 | 1015 |
int ret; |
... | ... |
@@ -1472,319 +1394,317 @@ int pdf_extract_obj(struct pdf_struct *pdf, struct pdf_obj *obj, uint32_t flags) |
1472 | 1472 |
if (!(flags & PDF_EXTRACT_OBJ_SCAN)) |
1473 | 1473 |
obj->path = strdup(fullname); |
1474 | 1474 |
|
1475 |
- do { |
|
1476 |
- if (obj->flags & (1 << OBJ_STREAM)) { |
|
1477 |
- const char *start = pdf->map + obj->start; |
|
1478 |
- off_t p_stream = 0, p_endstream = 0; |
|
1479 |
- off_t length; |
|
1475 |
+ if ((NULL == obj->objstm) && |
|
1476 |
+ (obj->flags & (1 << OBJ_STREAM))) { |
|
1477 |
+ /* |
|
1478 |
+ * Object contains a stream. Parse this now. |
|
1479 |
+ */ |
|
1480 |
+ cli_dbgmsg("pdf_extract_obj: parsing a stream in obj %u %u\n", obj->id >> 8, obj->id & 0xff); |
|
1480 | 1481 |
|
1481 |
- if (NULL != obj->objstm) { |
|
1482 |
- cli_warnmsg("pdf_extract_obj: Object found in object stream claims to be an object stream! Skipping.\n"); |
|
1483 |
- break; |
|
1484 |
- } |
|
1482 |
+ const char *start = pdf->map + obj->start; |
|
1485 | 1483 |
|
1486 |
- find_stream_bounds(start, pdf->size - obj->start, |
|
1487 |
- pdf->size - obj->start, |
|
1488 |
- &p_stream, &p_endstream, |
|
1489 |
- pdf->enc_method_stream <= ENC_IDENTITY && |
|
1490 |
- pdf->enc_method_embeddedfile <= ENC_IDENTITY); |
|
1491 |
- |
|
1492 |
- if (p_stream && p_endstream) { |
|
1493 |
- size_t size = p_endstream - p_stream; |
|
1494 |
- off_t orig_length; |
|
1495 |
- int len = p_stream; |
|
1496 |
- const char *pstr; |
|
1497 |
- struct pdf_dict *dparams = NULL; |
|
1498 |
- struct objstm_struct *objstm = NULL; |
|
1499 |
- int xref = 0; |
|
1500 |
- |
|
1501 |
- length = find_length(pdf, obj, start, p_stream); |
|
1502 |
- if (length < 0) |
|
1503 |
- length = 0; |
|
1504 |
- |
|
1505 |
- orig_length = length; |
|
1506 |
- if (length > pdf->size || obj->start + p_stream + length > pdf->size) { |
|
1507 |
- cli_dbgmsg("cli_pdf: length out of file: %lld + %lld > %lld\n", |
|
1508 |
- (long long)p_stream, (long long)length, (long long)pdf->size); |
|
1509 |
- noisy_warnmsg("length out of file, truncated: %lld + %lld > %lld\n", |
|
1510 |
- (long long)p_stream, (long long)length, (long long)pdf->size); |
|
1511 |
- length = pdf->size - (obj->start + p_stream); |
|
1512 |
- } |
|
1484 |
+ size_t length; |
|
1485 |
+ size_t orig_length; |
|
1486 |
+ int dict_len = obj->stream - start; /* Dictionary should end where the stream begins */ |
|
1513 | 1487 |
|
1514 |
- if (!(obj->flags & (1 << OBJ_FILTER_FLATE)) && length <= 0) { |
|
1515 |
- const char *q = start + p_endstream; |
|
1516 |
- length = size; |
|
1517 |
- q--; |
|
1488 |
+ const char *pstr; |
|
1489 |
+ struct pdf_dict *dparams = NULL; |
|
1490 |
+ struct objstm_struct *objstm = NULL; |
|
1491 |
+ int xref = 0; |
|
1518 | 1492 |
|
1519 |
- if (*q == '\n') { |
|
1520 |
- q--; |
|
1521 |
- length--; |
|
1493 |
+ /* Find and interpret the length dictionary value */ |
|
1494 |
+ length = find_length(pdf, obj, start, dict_len); |
|
1495 |
+ if (length < 0) |
|
1496 |
+ length = 0; |
|
1522 | 1497 |
|
1523 |
- if (*q == '\r') |
|
1524 |
- length--; |
|
1525 |
- } else if (*q == '\r') { |
|
1526 |
- length--; |
|
1527 |
- } |
|
1498 |
+ orig_length = length; |
|
1528 | 1499 |
|
1529 |
- if (length < 0) |
|
1530 |
- length = 0; |
|
1500 |
+ if (length > obj->stream_size) { |
|
1501 |
+ cli_dbgmsg("cli_pdf: Stream length exceeds object length by %zu bytes. Length truncated to %zu bytes\n", length - obj->stream_size, obj->stream_size); |
|
1502 |
+ noisy_warnmsg("Stream length exceeds object length by %zu bytes. Length truncated to %zu bytes\n", length - obj->stream_size, obj->stream_size); |
|
1531 | 1503 |
|
1532 |
- cli_dbgmsg("pdf_extract_obj: calculated length %lld\n", (long long)length); |
|
1533 |
- } else { |
|
1534 |
- if (size > (size_t)length + 2) { |
|
1535 |
- cli_dbgmsg("cli_pdf: calculated length %zu < %zu\n", |
|
1536 |
- (size_t)length, size); |
|
1537 |
- length = size; |
|
1538 |
- } |
|
1539 |
- } |
|
1504 |
+ length = obj->stream_size; |
|
1505 |
+ } |
|
1540 | 1506 |
|
1541 |
- if (orig_length && size > (size_t)orig_length + 20) { |
|
1542 |
- cli_dbgmsg("pdf_extract_obj: orig length: %lld, length: %lld, size: %zu\n", |
|
1543 |
- (long long)orig_length, (long long)length, size); |
|
1544 |
- pdfobj_flag(pdf, obj, BAD_STREAMLEN); |
|
1545 |
- } |
|
1507 |
+ if (!(obj->flags & (1 << OBJ_FILTER_FLATE)) && (length <= 0)) { |
|
1508 |
+ /* |
|
1509 |
+ * If the length is unknown and this doesn't contain a FLATE encoded filter... |
|
1510 |
+ * Calculate the length using the stream size, and trimming |
|
1511 |
+ * off any newline/carriage returns from the end of the stream. |
|
1512 |
+ */ |
|
1513 |
+ const char *q = start + obj->stream_size; |
|
1514 |
+ length = obj->stream_size; |
|
1515 |
+ q--; |
|
1516 |
+ |
|
1517 |
+ if (*q == '\n') { |
|
1518 |
+ q--; |
|
1519 |
+ length--; |
|
1520 |
+ |
|
1521 |
+ if (*q == '\r') |
|
1522 |
+ length--; |
|
1523 |
+ } else if (*q == '\r') { |
|
1524 |
+ length--; |
|
1525 |
+ } |
|
1546 | 1526 |
|
1547 |
- if (!length) { |
|
1548 |
- length = size; |
|
1549 |
- if (!length) { |
|
1550 |
- cli_dbgmsg("pdf_extract_obj: length and size both 0\n"); |
|
1551 |
- break; /* Empty stream, nothing to scan */ |
|
1552 |
- } |
|
1553 |
- } |
|
1527 |
+ if (length < 0) |
|
1528 |
+ length = 0; |
|
1529 |
+ |
|
1530 |
+ cli_dbgmsg("pdf_extract_obj: calculated length %lld\n", (long long)length); |
|
1531 |
+ } else { |
|
1532 |
+ if (obj->stream_size > (size_t)length + 2) { |
|
1533 |
+ cli_dbgmsg("cli_pdf: calculated length %zu < %zu\n", |
|
1534 |
+ (size_t)length, obj->stream_size); |
|
1535 |
+ length = obj->stream_size; |
|
1536 |
+ } |
|
1537 |
+ } |
|
1554 | 1538 |
|
1555 |
- if (cli_memstr(start, p_stream, "/XRef", 5)) |
|
1556 |
- xref = 1; |
|
1539 |
+ if ((0 != orig_length) && (obj->stream_size > (size_t)orig_length + 20)) { |
|
1540 |
+ cli_dbgmsg("pdf_extract_obj: orig length: %lld, length: %lld, size: %zu\n", |
|
1541 |
+ (long long)orig_length, (long long)length, obj->stream_size); |
|
1542 |
+ pdfobj_flag(pdf, obj, BAD_STREAMLEN); |
|
1543 |
+ } |
|
1557 | 1544 |
|
1558 |
- cli_dbgmsg("-------------EXPERIMENTAL-------------\n"); |
|
1545 |
+ if (0 == length) { |
|
1546 |
+ length = obj->stream_size; |
|
1547 |
+ if (0 == length) { |
|
1548 |
+ cli_dbgmsg("pdf_extract_obj: Alleged or calculated stream length and stream buffer size both 0\n"); |
|
1549 |
+ goto done; /* Empty stream, nothing to scan */ |
|
1550 |
+ } |
|
1551 |
+ } |
|
1559 | 1552 |
|
1560 |
- /* |
|
1561 |
- * Identify the DecodeParms, if available. |
|
1562 |
- */ |
|
1563 |
- if (NULL != (pstr = pdf_getdict(start, &len, "/DecodeParms"))) { |
|
1564 |
- cli_dbgmsg("pdf_extract_obj: Found /DecodeParms\n"); |
|
1565 |
- } else if (NULL != (pstr = pdf_getdict(start, &len, "/DP"))) { |
|
1566 |
- cli_dbgmsg("pdf_extract_obj: Found /DP\n"); |
|
1567 |
- } |
|
1553 |
+ /* Check if XRef is enabled */ |
|
1554 |
+ if (cli_memstr(start, dict_len, "/XRef", strlen("/XRef"))) { |
|
1555 |
+ xref = 1; |
|
1556 |
+ } |
|
1568 | 1557 |
|
1569 |
- if (pstr) { |
|
1570 |
- unsigned int objsize = obj_size(pdf, obj, 1); |
|
1558 |
+ cli_dbgmsg("-------------EXPERIMENTAL-------------\n"); |
|
1571 | 1559 |
|
1572 |
- /* shift pstr left to "<<" for pdf_parse_dict */ |
|
1573 |
- while ((*pstr == '<') && (pstr > start)) { |
|
1574 |
- pstr--; |
|
1575 |
- len++; |
|
1576 |
- } |
|
1560 |
+ /* |
|
1561 |
+ * Identify the DecodeParms, if available. |
|
1562 |
+ */ |
|
1563 |
+ if (NULL != (pstr = pdf_getdict(start, &dict_len, "/DecodeParms"))) { |
|
1564 |
+ cli_dbgmsg("pdf_extract_obj: Found /DecodeParms\n"); |
|
1565 |
+ } else if (NULL != (pstr = pdf_getdict(start, &dict_len, "/DP"))) { |
|
1566 |
+ cli_dbgmsg("pdf_extract_obj: Found /DP\n"); |
|
1567 |
+ } |
|
1577 | 1568 |
|
1578 |
- /* shift pstr right to "<<" for pdf_parse_dict */ |
|
1579 |
- while ((*pstr != '<') && (len > 0)) { |
|
1580 |
- pstr++; |
|
1581 |
- len--; |
|
1582 |
- } |
|
1569 |
+ if (pstr) { |
|
1570 |
+ /* shift pstr left to "<<" for pdf_parse_dict */ |
|
1571 |
+ while ((*pstr == '<') && (pstr > start)) { |
|
1572 |
+ pstr--; |
|
1573 |
+ dict_len++; |
|
1574 |
+ } |
|
1583 | 1575 |
|
1584 |
- if (len > 4) |
|
1585 |
- dparams = pdf_parse_dict(pdf, obj, objsize, (char *)pstr, NULL); |
|
1586 |
- else |
|
1587 |
- cli_dbgmsg("pdf_extract_obj: failed to locate DecodeParms dictionary start\n"); |
|
1576 |
+ /* shift pstr right to "<<" for pdf_parse_dict */ |
|
1577 |
+ while ((*pstr != '<') && (dict_len > 0)) { |
|
1578 |
+ pstr++; |
|
1579 |
+ dict_len--; |
|
1580 |
+ } |
|
1581 |
+ |
|
1582 |
+ if (dict_len > 4) |
|
1583 |
+ dparams = pdf_parse_dict(pdf, obj, obj->size, (char *)pstr, NULL); |
|
1584 |
+ else |
|
1585 |
+ cli_dbgmsg("pdf_extract_obj: failed to locate DecodeParms dictionary start\n"); |
|
1586 |
+ } |
|
1587 |
+ |
|
1588 |
+ /* |
|
1589 |
+ * Go back to the start of the dictionary and check to see if the stream |
|
1590 |
+ * is an object stream. If so, collect the relevant info. |
|
1591 |
+ */ |
|
1592 |
+ dict_len = obj->stream - start; |
|
1593 |
+ if (NULL != (pstr = pdf_getdict(start, &dict_len, "/Type/ObjStm"))) { |
|
1594 |
+ int32_t objstm_first = -1; |
|
1595 |
+ int32_t objstm_length = -1; |
|
1596 |
+ int32_t objstm_n = -1; |
|
1597 |
+ |
|
1598 |
+ cli_dbgmsg("pdf_extract_obj: Found /Type/ObjStm\n"); |
|
1599 |
+ |
|
1600 |
+ dict_len = obj->stream - start; |
|
1601 |
+ if ((-1 == (objstm_first = pdf_readint(start, dict_len, "/First")))) { |
|
1602 |
+ cli_warnmsg("pdf_extract_obj: Failed to find offset of first object in object stream\n"); |
|
1603 |
+ } else if ((-1 == (objstm_length = pdf_readint(start, dict_len, "/Length")))) { |
|
1604 |
+ cli_warnmsg("pdf_extract_obj: Failed to find length of object stream\n"); |
|
1605 |
+ } else if ((-1 == (objstm_n = pdf_readint(start, dict_len, "/N")))) { |
|
1606 |
+ cli_warnmsg("pdf_extract_obj: Failed to find num objects in object stream\n"); |
|
1607 |
+ } else { |
|
1608 |
+ /* Add objstm to pdf struct, so it can be freed eventually */ |
|
1609 |
+ pdf->nobjstms++; |
|
1610 |
+ pdf->objstms = cli_realloc2(pdf->objstms, sizeof(struct objstm_struct *) * pdf->nobjstms); |
|
1611 |
+ if (!pdf->objstms) { |
|
1612 |
+ cli_warnmsg("pdf_extract_obj: out of memory parsing object stream (%u)\n", pdf->nobjstms); |
|
1613 |
+ pdf_free_dict(dparams); |
|
1614 |
+ return CL_EMEM; |
|
1588 | 1615 |
} |
1589 | 1616 |
|
1590 |
- /* |
|
1591 |
- * Identify if the stream is an object stream. If so, collect the relevant info. |
|
1592 |
- */ |
|
1593 |
- len = p_stream; |
|
1594 |
- if (NULL != (pstr = pdf_getdict(start, &len, "/Type/ObjStm"))) { |
|
1595 |
- int32_t objstm_first = -1; |
|
1596 |
- int32_t objstm_length = -1; |
|
1597 |
- int32_t objstm_n = -1; |
|
1598 |
- |
|
1599 |
- cli_dbgmsg("pdf_extract_obj: Found /Type/ObjStm\n"); |
|
1600 |
- |
|
1601 |
- len = p_stream; |
|
1602 |
- if ((-1 == (objstm_first = pdf_readint(start, len, "/First")))) { |
|
1603 |
- cli_warnmsg("pdf_extract_obj: Failed to find offset of first object in object stream\n"); |
|
1604 |
- } else if ((-1 == (objstm_length = pdf_readint(start, len, "/Length")))) { |
|
1605 |
- cli_warnmsg("pdf_extract_obj: Failed to find length of object stream\n"); |
|
1606 |
- } else if ((-1 == (objstm_n = pdf_readint(start, len, "/N")))) { |
|
1607 |
- cli_warnmsg("pdf_extract_obj: Failed to find num objects in object stream\n"); |
|
1608 |
- } else { |
|
1609 |
- /* Add objstm to pdf struct, so it can be freed eventually */ |
|
1610 |
- pdf->nobjstms++; |
|
1611 |
- pdf->objstms = cli_realloc2(pdf->objstms, sizeof(struct objstm_struct *) * pdf->nobjstms); |
|
1612 |
- if (!pdf->objstms) { |
|
1613 |
- cli_warnmsg("pdf_extract_obj: out of memory parsing object stream (%u)\n", pdf->nobjstms); |
|
1614 |
- pdf_free_dict(dparams); |
|
1615 |
- return CL_EMEM; |
|
1616 |
- } |
|
1617 |
+ objstm = malloc(sizeof(struct objstm_struct)); |
|
1618 |
+ if (!objstm) { |
|
1619 |
+ cli_warnmsg("pdf_extract_obj: out of memory parsing object stream (%u)\n", pdf->nobjstms); |
|
1620 |
+ pdf_free_dict(dparams); |
|
1621 |
+ return CL_EMEM; |
|
1622 |
+ } |
|
1623 |
+ pdf->objstms[pdf->nobjstms - 1] = objstm; |
|
1617 | 1624 |
|
1618 |
- objstm = malloc(sizeof(struct objstm_struct)); |
|
1619 |
- if (!objstm) { |
|
1620 |
- cli_warnmsg("pdf_extract_obj: out of memory parsing object stream (%u)\n", pdf->nobjstms); |
|
1621 |
- pdf_free_dict(dparams); |
|
1622 |
- return CL_EMEM; |
|
1623 |
- } |
|
1624 |
- pdf->objstms[pdf->nobjstms - 1] = objstm; |
|
1625 |
+ memset(objstm, 0, sizeof(*objstm)); |
|
1625 | 1626 |
|
1626 |
- memset(objstm, 0, sizeof(*objstm)); |
|
1627 |
+ objstm->first = (uint32_t)objstm_first; |
|
1628 |
+ objstm->current = (uint32_t)objstm_first; |
|
1629 |
+ objstm->current_pair = 0; |
|
1630 |
+ objstm->length = (uint32_t)objstm_length; |
|
1631 |
+ objstm->n = (uint32_t)objstm_n; |
|
1627 | 1632 |
|
1628 |
- objstm->first = (uint32_t)objstm_first; |
|
1629 |
- objstm->current = (uint32_t)objstm_first; |
|
1630 |
- objstm->current_pair = 0; |
|
1631 |
- objstm->length = (uint32_t)objstm_length; |
|
1632 |
- objstm->n = (uint32_t)objstm_n; |
|
1633 |
+ cli_dbgmsg("pdf_extract_obj: ObjStm first obj at offset %d\n", objstm->first); |
|
1634 |
+ cli_dbgmsg("pdf_extract_obj: ObjStm length is %d bytes\n", objstm->length); |
|
1635 |
+ cli_dbgmsg("pdf_extract_obj: ObjStm should contain %d objects\n", objstm->n); |
|
1636 |
+ } |
|
1637 |
+ } |
|
1638 |
+ |
|
1639 |
+ sum = pdf_decodestream(pdf, obj, dparams, obj->stream, (uint32_t)length, xref, fout, &rc, objstm); |
|
1640 |
+ if ((CL_SUCCESS != rc) && (CL_VIRUS != rc)) { |
|
1641 |
+ cli_dbgmsg("Error decoding stream! Error code: %d\n", rc); |
|
1642 |
+ |
|
1643 |
+ /* It's ok if we couldn't decode the stream, |
|
1644 |
+ * make a best effort to keep parsing. */ |
|
1645 |
+ if (CL_EPARSE == rc) |
|
1646 |
+ rc = CL_SUCCESS; |
|
1633 | 1647 |
|
1634 |
- cli_dbgmsg("pdf_extract_obj: ObjStm first obj at offset %d\n", objstm->first); |
|
1635 |
- cli_dbgmsg("pdf_extract_obj: ObjStm length is %d bytes\n", objstm->length); |
|
1636 |
- cli_dbgmsg("pdf_extract_obj: ObjStm should contain %d objects\n", objstm->n); |
|
1648 |
+ if (NULL != objstm) { |
|
1649 |
+ /* |
|
1650 |
+ * If we were expecting an objstm and there was a failure... |
|
1651 |
+ * discard the memory for last object stream. |
|
1652 |
+ */ |
|
1653 |
+ if (NULL != pdf->objstms) { |
|
1654 |
+ if (NULL != pdf->objstms[pdf->nobjstms - 1]) { |
|
1655 |
+ if (NULL != pdf->objstms[pdf->nobjstms - 1]->streambuf) { |
|
1656 |
+ free(pdf->objstms[pdf->nobjstms - 1]->streambuf); |
|
1657 |
+ pdf->objstms[pdf->nobjstms - 1]->streambuf = NULL; |
|
1658 |
+ } |
|
1659 |
+ free(pdf->objstms[pdf->nobjstms - 1]); |
|
1660 |
+ pdf->objstms[pdf->nobjstms - 1] = NULL; |
|
1637 | 1661 |
} |
1638 |
- } |
|
1639 | 1662 |
|
1640 |
- sum = pdf_decodestream(pdf, obj, dparams, start + p_stream, (uint32_t)length, xref, fout, &rc, objstm); |
|
1641 |
- if ((CL_SUCCESS != rc) && (CL_VIRUS != rc)) { |
|
1642 |
- cli_dbgmsg("Error decoding stream! Error code: %d\n", rc); |
|
1643 |
- |
|
1644 |
- /* It's ok if we couldn't decode the stream, |
|
1645 |
- * make a best effort to keep parsing. */ |
|
1646 |
- if (CL_EPARSE == rc) |
|
1647 |
- rc = CL_SUCCESS; |
|
1648 |
- |
|
1649 |
- if (NULL != objstm) { |
|
1650 |
- /* |
|
1651 |
- * If we were expecting an objstm and there was a failure... |
|
1652 |
- * discard the memory for last object stream. |
|
1653 |
- */ |
|
1654 |
- if (NULL != pdf->objstms) { |
|
1655 |
- if (NULL != pdf->objstms[pdf->nobjstms - 1]) { |
|
1656 |
- if (NULL != pdf->objstms[pdf->nobjstms - 1]->streambuf) { |
|
1657 |
- free(pdf->objstms[pdf->nobjstms - 1]->streambuf); |
|
1658 |
- pdf->objstms[pdf->nobjstms - 1]->streambuf = NULL; |
|
1659 |
- } |
|
1660 |
- free(pdf->objstms[pdf->nobjstms - 1]); |
|
1661 |
- pdf->objstms[pdf->nobjstms - 1] = NULL; |
|
1662 |
- } |
|
1663 |
+ /* Pop the objstm off the end of the pdf->objstms array. */ |
|
1664 |
+ if (pdf->nobjstms > 0) { |
|
1665 |
+ pdf->nobjstms--; |
|
1666 |
+ if (0 == pdf->nobjstms) { |
|
1667 |
+ free(pdf->objstms); |
|
1668 |
+ pdf->objstms = NULL; |
|
1669 |
+ } else { |
|
1670 |
+ pdf->objstms = cli_realloc2(pdf->objstms, sizeof(struct objstm_struct *) * pdf->nobjstms); |
|
1663 | 1671 |
|
1664 |
- /* Pop the objstm off the end of the pdf->objstms array. */ |
|
1665 |
- if (pdf->nobjstms > 0) { |
|
1666 |
- pdf->nobjstms--; |
|
1667 |
- if (0 == pdf->nobjstms) { |
|
1668 |
- free(pdf->objstms); |
|
1669 |
- pdf->objstms = NULL; |
|
1670 |
- } else { |
|
1671 |
- pdf->objstms = cli_realloc2(pdf->objstms, sizeof(struct objstm_struct *) * pdf->nobjstms); |
|
1672 |
- |
|
1673 |
- if (!pdf->objstms) { |
|
1674 |
- cli_warnmsg("pdf_extract_obj: out of memory when shrinking down objstm array\n"); |
|
1675 |
- return CL_EMEM; |
|
1676 |
- } |
|
1677 |
- } |
|
1678 |
- } else { |
|
1679 |
- /* hm.. this shouldn't happen */ |
|
1680 |
- cli_warnmsg("pdf_extract_obj: Failure counting objstms.\n"); |
|
1672 |
+ if (!pdf->objstms) { |
|
1673 |
+ cli_warnmsg("pdf_extract_obj: out of memory when shrinking down objstm array\n"); |
|
1674 |
+ return CL_EMEM; |
|
1681 | 1675 |
} |
1682 | 1676 |
} |
1677 |
+ } else { |
|
1678 |
+ /* hm.. this shouldn't happen */ |
|
1679 |
+ cli_warnmsg("pdf_extract_obj: Failure counting objstms.\n"); |
|
1683 | 1680 |
} |
1684 | 1681 |
} |
1682 |
+ } |
|
1683 |
+ } |
|
1685 | 1684 |
|
1686 |
- if (dparams) |
|
1687 |
- pdf_free_dict(dparams); |
|
1688 |
- |
|
1689 |
- if ((rc == CL_VIRUS) && !SCAN_ALLMATCHES) { |
|
1690 |
- sum = 0; /* prevents post-filter scan */ |
|
1691 |
- break; |
|
1692 |
- } |
|
1685 |
+ if (dparams) |
|
1686 |
+ pdf_free_dict(dparams); |
|
1693 | 1687 |
|
1694 |
- cli_dbgmsg("-------------EXPERIMENTAL-------------\n"); |
|
1695 |
- } else { |
|
1696 |
- noisy_warnmsg("pdf_extract_obj: cannot find stream bounds for obj %u %u\n", obj->id >> 8, obj->id & 0xff); |
|
1697 |
- } |
|
1688 |
+ if ((rc == CL_VIRUS) && !SCAN_ALLMATCHES) { |
|
1689 |
+ sum = 0; /* prevents post-filter scan */ |
|
1690 |
+ goto done; |
|
1691 |
+ } |
|
1698 | 1692 |
|
1699 |
- } else if (obj->flags & (1 << OBJ_JAVASCRIPT)) { |
|
1700 |
- const char *q2; |
|
1701 |
- const char *q = (obj->objstm) ? (const char *)(obj->start + obj->objstm->streambuf) |
|
1702 |
- : (const char *)(obj->start + pdf->map); |
|
1693 |
+ cli_dbgmsg("-------------EXPERIMENTAL-------------\n"); |
|
1703 | 1694 |
|
1704 |
- /* TODO: get obj-endobj size */ |
|
1705 |
- off_t bytesleft = obj_size(pdf, obj, 0); |
|
1695 |
+ } else if (obj->flags & (1 << OBJ_JAVASCRIPT)) { |
|
1696 |
+ const char *q2; |
|
1697 |
+ const char *q = (obj->objstm) ? (const char *)(obj->start + obj->objstm->streambuf) |
|
1698 |
+ : (const char *)(obj->start + pdf->map); |
|
1706 | 1699 |
|
1707 |
- if (bytesleft < 0) |
|
1708 |
- break; |
|
1700 |
+ /* TODO: get obj-endobj size */ |
|
1701 |
+ off_t bytesleft = obj->size; |
|
1709 | 1702 |
|
1710 |
- do { |
|
1711 |
- char *js = NULL; |
|
1712 |
- size_t js_len = 0; |
|
1713 |
- const char *q3; |
|
1703 |
+ if (bytesleft < 0) { |
|
1704 |
+ goto done; |
|
1705 |
+ } |
|
1714 | 1706 |
|
1715 |
- q2 = cli_memstr(q, bytesleft, "/JavaScript", 11); |
|
1716 |
- if (!q2) |
|
1717 |
- break; |
|
1707 |
+ do { |
|
1708 |
+ char *js = NULL; |
|
1709 |
+ size_t js_len = 0; |
|
1710 |
+ const char *q3; |
|
1718 | 1711 |
|
1719 |
- bytesleft -= q2 - q + 11; |
|
1720 |
- q = q2 + 11; |
|
1712 |
+ q2 = cli_memstr(q, bytesleft, "/JavaScript", 11); |
|
1713 |
+ if (!q2) |
|
1714 |
+ break; |
|
1721 | 1715 |
|
1722 |
- js = pdf_readstring(q, bytesleft, "/JS", NULL, &q2, !(pdf->flags & (1 << DECRYPTABLE_PDF))); |
|
1723 |
- bytesleft -= q2 - q; |
|
1724 |
- q = q2; |
|
1716 |
+ bytesleft -= q2 - q + 11; |
|
1717 |
+ q = q2 + 11; |
|
1725 | 1718 |
|
1726 |
- if (js) { |
|
1727 |
- char *decrypted = NULL; |
|
1728 |
- const char *out = js; |
|
1729 |
- js_len = strlen(js); |
|
1730 |
- if (pdf->flags & (1 << DECRYPTABLE_PDF)) { |
|
1731 |
- cli_dbgmsg("pdf_extract_obj: encrypted string\n"); |
|
1732 |
- decrypted = decrypt_any(pdf, obj->id, js, &js_len, pdf->enc_method_string); |
|
1719 |
+ js = pdf_readstring(q, bytesleft, "/JS", NULL, &q2, !(pdf->flags & (1 << DECRYPTABLE_PDF))); |
|
1720 |
+ bytesleft -= q2 - q; |
|
1721 |
+ q = q2; |
|
1733 | 1722 |
|
1734 |
- if (decrypted) { |
|
1735 |
- noisy_msg(pdf, "pdf_extract_obj: decrypted Javascript string from obj %u %u\n", obj->id >> 8, obj->id & 0xff); |
|
1736 |
- out = decrypted; |
|
1737 |
- } |
|
1738 |
- } |
|
1723 |
+ if (js) { |
|
1724 |
+ char *decrypted = NULL; |
|
1725 |
+ const char *out = js; |
|
1726 |
+ js_len = strlen(js); |
|
1727 |
+ if (pdf->flags & (1 << DECRYPTABLE_PDF)) { |
|
1728 |
+ cli_dbgmsg("pdf_extract_obj: encrypted string\n"); |
|
1729 |
+ decrypted = decrypt_any(pdf, obj->id, js, &js_len, pdf->enc_method_string); |
|
1739 | 1730 |
|
1740 |
- if (filter_writen(pdf, obj, fout, out, js_len, (size_t *)&sum) != js_len) { |
|
1741 |
- rc = CL_EWRITE; |
|
1742 |
- free(js); |
|
1743 |
- break; |
|
1731 |
+ if (decrypted) { |
|
1732 |
+ noisy_msg(pdf, "pdf_extract_obj: decrypted Javascript string from obj %u %u\n", obj->id >> 8, obj->id & 0xff); |
|
1733 |
+ out = decrypted; |
|
1744 | 1734 |
} |
1735 |
+ } |
|
1745 | 1736 |
|
1746 |
- free(decrypted); |
|
1737 |
+ if (filter_writen(pdf, obj, fout, out, js_len, (size_t *)&sum) != js_len) { |
|
1738 |
+ rc = CL_EWRITE; |
|
1747 | 1739 |
free(js); |
1748 |
- cli_dbgmsg("pdf_extract_obj: bytesleft: %d\n", (int)bytesleft); |
|
1740 |
+ break; |
|
1741 |
+ } |
|
1749 | 1742 |
|
1750 |
- if (bytesleft > 0) { |
|
1751 |
- q2 = pdf_nextobject(q, bytesleft); |
|
1752 |
- if (!q2) |
|
1753 |
- q2 = q + bytesleft - 1; |
|
1743 |
+ free(decrypted); |
|
1744 |
+ free(js); |
|
1745 |
+ cli_dbgmsg("pdf_extract_obj: bytesleft: %d\n", (int)bytesleft); |
|
1754 | 1746 |
|
1755 |
- /* non-conforming PDFs that don't escape ) properly */ |
|
1756 |
- q3 = memchr(q, ')', bytesleft); |
|
1757 |
- if (q3 && q3 < q2) |
|
1758 |
- q2 = q3; |
|
1747 |
+ if (bytesleft > 0) { |
|
1748 |
+ q2 = pdf_nextobject(q, bytesleft); |
|
1749 |
+ if (!q2) |
|
1750 |
+ q2 = q + bytesleft - 1; |
|
1759 | 1751 |
|
1760 |
- while (q2 > q && q2[-1] == ' ') |
|
1761 |
- q2--; |
|
1752 |
+ /* non-conforming PDFs that don't escape ) properly */ |
|
1753 |
+ q3 = memchr(q, ')', bytesleft); |
|
1754 |
+ if (q3 && q3 < q2) |
|
1755 |
+ q2 = q3; |
|
1762 | 1756 |
|
1763 |
- if (q2 > q) { |
|
1764 |
- q--; |
|
1765 |
- filter_writen(pdf, obj, fout, q, q2 - q, (size_t *)&sum); |
|
1766 |
- q++; |
|
1767 |
- } |
|
1757 |
+ while (q2 > q && q2[-1] == ' ') |
|
1758 |
+ q2--; |
|
1759 |
+ |
|
1760 |
+ if (q2 > q) { |
|
1761 |
+ q--; |
|
1762 |
+ filter_writen(pdf, obj, fout, q, q2 - q, (size_t *)&sum); |
|
1763 |
+ q++; |
|
1768 | 1764 |
} |
1769 | 1765 |
} |
1766 |
+ } |
|
1770 | 1767 |
|
1771 |
- } while (bytesleft > 0); |
|
1772 |
- } else { |
|
1773 |
- off_t bytesleft = obj_size(pdf, obj, 0); |
|
1774 |
- |
|
1775 |
- if (bytesleft < 0) |
|
1776 |
- rc = CL_EFORMAT; |
|
1777 |
- else { |
|
1778 |
- if (obj->objstm) { |
|
1779 |
- if (filter_writen(pdf, obj, fout, obj->objstm->streambuf + obj->start, bytesleft, (size_t *)&sum) != (size_t)bytesleft) |
|
1780 |
- rc = CL_EWRITE; |
|
1781 |
- } else { |
|
1782 |
- if (filter_writen(pdf, obj, fout, pdf->map + obj->start, bytesleft, (size_t *)&sum) != (size_t)bytesleft) |
|
1783 |
- rc = CL_EWRITE; |
|
1784 |
- } |
|
1768 |
+ } while (bytesleft > 0); |
|
1769 |
+ } else { |
|
1770 |
+ off_t bytesleft = obj->size; |
|
1771 |
+ |
|
1772 |
+ if (bytesleft < 0) |
|
1773 |
+ rc = CL_EFORMAT; |
|
1774 |
+ else { |
|
1775 |
+ if (obj->objstm) { |
|
1776 |
+ if (filter_writen(pdf, obj, fout, obj->objstm->streambuf + obj->start, bytesleft, (size_t *)&sum) != (size_t)bytesleft) |
|
1777 |
+ rc = CL_EWRITE; |
|
1778 |
+ } else { |
|
1779 |
+ if (filter_writen(pdf, obj, fout, pdf->map + obj->start, bytesleft, (size_t *)&sum) != (size_t)bytesleft) |
|
1780 |
+ rc = CL_EWRITE; |
|
1785 | 1781 |
} |
1786 | 1782 |
} |
1787 |
- } while (0); |
|
1783 |
+ } |
|
1784 |
+ |
|
1785 |
+done: |
|
1788 | 1786 |
|
1789 | 1787 |
cli_dbgmsg("pdf_extract_obj: extracted %td bytes %u %u obj\n", sum, obj->id >> 8, obj->id & 0xff); |
1790 | 1788 |
cli_dbgmsg("pdf_extract_obj: ... to %s\n", fullname); |
... | ... |
@@ -2093,7 +2013,7 @@ void pdf_parseobj(struct pdf_struct *pdf, struct pdf_obj *obj) |
2093 | 2093 |
const char *nextobj = NULL, *nextopen = NULL, *nextclose = NULL; |
2094 | 2094 |
const char *q = NULL; |
2095 | 2095 |
const char *dict = NULL, *enddict = NULL, *start = NULL; |
2096 |
- off_t dict_length = 0, full_dict_length = 0, objsize = 0, bytesleft = 0; |
|
2096 |
+ off_t dict_length = 0, full_dict_length = 0, bytesleft = 0; |
|
2097 | 2097 |
size_t i = 0; |
2098 | 2098 |
unsigned filters = 0, blockopens = 0; |
2099 | 2099 |
enum objstate objstate = STATE_NONE; |
... | ... |
@@ -2106,6 +2026,8 @@ void pdf_parseobj(struct pdf_struct *pdf, struct pdf_obj *obj) |
2106 | 2106 |
return; |
2107 | 2107 |
} |
2108 | 2108 |
|
2109 |
+ cli_dbgmsg("pdf_parseobj: Parsing object %u %u\n", obj->id >> 8, obj->id & 0xff); |
|
2110 |
+ |
|
2109 | 2111 |
if (obj->objstm) { |
2110 | 2112 |
if ((size_t)obj->start > obj->objstm->streambuf_len) { |
2111 | 2113 |
cli_dbgmsg("pdf_parseobj: %u %u obj: obj start (%u) is greater than size of object stream (%zu).\n", |
... | ... |
@@ -2123,14 +2045,38 @@ void pdf_parseobj(struct pdf_struct *pdf, struct pdf_obj *obj) |
2123 | 2123 |
} |
2124 | 2124 |
start = q; |
2125 | 2125 |
|
2126 |
- objsize = obj_size(pdf, obj, 1); |
|
2127 |
- if (objsize < 0) |
|
2126 |
+ if (obj->size <= 0) |
|
2128 | 2127 |
return; |
2129 | 2128 |
|
2130 | 2129 |
if (obj->objstm) { |
2131 |
- bytesleft = MIN(objsize, obj->objstm->streambuf_len - obj->start); |
|
2130 |
+ bytesleft = MIN(obj->size, obj->objstm->streambuf_len - obj->start); |
|
2132 | 2131 |
} else { |
2133 |
- bytesleft = MIN(objsize, pdf->size - obj->start); |
|
2132 |
+ bytesleft = MIN(obj->size, pdf->size - obj->start); |
|
2133 |
+ } |
|
2134 |
+ |
|
2135 |
+ /* For objects that aren't already in an object stream^, check if they contain a stream. |
|
2136 |
+ * ^Objects in object streams aren't supposed to contain streams, so we don't check them. */ |
|
2137 |
+ if (NULL == obj->objstm) { |
|
2138 |
+ /* Check if object contains stream */ |
|
2139 |
+ cl_error_t has_stream; |
|
2140 |
+ const char *stream = NULL; |
|
2141 |
+ size_t stream_size = 0; |
|
2142 |
+ |
|
2143 |
+ has_stream = find_stream_bounds( |
|
2144 |
+ start, |
|
2145 |
+ obj->size, |
|
2146 |
+ &stream, |
|
2147 |
+ &stream_size, |
|
2148 |
+ (pdf->enc_method_stream <= ENC_IDENTITY) && (pdf->enc_method_embeddedfile <= ENC_IDENTITY)); |
|
2149 |
+ |
|
2150 |
+ if ((CL_SUCCESS == has_stream) || |
|
2151 |
+ (CL_EFORMAT == has_stream)) { |
|
2152 |
+ /* Stream found. Store this fact and the stream bounds. */ |
|
2153 |
+ cli_dbgmsg("pdf_parseobj: %u %u contains stream, size: %zu\n", obj->id >> 8, obj->id & 0xff, stream_size); |
|
2154 |
+ obj->flags |= (1 << OBJ_STREAM); |
|
2155 |
+ obj->stream = stream; |
|
2156 |
+ obj->stream_size = stream_size; |
|
2157 |
+ } |
|
2134 | 2158 |
} |
2135 | 2159 |
|
2136 | 2160 |
/* find start of dictionary */ |
... | ... |
@@ -2181,7 +2127,7 @@ void pdf_parseobj(struct pdf_struct *pdf, struct pdf_obj *obj) |
2181 | 2181 |
dict = q3 + 2; |
2182 | 2182 |
q = dict; |
2183 | 2183 |
blockopens++; |
2184 |
- bytesleft = objsize - (q - start); |
|
2184 |
+ bytesleft = obj->size - (q - start); |
|
2185 | 2185 |
enddict = q + bytesleft - 1; |
2186 | 2186 |
|
2187 | 2187 |
/* find end of dictionary block */ |
... | ... |
@@ -2329,7 +2275,7 @@ void pdf_parseobj(struct pdf_struct *pdf, struct pdf_obj *obj) |
2329 | 2329 |
pdfobj_flag(pdf, obj, LINEARIZED_PDF); |
2330 | 2330 |
objstate = STATE_NONE; |
2331 | 2331 |
trailer_end = pdf_readint(dict, full_dict_length, "/H"); |
2332 |
- if (trailer_end > 0 && trailer_end < pdf->size) { |
|
2332 |
+ if ((trailer_end > 0) && ((size_t)trailer_end < pdf->size)) { |
|
2333 | 2333 |
trailer = trailer_end - 1024; |
2334 | 2334 |
if (trailer < 0) |
2335 | 2335 |
trailer = 0; |
... | ... |
@@ -2939,7 +2885,7 @@ void pdf_handle_enc(struct pdf_struct *pdf) |
2939 | 2939 |
return; |
2940 | 2940 |
} |
2941 | 2941 |
|
2942 |
- len = obj_size(pdf, obj, 1); |
|
2942 |
+ len = obj->size; |
|
2943 | 2943 |
q = (obj->objstm) ? (const char *)(obj->start + obj->objstm->streambuf) |
2944 | 2944 |
: (const char *)(obj->start + pdf->map); |
2945 | 2945 |
|
... | ... |
@@ -3095,20 +3041,20 @@ void pdf_handle_enc(struct pdf_struct *pdf) |
3095 | 3095 |
} |
3096 | 3096 |
|
3097 | 3097 |
/** |
3098 |
- * @brief Search pdf buffer for objects. Parse each. |
|
3099 |
- * |
|
3098 |
+ * @brief Search pdf buffer for objects. Parse each. |
|
3099 |
+ * |
|
3100 | 3100 |
* Newly found objects will be extracted after completion when the extraction for loop continues. |
3101 |
- * |
|
3102 |
- * @param pdf Pdf struct that keeps track of all information found in the PDF. |
|
3101 |
+ * |
|
3102 |
+ * @param pdf Pdf struct that keeps track of all information found in the PDF. |
|
3103 | 3103 |
* @param objstm Pointer to an object stream to parse. |
3104 |
- * |
|
3104 |
+ * |
|
3105 | 3105 |
* @return cl_error_t Error code. |
3106 | 3106 |
*/ |
3107 | 3107 |
cl_error_t pdf_find_and_parse_objs_in_objstm(struct pdf_struct *pdf, struct objstm_struct *objstm) |
3108 | 3108 |
{ |
3109 |
- cl_error_t status = CL_EFORMAT; |
|
3110 |
- cl_error_t retval = CL_EPARSE; |
|
3111 |
- int32_t foundobj = 0, alerts = 0; |
|
3109 |
+ cl_error_t status = CL_EFORMAT; |
|
3110 |
+ cl_error_t retval = CL_EPARSE; |
|
3111 |
+ int32_t alerts = 0; |
|
3112 | 3112 |
uint32_t badobjects = 0; |
3113 | 3113 |
size_t i = 0; |
3114 | 3114 |
|
... | ... |
@@ -3119,9 +3065,6 @@ cl_error_t pdf_find_and_parse_objs_in_objstm(struct pdf_struct *pdf, struct objs |
3119 | 3119 |
goto done; |
3120 | 3120 |
} |
3121 | 3121 |
|
3122 |
- char *current_pair = objstm->streambuf; |
|
3123 |
- char *current_obj = objstm->streambuf + objstm->first; |
|
3124 |
- |
|
3125 | 3122 |
if ((0 == objstm->first) || |
3126 | 3123 |
(0 == objstm->streambuf_len) || |
3127 | 3124 |
(0 == objstm->n)) { |
... | ... |
@@ -3183,18 +3126,17 @@ done: |
3183 | 3183 |
|
3184 | 3184 |
/** |
3185 | 3185 |
* @brief Search pdf buffer for objects. Parse each and then extract each. |
3186 |
- * |
|
3186 |
+ * |
|
3187 | 3187 |
* @param pdf Pdf struct that keeps track of all information found in the PDF. |
3188 | 3188 |
* @param alerts[in/out] The number of alerts, relevant in ALLMATCH mode. |
3189 |
- * |
|
3189 |
+ * |
|
3190 | 3190 |
* @return cl_error_t Error code. |
3191 | 3191 |
*/ |
3192 | 3192 |
cl_error_t pdf_find_and_extract_objs(struct pdf_struct *pdf, uint32_t *alerts) |
3193 | 3193 |
{ |
3194 |
- cl_error_t status = CL_SUCCESS; |
|
3195 |
- int32_t rv = 0; |
|
3196 |
- int foundobj = 0; |
|
3197 |
- unsigned int i = 0, j = 0; |
|
3194 |
+ cl_error_t status = CL_SUCCESS; |
|
3195 |
+ int32_t rv = 0; |
|
3196 |
+ unsigned int i = 0; |
|
3198 | 3197 |
uint32_t badobjects = 0; |
3199 | 3198 |
cli_ctx *ctx = pdf->ctx; |
3200 | 3199 |
|
... | ... |
@@ -3299,11 +3241,11 @@ done: |
3299 | 3299 |
|
3300 | 3300 |
/** |
3301 | 3301 |
* @brief Primary function for parsing and scanning a PDF. |
3302 |
- * |
|
3302 |
+ * |
|
3303 | 3303 |
* @param dir Filepath for temp file. |
3304 |
- * @param ctx clam scan context structure. |
|
3304 |
+ * @param ctx clam scan context structure. |
|
3305 | 3305 |
* @param offset offset of pdf in ctx->fmap |
3306 |
- * |
|
3306 |
+ * |
|
3307 | 3307 |
* @return int Returns cl_error_t status value. |
3308 | 3308 |
*/ |
3309 | 3309 |
int cli_pdf(const char *dir, cli_ctx *ctx, off_t offset) |
... | ... |
@@ -3505,7 +3447,7 @@ int cli_pdf(const char *dir, cli_ctx *ctx, off_t offset) |
3505 | 3505 |
} |
3506 | 3506 |
|
3507 | 3507 |
/* |
3508 |
- * Find and extract all objects in the PDF. |
|
3508 |
+ * Find and extract all objects in the PDF. |
|
3509 | 3509 |
* New experimental recursive methodology that adds objects from object streams. |
3510 | 3510 |
*/ |
3511 | 3511 |
objs_found = pdf.nobjs; |
... | ... |
@@ -3605,10 +3547,10 @@ done: |
3605 | 3605 |
|
3606 | 3606 |
/** |
3607 | 3607 |
* @brief Skip the rest of the current line, and find the start of the next line. |
3608 |
- * |
|
3608 |
+ * |
|
3609 | 3609 |
* @param ptr Current offset into buffer. |
3610 |
- * @param len Remaining bytes in buffer. |
|
3611 |
- * |
|
3610 |
+ * @param len Remaining bytes in buffer. |
|
3611 |
+ * |
|
3612 | 3612 |
* @return const char* Address of next line, or NULL if no next line in buffer. |
3613 | 3613 |
*/ |
3614 | 3614 |
static const char * |
... | ... |
@@ -3638,13 +3580,13 @@ pdf_nextlinestart(const char *ptr, size_t len) |
3638 | 3638 |
|
3639 | 3639 |
/** |
3640 | 3640 |
* @brief Return the start of the next PDF object. |
3641 |
- * |
|
3641 |
+ * |
|
3642 | 3642 |
* This assumes that we're not in a stream. |
3643 |
- * |
|
3643 |
+ * |
|
3644 | 3644 |
* @param ptr Current offset into buffer. |
3645 |
- * @param len Remaining bytes in buffer. |
|
3646 |
- * |
|
3647 |
- * @return const char* Address of next object in the buffer, or NULL if there is none in the buffer. |
|
3645 |
+ * @param len Remaining bytes in buffer. |
|
3646 |
+ * |
|
3647 |
+ * @return const char* Address of next object in the buffer, or NULL if there is none in the buffer. |
|
3648 | 3648 |
*/ |
3649 | 3649 |
static const char * |
3650 | 3650 |
pdf_nextobject(const char *ptr, size_t len) |
... | ... |
@@ -3987,7 +3929,7 @@ static void Author_cb(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdfnam |
3987 | 3987 |
pdf->stats.author = cli_calloc(1, sizeof(struct pdf_stats_entry)); |
3988 | 3988 |
if (!(pdf->stats.author)) |
3989 | 3989 |
return; |
3990 |
- pdf->stats.author->data = pdf_parse_string(pdf, obj, objstart, obj_size(pdf, obj, 1), "/Author", NULL, &(pdf->stats.author->meta)); |
|
3990 |
+ pdf->stats.author->data = pdf_parse_string(pdf, obj, objstart, obj->size, "/Author", NULL, &(pdf->stats.author->meta)); |
|
3991 | 3991 |
} |
3992 | 3992 |
} |
3993 | 3993 |
#endif |
... | ... |
@@ -4012,7 +3954,7 @@ static void Creator_cb(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdfna |
4012 | 4012 |
pdf->stats.creator = cli_calloc(1, sizeof(struct pdf_stats_entry)); |
4013 | 4013 |
if (!(pdf->stats.creator)) |
4014 | 4014 |
return; |
4015 |
- pdf->stats.creator->data = pdf_parse_string(pdf, obj, objstart, obj_size(pdf, obj, 1), "/Creator", NULL, &(pdf->stats.creator->meta)); |
|
4015 |
+ pdf->stats.creator->data = pdf_parse_string(pdf, obj, objstart, obj->size, "/Creator", NULL, &(pdf->stats.creator->meta)); |
|
4016 | 4016 |
} |
4017 | 4017 |
} |
4018 | 4018 |
#endif |
... | ... |
@@ -4037,7 +3979,7 @@ static void ModificationDate_cb(struct pdf_struct *pdf, struct pdf_obj *obj, str |
4037 | 4037 |
pdf->stats.modificationdate = cli_calloc(1, sizeof(struct pdf_stats_entry)); |
4038 | 4038 |
if (!(pdf->stats.modificationdate)) |
4039 | 4039 |
return; |
4040 |
- pdf->stats.modificationdate->data = pdf_parse_string(pdf, obj, objstart, obj_size(pdf, obj, 1), "/ModDate", NULL, &(pdf->stats.modificationdate->meta)); |
|
4040 |
+ pdf->stats.modificationdate->data = pdf_parse_string(pdf, obj, objstart, obj->size, "/ModDate", NULL, &(pdf->stats.modificationdate->meta)); |
|
4041 | 4041 |
} |
4042 | 4042 |
} |
4043 | 4043 |
#endif |
... | ... |
@@ -4062,7 +4004,7 @@ static void CreationDate_cb(struct pdf_struct *pdf, struct pdf_obj *obj, struct |
4062 | 4062 |
pdf->stats.creationdate = cli_calloc(1, sizeof(struct pdf_stats_entry)); |
4063 | 4063 |
if (!(pdf->stats.creationdate)) |
4064 | 4064 |
return; |
4065 |
- pdf->stats.creationdate->data = pdf_parse_string(pdf, obj, objstart, obj_size(pdf, obj, 1), "/CreationDate", NULL, &(pdf->stats.creationdate->meta)); |
|
4065 |
+ pdf->stats.creationdate->data = pdf_parse_string(pdf, obj, objstart, obj->size, "/CreationDate", NULL, &(pdf->stats.creationdate->meta)); |
|
4066 | 4066 |
} |
4067 | 4067 |
} |
4068 | 4068 |
#endif |
... | ... |
@@ -4087,7 +4029,7 @@ static void Producer_cb(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdfn |
4087 | 4087 |
pdf->stats.producer = cli_calloc(1, sizeof(struct pdf_stats_entry)); |
4088 | 4088 |
if (!(pdf->stats.producer)) |
4089 | 4089 |
return; |
4090 |
- pdf->stats.producer->data = pdf_parse_string(pdf, obj, objstart, obj_size(pdf, obj, 1), "/Producer", NULL, &(pdf->stats.producer->meta)); |
|
4090 |
+ pdf->stats.producer->data = pdf_parse_string(pdf, obj, objstart, obj->size, "/Producer", NULL, &(pdf->stats.producer->meta)); |
|
4091 | 4091 |
} |
4092 | 4092 |
} |
4093 | 4093 |
#endif |
... | ... |
@@ -4112,7 +4054,7 @@ static void Title_cb(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdfname |
4112 | 4112 |
pdf->stats.title = cli_calloc(1, sizeof(struct pdf_stats_entry)); |
4113 | 4113 |
if (!(pdf->stats.title)) |
4114 | 4114 |
return; |
4115 |
- pdf->stats.title->data = pdf_parse_string(pdf, obj, objstart, obj_size(pdf, obj, 1), "/Title", NULL, &(pdf->stats.title->meta)); |
|
4115 |
+ pdf->stats.title->data = pdf_parse_string(pdf, obj, objstart, obj->size, "/Title", NULL, &(pdf->stats.title->meta)); |
|
4116 | 4116 |
} |
4117 | 4117 |
} |
4118 | 4118 |
#endif |
... | ... |
@@ -4137,7 +4079,7 @@ static void Keywords_cb(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdfn |
4137 | 4137 |
pdf->stats.keywords = cli_calloc(1, sizeof(struct pdf_stats_entry)); |
4138 | 4138 |
if (!(pdf->stats.keywords)) |
4139 | 4139 |
return; |
4140 |
- pdf->stats.keywords->data = pdf_parse_string(pdf, obj, objstart, obj_size(pdf, obj, 1), "/Keywords", NULL, &(pdf->stats.keywords->meta)); |
|
4140 |
+ pdf->stats.keywords->data = pdf_parse_string(pdf, obj, objstart, obj->size, "/Keywords", NULL, &(pdf->stats.keywords->meta)); |
|
4141 | 4141 |
} |
4142 | 4142 |
} |
4143 | 4143 |
#endif |
... | ... |
@@ -4162,7 +4104,7 @@ static void Subject_cb(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdfna |
4162 | 4162 |
pdf->stats.subject = cli_calloc(1, sizeof(struct pdf_stats_entry)); |
4163 | 4163 |
if (!(pdf->stats.subject)) |
4164 | 4164 |
return; |
4165 |
- pdf->stats.subject->data = pdf_parse_string(pdf, obj, objstart, obj_size(pdf, obj, 1), "/Subject", NULL, &(pdf->stats.subject->meta)); |
|
4165 |
+ pdf->stats.subject->data = pdf_parse_string(pdf, obj, objstart, obj->size, "/Subject", NULL, &(pdf->stats.subject->meta)); |
|
4166 | 4166 |
} |
4167 | 4167 |
} |
4168 | 4168 |
#endif |
... | ... |
@@ -4214,7 +4156,6 @@ static void Pages_cb(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdfname |
4214 | 4214 |
const char *objstart = (obj->objstm) ? (const char *)(obj->start + obj->objstm->streambuf) |
4215 | 4215 |
: (const char *)(obj->start + pdf->map); |
4216 | 4216 |
const char *begin; |
4217 |
- unsigned int objsize; |
|
4218 | 4217 |
unsigned long npages = 0, count; |
4219 | 4218 |
long temp_long; |
4220 | 4219 |
struct pdf_array_node *node; |
... | ... |
@@ -4229,19 +4170,17 @@ static void Pages_cb(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdfname |
4229 | 4229 |
if (!(SCAN_COLLECT_METADATA)) |
4230 | 4230 |
return; |
4231 | 4231 |
|
4232 |
- objsize = obj_size(pdf, obj, 1); |
|
4233 |
- |
|
4234 | 4232 |
pdfobj = cli_jsonobj(pdf->ctx->wrkproperty, "PDFStats"); |
4235 | 4233 |
if (!(pdfobj)) |
4236 | 4234 |
return; |
4237 | 4235 |
|
4238 |
- begin = cli_memstr(objstart, objsize, "/Kids", 5); |
|
4236 |
+ begin = cli_memstr(objstart, obj->size, "/Kids", 5); |
|
4239 | 4237 |
if (!(begin)) |
4240 | 4238 |
return; |
4241 | 4239 |
|
4242 | 4240 |
begin += 5; |
4243 | 4241 |
|
4244 |
- array = pdf_parse_array(pdf, obj, objsize, (char *)begin, NULL); |
|
4242 |
+ array = pdf_parse_array(pdf, obj, obj->size, (char *)begin, NULL); |
|
4245 | 4243 |
if (!(array)) { |
4246 | 4244 |
cli_jsonbool(pdfobj, "IncorrectPagesCount", 1); |
4247 | 4245 |
return; |
... | ... |
@@ -4252,22 +4191,22 @@ static void Pages_cb(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdfname |
4252 | 4252 |
if (strchr((char *)(node->data), 'R')) |
4253 | 4253 |
npages++; |
4254 | 4254 |
|
4255 |
- begin = cli_memstr(objstart, objsize, "/Count", 6); |
|
4255 |
+ begin = cli_memstr(objstart, obj->size, "/Count", 6); |
|
4256 | 4256 |
if (!(begin)) { |
4257 | 4257 |
cli_jsonbool(pdfobj, "IncorrectPagesCount", 1); |
4258 | 4258 |
goto cleanup; |
4259 | 4259 |
} |
4260 | 4260 |
|
4261 | 4261 |
begin += 6; |
4262 |
- while (begin - objstart < objsize && isspace(begin[0])) |
|
4262 |
+ while (((size_t)(begin - objstart) < obj->size) && isspace(begin[0])) |
|
4263 | 4263 |
begin++; |
4264 | 4264 |
|
4265 |
- if (begin - objstart >= objsize) { |
|
4265 |
+ if ((size_t)(begin - objstart) >= obj->size) { |
|
4266 | 4266 |
goto cleanup; |
4267 | 4267 |
} |
4268 | 4268 |
|
4269 |
- countsize = (obj->objstm) ? (size_t)(obj->start + obj->objstm->streambuf + objsize - begin) |
|
4270 |
- : (size_t)(obj->start + pdf->map + objsize - begin); |
|
4269 |
+ countsize = (obj->objstm) ? (size_t)(obj->start + obj->objstm->streambuf + obj->size - begin) |
|
4270 |
+ : (size_t)(obj->start + pdf->map + obj->size - begin); |
|
4271 | 4271 |
|
4272 | 4272 |
if (CL_SUCCESS != cli_strntol_wrap(begin, countsize, 0, 10, &temp_long)) { |
4273 | 4273 |
cli_jsonbool(pdfobj, "IncorrectPagesCount", 1); |
... | ... |
@@ -4295,7 +4234,6 @@ static void Colors_cb(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdfnam |
4295 | 4295 |
char *p1; |
4296 | 4296 |
const char *objstart = (obj->objstm) ? (const char *)(obj->start + obj->objstm->streambuf) |
4297 | 4297 |
: (const char *)(obj->start + pdf->map); |
4298 |
- size_t objsize; |
|
4299 | 4298 |
|
4300 | 4299 |
UNUSEDPARAM(act); |
4301 | 4300 |
|
... | ... |
@@ -4305,25 +4243,23 @@ static void Colors_cb(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdfnam |
4305 | 4305 |
if (!(SCAN_COLLECT_METADATA)) |
4306 | 4306 |
return; |
4307 | 4307 |
|
4308 |
- objsize = obj_size(pdf, obj, 1); |
|
4309 |
- |
|
4310 |
- p1 = (char *)cli_memstr(objstart, objsize, "/Colors", 7); |
|
4308 |
+ p1 = (char *)cli_memstr(objstart, obj->size, "/Colors", 7); |
|
4311 | 4309 |
if (!(p1)) |
4312 | 4310 |
return; |
4313 | 4311 |
|
4314 | 4312 |
p1 += 7; |
4315 | 4313 |
|
4316 | 4314 |
/* Ensure that we have at least one whitespace character plus at least one number */ |
4317 |
- if (objsize - (p1 - objstart) < 2) |
|
4315 |
+ if (obj->size - (size_t)(p1 - objstart) < 2) |
|
4318 | 4316 |
return; |
4319 | 4317 |
|
4320 |
- while (p1 - objstart < objsize && isspace(p1[0])) |
|
4318 |
+ while (((size_t)(p1 - objstart) < obj->size) && isspace(p1[0])) |
|
4321 | 4319 |
p1++; |
4322 | 4320 |
|
4323 |
- if ((size_t)(p1 - objstart) == objsize) |
|
4321 |
+ if ((size_t)(p1 - objstart) == obj->size) |
|
4324 | 4322 |
return; |
4325 | 4323 |
|
4326 |
- if (CL_SUCCESS != cli_strntol_wrap(p1, (size_t)((p1 - objstart) - objsize), 0, 10, &temp_long)) { |
|
4324 |
+ if (CL_SUCCESS != cli_strntol_wrap(p1, (size_t)((p1 - objstart) - obj->size), 0, 10, &temp_long)) { |
|
4327 | 4325 |
return; |
4328 | 4326 |
} else if (temp_long < 0) { |
4329 | 4327 |
return; |
... | ... |
@@ -37,12 +37,14 @@ struct objstm_struct { |
37 | 37 |
|
38 | 38 |
struct pdf_obj { |
39 | 39 |
uint32_t start; |
40 |
- int32_t size; |
|
40 |
+ size_t size; |
|
41 | 41 |
uint32_t id; |
42 | 42 |
uint32_t flags; |
43 | 43 |
uint32_t statsflags; |
44 | 44 |
uint32_t numfilters; |
45 | 45 |
uint32_t filterlist[PDF_FILTERLIST_MAX]; |
46 |
+ const char *stream; // pointer to stream contained in object. |
|
47 |
+ size_t stream_size; // size of stream contained in object. |
|
46 | 48 |
struct objstm_struct *objstm; // Should be NULL unless the obj exists in an object stream (separate buffer) |
47 | 49 |
char *path; |
48 | 50 |
}; |
... | ... |
@@ -151,7 +153,7 @@ struct pdf_struct { |
151 | 151 |
const char *CF; |
152 | 152 |
long CF_n; |
153 | 153 |
const char *map; |
154 |
- off_t size; |
|
154 |
+ size_t size; |
|
155 | 155 |
off_t offset; |
156 | 156 |
off_t startoff; |
157 | 157 |
cli_ctx *ctx; |
... | ... |
@@ -403,10 +403,9 @@ char *cli_strtokbuf(const char *input, int fieldno, const char *delim, |
403 | 403 |
return output; |
404 | 404 |
} |
405 | 405 |
|
406 |
-const char *cli_memstr(const char *haystack, unsigned int hs, |
|
407 |
- const char *needle, unsigned int ns) |
|
406 |
+const char *cli_memstr(const char *haystack, size_t hs, const char *needle, size_t ns) |
|
408 | 407 |
{ |
409 |
- unsigned int i, s1, s2; |
|
408 |
+ size_t i, s1, s2; |
|
410 | 409 |
|
411 | 410 |
if (!hs || !ns || hs < ns) |
412 | 411 |
return NULL; |
... | ... |
@@ -32,7 +32,7 @@ |
32 | 32 |
#include "clamav.h" |
33 | 33 |
#include "clamav.h" |
34 | 34 |
|
35 |
-#define SIZE_T_CHARLEN ((sizeof(size_t) * CHAR_BIT + 2) / 3 + 1) |
|
35 |
+#define SIZE_T_CHARLEN ( (sizeof(size_t) * CHAR_BIT + 2) / 3 + 1 ) |
|
36 | 36 |
|
37 | 37 |
#ifdef HAVE_STRCASESTR |
38 | 38 |
#define cli_strcasestr strcasestr |
... | ... |
@@ -61,25 +61,25 @@ int cli_chomp(char *string); |
61 | 61 |
char *cli_strtok(const char *line, int field, const char *delim); |
62 | 62 |
int cli_realhex2ui(const char *hex, uint16_t *ptr, unsigned int len); |
63 | 63 |
uint16_t *cli_hex2ui(const char *hex); |
64 |
-int cli_hex2str_to(const char *hex, char *ptr, size_t len); |
|
64 |
+int cli_hex2str_to(const char *hex, char *ptr, size_t len); |
|
65 | 65 |
char *cli_hex2str(const char *hex); |
66 | 66 |
int cli_hex2num(const char *hex); |
67 | 67 |
int cli_xtoi(const char *hex); |
68 | 68 |
char *cli_str2hex(const char *string, unsigned int len); |
69 | 69 |
char *cli_utf16toascii(const char *str, unsigned int length); |
70 | 70 |
char *cli_strtokbuf(const char *input, int fieldno, const char *delim, char *output); |
71 |
-const char *cli_memstr(const char *haystack, unsigned int hs, const char *needle, unsigned int ns); |
|
71 |
+const char *cli_memstr(const char *haystack, size_t hs, const char *needle, size_t ns); |
|
72 | 72 |
char *cli_strrcpy(char *dest, const char *source); |
73 | 73 |
size_t cli_strtokenize(char *buffer, const char delim, const size_t token_count, const char **tokens); |
74 | 74 |
size_t cli_ldbtokenize(char *buffer, const char delim, const size_t token_count, const char **tokens, int token_skip); |
75 |
-long cli_strntol(const char *nptr, size_t n, char **endptr, register int base); |
|
76 |
-unsigned long cli_strntoul(const char *nptr, size_t n, char **endptr, register int base); |
|
75 |
+long cli_strntol(const char* nptr, size_t n, char** endptr, register int base); |
|
76 |
+unsigned long cli_strntoul(const char* nptr, size_t n, char** endptr, register int base); |
|
77 | 77 |
cl_error_t cli_strntol_wrap(const char *buf, size_t buf_size, int fail_at_nondigit, int base, long *result); |
78 | 78 |
cl_error_t cli_strntoul_wrap(const char *buf, size_t buf_size, int fail_at_nondigit, int base, unsigned long *result); |
79 | 79 |
int cli_isnumber(const char *str); |
80 | 80 |
char *cli_unescape(const char *str); |
81 | 81 |
struct text_buffer; |
82 |
-int cli_textbuffer_append_normalize(struct text_buffer *buf, const char *str, size_t len); |
|
82 |
+int cli_textbuffer_append_normalize(struct text_buffer *buf, const char *str, size_t len); |
|
83 | 83 |
int cli_hexnibbles(char *str, int len); |
84 | 84 |
|
85 | 85 |
typedef enum { |