... | ... |
@@ -134,14 +134,14 @@ static int xrefCheck(const char *xref, const char *eof) |
134 | 134 |
if (xref + 4 >= eof) |
135 | 135 |
return -1; |
136 | 136 |
|
137 |
- if (!memcmp(xref, "xref", 4)) { |
|
137 |
+ if (!memcmp(xref, "xref", strlen("xref"))) { |
|
138 | 138 |
cli_dbgmsg("cli_pdf: found xref\n"); |
139 | 139 |
return 0; |
140 | 140 |
} |
141 | 141 |
|
142 | 142 |
/* could be xref stream */ |
143 | 143 |
for (q=xref; q+5 < eof; q++) { |
144 |
- if (!memcmp(q,"/XRef",4)) { |
|
144 |
+ if (!memcmp(q,"/XRef", strlen("/XRef"))) { |
|
145 | 145 |
cli_dbgmsg("cli_pdf: found /XRef\n"); |
146 | 146 |
return 0; |
147 | 147 |
} |
... | ... |
@@ -163,10 +163,10 @@ static int xrefCheck(const char *xref, const char *eof) |
163 | 163 |
|
164 | 164 |
/** |
165 | 165 |
* @brief Searching BACKwards, find the next character that is not a whitespace. |
166 |
- * |
|
166 |
+ * |
|
167 | 167 |
* @param q Index to start from (at the end of the search space) |
168 |
- * @param start Beginning of the search space. |
|
169 |
- * |
|
168 |
+ * @param start Beginning of the search space. |
|
169 |
+ * |
|
170 | 170 |
* @return const char* Address of the final non-whitespace character OR the same address as the start. |
171 | 171 |
*/ |
172 | 172 |
static const char *findNextNonWSBack(const char *q, const char *start) |
... | ... |
@@ -179,10 +179,10 @@ static const char *findNextNonWSBack(const char *q, const char *start) |
179 | 179 |
|
180 | 180 |
/** |
181 | 181 |
* @brief Searching FORwards, find the next character that is not a whitespace. |
182 |
- * |
|
182 |
+ * |
|
183 | 183 |
* @param q Index to start from (at the end of the search space) |
184 |
- * @param start Beginning of the search space. |
|
185 |
- * |
|
184 |
+ * @param start Beginning of the search space. |
|
185 |
+ * |
|
186 | 186 |
* @return const char* Address of the final non-whitespace character OR the same address as the start. |
187 | 187 |
*/ |
188 | 188 |
static const char *findNextNonWS(const char *q, const char *end) |
... | ... |
@@ -195,100 +195,116 @@ static const char *findNextNonWS(const char *q, const char *end) |
195 | 195 |
|
196 | 196 |
/** |
197 | 197 |
* @brief Find bounds of stream. |
198 |
- * |
|
198 |
+ * |
|
199 | 199 |
* PDF streams are prefixed with "stream" and suffixed with "endstream". |
200 | 200 |
* Return value indicates success or failure. |
201 |
- * |
|
201 |
+ * |
|
202 | 202 |
* @param start start address of search space. |
203 |
- * @param bytesleft size of search space for "stream" |
|
204 |
- * @param bytesleft2 size of search space for "endstream" |
|
203 |
+ * @param size size of search space |
|
205 | 204 |
* @param[out] stream output param, address of start of stream data |
206 |
- * @param[out] endstream output param, address of end of stream data |
|
205 |
+ * @param[out] stream_size output param, size of stream data |
|
207 | 206 |
* @param newline_hack hack to support newlines that are \r\n, and not just \n or just \r. |
208 |
- * |
|
209 |
- * @return int 1 if stream bounds were found. |
|
210 |
- * @return int 0 if stream bounds could not be found. |
|
207 |
+ * |
|
208 |
+ * @return cl_error_t CL_SUCCESS if stream bounds were found. |
|
209 |
+ * @return cl_error_t CL_BREAK if stream bounds could not be found. |
|
210 |
+ * @return cl_error_t CL_EFORMAT if stream start was found, but not end. (truncated) |
|
211 |
+ * @return cl_error_t CL_EARG if invalid args were provided. |
|
211 | 212 |
*/ |
212 |
-static int find_stream_bounds( |
|
213 |
- const char *start, |
|
214 |
- off_t bytesleft, |
|
215 |
- off_t bytesleft2, |
|
216 |
- off_t *stream, |
|
217 |
- off_t *endstream, |
|
213 |
+static cl_error_t find_stream_bounds( |
|
214 |
+ const char *start, |
|
215 |
+ size_t size, |
|
216 |
+ const char **stream, |
|
217 |
+ size_t *stream_size, |
|
218 | 218 |
int newline_hack) |
219 | 219 |
{ |
220 |
- const char *q2, *q; |
|
220 |
+ cl_error_t status = CL_BREAK; |
|
221 |
+ |
|
222 |
+ const char *idx; |
|
223 |
+ const char *stream_begin; |
|
224 |
+ const char *endstream_begin; |
|
225 |
+ size_t bytesleft = size; |
|
226 |
+ |
|
227 |
+ if ((NULL == start) || (0 == bytesleft) || (NULL == stream) || (NULL == stream_size)) { |
|
228 |
+ status = CL_EARG; |
|
229 |
+ return status; |
|
230 |
+ } |
|
231 |
+ |
|
232 |
+ *stream = NULL; |
|
233 |
+ *stream_size = 0; |
|
221 | 234 |
|
222 | 235 |
/* Begin by finding the "stream" string that prefixes stream data. */ |
223 |
- if ((q2 = cli_memstr(start, bytesleft, "stream", 6))) { |
|
224 |
- q2 += 6; |
|
225 |
- bytesleft -= q2 - start; |
|
236 |
+ if ((stream_begin = cli_memstr(start, bytesleft, "stream", strlen("stream")))) { |
|
237 |
+ idx = stream_begin + strlen("stream"); |
|
238 |
+ bytesleft -= idx - start; |
|
226 | 239 |
if (bytesleft < 0) |
227 |
- return 0; |
|
240 |
+ goto done; |
|
228 | 241 |
|
229 | 242 |
/* Skip any new line charcters. */ |
230 |
- if (bytesleft >= 2 && q2[0] == '\xd' && q2[1] == '\xa') { |
|
231 |
- q2 += 2; |
|
232 |
- if (newline_hack && (bytesleft > 2) && q2[0] == '\xa') |
|
233 |
- q2++; |
|
234 |
- } else if (bytesleft && q2[0] == '\xa') { |
|
235 |
- q2++; |
|
243 |
+ if (bytesleft >= 2 && idx[0] == '\xd' && idx[1] == '\xa') { |
|
244 |
+ idx += 2; |
|
245 |
+ if (newline_hack && (bytesleft > 2) && idx[0] == '\xa') |
|
246 |
+ idx++; |
|
247 |
+ } else if (bytesleft && idx[0] == '\xa') { |
|
248 |
+ idx++; |
|
236 | 249 |
} |
237 | 250 |
|
238 |
- *stream = q2 - start; |
|
251 |
+ /* Pass back start of the stream data. */ |
|
252 |
+ *stream = idx; |
|
239 | 253 |
|
240 |
- bytesleft2 -= q2 - start; |
|
241 |
- if (bytesleft2 <= 0) |
|
242 |
- return 0; |
|
254 |
+ bytesleft = size - (idx - start); |
|
255 |
+ if (bytesleft <= 0) |
|
256 |
+ goto done; |
|
243 | 257 |
|
244 |
- /* Now find the "endstream" string that suffixes stream data */ |
|
245 |
- q = q2; |
|
246 |
- q2 = cli_memstr(q, bytesleft2, "endstream", 9); |
|
247 |
- if (!q2) { |
|
248 |
- /* Couldn't find "endstream" */ |
|
249 |
- return 0; |
|
258 |
+ /* Now find the "endstream" string that suffixes stream data. */ |
|
259 |
+ endstream_begin = cli_memstr(idx, bytesleft, "endstream", strlen("endstream")); |
|
260 |
+ if (!endstream_begin) { |
|
261 |
+ /* Couldn't find "endstream", but that's ok -- |
|
262 |
+ * -- we'll just count the rest of the provided buffer. */ |
|
263 |
+ cli_dbgmsg("find_stream_bounds: Truncated stream found!\n"); |
|
264 |
+ endstream_begin = start + size; |
|
265 |
+ status = CL_EFORMAT; |
|
250 | 266 |
} |
251 | 267 |
|
252 |
- *endstream = q2 - start; |
|
253 |
- |
|
254 |
- /* Double-check that endstream >= stream */ |
|
255 |
- if (*endstream < *stream) |
|
256 |
- *endstream = *stream; |
|
268 |
+ /* Pass back end of the stream data, as offset from start. */ |
|
269 |
+ *stream_size = endstream_begin - *stream; |
|
257 | 270 |
|
258 |
- return 1; |
|
271 |
+ if (CL_EFORMAT != status) |
|
272 |
+ status = CL_SUCCESS; |
|
259 | 273 |
} |
260 | 274 |
|
261 |
- return 0; |
|
275 |
+done: |
|
276 |
+ |
|
277 |
+ return status; |
|
262 | 278 |
} |
263 | 279 |
|
264 | 280 |
/** |
265 |
- * @brief Find the next *indirect* object in an object stream, adds it to our list of |
|
281 |
+ * @brief Find the next *indirect* object in an object stream, adds it to our list of |
|
266 | 282 |
* objects, and increments nobj. |
267 |
- * |
|
283 |
+ * |
|
268 | 284 |
* Indirect objects in a stream DON'T begin with "obj" and end with "endobj". |
269 | 285 |
* Instead, they have an obj ID and an offset from the first object to point you |
270 | 286 |
* right at them. |
271 |
- * |
|
287 |
+ * |
|
272 | 288 |
* If found, objstm->current will be updated to the next obj id. |
273 |
- * |
|
274 |
- * All objects in an object stream are indirect and thus do not begin or start |
|
275 |
- * with "obj" or "endobj". Instead, the object stream takes the following |
|
289 |
+ * |
|
290 |
+ * All objects in an object stream are indirect and thus do not begin or start |
|
291 |
+ * with "obj" or "endobj". Instead, the object stream takes the following |
|
276 | 292 |
* format. |
277 |
- * |
|
293 |
+ * |
|
278 | 294 |
* <dictionary describing stream> objstm content endobjstm |
279 |
- * |
|
295 |
+ * |
|
280 | 296 |
* where content looks something like the following: |
281 |
- * |
|
297 |
+ * |
|
282 | 298 |
* 15 0 16 3 17 46 (ab)<</IDS 8 0 R/JavaScript 27 0 R/URLS 9 0 R>><</Names[(Test)28 0 R]>> |
283 |
- * |
|
284 |
- * In the above example, the literal string (ab) is indirect object # 15, and |
|
285 |
- * begins at offset 0 of the set of objects. The next object, # 16 begis at |
|
286 |
- * offset 3 is a dictionary. The final object is also a dictionary, beginning |
|
299 |
+ * |
|
300 |
+ * In the above example, the literal string (ab) is indirect object # 15, and |
|
301 |
+ * begins at offset 0 of the set of objects. The next object, # 16 begis at |
|
302 |
+ * offset 3 is a dictionary. The final object is also a dictionary, beginning |
|
287 | 303 |
* at offset 46. |
288 |
- * |
|
289 |
- * @param pdf Pdf struct that keeps track of all information found in the PDF. |
|
304 |
+ * |
|
305 |
+ * @param pdf Pdf struct that keeps track of all information found in the PDF. |
|
290 | 306 |
* @param objstm |
291 |
- * |
|
307 |
+ * |
|
292 | 308 |
* @return CL_SUCCESS if success |
293 | 309 |
* @return CL_EPARSE if parsing error |
294 | 310 |
* @return CL_EMEM if error allocating memory |
... | ... |
@@ -298,7 +314,7 @@ int pdf_findobj_in_objstm(struct pdf_struct *pdf, struct objstm_struct *objstm, |
298 | 298 |
{ |
299 | 299 |
cl_error_t status = CL_EPARSE; |
300 | 300 |
struct pdf_obj *obj = NULL; |
301 |
- unsigned long objid = 0, objsize = 0, objoff = 0; |
|
301 |
+ unsigned long objid = 0, objoff = 0; |
|
302 | 302 |
long temp_long = 0; |
303 | 303 |
const char *index = NULL; |
304 | 304 |
size_t bytes_remaining = 0; |
... | ... |
@@ -383,10 +399,10 @@ int pdf_findobj_in_objstm(struct pdf_struct *pdf, struct objstm_struct *objstm, |
383 | 383 |
{ |
384 | 384 |
unsigned long next_objid = 0, next_objoff = 0; |
385 | 385 |
|
386 |
- /* |
|
387 |
- * While we're at it, |
|
386 |
+ /* |
|
387 |
+ * While we're at it, |
|
388 | 388 |
* lets record the size as running up to the next object offset. |
389 |
- * |
|
389 |
+ * |
|
390 | 390 |
* To do so, we will need to parse the next obj pair. |
391 | 391 |
*/ |
392 | 392 |
/* objstm->current_pair points directly to the obj id */ |
... | ... |
@@ -439,14 +455,14 @@ int pdf_findobj_in_objstm(struct pdf_struct *pdf, struct objstm_struct *objstm, |
439 | 439 |
} |
440 | 440 |
|
441 | 441 |
obj->size = next_objoff - objoff; |
442 |
- } |
|
443 |
- else |
|
442 |
+ } |
|
443 |
+ else |
|
444 | 444 |
{ |
445 | 445 |
/* |
446 | 446 |
* Should be no more objects. We should verify. |
447 |
- * |
|
447 |
+ * |
|
448 | 448 |
* Either way... |
449 |
- * obj->size should be the rest of the buffer. |
|
449 |
+ * obj->size should be the rest of the buffer. |
|
450 | 450 |
*/ |
451 | 451 |
if (objstm->nobjs_found < objstm->n) { |
452 | 452 |
cli_warnmsg("pdf_findobj_in_objstm: Fewer objects found in object stream than expected!\n"); |
... | ... |
@@ -480,17 +496,25 @@ done: |
480 | 480 |
|
481 | 481 |
/** |
482 | 482 |
* @brief Find the next *indirect* object. |
483 |
- * |
|
484 |
- * Indirect objects begin with "obj" and end with "endobj". |
|
485 |
- * Identify objects that contain streams. |
|
486 |
- * Identify truncated objects. |
|
487 |
- * |
|
483 |
+ * |
|
484 |
+ * Indirect objects located outside of an object stream are prefaced with: |
|
485 |
+ * <objid> <genid> obj |
|
486 |
+ * |
|
487 |
+ * Each of the above are separated by whitespace of some sort. |
|
488 |
+ * |
|
489 |
+ * Indirect objects are postfaced with: |
|
490 |
+ * endobj |
|
491 |
+ * |
|
492 |
+ * The specification does not say if whitespace is required before or after "endobj". |
|
493 |
+ * |
|
494 |
+ * Identify truncated objects. |
|
495 |
+ * |
|
488 | 496 |
* If found, pdf->offset will be updated to just after the "endobj". |
489 | 497 |
* If truncated, pdf->offset will == pdf->size. |
490 | 498 |
* If not found, pdf->offset will not be updated. |
491 |
- * |
|
492 |
- * @param pdf Pdf context struct that keeps track of all information found in the PDF. |
|
493 |
- * |
|
499 |
+ * |
|
500 |
+ * @param pdf Pdf context struct that keeps track of all information found in the PDF. |
|
501 |
+ * |
|
494 | 502 |
* @return CL_SUCCESS if success |
495 | 503 |
* @return CL_BREAK if no more objects |
496 | 504 |
* @return CL_EPARSE if parsing error |
... | ... |
@@ -499,9 +523,13 @@ done: |
499 | 499 |
cl_error_t pdf_findobj(struct pdf_struct *pdf) |
500 | 500 |
{ |
501 | 501 |
cl_error_t status = CL_EPARSE; |
502 |
- const char *start, *q, *q2, *q3, *eof; |
|
502 |
+ const char *start, *idx, *genid_search_index, *objid_search_index; |
|
503 |
+ |
|
504 |
+ const char *obj_begin = NULL, *obj_end = NULL; |
|
505 |
+ const char *endobj_begin = NULL, *endobj_end = NULL; |
|
506 |
+ |
|
503 | 507 |
struct pdf_obj *obj = NULL; |
504 |
- off_t bytesleft; |
|
508 |
+ size_t bytesleft; |
|
505 | 509 |
unsigned long genid, objid; |
506 | 510 |
long temp_long; |
507 | 511 |
|
... | ... |
@@ -524,101 +552,111 @@ cl_error_t pdf_findobj(struct pdf_struct *pdf) |
524 | 524 |
start = pdf->map + pdf->offset; |
525 | 525 |
bytesleft = pdf->size - pdf->offset; |
526 | 526 |
|
527 |
- /* Indirect objects located outside of an object stream are prefaced with "obj" |
|
528 |
- * and suffixed with "endobj". Find the "obj" preface. */ |
|
529 |
- while (bytesleft > 0) |
|
530 |
- { |
|
531 |
- q2 = cli_memstr(start, bytesleft, "obj", 3); |
|
532 |
- if (!q2) { |
|
533 |
- status = CL_BREAK; /* no more objs */ |
|
534 |
- goto done; |
|
527 |
+ /* |
|
528 |
+ * Start by searching for "obj" |
|
529 |
+ */ |
|
530 |
+ idx = start + 1; |
|
531 |
+ while (bytesleft > 1 + strlen("obj")) { |
|
532 |
+ /* `- 1` accounts for size of white space before obj */ |
|
533 |
+ idx = cli_memstr(idx, bytesleft - 1, "obj", strlen("obj")); |
|
534 |
+ if (NULL == idx) { |
|
535 |
+ status = CL_BREAK; |
|
536 |
+ goto done; /* No more objs. */ |
|
535 | 537 |
} |
536 | 538 |
|
537 |
- /* verify that "obj" has a whitespace before it, and is not the end of |
|
538 |
- * a previous string like... "globj" */ |
|
539 |
- q2--; |
|
540 |
- bytesleft -= q2 - start; |
|
539 |
+ /* verify that the word has a whitespace before it, and is not the end of |
|
540 |
+ * a previous word */ |
|
541 |
+ idx--; |
|
542 |
+ bytesleft = (pdf->size - pdf->offset) - (size_t)(idx - start); |
|
541 | 543 |
|
542 |
- if (*q2 != 0 && *q2 != 9 && *q2 != 0xa && *q2 != 0xc && *q2 != 0xd && *q2 != 0x20) { |
|
543 |
- /* This instance of the "obj" string appears to be part of another string. |
|
544 |
+ if (*idx != 0 && *idx != 9 && *idx != 0xa && *idx != 0xc && *idx != 0xd && *idx != 0x20) { |
|
545 |
+ /* This instance of "obj" appears to be part of a longer string. |
|
544 | 546 |
* Skip it, and keep searching for an object. */ |
545 |
- start = q2+4; |
|
546 |
- bytesleft -= 4; |
|
547 |
+ idx += 1 + strlen("obj"); |
|
548 |
+ bytesleft -= 1 + strlen("obj"); |
|
547 | 549 |
continue; |
548 | 550 |
} |
549 | 551 |
|
550 |
- break; /* Found it. q2 should point to the whitespace before the "obj" string */ |
|
551 |
- } |
|
552 |
+ /* Found the beginning of the word */ |
|
553 |
+ obj_begin = idx; |
|
554 |
+ obj_end = idx + 1 + strlen("obj"); |
|
552 | 555 |
|
553 |
- if (bytesleft <= 0) { |
|
554 |
- status = CL_BREAK; /* No "obj" found. */ |
|
555 |
- goto done; |
|
556 |
+ break; |
|
556 | 557 |
} |
557 | 558 |
|
558 |
- /* "obj" found! */ |
|
559 |
+ if ((NULL == obj_begin) || (NULL == obj_end)) { |
|
560 |
+ status = CL_BREAK; |
|
561 |
+ goto done; /* No more objs. */ |
|
562 |
+ } |
|
559 | 563 |
|
560 | 564 |
/* Find the generation id (genid) that appears before the "obj" */ |
561 |
- q = findNextNonWSBack(q2-1, start); |
|
562 |
- while (q > start && isdigit(*q)) |
|
563 |
- q--; |
|
565 |
+ genid_search_index = findNextNonWSBack(obj_begin - 1, start); |
|
566 |
+ while (genid_search_index > start && isdigit(*genid_search_index)) |
|
567 |
+ genid_search_index--; |
|
564 | 568 |
|
565 |
- if (CL_SUCCESS != cli_strntol_wrap(q, (size_t)(bytesleft + (q2 - q)), 0, 10, &temp_long)) { |
|
569 |
+ if (CL_SUCCESS != cli_strntol_wrap(genid_search_index, (size_t)((obj_begin) - genid_search_index), 0, 10, &temp_long)) { |
|
566 | 570 |
cli_dbgmsg("pdf_findobj: Failed to parse object genid (# objects found: %u)\n", pdf->nobjs); |
567 | 571 |
/* Failed to parse, probably not a real object. Skip past the "obj" thing, and continue. */ |
568 |
- pdf->offset = q2 + 4 - pdf->map; |
|
572 |
+ pdf->offset = obj_end - pdf->map; |
|
569 | 573 |
status = CL_EPARSE; |
570 | 574 |
goto done; |
571 | 575 |
} else if (temp_long < 0) { |
572 | 576 |
cli_dbgmsg("pdf_findobj: Encountered invalid negative obj genid (%ld).\n", temp_long); |
573 |
- pdf->offset = q2 + 4 - pdf->map; |
|
577 |
+ pdf->offset = obj_end - pdf->map; |
|
574 | 578 |
status = CL_EPARSE; |
575 | 579 |
goto done; |
576 | 580 |
} |
577 | 581 |
genid = (unsigned long)temp_long; |
578 | 582 |
|
579 |
- /* Find the object id (objid) that appers before the genid */ |
|
580 |
- q = findNextNonWSBack(q-1,start); |
|
581 |
- while (q > start && isdigit(*q)) |
|
582 |
- q--; |
|
583 |
+ /* Find the object id (objid) that appears before the genid */ |
|
584 |
+ objid_search_index = findNextNonWSBack(genid_search_index - 1, start); |
|
585 |
+ while (objid_search_index > start && isdigit(*objid_search_index)) |
|
586 |
+ objid_search_index--; |
|
583 | 587 |
|
584 |
- if (CL_SUCCESS != cli_strntol_wrap(q, (size_t)(bytesleft + (q2 - q)), 0, 10, &temp_long)) { |
|
588 |
+ if (CL_SUCCESS != cli_strntol_wrap(objid_search_index, (size_t)((genid_search_index) - objid_search_index), 0, 10, &temp_long)) { |
|
585 | 589 |
/* |
586 |
- * PDFs with multiple revisions will have %%EOF before the end of the file, |
|
587 |
- * followed by the next revision of the PDF. If this is the case, we can |
|
588 |
- * detect it and continue parsing after the %%EOF. |
|
590 |
+ * Edge case: |
|
591 |
+ * |
|
592 |
+ * PDFs with multiple revisions will have %%EOF before the end of the file, |
|
593 |
+ * followed by the next revision of the PDF, which will probably be an immediate objid. |
|
594 |
+ * |
|
595 |
+ * Example: |
|
596 |
+ * %%EOF1 1 obj <blah> endobj |
|
597 |
+ * |
|
598 |
+ * If this is the case, we can detect it and continue parsing after the %%EOF. |
|
589 | 599 |
*/ |
590 |
- if (q - 4 > start) { |
|
591 |
- const char* lastfile = q - 4; |
|
600 |
+ if (objid_search_index - strlen("\%\%EO") > start) { |
|
601 |
+ const char* lastfile = objid_search_index - strlen("\%\%EO"); |
|
592 | 602 |
if (0 != strncmp(lastfile, "\%\%EOF", 5)) { |
593 | 603 |
/* Nope, wasn't %%EOF */ |
594 | 604 |
cli_dbgmsg("pdf_findobj: Failed to parse object objid (# objects found: %u)\n", pdf->nobjs); |
595 | 605 |
/* Skip past the "obj" thing, and continue. */ |
596 |
- pdf->offset = q2 + 4 - pdf->map; |
|
597 |
- status = CL_EPARSE; |
|
606 |
+ pdf->offset = obj_end - pdf->map; |
|
607 |
+ status = CL_EPARSE; |
|
598 | 608 |
goto done; |
599 | 609 |
} |
600 |
- /* Yup, Looks, like the file continues after %%EOF. |
|
610 |
+ /* Yup, Looks, like the file continues after %%EOF. |
|
601 | 611 |
* Probably another revision. Keep parsing... */ |
602 |
- q++; |
|
603 |
- cli_dbgmsg("pdf_findobj: \%\%EOF detected before end of file, at %zu\n", (size_t)q); |
|
612 |
+ objid_search_index++; |
|
613 |
+ cli_dbgmsg("pdf_findobj: \%\%EOF detected before end of file, at offset: %zu\n", (size_t)(objid_search_index - pdf->map)); |
|
604 | 614 |
} else { |
605 | 615 |
/* Failed parsing at the very beginning */ |
606 | 616 |
cli_dbgmsg("pdf_findobj: Failed to parse object objid (# objects found: %u)\n", pdf->nobjs); |
607 | 617 |
/* Probably not a real object. Skip past the "obj" thing, and continue. */ |
608 |
- pdf->offset = q2 + 4 - pdf->map; |
|
609 |
- status = CL_EPARSE; |
|
618 |
+ pdf->offset = obj_end - pdf->map; |
|
619 |
+ status = CL_EPARSE; |
|
610 | 620 |
goto done; |
611 | 621 |
} |
612 | 622 |
/* Try again, with offset slightly adjusted */ |
613 |
- if (CL_SUCCESS != cli_strntol_wrap(q, (size_t)(bytesleft + (q2 - q)), 0, 10, &temp_long)) { |
|
623 |
+ if (CL_SUCCESS != cli_strntol_wrap(objid_search_index, (size_t)((genid_search_index - 1) - objid_search_index), 0, 10, &temp_long)) { |
|
614 | 624 |
cli_dbgmsg("pdf_findobj: Failed to parse object objid (# objects found: %u)\n", pdf->nobjs); |
615 | 625 |
/* Still failed... Probably not a real object. Skip past the "obj" thing, and continue. */ |
616 |
- pdf->offset = q2 + 4 - pdf->map; |
|
617 |
- status = CL_EPARSE; |
|
626 |
+ pdf->offset = obj_end - pdf->map; |
|
627 |
+ status = CL_EPARSE; |
|
618 | 628 |
goto done; |
619 | 629 |
} else if (temp_long < 0) { |
620 | 630 |
cli_dbgmsg("pdf_findobj: Encountered invalid negative objid (%ld).\n", temp_long); |
621 |
- pdf->offset = q2 + 4 - pdf->map; |
|
631 |
+ pdf->offset = obj_end - pdf->map; |
|
622 | 632 |
status = CL_EPARSE; |
623 | 633 |
goto done; |
624 | 634 |
} |
... | ... |
@@ -626,85 +664,54 @@ cl_error_t pdf_findobj(struct pdf_struct *pdf) |
626 | 626 |
cli_dbgmsg("pdf_findobj: There appears to be an additional revision. Continuing to parse...\n"); |
627 | 627 |
} else if (temp_long < 0) { |
628 | 628 |
cli_dbgmsg("pdf_findobj: Encountered invalid negative objid (%ld).\n", temp_long); |
629 |
- pdf->offset = q2 + 4 - pdf->map; |
|
629 |
+ pdf->offset = obj_end - pdf->map; |
|
630 | 630 |
status = CL_EPARSE; |
631 | 631 |
goto done; |
632 | 632 |
} |
633 | 633 |
objid = (unsigned long)temp_long; |
634 | 634 |
|
635 |
- /* |
|
636 |
- * Ok so we have the objid, genid, and "obj" string. |
|
637 |
- * Time to store that information and then ... |
|
638 |
- * ... investigate what kind of object this is. |
|
639 |
- */ |
|
640 | 635 |
obj->id = (objid << 8) | (genid & 0xff); |
641 |
- obj->start = q2+4 - pdf->map; /* obj start begins just after the "obj" string */ |
|
636 |
+ obj->start = obj_end - pdf->map; /* obj start begins just after the "obj" string */ |
|
642 | 637 |
obj->flags = 0; |
643 | 638 |
|
644 |
- bytesleft -= 4; |
|
645 |
- eof = pdf->map + pdf->size; |
|
646 |
- q = pdf->map + obj->start; |
|
647 |
- |
|
648 |
- while (q < eof && bytesleft > 0) |
|
649 |
- { |
|
650 |
- off_t p_stream, p_endstream; |
|
651 |
- q2 = pdf_nextobject(q, bytesleft); |
|
652 |
- if (!q2) |
|
653 |
- q2 = pdf->map + pdf->size; /* No interesting objects found, fast-forward to eof */ |
|
654 |
- |
|
655 |
- bytesleft -= q2 - q; |
|
656 |
- if (find_stream_bounds(q-1, q2-q, bytesleft + (q2-q), &p_stream, &p_endstream, 1)) { |
|
657 |
- /* |
|
658 |
- * Found obj that contains a stream. |
|
659 |
- */ |
|
660 |
- obj->flags |= 1 << OBJ_STREAM; |
|
661 |
- q2 = q-1 + p_endstream + 9; |
|
662 |
- bytesleft -= q2 - q + 1; |
|
663 |
- |
|
664 |
- if (bytesleft < 0) { |
|
665 |
- /* ... and the stream is truncated. Hmm... */ |
|
666 |
- obj->flags |= 1 << OBJ_TRUNCATED; |
|
667 |
- pdf->offset = pdf->size; |
|
668 |
- |
|
669 |
- status = CL_SUCCESS; |
|
670 |
- goto done; /* Truncated file, no end to obj/stream. |
|
671 |
- * The next call to pdf_findobj() will return no more objects. */ |
|
672 |
- } |
|
673 |
- } else if ((q3 = cli_memstr(q-1, q2-q+1, "endobj", 6))) { |
|
674 |
- /* |
|
675 |
- * obj found and offset positioned. ideal return case |
|
676 |
- */ |
|
677 |
- q2 = q3 + 6; |
|
678 |
- pdf->offset = q2 - pdf->map; /* update the offset to just after the endobj */ |
|
679 |
- |
|
680 |
- status = CL_SUCCESS; |
|
681 |
- goto done; |
|
682 |
- } else { |
|
683 |
- q2++; |
|
684 |
- bytesleft--; |
|
685 |
- } |
|
686 |
- |
|
687 |
- q = q2; |
|
639 |
+ /* |
|
640 |
+ * We now have the objid, genid, and object start. |
|
641 |
+ * Find the object end ("endobj"). |
|
642 |
+ */ |
|
643 |
+ /* `- 1` accounts for size of white space before obj */ |
|
644 |
+ endobj_begin = cli_memstr(obj_end, pdf->map + pdf->size - obj_end, "endobj", strlen("endobj")); |
|
645 |
+ if (NULL == endobj_begin) { |
|
646 |
+ /* No end to object. |
|
647 |
+ * PDF appears to be malformed or truncated. |
|
648 |
+ * Will record the object size as going ot the end of the file. |
|
649 |
+ * Will record that the object is truncated. |
|
650 |
+ * Will position the pdf offset to the end of the PDF. |
|
651 |
+ * The next iteration of this function will find no more objects. */ |
|
652 |
+ obj->flags |= 1 << OBJ_TRUNCATED; |
|
653 |
+ obj->size = (pdf->map + pdf->size) - obj_end; |
|
654 |
+ pdf->offset = pdf->size; |
|
655 |
+ |
|
656 |
+ /* Truncated "object" found! */ |
|
657 |
+ status = CL_SUCCESS; |
|
658 |
+ goto done; |
|
688 | 659 |
} |
660 |
+ endobj_end = endobj_begin + strlen("endobj"); |
|
689 | 661 |
|
690 |
- obj->flags |= 1 << OBJ_TRUNCATED; |
|
691 |
- pdf->offset = pdf->size; |
|
662 |
+ /* Size of the object goes from "obj" <-> "endobject". */ |
|
663 |
+ obj->size = endobj_begin - obj_end; |
|
664 |
+ pdf->offset = endobj_end - pdf->map; |
|
692 | 665 |
|
666 |
+ /* |
|
667 |
+ * Object found! |
|
668 |
+ */ |
|
693 | 669 |
status = CL_SUCCESS; /* truncated file, no end to obj. */ |
694 | 670 |
|
695 | 671 |
done: |
696 | 672 |
if (status == CL_SUCCESS) { |
697 |
- cli_dbgmsg("pdf_findobj: found %d %d obj @%lld\n", obj->id >> 8, obj->id&0xff, (long long)(obj->start + pdf->startoff)); |
|
673 |
+ cli_dbgmsg("pdf_findobj: found %d %d obj @%lld, size: %zu bytes.\n", obj->id >> 8, obj->id&0xff, (long long)(obj->start + pdf->startoff), obj->size); |
|
698 | 674 |
} |
699 | 675 |
else |
700 | 676 |
{ |
701 |
- if(status == CL_BREAK) { |
|
702 |
- cli_dbgmsg("pdf_findobj: No more objects (# objects found: %u)\n", pdf->nobjs); |
|
703 |
- } else if(status == CL_EMEM) { |
|
704 |
- cli_warnmsg("pdf_findobj: Error allocating memory (# objects found: %u)\n", pdf->nobjs); |
|
705 |
- } else { |
|
706 |
- cli_dbgmsg("pdf_findobj: Unexpected status code %d.\n", status); |
|
707 |
- } |
|
708 | 677 |
/* Remove the unused obj reference from our list of objects found */ |
709 | 678 |
/* No need to realloc pdf->objs back down. It won't leak. */ |
710 | 679 |
pdf->objs[pdf->nobjs-1] = NULL; |
... | ... |
@@ -713,9 +720,17 @@ done: |
713 | 713 |
/* Free up the obj struct. */ |
714 | 714 |
if (NULL != obj) |
715 | 715 |
free(obj); |
716 |
+ |
|
717 |
+ if(status == CL_BREAK) { |
|
718 |
+ cli_dbgmsg("pdf_findobj: No more objects (# objects found: %u)\n", pdf->nobjs); |
|
719 |
+ } else if(status == CL_EMEM) { |
|
720 |
+ cli_warnmsg("pdf_findobj: Error allocating memory (# objects found: %u)\n", pdf->nobjs); |
|
721 |
+ } else { |
|
722 |
+ cli_dbgmsg("pdf_findobj: Unexpected status code %d.\n", status); |
|
723 |
+ } |
|
716 | 724 |
} |
717 | 725 |
|
718 |
- return status; |
|
726 |
+ return status; |
|
719 | 727 |
} |
720 | 728 |
|
721 | 729 |
static size_t filter_writen(struct pdf_struct *pdf, struct pdf_obj *obj, int fout, const char *buf, size_t len, size_t *sum) |
... | ... |
@@ -836,14 +851,14 @@ struct pdf_obj *find_obj(struct pdf_struct *pdf, struct pdf_obj *obj, uint32_t o |
836 | 836 |
|
837 | 837 |
/** |
838 | 838 |
* @brief Find and interpret the "/Length" dictionary key value. |
839 |
- * |
|
839 |
+ * |
|
840 | 840 |
* The value may be: |
841 |
- * - a direct object (i.e. just a number) |
|
841 |
+ * - a direct object (i.e. just a number) |
|
842 | 842 |
* - an indirect object, where the value is somewhere else in the document and we have to look it up. |
843 | 843 |
* indirect objects are referenced using an object id (objid), generation id (genid) genid, and the letter 'R'. |
844 |
- * |
|
844 |
+ * |
|
845 | 845 |
* Example dictionary with a single key "/Length" that relies direct object for the value. |
846 |
- * |
|
846 |
+ * |
|
847 | 847 |
* 1 0 obj |
848 | 848 |
* << /Length 534 |
849 | 849 |
* /Filter [ /ASCII85Decode /LZWDecode ] |
... | ... |
@@ -857,9 +872,9 @@ struct pdf_obj *find_obj(struct pdf_struct *pdf, struct pdf_obj *obj, uint32_t o |
857 | 857 |
* JD?M$0QP)lKn06l1apKDC@\qJ4B!!(5m+j.7F790m(Vj88l8Q:_CZ(Gm1%X\N1&u!FKHMB~> |
858 | 858 |
* endstream |
859 | 859 |
* endobj |
860 |
- * |
|
860 |
+ * |
|
861 | 861 |
* Example dictionary with a single key "/Length" that relies on an indirect object for the value. |
862 |
- * |
|
862 |
+ * |
|
863 | 863 |
* 7 0 obj |
864 | 864 |
* << /Length 8 0 R >> % An indirect reference to object 8, with generation id 0. |
865 | 865 |
* stream |
... | ... |
@@ -870,11 +885,11 @@ struct pdf_obj *find_obj(struct pdf_struct *pdf, struct pdf_obj *obj, uint32_t o |
870 | 870 |
* ET |
871 | 871 |
* endstream |
872 | 872 |
* endobj |
873 |
- * |
|
873 |
+ * |
|
874 | 874 |
* 8 0 obj |
875 | 875 |
* 77 % The length of the preceding stream |
876 | 876 |
* endobj |
877 |
- * |
|
877 |
+ * |
|
878 | 878 |
* @param pdf Pdf context structure. |
879 | 879 |
* @param obj Pdf object context structure. |
880 | 880 |
* @param start Pointer start of the dictionary string. |
... | ... |
@@ -914,12 +929,12 @@ static size_t find_length(struct pdf_struct *pdf, struct pdf_obj *obj, const cha |
914 | 914 |
if (!obj_start) |
915 | 915 |
return 0; |
916 | 916 |
|
917 |
- if (bytes_remaining < obj_start - index) { |
|
917 |
+ if (bytes_remaining < (size_t)(obj_start - index)) { |
|
918 | 918 |
return 0; |
919 | 919 |
} |
920 | 920 |
bytes_remaining -= obj_start - index; |
921 | 921 |
index = obj_start; |
922 |
- |
|
922 |
+ |
|
923 | 923 |
/* Read the value. This could either be the direct length value, |
924 | 924 |
or the object id of the indirect object that has the length */ |
925 | 925 |
if (CL_SUCCESS != cli_strntol_wrap(index, bytes_remaining, 0, 10, &temp_long)) { |
... | ... |
@@ -931,10 +946,10 @@ static size_t find_length(struct pdf_struct *pdf, struct pdf_obj *obj, const cha |
931 | 931 |
} |
932 | 932 |
length = (size_t)temp_long; /* length or maybe object id */ |
933 | 933 |
|
934 |
- /* |
|
935 |
- * Keep parsing, skipping past the first integer that might have been what we wanted. |
|
936 |
- * If it's an indirect object, we'll find a Generation ID followed by the letter 'R' |
|
937 |
- * I.e. something like " 0 R" |
|
934 |
+ /* |
|
935 |
+ * Keep parsing, skipping past the first integer that might have been what we wanted. |
|
936 |
+ * If it's an indirect object, we'll find a Generation ID followed by the letter 'R' |
|
937 |
+ * I.e. something like " 0 R" |
|
938 | 938 |
*/ |
939 | 939 |
while ((bytes_remaining > 0) && isdigit(*index)) { |
940 | 940 |
index++; |
... | ... |
@@ -966,14 +981,14 @@ static size_t find_length(struct pdf_struct *pdf, struct pdf_obj *obj, const cha |
966 | 966 |
} |
967 | 967 |
|
968 | 968 |
if (index[0] == ' ' && index[1] == 'R') { |
969 |
- /* |
|
970 |
- * Ok so we found a genid and that 'R'. Which means that first value |
|
969 |
+ /* |
|
970 |
+ * Ok so we found a genid and that 'R'. Which means that first value |
|
971 | 971 |
* was actually the objid. |
972 | 972 |
* We can look up the indirect object using this information. |
973 | 973 |
*/ |
974 | 974 |
unsigned long objid = length; |
975 | 975 |
const char* indirect_obj_start = NULL; |
976 |
- |
|
976 |
+ |
|
977 | 977 |
cli_dbgmsg("find_length: length is in indirect object %lu %lu\n", objid, genid); |
978 | 978 |
|
979 | 979 |
obj = find_obj(pdf, obj, (length << 8) | (genid&0xff)); |
... | ... |
@@ -984,15 +999,15 @@ static size_t find_length(struct pdf_struct *pdf, struct pdf_obj *obj, const cha |
984 | 984 |
|
985 | 985 |
indirect_obj_start = pdf->map + obj->start; |
986 | 986 |
bytes_remaining = pdf->size - obj->start; |
987 |
- |
|
987 |
+ |
|
988 | 988 |
/* Ok so we found the indirect object, lets read the value. */ |
989 | 989 |
index = pdf_nextobject(indirect_obj_start, bytes_remaining); |
990 | 990 |
if (!index) { |
991 | 991 |
cli_dbgmsg("find_length: next object not found\n"); |
992 | 992 |
return 0; |
993 | 993 |
} |
994 |
- |
|
995 |
- if (bytes_remaining < index - indirect_obj_start) { |
|
994 |
+ |
|
995 |
+ if (bytes_remaining < (size_t)(index - indirect_obj_start)) { |
|
996 | 996 |
return 0; |
997 | 997 |
} |
998 | 998 |
bytes_remaining -= index - indirect_obj_start; |
... | ... |
@@ -1010,7 +1025,7 @@ static size_t find_length(struct pdf_struct *pdf, struct pdf_obj *obj, const cha |
1010 | 1010 |
} |
1011 | 1011 |
|
1012 | 1012 |
/* limit length */ |
1013 |
- if (obj_start - pdf->map + length + 5 > pdf->size) |
|
1013 |
+ if ((size_t)(obj_start - pdf->map) + length + 5 > pdf->size) |
|
1014 | 1014 |
length = pdf->size - (obj_start - pdf->map) - 5; |
1015 | 1015 |
|
1016 | 1016 |
return length; |
... | ... |
@@ -1018,102 +1033,6 @@ static size_t find_length(struct pdf_struct *pdf, struct pdf_obj *obj, const cha |
1018 | 1018 |
|
1019 | 1019 |
#define DUMP_MASK ((1 << OBJ_CONTENTS) | (1 << OBJ_FILTER_FLATE) | (1 << OBJ_FILTER_DCT) | (1 << OBJ_FILTER_AH) | (1 << OBJ_FILTER_A85) | (1 << OBJ_EMBEDDED_FILE) | (1 << OBJ_JAVASCRIPT) | (1 << OBJ_OPENACTION) | (1 << OBJ_LAUNCHACTION)) |
1020 | 1020 |
|
1021 |
-static int obj_size(struct pdf_struct *pdf, struct pdf_obj *obj, int binary) |
|
1022 |
-{ |
|
1023 |
- if (0 == obj->size) |
|
1024 |
- { |
|
1025 |
- /* |
|
1026 |
- * Programmatically determine size if not already known. |
|
1027 |
- */ |
|
1028 |
- unsigned i = 0; |
|
1029 |
- |
|
1030 |
- /* Find the index of the current object */ |
|
1031 |
- for (i = 0; i < pdf->nobjs; i++) { |
|
1032 |
- if (pdf->objs[i] == obj) |
|
1033 |
- break; |
|
1034 |
- } |
|
1035 |
- |
|
1036 |
- /* Find the next object that exists in the same buffer (pdf fmap, or object stream) */ |
|
1037 |
- if (i < pdf->nobjs) { |
|
1038 |
- i++; |
|
1039 |
- } |
|
1040 |
- |
|
1041 |
- if (obj->objstm == NULL) { |
|
1042 |
- /* Current object isn't in an object stream, we want to find |
|
1043 |
- * the next object that also isn't in an object stream. */ |
|
1044 |
- for ( ; i < pdf->nobjs; i++) { |
|
1045 |
- if (pdf->objs[i]->objstm == NULL) |
|
1046 |
- break; |
|
1047 |
- } |
|
1048 |
- } else { |
|
1049 |
- /* Current object is in an object stream, we want to find |
|
1050 |
- * the next object that is in the same object stream. |
|
1051 |
- * |
|
1052 |
- * This really shouldn't happen, so throw a warning and |
|
1053 |
- * then see if we can solve it anyhow */ |
|
1054 |
- cli_warnmsg("obj_size: Encountered pdf object in an object stream that has an unknown size!!\n"); |
|
1055 |
- |
|
1056 |
- for ( ; i < pdf->nobjs; i++) { |
|
1057 |
- if (pdf->objs[i]->objstm == obj->objstm) |
|
1058 |
- break; |
|
1059 |
- } |
|
1060 |
- } |
|
1061 |
- |
|
1062 |
- /* Step backwards from the "next" object to find the end of the current object */ |
|
1063 |
- if (i < pdf->nobjs) { |
|
1064 |
- int s = pdf->objs[i]->start - obj->start - 4; |
|
1065 |
- if (s > 0) { |
|
1066 |
- if (!binary) { |
|
1067 |
- const char *p = NULL; |
|
1068 |
- const char *q = NULL; |
|
1069 |
- |
|
1070 |
- if (obj->objstm == NULL) { |
|
1071 |
- p = pdf->map + obj->start; |
|
1072 |
- } else { |
|
1073 |
- p = obj->objstm->streambuf + obj->start; |
|
1074 |
- } |
|
1075 |
- q = p + s; |
|
1076 |
- |
|
1077 |
- while (q > p && (isspace(*q) || isdigit(*q))) |
|
1078 |
- q--; |
|
1079 |
- |
|
1080 |
- if (q > p+5 && !memcmp(q-5,"endobj",6)) |
|
1081 |
- q -= 6; |
|
1082 |
- |
|
1083 |
- q = findNextNonWSBack(q, p); |
|
1084 |
- q++; |
|
1085 |
- |
|
1086 |
- obj->size = q - p; |
|
1087 |
- goto done; |
|
1088 |
- } |
|
1089 |
- |
|
1090 |
- obj->size = s; |
|
1091 |
- goto done; |
|
1092 |
- } |
|
1093 |
- } |
|
1094 |
- |
|
1095 |
- /* If we've gotten this far, we didn't find a "next" object... so our |
|
1096 |
- * current object must be at the end of the pdf fmap or the end of the |
|
1097 |
- * object stream. */ |
|
1098 |
- if (obj->objstm == NULL) { |
|
1099 |
- /* Current object isn't in an object stream, so we can determine object |
|
1100 |
- * size based on the remaining size of the file (in theory). */ |
|
1101 |
- if (binary) |
|
1102 |
- obj->size = pdf->size - obj->start; |
|
1103 |
- else |
|
1104 |
- obj->size = pdf->offset - obj->start - 6; /* This hack I think assumes that we reached the end of the file when finding objects. */ |
|
1105 |
- } else { |
|
1106 |
- /* Current object is in an object stream, we want to find |
|
1107 |
- * the next object that is in the same object stream. */ |
|
1108 |
- obj->size = obj->objstm->streambuf_len - obj->start; |
|
1109 |
- } |
|
1110 |
- } |
|
1111 |
- |
|
1112 |
-done: |
|
1113 |
- |
|
1114 |
- return obj->size; |
|
1115 |
-} |
|
1116 |
- |
|
1117 | 1021 |
static int run_pdf_hooks(struct pdf_struct *pdf, enum pdf_phase phase, int fd, int dumpid) |
1118 | 1022 |
{ |
1119 | 1023 |
int ret; |
... | ... |
@@ -1482,330 +1401,328 @@ int pdf_extract_obj(struct pdf_struct *pdf, struct pdf_obj *obj, uint32_t flags) |
1482 | 1482 |
if (!(flags & PDF_EXTRACT_OBJ_SCAN)) |
1483 | 1483 |
obj->path = strdup(fullname); |
1484 | 1484 |
|
1485 |
- do { |
|
1486 |
- if (obj->flags & (1 << OBJ_STREAM)) { |
|
1487 |
- const char *start = pdf->map + obj->start; |
|
1488 |
- off_t p_stream = 0, p_endstream = 0; |
|
1489 |
- off_t length; |
|
1485 |
+ if ((NULL == obj->objstm) && |
|
1486 |
+ (obj->flags & (1 << OBJ_STREAM))) { |
|
1487 |
+ /* |
|
1488 |
+ * Object contains a stream. Parse this now. |
|
1489 |
+ */ |
|
1490 |
+ cli_dbgmsg("pdf_extract_obj: parsing a stream in obj %u %u\n", obj->id>>8, obj->id&0xff); |
|
1490 | 1491 |
|
1491 |
- if (NULL != obj->objstm) { |
|
1492 |
- cli_warnmsg("pdf_extract_obj: Object found in object stream claims to be an object stream! Skipping.\n"); |
|
1493 |
- break; |
|
1492 |
+ const char *start = pdf->map + obj->start; |
|
1493 |
+ |
|
1494 |
+ size_t length; |
|
1495 |
+ size_t orig_length; |
|
1496 |
+ int dict_len = obj->stream - start; /* Dictionary should end where the stream begins */ |
|
1497 |
+ |
|
1498 |
+ const char *pstr; |
|
1499 |
+ struct pdf_dict *dparams = NULL; |
|
1500 |
+ struct objstm_struct *objstm = NULL; |
|
1501 |
+ int xref = 0; |
|
1502 |
+ |
|
1503 |
+ /* Find and interpret the length dictionary value */ |
|
1504 |
+ length = find_length(pdf, obj, start, dict_len); |
|
1505 |
+ if (length < 0) |
|
1506 |
+ length = 0; |
|
1507 |
+ |
|
1508 |
+ orig_length = length; |
|
1509 |
+ |
|
1510 |
+ if (length > obj->stream_size) { |
|
1511 |
+ cli_dbgmsg("cli_pdf: Stream length exceeds object length by %zu bytes. Length truncated to %zu bytes\n", length - obj->stream_size, obj->stream_size); |
|
1512 |
+ noisy_warnmsg("Stream length exceeds object length by %zu bytes. Length truncated to %zu bytes\n", length - obj->stream_size, obj->stream_size); |
|
1513 |
+ |
|
1514 |
+ length = obj->stream_size; |
|
1515 |
+ } |
|
1516 |
+ |
|
1517 |
+ if (!(obj->flags & (1 << OBJ_FILTER_FLATE)) && (length <= 0)) { |
|
1518 |
+ /* |
|
1519 |
+ * If the length is unknown and this doesn't contain a FLATE encoded filter... |
|
1520 |
+ * Calculate the length using the stream size, and trimming |
|
1521 |
+ * off any newline/carriage returns from the end of the stream. |
|
1522 |
+ */ |
|
1523 |
+ const char *q = start + obj->stream_size; |
|
1524 |
+ length = obj->stream_size; |
|
1525 |
+ q--; |
|
1526 |
+ |
|
1527 |
+ if (*q == '\n') { |
|
1528 |
+ q--; |
|
1529 |
+ length--; |
|
1530 |
+ |
|
1531 |
+ if (*q == '\r') |
|
1532 |
+ length--; |
|
1533 |
+ } else if (*q == '\r') { |
|
1534 |
+ length--; |
|
1494 | 1535 |
} |
1495 | 1536 |
|
1496 |
- find_stream_bounds(start, pdf->size - obj->start, |
|
1497 |
- pdf->size - obj->start, |
|
1498 |
- &p_stream, &p_endstream, |
|
1499 |
- pdf->enc_method_stream <= ENC_IDENTITY && |
|
1500 |
- pdf->enc_method_embeddedfile <= ENC_IDENTITY); |
|
1501 |
- |
|
1502 |
- if (p_stream && p_endstream) { |
|
1503 |
- size_t size = p_endstream - p_stream; |
|
1504 |
- off_t orig_length; |
|
1505 |
- int len = p_stream; |
|
1506 |
- const char *pstr; |
|
1507 |
- struct pdf_dict *dparams = NULL; |
|
1508 |
- struct objstm_struct *objstm = NULL; |
|
1509 |
- int xref = 0; |
|
1510 |
- |
|
1511 |
- length = find_length(pdf, obj, start, p_stream); |
|
1512 |
- if (length < 0) |
|
1513 |
- length = 0; |
|
1514 |
- |
|
1515 |
- orig_length = length; |
|
1516 |
- if (length > pdf->size || obj->start + p_stream + length > pdf->size) { |
|
1517 |
- cli_dbgmsg("cli_pdf: length out of file: %lld + %lld > %lld\n", |
|
1518 |
- (long long)p_stream, (long long)length, (long long)pdf->size); |
|
1519 |
- noisy_warnmsg("length out of file, truncated: %lld + %lld > %lld\n", |
|
1520 |
- (long long)p_stream, (long long)length, (long long)pdf->size); |
|
1521 |
- length = pdf->size - (obj->start + p_stream); |
|
1522 |
- } |
|
1537 |
+ if (length < 0) |
|
1538 |
+ length = 0; |
|
1523 | 1539 |
|
1524 |
- if (!(obj->flags & (1 << OBJ_FILTER_FLATE)) && length <= 0) { |
|
1525 |
- const char *q = start + p_endstream; |
|
1526 |
- length = size; |
|
1527 |
- q--; |
|
1540 |
+ cli_dbgmsg("pdf_extract_obj: calculated length %lld\n", (long long)length); |
|
1541 |
+ } else { |
|
1542 |
+ if (obj->stream_size > (size_t)length + 2) { |
|
1543 |
+ cli_dbgmsg("cli_pdf: calculated length %zu < %zu\n", |
|
1544 |
+ (size_t)length, obj->stream_size); |
|
1545 |
+ length = obj->stream_size; |
|
1546 |
+ } |
|
1547 |
+ } |
|
1528 | 1548 |
|
1529 |
- if (*q == '\n') { |
|
1530 |
- q--; |
|
1531 |
- length--; |
|
1549 |
+ if ((0 != orig_length) && (obj->stream_size > (size_t)orig_length + 20)) { |
|
1550 |
+ cli_dbgmsg("pdf_extract_obj: orig length: %lld, length: %lld, size: %zu\n", |
|
1551 |
+ (long long)orig_length, (long long)length, obj->stream_size); |
|
1552 |
+ pdfobj_flag(pdf, obj, BAD_STREAMLEN); |
|
1553 |
+ } |
|
1532 | 1554 |
|
1533 |
- if (*q == '\r') |
|
1534 |
- length--; |
|
1535 |
- } else if (*q == '\r') { |
|
1536 |
- length--; |
|
1537 |
- } |
|
1555 |
+ if (0 == length) { |
|
1556 |
+ length = obj->stream_size; |
|
1557 |
+ if (0 == length) { |
|
1558 |
+ cli_dbgmsg("pdf_extract_obj: Alleged or calculated stream length and stream buffer size both 0\n"); |
|
1559 |
+ goto done; /* Empty stream, nothing to scan */ |
|
1560 |
+ } |
|
1561 |
+ } |
|
1538 | 1562 |
|
1539 |
- if (length < 0) |
|
1540 |
- length = 0; |
|
1563 |
+ /* Check if XRef is enabled */ |
|
1564 |
+ if (cli_memstr(start, dict_len, "/XRef", strlen("/XRef"))) { |
|
1565 |
+ xref = 1; |
|
1566 |
+ } |
|
1541 | 1567 |
|
1542 |
- cli_dbgmsg("pdf_extract_obj: calculated length %lld\n", (long long)length); |
|
1543 |
- } else { |
|
1544 |
- if (size > (size_t)length+2) { |
|
1545 |
- cli_dbgmsg("cli_pdf: calculated length %zu < %zu\n", |
|
1546 |
- (size_t)length, size); |
|
1547 |
- length = size; |
|
1548 |
- } |
|
1549 |
- } |
|
1568 |
+ cli_dbgmsg("-------------EXPERIMENTAL-------------\n"); |
|
1550 | 1569 |
|
1551 |
- if (orig_length && size > (size_t)orig_length + 20) { |
|
1552 |
- cli_dbgmsg("pdf_extract_obj: orig length: %lld, length: %lld, size: %zu\n", |
|
1553 |
- (long long)orig_length, (long long)length, size); |
|
1554 |
- pdfobj_flag(pdf, obj, BAD_STREAMLEN); |
|
1555 |
- } |
|
1570 |
+ /* |
|
1571 |
+ * Identify the DecodeParms, if available. |
|
1572 |
+ */ |
|
1573 |
+ if (NULL != (pstr = pdf_getdict(start, &dict_len, "/DecodeParms"))) |
|
1574 |
+ { |
|
1575 |
+ cli_dbgmsg("pdf_extract_obj: Found /DecodeParms\n"); |
|
1576 |
+ } |
|
1577 |
+ else if (NULL != (pstr = pdf_getdict(start, &dict_len, "/DP"))) |
|
1578 |
+ { |
|
1579 |
+ cli_dbgmsg("pdf_extract_obj: Found /DP\n"); |
|
1580 |
+ } |
|
1556 | 1581 |
|
1557 |
- if (!length) { |
|
1558 |
- length = size; |
|
1559 |
- if (!length) { |
|
1560 |
- cli_dbgmsg("pdf_extract_obj: length and size both 0\n"); |
|
1561 |
- break; /* Empty stream, nothing to scan */ |
|
1562 |
- } |
|
1563 |
- } |
|
1582 |
+ if (pstr) { |
|
1583 |
+ /* shift pstr left to "<<" for pdf_parse_dict */ |
|
1584 |
+ while ((*pstr == '<') && (pstr > start)) { |
|
1585 |
+ pstr--; |
|
1586 |
+ dict_len++; |
|
1587 |
+ } |
|
1564 | 1588 |
|
1565 |
- if (cli_memstr(start, p_stream, "/XRef", 5)) |
|
1566 |
- xref = 1; |
|
1589 |
+ /* shift pstr right to "<<" for pdf_parse_dict */ |
|
1590 |
+ while ((*pstr != '<') && (dict_len > 0)) { |
|
1591 |
+ pstr++; |
|
1592 |
+ dict_len--; |
|
1593 |
+ } |
|
1567 | 1594 |
|
1568 |
- cli_dbgmsg("-------------EXPERIMENTAL-------------\n"); |
|
1595 |
+ if (dict_len > 4) |
|
1596 |
+ dparams = pdf_parse_dict(pdf, obj, obj->size, (char *)pstr, NULL); |
|
1597 |
+ else |
|
1598 |
+ cli_dbgmsg("pdf_extract_obj: failed to locate DecodeParms dictionary start\n"); |
|
1599 |
+ } |
|
1569 | 1600 |
|
1570 |
- /* |
|
1571 |
- * Identify the DecodeParms, if available. |
|
1572 |
- */ |
|
1573 |
- if (NULL != (pstr = pdf_getdict(start, &len, "/DecodeParms"))) |
|
1574 |
- { |
|
1575 |
- cli_dbgmsg("pdf_extract_obj: Found /DecodeParms\n"); |
|
1601 |
+ /* |
|
1602 |
+ * Go back to the start of the dictionary and check to see if the stream |
|
1603 |
+ * is an object stream. If so, collect the relevant info. |
|
1604 |
+ */ |
|
1605 |
+ dict_len = obj->stream - start; |
|
1606 |
+ if (NULL != (pstr = pdf_getdict(start, &dict_len, "/Type/ObjStm"))) |
|
1607 |
+ { |
|
1608 |
+ int32_t objstm_first = -1; |
|
1609 |
+ int32_t objstm_length = -1; |
|
1610 |
+ int32_t objstm_n = -1; |
|
1611 |
+ |
|
1612 |
+ cli_dbgmsg("pdf_extract_obj: Found /Type/ObjStm\n"); |
|
1613 |
+ |
|
1614 |
+ dict_len = obj->stream - start; |
|
1615 |
+ if ((-1 == (objstm_first = pdf_readint(start, dict_len, "/First")))) |
|
1616 |
+ { |
|
1617 |
+ cli_warnmsg("pdf_extract_obj: Failed to find offset of first object in object stream\n"); |
|
1618 |
+ } |
|
1619 |
+ else if ((-1 == (objstm_length = pdf_readint(start, dict_len, "/Length")))) |
|
1620 |
+ { |
|
1621 |
+ cli_warnmsg("pdf_extract_obj: Failed to find length of object stream\n"); |
|
1622 |
+ } |
|
1623 |
+ else if ((-1 == (objstm_n = pdf_readint(start, dict_len, "/N")))) |
|
1624 |
+ { |
|
1625 |
+ cli_warnmsg("pdf_extract_obj: Failed to find num objects in object stream\n"); |
|
1626 |
+ } |
|
1627 |
+ else |
|
1628 |
+ { |
|
1629 |
+ /* Add objstm to pdf struct, so it can be freed eventually */ |
|
1630 |
+ pdf->nobjstms++; |
|
1631 |
+ pdf->objstms = cli_realloc2(pdf->objstms, sizeof(struct objstm_struct*) * pdf->nobjstms); |
|
1632 |
+ if (!pdf->objstms) { |
|
1633 |
+ cli_warnmsg("pdf_extract_obj: out of memory parsing object stream (%u)\n", pdf->nobjstms); |
|
1634 |
+ pdf_free_dict(dparams); |
|
1635 |
+ return CL_EMEM; |
|
1576 | 1636 |
} |
1577 |
- else if (NULL != (pstr = pdf_getdict(start, &len, "/DP"))) |
|
1578 |
- { |
|
1579 |
- cli_dbgmsg("pdf_extract_obj: Found /DP\n"); |
|
1637 |
+ |
|
1638 |
+ objstm = malloc(sizeof(struct objstm_struct)); |
|
1639 |
+ if (!objstm) { |
|
1640 |
+ cli_warnmsg("pdf_extract_obj: out of memory parsing object stream (%u)\n", pdf->nobjstms); |
|
1641 |
+ pdf_free_dict(dparams); |
|
1642 |
+ return CL_EMEM; |
|
1580 | 1643 |
} |
1644 |
+ pdf->objstms[pdf->nobjstms-1] = objstm; |
|
1581 | 1645 |
|
1582 |
- if (pstr) { |
|
1583 |
- unsigned int objsize = obj_size(pdf, obj, 1); |
|
1646 |
+ memset(objstm, 0, sizeof(*objstm)); |
|
1584 | 1647 |
|
1585 |
- /* shift pstr left to "<<" for pdf_parse_dict */ |
|
1586 |
- while ((*pstr == '<') && (pstr > start)) { |
|
1587 |
- pstr--; |
|
1588 |
- len++; |
|
1589 |
- } |
|
1648 |
+ objstm->first = (uint32_t)objstm_first; |
|
1649 |
+ objstm->current = (uint32_t)objstm_first; |
|
1650 |
+ objstm->current_pair = 0; |
|
1651 |
+ objstm->length = (uint32_t)objstm_length; |
|
1652 |
+ objstm->n = (uint32_t)objstm_n; |
|
1590 | 1653 |
|
1591 |
- /* shift pstr right to "<<" for pdf_parse_dict */ |
|
1592 |
- while ((*pstr != '<') && (len > 0)) { |
|
1593 |
- pstr++; |
|
1594 |
- len--; |
|
1595 |
- } |
|
1654 |
+ cli_dbgmsg("pdf_extract_obj: ObjStm first obj at offset %d\n", objstm->first); |
|
1655 |
+ cli_dbgmsg("pdf_extract_obj: ObjStm length is %d bytes\n", objstm->length); |
|
1656 |
+ cli_dbgmsg("pdf_extract_obj: ObjStm should contain %d objects\n", objstm->n); |
|
1657 |
+ } |
|
1658 |
+ } |
|
1596 | 1659 |
|
1597 |
- if (len > 4) |
|
1598 |
- dparams = pdf_parse_dict(pdf, obj, objsize, (char *)pstr, NULL); |
|
1599 |
- else |
|
1600 |
- cli_dbgmsg("pdf_extract_obj: failed to locate DecodeParms dictionary start\n"); |
|
1601 |
- } |
|
1660 |
+ sum = pdf_decodestream(pdf, obj, dparams, obj->stream, (uint32_t)length, xref, fout, &rc, objstm); |
|
1661 |
+ if ((CL_SUCCESS != rc) && (CL_VIRUS != rc)) { |
|
1662 |
+ cli_dbgmsg("Error decoding stream! Error code: %d\n", rc); |
|
1663 |
+ |
|
1664 |
+ /* It's ok if we couldn't decode the stream, |
|
1665 |
+ * make a best effort to keep parsing. */ |
|
1666 |
+ if (CL_EPARSE == rc) |
|
1667 |
+ rc = CL_SUCCESS; |
|
1602 | 1668 |
|
1669 |
+ if (NULL != objstm) { |
|
1603 | 1670 |
/* |
1604 |
- * Identify if the stream is an object stream. If so, collect the relevant info. |
|
1671 |
+ * If we were expecting an objstm and there was a failure... |
|
1672 |
+ * discard the memory for last object stream. |
|
1605 | 1673 |
*/ |
1606 |
- len = p_stream; |
|
1607 |
- if (NULL != (pstr = pdf_getdict(start, &len, "/Type/ObjStm"))) |
|
1608 |
- { |
|
1609 |
- int32_t objstm_first = -1; |
|
1610 |
- int32_t objstm_length = -1; |
|
1611 |
- int32_t objstm_n = -1; |
|
1612 |
- |
|
1613 |
- cli_dbgmsg("pdf_extract_obj: Found /Type/ObjStm\n"); |
|
1614 |
- |
|
1615 |
- len = p_stream; |
|
1616 |
- if ((-1 == (objstm_first = pdf_readint(start, len, "/First")))) |
|
1617 |
- { |
|
1618 |
- cli_warnmsg("pdf_extract_obj: Failed to find offset of first object in object stream\n"); |
|
1619 |
- } |
|
1620 |
- else if ((-1 == (objstm_length = pdf_readint(start, len, "/Length")))) |
|
1621 |
- { |
|
1622 |
- cli_warnmsg("pdf_extract_obj: Failed to find length of object stream\n"); |
|
1623 |
- } |
|
1624 |
- else if ((-1 == (objstm_n = pdf_readint(start, len, "/N")))) |
|
1625 |
- { |
|
1626 |
- cli_warnmsg("pdf_extract_obj: Failed to find num objects in object stream\n"); |
|
1627 |
- } |
|
1628 |
- else |
|
1629 |
- { |
|
1630 |
- /* Add objstm to pdf struct, so it can be freed eventually */ |
|
1631 |
- pdf->nobjstms++; |
|
1632 |
- pdf->objstms = cli_realloc2(pdf->objstms, sizeof(struct objstm_struct*) * pdf->nobjstms); |
|
1633 |
- if (!pdf->objstms) { |
|
1634 |
- cli_warnmsg("pdf_extract_obj: out of memory parsing object stream (%u)\n", pdf->nobjstms); |
|
1635 |
- pdf_free_dict(dparams); |
|
1636 |
- return CL_EMEM; |
|
1637 |
- } |
|
1638 |
- |
|
1639 |
- objstm = malloc(sizeof(struct objstm_struct)); |
|
1640 |
- if (!objstm) { |
|
1641 |
- cli_warnmsg("pdf_extract_obj: out of memory parsing object stream (%u)\n", pdf->nobjstms); |
|
1642 |
- pdf_free_dict(dparams); |
|
1643 |
- return CL_EMEM; |
|
1674 |
+ if (NULL != pdf->objstms) { |
|
1675 |
+ if (NULL != pdf->objstms[pdf->nobjstms - 1]) { |
|
1676 |
+ if (NULL != pdf->objstms[pdf->nobjstms - 1]->streambuf) { |
|
1677 |
+ free(pdf->objstms[pdf->nobjstms - 1]->streambuf); |
|
1678 |
+ pdf->objstms[pdf->nobjstms - 1]->streambuf = NULL; |
|
1644 | 1679 |
} |
1645 |
- pdf->objstms[pdf->nobjstms-1] = objstm; |
|
1646 |
- |
|
1647 |
- memset(objstm, 0, sizeof(*objstm)); |
|
1648 |
- |
|
1649 |
- objstm->first = (uint32_t)objstm_first; |
|
1650 |
- objstm->current = (uint32_t)objstm_first; |
|
1651 |
- objstm->current_pair = 0; |
|
1652 |
- objstm->length = (uint32_t)objstm_length; |
|
1653 |
- objstm->n = (uint32_t)objstm_n; |
|
1654 |
- |
|
1655 |
- cli_dbgmsg("pdf_extract_obj: ObjStm first obj at offset %d\n", objstm->first); |
|
1656 |
- cli_dbgmsg("pdf_extract_obj: ObjStm length is %d bytes\n", objstm->length); |
|
1657 |
- cli_dbgmsg("pdf_extract_obj: ObjStm should contain %d objects\n", objstm->n); |
|
1680 |
+ free(pdf->objstms[pdf->nobjstms - 1]); |
|
1681 |
+ pdf->objstms[pdf->nobjstms - 1] = NULL; |
|
1658 | 1682 |
} |
1659 |
- } |
|
1660 | 1683 |
|
1661 |
- sum = pdf_decodestream(pdf, obj, dparams, start + p_stream, (uint32_t)length, xref, fout, &rc, objstm); |
|
1662 |
- if ((CL_SUCCESS != rc) && (CL_VIRUS != rc)) { |
|
1663 |
- cli_dbgmsg("Error decoding stream! Error code: %d\n", rc); |
|
1664 |
- |
|
1665 |
- /* It's ok if we couldn't decode the stream, |
|
1666 |
- * make a best effort to keep parsing. */ |
|
1667 |
- if (CL_EPARSE == rc) |
|
1668 |
- rc = CL_SUCCESS; |
|
1669 |
- |
|
1670 |
- if (NULL != objstm) { |
|
1671 |
- /* |
|
1672 |
- * If we were expecting an objstm and there was a failure... |
|
1673 |
- * discard the memory for last object stream. |
|
1674 |
- */ |
|
1675 |
- if (NULL != pdf->objstms) { |
|
1676 |
- if (NULL != pdf->objstms[pdf->nobjstms - 1]) { |
|
1677 |
- if (NULL != pdf->objstms[pdf->nobjstms - 1]->streambuf) { |
|
1678 |
- free(pdf->objstms[pdf->nobjstms - 1]->streambuf); |
|
1679 |
- pdf->objstms[pdf->nobjstms - 1]->streambuf = NULL; |
|
1680 |
- } |
|
1681 |
- free(pdf->objstms[pdf->nobjstms - 1]); |
|
1682 |
- pdf->objstms[pdf->nobjstms - 1] = NULL; |
|
1683 |
- } |
|
1684 |
+ /* Pop the objstm off the end of the pdf->objstms array. */ |
|
1685 |
+ if (pdf->nobjstms > 0) { |
|
1686 |
+ pdf->nobjstms--; |
|
1687 |
+ if (0 == pdf->nobjstms) { |
|
1688 |
+ free(pdf->objstms); |
|
1689 |
+ pdf->objstms = NULL; |
|
1690 |
+ } else { |
|
1691 |
+ pdf->objstms = cli_realloc2(pdf->objstms, sizeof(struct objstm_struct*) * pdf->nobjstms); |
|
1684 | 1692 |
|
1685 |
- /* Pop the objstm off the end of the pdf->objstms array. */ |
|
1686 |
- if (pdf->nobjstms > 0) { |
|
1687 |
- pdf->nobjstms--; |
|
1688 |
- if (0 == pdf->nobjstms) { |
|
1689 |
- free(pdf->objstms); |
|
1690 |
- pdf->objstms = NULL; |
|
1691 |
- } else { |
|
1692 |
- pdf->objstms = cli_realloc2(pdf->objstms, sizeof(struct objstm_struct*) * pdf->nobjstms); |
|
1693 |
- |
|
1694 |
- if (!pdf->objstms) { |
|
1695 |
- cli_warnmsg("pdf_extract_obj: out of memory when shrinking down objstm array\n"); |
|
1696 |
- return CL_EMEM; |
|
1697 |
- } |
|
1698 |
- } |
|
1699 |
- } else { |
|
1700 |
- /* hm.. this shouldn't happen */ |
|
1701 |
- cli_warnmsg("pdf_extract_obj: Failure counting objstms.\n"); |
|
1693 |
+ if (!pdf->objstms) { |
|
1694 |
+ cli_warnmsg("pdf_extract_obj: out of memory when shrinking down objstm array\n"); |
|
1695 |
+ return CL_EMEM; |
|
1702 | 1696 |
} |
1703 | 1697 |
} |
1698 |
+ } else { |
|
1699 |
+ /* hm.. this shouldn't happen */ |
|
1700 |
+ cli_warnmsg("pdf_extract_obj: Failure counting objstms.\n"); |
|
1704 | 1701 |
} |
1705 | 1702 |
} |
1703 |
+ } |
|
1704 |
+ } |
|
1706 | 1705 |
|
1707 |
- if (dparams) |
|
1708 |
- pdf_free_dict(dparams); |
|
1706 |
+ if (dparams) |
|
1707 |
+ pdf_free_dict(dparams); |
|
1709 | 1708 |
|
1710 |
- if ((rc == CL_VIRUS) && !SCAN_ALLMATCHES) { |
|
1711 |
- sum = 0; /* prevents post-filter scan */ |
|
1712 |
- break; |
|
1713 |
- } |
|
1709 |
+ if ((rc == CL_VIRUS) && !SCAN_ALLMATCHES) { |
|
1710 |
+ sum = 0; /* prevents post-filter scan */ |
|
1711 |
+ goto done; |
|
1712 |
+ } |
|
1714 | 1713 |
|
1715 |
- cli_dbgmsg("-------------EXPERIMENTAL-------------\n"); |
|
1716 |
- } else { |
|
1717 |
- noisy_warnmsg("pdf_extract_obj: cannot find stream bounds for obj %u %u\n", obj->id>>8, obj->id&0xff); |
|
1718 |
- } |
|
1714 |
+ cli_dbgmsg("-------------EXPERIMENTAL-------------\n"); |
|
1719 | 1715 |
|
1720 |
- } else if (obj->flags & (1 << OBJ_JAVASCRIPT)) { |
|
1721 |
- const char *q2; |
|
1722 |
- const char *q = (obj->objstm) ? (const char *)(obj->start + obj->objstm->streambuf) |
|
1723 |
- : (const char *)(obj->start + pdf->map); |
|
1716 |
+ } else if (obj->flags & (1 << OBJ_JAVASCRIPT)) { |
|
1717 |
+ const char *q2; |
|
1718 |
+ const char *q = (obj->objstm) ? (const char *)(obj->start + obj->objstm->streambuf) |
|
1719 |
+ : (const char *)(obj->start + pdf->map); |
|
1724 | 1720 |
|
1725 |
- /* TODO: get obj-endobj size */ |
|
1726 |
- off_t bytesleft = obj_size(pdf, obj, 0); |
|
1721 |
+ /* TODO: get obj-endobj size */ |
|
1722 |
+ off_t bytesleft = obj->size; |
|
1727 | 1723 |
|
1728 |
- if (bytesleft < 0) |
|
1729 |
- break; |
|
1724 |
+ if (bytesleft < 0) { |
|
1725 |
+ goto done; |
|
1726 |
+ } |
|
1730 | 1727 |
|
1731 |
- do { |
|
1732 |
- char *js = NULL; |
|
1733 |
- size_t js_len = 0; |
|
1734 |
- const char *q3; |
|
1728 |
+ do { |
|
1729 |
+ char *js = NULL; |
|
1730 |
+ size_t js_len = 0; |
|
1731 |
+ const char *q3; |
|
1735 | 1732 |
|
1736 |
- q2 = cli_memstr(q, bytesleft, "/JavaScript", 11); |
|
1737 |
- if (!q2) |
|
1738 |
- break; |
|
1733 |
+ q2 = cli_memstr(q, bytesleft, "/JavaScript", 11); |
|
1734 |
+ if (!q2) |
|
1735 |
+ break; |
|
1739 | 1736 |
|
1740 |
- bytesleft -= q2 - q + 11; |
|
1741 |
- q = q2 + 11; |
|
1737 |
+ bytesleft -= q2 - q + 11; |
|
1738 |
+ q = q2 + 11; |
|
1742 | 1739 |
|
1743 |
- js = pdf_readstring(q, bytesleft, "/JS", NULL, &q2, !(pdf->flags & (1<<DECRYPTABLE_PDF))); |
|
1744 |
- bytesleft -= q2 - q; |
|
1745 |
- q = q2; |
|
1740 |
+ js = pdf_readstring(q, bytesleft, "/JS", NULL, &q2, !(pdf->flags & (1<<DECRYPTABLE_PDF))); |
|
1741 |
+ bytesleft -= q2 - q; |
|
1742 |
+ q = q2; |
|
1746 | 1743 |
|
1747 |
- if (js) { |
|
1748 |
- char *decrypted = NULL; |
|
1749 |
- const char *out = js; |
|
1750 |
- js_len = strlen(js); |
|
1751 |
- if (pdf->flags & (1 << DECRYPTABLE_PDF)) { |
|
1752 |
- cli_dbgmsg("pdf_extract_obj: encrypted string\n"); |
|
1753 |
- decrypted = decrypt_any(pdf, obj->id, js, &js_len, pdf->enc_method_string); |
|
1744 |
+ if (js) { |
|
1745 |
+ char *decrypted = NULL; |
|
1746 |
+ const char *out = js; |
|
1747 |
+ js_len = strlen(js); |
|
1748 |
+ if (pdf->flags & (1 << DECRYPTABLE_PDF)) { |
|
1749 |
+ cli_dbgmsg("pdf_extract_obj: encrypted string\n"); |
|
1750 |
+ decrypted = decrypt_any(pdf, obj->id, js, &js_len, pdf->enc_method_string); |
|
1754 | 1751 |
|
1755 |
- if (decrypted) { |
|
1756 |
- noisy_msg(pdf, "pdf_extract_obj: decrypted Javascript string from obj %u %u\n", obj->id>>8,obj->id&0xff); |
|
1757 |
- out = decrypted; |
|
1758 |
- } |
|
1752 |
+ if (decrypted) { |
|
1753 |
+ noisy_msg(pdf, "pdf_extract_obj: decrypted Javascript string from obj %u %u\n", obj->id>>8,obj->id&0xff); |
|
1754 |
+ out = decrypted; |
|
1759 | 1755 |
} |
1756 |
+ } |
|
1760 | 1757 |
|
1761 |
- if (filter_writen(pdf, obj, fout, out, js_len, (size_t*)&sum) != js_len) { |
|
1762 |
- rc = CL_EWRITE; |
|
1763 |
- free(js); |
|
1764 |
- break; |
|
1765 |
- } |
|
1758 |
+ if (filter_writen(pdf, obj, fout, out, js_len, (size_t*)&sum) != js_len) { |
|
1759 |
+ rc = CL_EWRITE; |
|
1760 |
+ free(js); |
|
1761 |
+ break; |
|
1762 |
+ } |
|
1766 | 1763 |
|
1767 |
- free(decrypted); |
|
1768 |
- free(js); |
|
1769 |
- cli_dbgmsg("pdf_extract_obj: bytesleft: %d\n", (int)bytesleft); |
|
1764 |
+ free(decrypted); |
|
1765 |
+ free(js); |
|
1766 |
+ cli_dbgmsg("pdf_extract_obj: bytesleft: %d\n", (int)bytesleft); |
|
1770 | 1767 |
|
1771 |
- if (bytesleft > 0) { |
|
1772 |
- q2 = pdf_nextobject(q, bytesleft); |
|
1773 |
- if (!q2) |
|
1774 |
- q2 = q + bytesleft - 1; |
|
1768 |
+ if (bytesleft > 0) { |
|
1769 |
+ q2 = pdf_nextobject(q, bytesleft); |
|
1770 |
+ if (!q2) |
|
1771 |
+ q2 = q + bytesleft - 1; |
|
1775 | 1772 |
|
1776 |
- /* non-conforming PDFs that don't escape ) properly */ |
|
1777 |
- q3 = memchr(q, ')', bytesleft); |
|
1778 |
- if (q3 && q3 < q2) |
|
1779 |
- q2 = q3; |
|
1773 |
+ /* non-conforming PDFs that don't escape ) properly */ |
|
1774 |
+ q3 = memchr(q, ')', bytesleft); |
|
1775 |
+ if (q3 && q3 < q2) |
|
1776 |
+ q2 = q3; |
|
1780 | 1777 |
|
1781 |
- while (q2 > q && q2[-1] == ' ') |
|
1782 |
- q2--; |
|
1778 |
+ while (q2 > q && q2[-1] == ' ') |
|
1779 |
+ q2--; |
|
1783 | 1780 |
|
1784 |
- if (q2 > q) { |
|
1785 |
- q--; |
|
1786 |
- filter_writen(pdf, obj, fout, q, q2 - q, (size_t*)&sum); |
|
1787 |
- q++; |
|
1788 |
- } |
|
1781 |
+ if (q2 > q) { |
|
1782 |
+ q--; |
|
1783 |
+ filter_writen(pdf, obj, fout, q, q2 - q, (size_t*)&sum); |
|
1784 |
+ q++; |
|
1789 | 1785 |
} |
1790 | 1786 |
} |
1787 |
+ } |
|
1791 | 1788 |
|
1792 |
- } while (bytesleft > 0); |
|
1793 |
- } else { |
|
1794 |
- off_t bytesleft = obj_size(pdf, obj, 0); |
|
1795 |
- |
|
1796 |
- if (bytesleft < 0) |
|
1797 |
- rc = CL_EFORMAT; |
|
1798 |
- else { |
|
1799 |
- if (obj->objstm) { |
|
1800 |
- if (filter_writen(pdf, obj, fout , obj->objstm->streambuf + obj->start, bytesleft, (size_t*)&sum) != (size_t)bytesleft) |
|
1801 |
- rc = CL_EWRITE; |
|
1802 |
- } else { |
|
1803 |
- if (filter_writen(pdf, obj, fout , pdf->map + obj->start, bytesleft, (size_t*)&sum) != (size_t)bytesleft) |
|
1804 |
- rc = CL_EWRITE; |
|
1805 |
- } |
|
1789 |
+ } while (bytesleft > 0); |
|
1790 |
+ } else { |
|
1791 |
+ off_t bytesleft = obj->size; |
|
1792 |
+ |
|
1793 |
+ if (bytesleft < 0) |
|
1794 |
+ rc = CL_EFORMAT; |
|
1795 |
+ else { |
|
1796 |
+ if (obj->objstm) { |
|
1797 |
+ if (filter_writen(pdf, obj, fout , obj->objstm->streambuf + obj->start, bytesleft, (size_t*)&sum) != (size_t)bytesleft) |
|
1798 |
+ rc = CL_EWRITE; |
|
1799 |
+ } else { |
|
1800 |
+ if (filter_writen(pdf, obj, fout , pdf->map + obj->start, bytesleft, (size_t*)&sum) != (size_t)bytesleft) |
|
1801 |
+ rc = CL_EWRITE; |
|
1806 | 1802 |
} |
1807 | 1803 |
} |
1808 |
- } while (0); |
|
1804 |
+ } |
|
1805 |
+ |
|
1806 |
+done: |
|
1809 | 1807 |
|
1810 | 1808 |
cli_dbgmsg("pdf_extract_obj: extracted %td bytes %u %u obj\n", sum, obj->id>>8, obj->id&0xff); |
1811 | 1809 |
cli_dbgmsg("pdf_extract_obj: ... to %s\n", fullname); |
... | ... |
@@ -2079,7 +1996,7 @@ static void pdf_parse_encrypt(struct pdf_struct *pdf, const char *enc, int len) |
2079 | 2079 |
} |
2080 | 2080 |
genid = (unsigned long)temp_long; |
2081 | 2081 |
|
2082 |
- objid |= genid & 0xff; |
|
2082 |
+ objid |= genid & 0xff; |
|
2083 | 2083 |
q2 = pdf_nextobject(q, len); |
2084 | 2084 |
if (!q2 || *q2 != 'R') |
2085 | 2085 |
return; |
... | ... |
@@ -2116,7 +2033,7 @@ void pdf_parseobj(struct pdf_struct *pdf, struct pdf_obj *obj) |
2116 | 2116 |
const char *nextobj = NULL, *nextopen = NULL, *nextclose = NULL; |
2117 | 2117 |
const char *q = NULL; |
2118 | 2118 |
const char *dict = NULL, *enddict = NULL, *start = NULL; |
2119 |
- off_t dict_length = 0, full_dict_length = 0, objsize = 0, bytesleft = 0; |
|
2119 |
+ off_t dict_length = 0, full_dict_length = 0, bytesleft = 0; |
|
2120 | 2120 |
size_t i = 0; |
2121 | 2121 |
unsigned filters = 0, blockopens = 0; |
2122 | 2122 |
enum objstate objstate = STATE_NONE; |
... | ... |
@@ -2129,6 +2046,8 @@ void pdf_parseobj(struct pdf_struct *pdf, struct pdf_obj *obj) |
2129 | 2129 |
return; |
2130 | 2130 |
} |
2131 | 2131 |
|
2132 |
+ cli_dbgmsg("pdf_parseobj: Parsing object %u %u\n", obj->id >> 8, obj->id & 0xff); |
|
2133 |
+ |
|
2132 | 2134 |
if (obj->objstm) { |
2133 | 2135 |
if ((size_t)obj->start > obj->objstm->streambuf_len) { |
2134 | 2136 |
cli_dbgmsg("pdf_parseobj: %u %u obj: obj start (%u) is greater than size of object stream (%zu).\n", |
... | ... |
@@ -2146,14 +2065,38 @@ void pdf_parseobj(struct pdf_struct *pdf, struct pdf_obj *obj) |
2146 | 2146 |
} |
2147 | 2147 |
start = q; |
2148 | 2148 |
|
2149 |
- objsize = obj_size(pdf, obj, 1); |
|
2150 |
- if (objsize < 0) |
|
2149 |
+ if (obj->size <= 0) |
|
2151 | 2150 |
return; |
2152 | 2151 |
|
2153 | 2152 |
if (obj->objstm) { |
2154 |
- bytesleft = MIN(objsize, obj->objstm->streambuf_len - obj->start); |
|
2153 |
+ bytesleft = MIN(obj->size, obj->objstm->streambuf_len - obj->start); |
|
2155 | 2154 |
} else { |
2156 |
- bytesleft = MIN(objsize, pdf->size - obj->start); |
|
2155 |
+ bytesleft = MIN(obj->size, pdf->size - obj->start); |
|
2156 |
+ } |
|
2157 |
+ |
|
2158 |
+ /* For objects that aren't already in an object stream^, check if they contain a stream. |
|
2159 |
+ * ^Objects in object streams aren't supposed to contain streams, so we don't check them. */ |
|
2160 |
+ if (NULL == obj->objstm) { |
|
2161 |
+ /* Check if object contains stream */ |
|
2162 |
+ cl_error_t has_stream; |
|
2163 |
+ const char* stream = NULL; |
|
2164 |
+ size_t stream_size = 0; |
|
2165 |
+ |
|
2166 |
+ has_stream = find_stream_bounds( |
|
2167 |
+ start, |
|
2168 |
+ obj->size, |
|
2169 |
+ &stream, |
|
2170 |
+ &stream_size, |
|
2171 |
+ (pdf->enc_method_stream <= ENC_IDENTITY) && (pdf->enc_method_embeddedfile <= ENC_IDENTITY)); |
|
2172 |
+ |
|
2173 |
+ if ((CL_SUCCESS == has_stream) || |
|
2174 |
+ (CL_EFORMAT == has_stream)) { |
|
2175 |
+ /* Stream found. Store this fact and the stream bounds. */ |
|
2176 |
+ cli_dbgmsg("pdf_parseobj: %u %u contains stream, size: %zu\n", obj->id>>8, obj->id&0xff, stream_size); |
|
2177 |
+ obj->flags |= (1 << OBJ_STREAM); |
|
2178 |
+ obj->stream = stream; |
|
2179 |
+ obj->stream_size = stream_size; |
|
2180 |
+ } |
|
2157 | 2181 |
} |
2158 | 2182 |
|
2159 | 2183 |
/* find start of dictionary */ |
... | ... |
@@ -2204,7 +2147,7 @@ void pdf_parseobj(struct pdf_struct *pdf, struct pdf_obj *obj) |
2204 | 2204 |
dict = q3+2; |
2205 | 2205 |
q = dict; |
2206 | 2206 |
blockopens++; |
2207 |
- bytesleft = objsize - (q - start); |
|
2207 |
+ bytesleft = obj->size - (q - start); |
|
2208 | 2208 |
enddict = q + bytesleft - 1; |
2209 | 2209 |
|
2210 | 2210 |
/* find end of dictionary block */ |
... | ... |
@@ -2355,7 +2298,7 @@ void pdf_parseobj(struct pdf_struct *pdf, struct pdf_obj *obj) |
2355 | 2355 |
pdfobj_flag(pdf, obj, LINEARIZED_PDF); |
2356 | 2356 |
objstate = STATE_NONE; |
2357 | 2357 |
trailer_end = pdf_readint(dict, full_dict_length, "/H"); |
2358 |
- if (trailer_end > 0 && trailer_end < pdf->size) { |
|
2358 |
+ if ((trailer_end > 0) && ((size_t)trailer_end < pdf->size)) { |
|
2359 | 2359 |
trailer = trailer_end - 1024; |
2360 | 2360 |
if (trailer < 0) |
2361 | 2361 |
trailer = 0; |
... | ... |
@@ -2967,7 +2910,7 @@ void pdf_handle_enc(struct pdf_struct *pdf) |
2967 | 2967 |
return; |
2968 | 2968 |
} |
2969 | 2969 |
|
2970 |
- len = obj_size(pdf, obj, 1); |
|
2970 |
+ len = obj->size; |
|
2971 | 2971 |
q = (obj->objstm) ? (const char *)(obj->start + obj->objstm->streambuf) |
2972 | 2972 |
: (const char *)(obj->start + pdf->map); |
2973 | 2973 |
|
... | ... |
@@ -3123,20 +3066,20 @@ void pdf_handle_enc(struct pdf_struct *pdf) |
3123 | 3123 |
} |
3124 | 3124 |
|
3125 | 3125 |
/** |
3126 |
- * @brief Search pdf buffer for objects. Parse each. |
|
3127 |
- * |
|
3126 |
+ * @brief Search pdf buffer for objects. Parse each. |
|
3127 |
+ * |
|
3128 | 3128 |
* Newly found objects will be extracted after completion when the extraction for loop continues. |
3129 |
- * |
|
3130 |
- * @param pdf Pdf struct that keeps track of all information found in the PDF. |
|
3129 |
+ * |
|
3130 |
+ * @param pdf Pdf struct that keeps track of all information found in the PDF. |
|
3131 | 3131 |
* @param objstm Pointer to an object stream to parse. |
3132 |
- * |
|
3132 |
+ * |
|
3133 | 3133 |
* @return cl_error_t Error code. |
3134 | 3134 |
*/ |
3135 | 3135 |
cl_error_t pdf_find_and_parse_objs_in_objstm(struct pdf_struct *pdf, struct objstm_struct *objstm) |
3136 | 3136 |
{ |
3137 | 3137 |
cl_error_t status = CL_EFORMAT; |
3138 | 3138 |
cl_error_t retval = CL_EPARSE; |
3139 |
- int32_t foundobj = 0, alerts = 0; |
|
3139 |
+ int32_t alerts = 0; |
|
3140 | 3140 |
uint32_t badobjects = 0; |
3141 | 3141 |
size_t i = 0; |
3142 | 3142 |
|
... | ... |
@@ -3147,11 +3090,8 @@ cl_error_t pdf_find_and_parse_objs_in_objstm(struct pdf_struct *pdf, struct objs |
3147 | 3147 |
goto done; |
3148 | 3148 |
} |
3149 | 3149 |
|
3150 |
- char* current_pair = objstm->streambuf; |
|
3151 |
- char* current_obj = objstm->streambuf + objstm->first; |
|
3152 |
- |
|
3153 |
- if ((0 == objstm->first) || |
|
3154 |
- (0 == objstm->streambuf_len) || |
|
3150 |
+ if ((0 == objstm->first) || |
|
3151 |
+ (0 == objstm->streambuf_len) || |
|
3155 | 3152 |
(0 == objstm->n)) |
3156 | 3153 |
{ |
3157 | 3154 |
cli_dbgmsg("pdf_find_and_parse_objs_in_objstm: Empty object stream.\n"); |
... | ... |
@@ -3177,7 +3117,7 @@ cl_error_t pdf_find_and_parse_objs_in_objstm(struct pdf_struct *pdf, struct objs |
3177 | 3177 |
|
3178 | 3178 |
/* Find object */ |
3179 | 3179 |
retval = pdf_findobj_in_objstm(pdf, objstm, &obj); |
3180 |
- |
|
3180 |
+ |
|
3181 | 3181 |
if (retval != CL_SUCCESS) |
3182 | 3182 |
{ |
3183 | 3183 |
cli_dbgmsg("pdf_find_and_parse_objs_in_objstm: Fewer objects in stream than expected: %u found, %u expected.\n", |
... | ... |
@@ -3207,7 +3147,7 @@ cl_error_t pdf_find_and_parse_objs_in_objstm(struct pdf_struct *pdf, struct objs |
3207 | 3207 |
status = CL_EFORMAT; |
3208 | 3208 |
goto done; |
3209 | 3209 |
} |
3210 |
- |
|
3210 |
+ |
|
3211 | 3211 |
status = CL_SUCCESS; |
3212 | 3212 |
|
3213 | 3213 |
done: |
... | ... |
@@ -3216,18 +3156,17 @@ done: |
3216 | 3216 |
|
3217 | 3217 |
/** |
3218 | 3218 |
* @brief Search pdf buffer for objects. Parse each and then extract each. |
3219 |
- * |
|
3219 |
+ * |
|
3220 | 3220 |
* @param pdf Pdf struct that keeps track of all information found in the PDF. |
3221 | 3221 |
* @param alerts[in/out] The number of alerts, relevant in ALLMATCH mode. |
3222 |
- * |
|
3222 |
+ * |
|
3223 | 3223 |
* @return cl_error_t Error code. |
3224 | 3224 |
*/ |
3225 | 3225 |
cl_error_t pdf_find_and_extract_objs(struct pdf_struct *pdf, uint32_t *alerts) |
3226 | 3226 |
{ |
3227 | 3227 |
cl_error_t status = CL_SUCCESS; |
3228 | 3228 |
int32_t rv = 0; |
3229 |
- int foundobj = 0; |
|
3230 |
- unsigned int i = 0, j = 0; |
|
3229 |
+ unsigned int i = 0; |
|
3231 | 3230 |
uint32_t badobjects = 0; |
3232 | 3231 |
cli_ctx *ctx = pdf->ctx; |
3233 | 3232 |
|
... | ... |
@@ -3269,7 +3208,7 @@ cl_error_t pdf_find_and_extract_objs(struct pdf_struct *pdf, uint32_t *alerts) |
3269 | 3269 |
* This doesn't trigger for PDFs that are encrypted but don't need |
3270 | 3270 |
* a password to decrypt */ |
3271 | 3271 |
status = cli_append_virus(pdf->ctx, "Heuristics.Encrypted.PDF"); |
3272 |
- if (status == CL_VIRUS) { |
|
3272 |
+ if (status == CL_VIRUS) { |
|
3273 | 3273 |
alerts++; |
3274 | 3274 |
if (SCAN_ALLMATCHES) |
3275 | 3275 |
status = CL_CLEAN; |
... | ... |
@@ -3328,11 +3267,11 @@ done: |
3328 | 3328 |
|
3329 | 3329 |
/** |
3330 | 3330 |
* @brief Primary function for parsing and scanning a PDF. |
3331 |
- * |
|
3331 |
+ * |
|
3332 | 3332 |
* @param dir Filepath for temp file. |
3333 |
- * @param ctx clam scan context structure. |
|
3333 |
+ * @param ctx clam scan context structure. |
|
3334 | 3334 |
* @param offset offset of pdf in ctx->fmap |
3335 |
- * |
|
3335 |
+ * |
|
3336 | 3336 |
* @return int Returns cl_error_t status value. |
3337 | 3337 |
*/ |
3338 | 3338 |
int cli_pdf(const char *dir, cli_ctx *ctx, off_t offset) |
... | ... |
@@ -3532,7 +3471,7 @@ int cli_pdf(const char *dir, cli_ctx *ctx, off_t offset) |
3532 | 3532 |
} |
3533 | 3533 |
|
3534 | 3534 |
/* |
3535 |
- * Find and extract all objects in the PDF. |
|
3535 |
+ * Find and extract all objects in the PDF. |
|
3536 | 3536 |
* New experimental recursive methodology that adds objects from object streams. |
3537 | 3537 |
*/ |
3538 | 3538 |
objs_found = pdf.nobjs; |
... | ... |
@@ -3633,10 +3572,10 @@ done: |
3633 | 3633 |
|
3634 | 3634 |
/** |
3635 | 3635 |
* @brief Skip the rest of the current line, and find the start of the next line. |
3636 |
- * |
|
3636 |
+ * |
|
3637 | 3637 |
* @param ptr Current offset into buffer. |
3638 |
- * @param len Remaining bytes in buffer. |
|
3639 |
- * |
|
3638 |
+ * @param len Remaining bytes in buffer. |
|
3639 |
+ * |
|
3640 | 3640 |
* @return const char* Address of next line, or NULL if no next line in buffer. |
3641 | 3641 |
*/ |
3642 | 3642 |
static const char * |
... | ... |
@@ -3666,13 +3605,13 @@ pdf_nextlinestart(const char *ptr, size_t len) |
3666 | 3666 |
|
3667 | 3667 |
/** |
3668 | 3668 |
* @brief Return the start of the next PDF object. |
3669 |
- * |
|
3669 |
+ * |
|
3670 | 3670 |
* This assumes that we're not in a stream. |
3671 |
- * |
|
3671 |
+ * |
|
3672 | 3672 |
* @param ptr Current offset into buffer. |
3673 |
- * @param len Remaining bytes in buffer. |
|
3674 |
- * |
|
3675 |
- * @return const char* Address of next object in the buffer, or NULL if there is none in the buffer. |
|
3673 |
+ * @param len Remaining bytes in buffer. |
|
3674 |
+ * |
|
3675 |
+ * @return const char* Address of next object in the buffer, or NULL if there is none in the buffer. |
|
3676 | 3676 |
*/ |
3677 | 3677 |
static const char * |
3678 | 3678 |
pdf_nextobject(const char *ptr, size_t len) |
... | ... |
@@ -4015,7 +3954,7 @@ static void Author_cb(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdfnam |
4015 | 4015 |
pdf->stats.author = cli_calloc(1, sizeof(struct pdf_stats_entry)); |
4016 | 4016 |
if (!(pdf->stats.author)) |
4017 | 4017 |
return; |
4018 |
- pdf->stats.author->data = pdf_parse_string(pdf, obj, objstart, obj_size(pdf, obj, 1), "/Author", NULL, &(pdf->stats.author->meta)); |
|
4018 |
+ pdf->stats.author->data = pdf_parse_string(pdf, obj, objstart, obj->size, "/Author", NULL, &(pdf->stats.author->meta)); |
|
4019 | 4019 |
} |
4020 | 4020 |
} |
4021 | 4021 |
#endif |
... | ... |
@@ -4040,7 +3979,7 @@ static void Creator_cb(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdfna |
4040 | 4040 |
pdf->stats.creator = cli_calloc(1, sizeof(struct pdf_stats_entry)); |
4041 | 4041 |
if (!(pdf->stats.creator)) |
4042 | 4042 |
return; |
4043 |
- pdf->stats.creator->data = pdf_parse_string(pdf, obj, objstart, obj_size(pdf, obj, 1), "/Creator", NULL, &(pdf->stats.creator->meta)); |
|
4043 |
+ pdf->stats.creator->data = pdf_parse_string(pdf, obj, objstart, obj->size, "/Creator", NULL, &(pdf->stats.creator->meta)); |
|
4044 | 4044 |
} |
4045 | 4045 |
} |
4046 | 4046 |
#endif |
... | ... |
@@ -4065,7 +4004,7 @@ static void ModificationDate_cb(struct pdf_struct *pdf, struct pdf_obj *obj, str |
4065 | 4065 |
pdf->stats.modificationdate = cli_calloc(1, sizeof(struct pdf_stats_entry)); |
4066 | 4066 |
if (!(pdf->stats.modificationdate)) |
4067 | 4067 |
return; |
4068 |
- pdf->stats.modificationdate->data = pdf_parse_string(pdf, obj, objstart, obj_size(pdf, obj, 1), "/ModDate", NULL, &(pdf->stats.modificationdate->meta)); |
|
4068 |
+ pdf->stats.modificationdate->data = pdf_parse_string(pdf, obj, objstart, obj->size, "/ModDate", NULL, &(pdf->stats.modificationdate->meta)); |
|
4069 | 4069 |
} |
4070 | 4070 |
} |
4071 | 4071 |
#endif |
... | ... |
@@ -4090,7 +4029,7 @@ static void CreationDate_cb(struct pdf_struct *pdf, struct pdf_obj *obj, struct |
4090 | 4090 |
pdf->stats.creationdate = cli_calloc(1, sizeof(struct pdf_stats_entry)); |
4091 | 4091 |
if (!(pdf->stats.creationdate)) |
4092 | 4092 |
return; |
4093 |
- pdf->stats.creationdate->data = pdf_parse_string(pdf, obj, objstart, obj_size(pdf, obj, 1), "/CreationDate", NULL, &(pdf->stats.creationdate->meta)); |
|
4093 |
+ pdf->stats.creationdate->data = pdf_parse_string(pdf, obj, objstart, obj->size, "/CreationDate", NULL, &(pdf->stats.creationdate->meta)); |
|
4094 | 4094 |
} |
4095 | 4095 |
} |
4096 | 4096 |
#endif |
... | ... |
@@ -4115,7 +4054,7 @@ static void Producer_cb(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdfn |
4115 | 4115 |
pdf->stats.producer = cli_calloc(1, sizeof(struct pdf_stats_entry)); |
4116 | 4116 |
if (!(pdf->stats.producer)) |
4117 | 4117 |
return; |
4118 |
- pdf->stats.producer->data = pdf_parse_string(pdf, obj, objstart, obj_size(pdf, obj, 1), "/Producer", NULL, &(pdf->stats.producer->meta)); |
|
4118 |
+ pdf->stats.producer->data = pdf_parse_string(pdf, obj, objstart, obj->size, "/Producer", NULL, &(pdf->stats.producer->meta)); |
|
4119 | 4119 |
} |
4120 | 4120 |
} |
4121 | 4121 |
#endif |
... | ... |
@@ -4140,7 +4079,7 @@ static void Title_cb(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdfname |
4140 | 4140 |
pdf->stats.title = cli_calloc(1, sizeof(struct pdf_stats_entry)); |
4141 | 4141 |
if (!(pdf->stats.title)) |
4142 | 4142 |
return; |
4143 |
- pdf->stats.title->data = pdf_parse_string(pdf, obj, objstart, obj_size(pdf, obj, 1), "/Title", NULL, &(pdf->stats.title->meta)); |
|
4143 |
+ pdf->stats.title->data = pdf_parse_string(pdf, obj, objstart, obj->size, "/Title", NULL, &(pdf->stats.title->meta)); |
|
4144 | 4144 |
} |
4145 | 4145 |
} |
4146 | 4146 |
#endif |
... | ... |
@@ -4165,7 +4104,7 @@ static void Keywords_cb(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdfn |
4165 | 4165 |
pdf->stats.keywords = cli_calloc(1, sizeof(struct pdf_stats_entry)); |
4166 | 4166 |
if (!(pdf->stats.keywords)) |
4167 | 4167 |
return; |
4168 |
- pdf->stats.keywords->data = pdf_parse_string(pdf, obj, objstart, obj_size(pdf, obj, 1), "/Keywords", NULL, &(pdf->stats.keywords->meta)); |
|
4168 |
+ pdf->stats.keywords->data = pdf_parse_string(pdf, obj, objstart, obj->size, "/Keywords", NULL, &(pdf->stats.keywords->meta)); |
|
4169 | 4169 |
} |
4170 | 4170 |
} |
4171 | 4171 |
#endif |
... | ... |
@@ -4190,7 +4129,7 @@ static void Subject_cb(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdfna |
4190 | 4190 |
pdf->stats.subject = cli_calloc(1, sizeof(struct pdf_stats_entry)); |
4191 | 4191 |
if (!(pdf->stats.subject)) |
4192 | 4192 |
return; |
4193 |
- pdf->stats.subject->data = pdf_parse_string(pdf, obj, objstart, obj_size(pdf, obj, 1), "/Subject", NULL, &(pdf->stats.subject->meta)); |
|
4193 |
+ pdf->stats.subject->data = pdf_parse_string(pdf, obj, objstart, obj->size, "/Subject", NULL, &(pdf->stats.subject->meta)); |
|
4194 | 4194 |
} |
4195 | 4195 |
} |
4196 | 4196 |
#endif |
... | ... |
@@ -4242,7 +4181,6 @@ static void Pages_cb(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdfname |
4242 | 4242 |
const char *objstart = (obj->objstm) ? (const char *)(obj->start + obj->objstm->streambuf) |
4243 | 4243 |
: (const char *)(obj->start + pdf->map); |
4244 | 4244 |
const char *begin; |
4245 |
- unsigned int objsize; |
|
4246 | 4245 |
unsigned long npages=0, count; |
4247 | 4246 |
long temp_long; |
4248 | 4247 |
struct pdf_array_node *node; |
... | ... |
@@ -4257,19 +4195,17 @@ static void Pages_cb(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdfname |
4257 | 4257 |
if (!(SCAN_COLLECT_METADATA)) |
4258 | 4258 |
return; |
4259 | 4259 |
|
4260 |
- objsize = obj_size(pdf, obj, 1); |
|
4261 |
- |
|
4262 | 4260 |
pdfobj = cli_jsonobj(pdf->ctx->wrkproperty, "PDFStats"); |
4263 | 4261 |
if (!(pdfobj)) |
4264 | 4262 |
return; |
4265 | 4263 |
|
4266 |
- begin = cli_memstr(objstart, objsize, "/Kids", 5); |
|
4264 |
+ begin = cli_memstr(objstart, obj->size, "/Kids", 5); |
|
4267 | 4265 |
if (!(begin)) |
4268 | 4266 |
return; |
4269 | 4267 |
|
4270 | 4268 |
begin += 5; |
4271 | 4269 |
|
4272 |
- array = pdf_parse_array(pdf, obj, objsize, (char *)begin, NULL); |
|
4270 |
+ array = pdf_parse_array(pdf, obj, obj->size, (char *)begin, NULL); |
|
4273 | 4271 |
if (!(array)) { |
4274 | 4272 |
cli_jsonbool(pdfobj, "IncorrectPagesCount", 1); |
4275 | 4273 |
return; |
... | ... |
@@ -4280,22 +4216,22 @@ static void Pages_cb(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdfname |
4280 | 4280 |
if (strchr((char *)(node->data), 'R')) |
4281 | 4281 |
npages++; |
4282 | 4282 |
|
4283 |
- begin = cli_memstr(objstart, objsize, "/Count", 6); |
|
4283 |
+ begin = cli_memstr(objstart, obj->size, "/Count", 6); |
|
4284 | 4284 |
if (!(begin)) { |
4285 | 4285 |
cli_jsonbool(pdfobj, "IncorrectPagesCount", 1); |
4286 | 4286 |
goto cleanup; |
4287 | 4287 |
} |
4288 | 4288 |
|
4289 | 4289 |
begin += 6; |
4290 |
- while (begin - objstart < objsize && isspace(begin[0])) |
|
4290 |
+ while (((size_t)(begin - objstart) < obj->size) && isspace(begin[0])) |
|
4291 | 4291 |
begin++; |
4292 | 4292 |
|
4293 |
- if (begin - objstart >= objsize) { |
|
4293 |
+ if ((size_t)(begin - objstart) >= obj->size) { |
|
4294 | 4294 |
goto cleanup; |
4295 | 4295 |
} |
4296 | 4296 |
|
4297 |
- countsize = (obj->objstm) ? (size_t)(obj->start + obj->objstm->streambuf + objsize - begin) |
|
4298 |
- : (size_t)(obj->start + pdf->map + objsize - begin); |
|
4297 |
+ countsize = (obj->objstm) ? (size_t)(obj->start + obj->objstm->streambuf + obj->size - begin) |
|
4298 |
+ : (size_t)(obj->start + pdf->map + obj->size - begin); |
|
4299 | 4299 |
|
4300 | 4300 |
if (CL_SUCCESS != cli_strntol_wrap(begin, countsize, 0, 10, &temp_long)) { |
4301 | 4301 |
cli_jsonbool(pdfobj, "IncorrectPagesCount", 1); |
... | ... |
@@ -4323,7 +4259,6 @@ static void Colors_cb(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdfnam |
4323 | 4323 |
char *p1; |
4324 | 4324 |
const char *objstart = (obj->objstm) ? (const char *)(obj->start + obj->objstm->streambuf) |
4325 | 4325 |
: (const char *)(obj->start + pdf->map); |
4326 |
- size_t objsize; |
|
4327 | 4326 |
|
4328 | 4327 |
UNUSEDPARAM(act); |
4329 | 4328 |
|
... | ... |
@@ -4333,25 +4268,23 @@ static void Colors_cb(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdfnam |
4333 | 4333 |
if (!(SCAN_COLLECT_METADATA)) |
4334 | 4334 |
return; |
4335 | 4335 |
|
4336 |
- objsize = obj_size(pdf, obj, 1); |
|
4337 |
- |
|
4338 |
- p1 = (char *)cli_memstr(objstart, objsize, "/Colors", 7); |
|
4336 |
+ p1 = (char *)cli_memstr(objstart, obj->size, "/Colors", 7); |
|
4339 | 4337 |
if (!(p1)) |
4340 | 4338 |
return; |
4341 | 4339 |
|
4342 | 4340 |
p1 += 7; |
4343 | 4341 |
|
4344 | 4342 |
/* Ensure that we have at least one whitespace character plus at least one number */ |
4345 |
- if (objsize - (p1 - objstart) < 2) |
|
4343 |
+ if (obj->size - (size_t)(p1 - objstart) < 2) |
|
4346 | 4344 |
return; |
4347 | 4345 |
|
4348 |
- while (p1 - objstart < objsize && isspace(p1[0])) |
|
4346 |
+ while (((size_t)(p1 - objstart) < obj->size) && isspace(p1[0])) |
|
4349 | 4347 |
p1++; |
4350 | 4348 |
|
4351 |
- if ((size_t)(p1 - objstart) == objsize) |
|
4349 |
+ if ((size_t)(p1 - objstart) == obj->size) |
|
4352 | 4350 |
return; |
4353 | 4351 |
|
4354 |
- if (CL_SUCCESS != cli_strntol_wrap(p1, (size_t)((p1 - objstart) - objsize), 0, 10, &temp_long)) { |
|
4352 |
+ if (CL_SUCCESS != cli_strntol_wrap(p1, (size_t)((p1 - objstart) - obj->size), 0, 10, &temp_long)) { |
|
4355 | 4353 |
return; |
4356 | 4354 |
} else if (temp_long < 0) { |
4357 | 4355 |
return; |
... | ... |
@@ -37,12 +37,14 @@ struct objstm_struct { |
37 | 37 |
|
38 | 38 |
struct pdf_obj { |
39 | 39 |
uint32_t start; |
40 |
- int32_t size; |
|
40 |
+ size_t size; |
|
41 | 41 |
uint32_t id; |
42 | 42 |
uint32_t flags; |
43 | 43 |
uint32_t statsflags; |
44 | 44 |
uint32_t numfilters; |
45 | 45 |
uint32_t filterlist[PDF_FILTERLIST_MAX]; |
46 |
+ const char *stream; // pointer to stream contained in object. |
|
47 |
+ size_t stream_size; // size of stream contained in object. |
|
46 | 48 |
struct objstm_struct *objstm; // Should be NULL unless the obj exists in an object stream (separate buffer) |
47 | 49 |
char *path; |
48 | 50 |
}; |
... | ... |
@@ -146,7 +148,7 @@ struct pdf_struct { |
146 | 146 |
const char *CF; |
147 | 147 |
long CF_n; |
148 | 148 |
const char *map; |
149 |
- off_t size; |
|
149 |
+ size_t size; |
|
150 | 150 |
off_t offset; |
151 | 151 |
off_t startoff; |
152 | 152 |
cli_ctx *ctx; |
... | ... |
@@ -400,9 +400,9 @@ char *cli_strtokbuf(const char *input, int fieldno, const char *delim, char *out |
400 | 400 |
return output; |
401 | 401 |
} |
402 | 402 |
|
403 |
-const char *cli_memstr(const char *haystack, unsigned int hs, const char *needle, unsigned int ns) |
|
403 |
+const char *cli_memstr(const char *haystack, size_t hs, const char *needle, size_t ns) |
|
404 | 404 |
{ |
405 |
- unsigned int i, s1, s2; |
|
405 |
+ size_t i, s1, s2; |
|
406 | 406 |
|
407 | 407 |
if(!hs || !ns || hs < ns) |
408 | 408 |
return NULL; |
... | ... |
@@ -68,7 +68,7 @@ int cli_xtoi(const char *hex); |
68 | 68 |
char *cli_str2hex(const char *string, unsigned int len); |
69 | 69 |
char *cli_utf16toascii(const char *str, unsigned int length); |
70 | 70 |
char *cli_strtokbuf(const char *input, int fieldno, const char *delim, char *output); |
71 |
-const char *cli_memstr(const char *haystack, unsigned int hs, const char *needle, unsigned int ns); |
|
71 |
+const char *cli_memstr(const char *haystack, size_t hs, const char *needle, size_t ns); |
|
72 | 72 |
char *cli_strrcpy(char *dest, const char *source); |
73 | 73 |
size_t cli_strtokenize(char *buffer, const char delim, const size_t token_count, const char **tokens); |
74 | 74 |
size_t cli_ldbtokenize(char *buffer, const char delim, const size_t token_count, const char **tokens, int token_skip); |