git-svn: trunk@1671
Nigel Horne authored on 2005/07/30 19:08:59... | ... |
@@ -15,7 +15,7 @@ |
15 | 15 |
* along with this program; if not, write to the Free Software |
16 | 16 |
* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. |
17 | 17 |
*/ |
18 |
-static char const rcsid[] = "$Id: pdf.c,v 1.29 2005/07/23 08:54:16 nigelhorne Exp $"; |
|
18 |
+static char const rcsid[] = "$Id: pdf.c,v 1.30 2005/07/30 10:08:59 nigelhorne Exp $"; |
|
19 | 19 |
|
20 | 20 |
#if HAVE_CONFIG_H |
21 | 21 |
#include "clamav-config.h" |
... | ... |
@@ -57,7 +57,10 @@ static char const rcsid[] = "$Id: pdf.c,v 1.29 2005/07/23 08:54:16 nigelhorne Ex |
57 | 57 |
static int flatedecode(const unsigned char *buf, size_t len, int fout); |
58 | 58 |
static int ascii85decode(const char *buf, size_t len, unsigned char *output); |
59 | 59 |
static const char *pdf_nextlinestart(const char *ptr, size_t len); |
60 |
+#if 0 |
|
60 | 61 |
static const char *pdf_nexttoken(const char *ptr, size_t len); |
62 |
+#endif |
|
63 |
+static const char *pdf_nextobject(const char *ptr, size_t len); |
|
61 | 64 |
|
62 | 65 |
int |
63 | 66 |
cli_pdf(const char *dir, int desc) |
... | ... |
@@ -102,13 +105,18 @@ cli_pdf(const char *dir, int desc) |
102 | 102 |
return CL_EFORMAT; |
103 | 103 |
} |
104 | 104 |
|
105 |
- q = pdf_nextlinestart(p, size); |
|
105 |
+#if 0 |
|
106 |
+ q = pdf_nextlinestart(&p[6], size - 6); |
|
106 | 107 |
if(q == NULL) { |
107 | 108 |
munmap(buf, size); |
108 | 109 |
return CL_EFORMAT; |
109 | 110 |
} |
110 | 111 |
bytesleft = size - (long)(q - p); |
111 | 112 |
p = q; |
113 |
+#else |
|
114 |
+ p = &p[6]; |
|
115 |
+ bytesleft = size - 6; |
|
116 |
+#endif |
|
112 | 117 |
|
113 | 118 |
/* Find the file trailer */ |
114 | 119 |
for(q = &p[bytesleft - 6]; q > p; --q) |
... | ... |
@@ -137,7 +145,10 @@ cli_pdf(const char *dir, int desc) |
137 | 137 |
return CL_EFORMAT; |
138 | 138 |
} |
139 | 139 |
|
140 |
+ /* |
|
141 |
+ * not true, since edits may put data after the trailer |
|
140 | 142 |
bytesleft -= trailerlength; |
143 |
+ */ |
|
141 | 144 |
|
142 | 145 |
/* |
143 | 146 |
* FIXME: Handle more than one xref section in the xref table |
... | ... |
@@ -158,22 +169,56 @@ cli_pdf(const char *dir, int desc) |
158 | 158 |
|
159 | 159 |
xreflength = (size_t)(trailerstart - xrefstart); |
160 | 160 |
|
161 |
+ /* |
|
162 |
+ * not true, since edits may put data after the trailer |
|
161 | 163 |
bytesleft -= xreflength; |
164 |
+ */ |
|
162 | 165 |
|
163 | 166 |
/* |
164 |
- * For each object in the body section |
|
167 |
+ * The body section consists of a sequence of indirect objects |
|
165 | 168 |
*/ |
166 |
- while((q = cli_pmemstr(p, bytesleft, " obj", 4)) != NULL) { |
|
169 |
+ while((p < xrefstart) && ((q = pdf_nextobject(p, bytesleft)) != NULL)) { |
|
167 | 170 |
int is_ascii85decode, is_flatedecode, fout, len; |
171 |
+ int object_number, generation_number; |
|
168 | 172 |
const char *objstart, *objend, *streamstart, *streamend; |
169 | 173 |
size_t length, objlen, streamlen; |
170 | 174 |
char fullname[NAME_MAX + 1]; |
171 | 175 |
|
172 |
- bytesleft -= (q - p) + 4; |
|
173 |
- objstart = p = &q[4]; |
|
176 |
+ if(q == xrefstart) |
|
177 |
+ break; |
|
178 |
+ if(memcmp(q, "xref", 4) == 0) |
|
179 |
+ break; |
|
180 |
+ if(!isdigit(*q)) { |
|
181 |
+ cli_warnmsg("Object number missing\n"); |
|
182 |
+ rc = CL_EFORMAT; |
|
183 |
+ break; |
|
184 |
+ } |
|
185 |
+ object_number = atoi(q); |
|
186 |
+ bytesleft -= (q - p); |
|
187 |
+ p = q; |
|
188 |
+ |
|
189 |
+ q = pdf_nextobject(p, bytesleft); |
|
190 |
+ if((q == NULL) || !isdigit(*q)) { |
|
191 |
+ cli_warnmsg("Generation number missing\n"); |
|
192 |
+ rc = CL_EFORMAT; |
|
193 |
+ break; |
|
194 |
+ } |
|
195 |
+ generation_number = atoi(q); |
|
196 |
+ bytesleft -= (q - p); |
|
197 |
+ p = q; |
|
198 |
+ |
|
199 |
+ q = pdf_nextobject(p, bytesleft); |
|
200 |
+ if((q == NULL) || (memcmp(q, "obj", 3) != 0)) { |
|
201 |
+ cli_warnmsg("Indirect object missing \"obj\"\n"); |
|
202 |
+ rc = CL_EFORMAT; |
|
203 |
+ break; |
|
204 |
+ } |
|
205 |
+ |
|
206 |
+ bytesleft -= (q - p) + 3; |
|
207 |
+ objstart = p = &q[3]; |
|
174 | 208 |
objend = cli_pmemstr(p, bytesleft, "endobj", 6); |
175 | 209 |
if(objend == NULL) { |
176 |
- cli_dbgmsg("No matching endobj"); |
|
210 |
+ cli_dbgmsg("No matching endobj\n"); |
|
177 | 211 |
break; |
178 | 212 |
} |
179 | 213 |
bytesleft -= (objend - p) + 6; |
... | ... |
@@ -191,7 +236,7 @@ cli_pdf(const char *dir, int desc) |
191 | 191 |
*/ |
192 | 192 |
q = objstart; |
193 | 193 |
while(q < streamstart) { |
194 |
- if(*q == '/') { |
|
194 |
+ if(*q == '/') { /* name object */ |
|
195 | 195 |
if(strncmp(++q, "Length ", 7) == 0) { |
196 | 196 |
q += 7; |
197 | 197 |
length = atoi(q); |
... | ... |
@@ -206,7 +251,7 @@ cli_pdf(const char *dir, int desc) |
206 | 206 |
q += 13; |
207 | 207 |
} |
208 | 208 |
} |
209 |
- q = pdf_nexttoken(q, (size_t)(streamstart - q)); |
|
209 |
+ q = pdf_nextobject(q, (size_t)(streamstart - q)); |
|
210 | 210 |
if(q == NULL) |
211 | 211 |
break; |
212 | 212 |
} |
... | ... |
@@ -292,7 +337,8 @@ cli_pdf(const char *dir, int desc) |
292 | 292 |
|
293 | 293 |
if(zstat != Z_OK) |
294 | 294 |
rc = CL_EZIP; |
295 |
- } |
|
295 |
+ } else |
|
296 |
+ cli_writen(fout, (char *)streamstart, streamlen); |
|
296 | 297 |
} |
297 | 298 |
free(tmpbuf); |
298 | 299 |
} else if(is_flatedecode) { |
... | ... |
@@ -305,6 +351,7 @@ cli_pdf(const char *dir, int desc) |
305 | 305 |
|
306 | 306 |
close(fout); |
307 | 307 |
cli_dbgmsg("cli_pdf: extracted to %s\n", fullname); |
308 |
+ |
|
308 | 309 |
} |
309 | 310 |
|
310 | 311 |
munmap(buf, size); |
... | ... |
@@ -448,6 +495,7 @@ pdf_nextlinestart(const char *ptr, size_t len) |
448 | 448 |
return ptr; |
449 | 449 |
} |
450 | 450 |
|
451 |
+#if 0 |
|
451 | 452 |
/* |
452 | 453 |
* Return the start of the next PDF token. |
453 | 454 |
* This assumes that we're not in a stream. |
... | ... |
@@ -471,7 +519,8 @@ pdf_nexttoken(const char *ptr, size_t len) |
471 | 471 |
intoken = 0; |
472 | 472 |
break; |
473 | 473 |
|
474 |
- /*case '(': |
|
474 |
+ case ' ': |
|
475 |
+ case '(': |
|
475 | 476 |
case ')': |
476 | 477 |
case '<': |
477 | 478 |
case '>': |
... | ... |
@@ -480,19 +529,58 @@ pdf_nexttoken(const char *ptr, size_t len) |
480 | 480 |
case '{': |
481 | 481 |
case '}': |
482 | 482 |
case '/': |
483 |
+ case '\t': |
|
484 |
+ case '\v': |
|
485 |
+ case '\f': |
|
486 |
+ intoken = 0; |
|
487 |
+ ptr++; |
|
488 |
+ len--; |
|
489 |
+ break; |
|
490 |
+ default: |
|
483 | 491 |
if(!intoken) |
484 | 492 |
return ptr; |
485 | 493 |
ptr++; |
486 | 494 |
len--; |
487 |
- break;*/ |
|
495 |
+ } |
|
496 |
+ } |
|
497 |
+ return NULL; |
|
498 |
+} |
|
499 |
+#endif |
|
500 |
+ |
|
501 |
+/* |
|
502 |
+ * Return the start of the next PDF object. |
|
503 |
+ * This assumes that we're not in a stream. |
|
504 |
+ */ |
|
505 |
+static const char * |
|
506 |
+pdf_nextobject(const char *ptr, size_t len) |
|
507 |
+{ |
|
508 |
+ const char *p; |
|
509 |
+ int inobject = 1; |
|
510 |
+ |
|
511 |
+ while(len) { |
|
512 |
+ switch(*ptr) { |
|
513 |
+ case '\n': |
|
514 |
+ case '\r': |
|
515 |
+ case '%': /* comment */ |
|
516 |
+ p = pdf_nextlinestart(ptr, len); |
|
517 |
+ if(p == NULL) |
|
518 |
+ return NULL; |
|
519 |
+ len -= (size_t)(p - ptr); |
|
520 |
+ ptr = p; |
|
521 |
+ inobject = 0; |
|
522 |
+ break; |
|
523 |
+ |
|
488 | 524 |
case ' ': |
489 | 525 |
case '\t': |
490 |
- intoken = 0; |
|
526 |
+ case '\v': |
|
527 |
+ case '\f': |
|
528 |
+ inobject = 0; |
|
491 | 529 |
ptr++; |
492 | 530 |
len--; |
493 | 531 |
break; |
494 | 532 |
default: |
495 |
- if(!intoken) |
|
533 |
+ if(!inobject) |
|
534 |
+ /* TODO: parse and return object type */ |
|
496 | 535 |
return ptr; |
497 | 536 |
ptr++; |
498 | 537 |
len--; |