Browse code

Better parsing

git-svn: trunk@1671

Nigel Horne authored on 2005/07/30 19:08:59
Showing 1 changed files
... ...
@@ -15,7 +15,7 @@
15 15
  *  along with this program; if not, write to the Free Software
16 16
  *  Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
17 17
  */
18
-static	char	const	rcsid[] = "$Id: pdf.c,v 1.29 2005/07/23 08:54:16 nigelhorne Exp $";
18
+static	char	const	rcsid[] = "$Id: pdf.c,v 1.30 2005/07/30 10:08:59 nigelhorne Exp $";
19 19
 
20 20
 #if HAVE_CONFIG_H
21 21
 #include "clamav-config.h"
... ...
@@ -57,7 +57,10 @@ static	char	const	rcsid[] = "$Id: pdf.c,v 1.29 2005/07/23 08:54:16 nigelhorne Ex
57 57
 static	int	flatedecode(const unsigned char *buf, size_t len, int fout);
58 58
 static	int	ascii85decode(const char *buf, size_t len, unsigned char *output);
59 59
 static	const	char	*pdf_nextlinestart(const char *ptr, size_t len);
60
+#if	0
60 61
 static	const	char	*pdf_nexttoken(const char *ptr, size_t len);
62
+#endif
63
+static	const	char	*pdf_nextobject(const char *ptr, size_t len);
61 64
 
62 65
 int
63 66
 cli_pdf(const char *dir, int desc)
... ...
@@ -102,13 +105,18 @@ cli_pdf(const char *dir, int desc)
102 102
 		return CL_EFORMAT;
103 103
 	}
104 104
 
105
-	q = pdf_nextlinestart(p, size);
105
+#if	0
106
+	q = pdf_nextlinestart(&p[6], size - 6);
106 107
 	if(q == NULL) {
107 108
 		munmap(buf, size);
108 109
 		return CL_EFORMAT;
109 110
 	}
110 111
 	bytesleft = size - (long)(q - p);
111 112
 	p = q;
113
+#else
114
+	p = &p[6];
115
+	bytesleft = size - 6;
116
+#endif
112 117
 
113 118
 	/* Find the file trailer */
114 119
 	for(q = &p[bytesleft - 6]; q > p; --q)
... ...
@@ -137,7 +145,10 @@ cli_pdf(const char *dir, int desc)
137 137
 		return CL_EFORMAT;
138 138
 	}
139 139
 
140
+	/*
141
+	 * not true, since edits may put data after the trailer
140 142
 	bytesleft -= trailerlength;
143
+	 */
141 144
 
142 145
 	/*
143 146
 	 * FIXME: Handle more than one xref section in the xref table
... ...
@@ -158,22 +169,56 @@ cli_pdf(const char *dir, int desc)
158 158
 
159 159
 	xreflength = (size_t)(trailerstart - xrefstart);
160 160
 
161
+	/*
162
+	 * not true, since edits may put data after the trailer
161 163
 	bytesleft -= xreflength;
164
+	 */
162 165
 
163 166
 	/*
164
-	 * For each object in the body section
167
+	 * The body section consists of a sequence of indirect objects
165 168
 	 */
166
-	while((q = cli_pmemstr(p, bytesleft, " obj", 4)) != NULL) {
169
+	while((p < xrefstart) && ((q = pdf_nextobject(p, bytesleft)) != NULL)) {
167 170
 		int is_ascii85decode, is_flatedecode, fout, len;
171
+		int object_number, generation_number;
168 172
 		const char *objstart, *objend, *streamstart, *streamend;
169 173
 		size_t length, objlen, streamlen;
170 174
 		char fullname[NAME_MAX + 1];
171 175
 
172
-		bytesleft -= (q - p) + 4;
173
-		objstart = p = &q[4];
176
+		if(q == xrefstart)
177
+			break;
178
+		if(memcmp(q, "xref", 4) == 0)
179
+			break;
180
+		if(!isdigit(*q)) {
181
+			cli_warnmsg("Object number missing\n");
182
+			rc = CL_EFORMAT;
183
+			break;
184
+		}
185
+		object_number = atoi(q);
186
+		bytesleft -= (q - p);
187
+		p = q;
188
+
189
+		q = pdf_nextobject(p, bytesleft);
190
+		if((q == NULL) || !isdigit(*q)) {
191
+			cli_warnmsg("Generation number missing\n");
192
+			rc = CL_EFORMAT;
193
+			break;
194
+		}
195
+		generation_number = atoi(q);
196
+		bytesleft -= (q - p);
197
+		p = q;
198
+
199
+		q = pdf_nextobject(p, bytesleft);
200
+		if((q == NULL) || (memcmp(q, "obj", 3) != 0)) {
201
+			cli_warnmsg("Indirect object missing \"obj\"\n");
202
+			rc = CL_EFORMAT;
203
+			break;
204
+		}
205
+
206
+		bytesleft -= (q - p) + 3;
207
+		objstart = p = &q[3];
174 208
 		objend = cli_pmemstr(p, bytesleft, "endobj", 6);
175 209
 		if(objend == NULL) {
176
-			cli_dbgmsg("No matching endobj");
210
+			cli_dbgmsg("No matching endobj\n");
177 211
 			break;
178 212
 		}
179 213
 		bytesleft -= (objend - p) + 6;
... ...
@@ -191,7 +236,7 @@ cli_pdf(const char *dir, int desc)
191 191
 		 */
192 192
 		q = objstart;
193 193
 		while(q < streamstart) {
194
-			if(*q == '/') {
194
+			if(*q == '/') {	/* name object */
195 195
 				if(strncmp(++q, "Length ", 7) == 0) {
196 196
 					q += 7;
197 197
 					length = atoi(q);
... ...
@@ -206,7 +251,7 @@ cli_pdf(const char *dir, int desc)
206 206
 					q += 13;
207 207
 				}
208 208
 			}
209
-			q = pdf_nexttoken(q, (size_t)(streamstart - q));
209
+			q = pdf_nextobject(q, (size_t)(streamstart - q));
210 210
 			if(q == NULL)
211 211
 				break;
212 212
 		}
... ...
@@ -292,7 +337,8 @@ cli_pdf(const char *dir, int desc)
292 292
 
293 293
 					if(zstat != Z_OK)
294 294
 						rc = CL_EZIP;
295
-				}
295
+				} else
296
+					cli_writen(fout, (char *)streamstart, streamlen);
296 297
 			}
297 298
 			free(tmpbuf);
298 299
 		} else if(is_flatedecode) {
... ...
@@ -305,6 +351,7 @@ cli_pdf(const char *dir, int desc)
305 305
 
306 306
 		close(fout);
307 307
 		cli_dbgmsg("cli_pdf: extracted to %s\n", fullname);
308
+
308 309
 	}
309 310
 
310 311
 	munmap(buf, size);
... ...
@@ -448,6 +495,7 @@ pdf_nextlinestart(const char *ptr, size_t len)
448 448
 	return ptr;
449 449
 }
450 450
 
451
+#if	0
451 452
 /*
452 453
  * Return the start of the next PDF token.
453 454
  * This assumes that we're not in a stream.
... ...
@@ -471,7 +519,8 @@ pdf_nexttoken(const char *ptr, size_t len)
471 471
 				intoken = 0;
472 472
 				break;
473 473
 
474
-			/*case '(':
474
+			case ' ':
475
+			case '(':
475 476
 			case ')':
476 477
 			case '<':
477 478
 			case '>':
... ...
@@ -480,19 +529,58 @@ pdf_nexttoken(const char *ptr, size_t len)
480 480
 			case '{':
481 481
 			case '}':
482 482
 			case '/':
483
+			case '\t':
484
+			case '\v':
485
+			case '\f':
486
+				intoken = 0;
487
+				ptr++;
488
+				len--;
489
+				break;
490
+			default:
483 491
 				if(!intoken)
484 492
 					return ptr;
485 493
 				ptr++;
486 494
 				len--;
487
-				break;*/
495
+		}
496
+	}
497
+	return NULL;
498
+}
499
+#endif
500
+
501
+/*
502
+ * Return the start of the next PDF object.
503
+ * This assumes that we're not in a stream.
504
+ */
505
+static const char *
506
+pdf_nextobject(const char *ptr, size_t len)
507
+{
508
+	const char *p;
509
+	int inobject = 1;
510
+
511
+	while(len) {
512
+		switch(*ptr) {
513
+			case '\n':
514
+			case '\r':
515
+			case '%':	/* comment */
516
+				p = pdf_nextlinestart(ptr, len);
517
+				if(p == NULL)
518
+					return NULL;
519
+				len -= (size_t)(p - ptr);
520
+				ptr = p;
521
+				inobject = 0;
522
+				break;
523
+
488 524
 			case ' ':
489 525
 			case '\t':
490
-				intoken = 0;
526
+			case '\v':
527
+			case '\f':
528
+				inobject = 0;
491 529
 				ptr++;
492 530
 				len--;
493 531
 				break;
494 532
 			default:
495
-				if(!intoken)
533
+				if(!inobject)
534
+					/* TODO: parse and return object type */
496 535
 					return ptr;
497 536
 				ptr++;
498 537
 				len--;