Browse code

Try with both real and calculated Length fields

git-svn: trunk@2882

Nigel Horne authored on 2007/03/01 20:06:37
Showing 2 changed files
... ...
@@ -1,3 +1,10 @@
1
+Thu Mar  1 09:10:04 GMT 2007 (njh)
2
+----------------------------------
3
+  * libclamav/pdf.c:	Try with both real and calculated Length fields, since
4
+				the Length object can't always be trusted
5
+			Improved backing out of unhandled formats (e.g.
6
+				Predictor for images and embedded fonts)
7
+
1 8
 Thu Mar  1 02:36:40 CET 2007 (tk)
2 9
 ---------------------------------
3 10
   * libclamav/unrar/unrar.c: improve handling of multi-volume archives: do not
... ...
@@ -50,7 +57,7 @@ Sun Feb 25 20:50:54 CET 2007 (tk)
50 50
 Sun Feb 25 17:00:31 CET 2007 (acab)
51 51
 -----------------------------------
52 52
   * libclamav/pe.c: fix leaks on upack return (bb#351)
53
-  
53
+
54 54
 Sun Feb 25 14:40:10 CET 2007 (tk)
55 55
 ---------------------------------
56 56
   * libclamav/unzip.c: fix memory leak when extracting stored files
... ...
@@ -1,5 +1,5 @@
1 1
 /*
2
- *  Copyright (C) 2005 Nigel Horne <njh@bandsman.co.uk>
2
+ *  Copyright (C) 2005-2007 Nigel Horne <njh@bandsman.co.uk>
3 3
  *
4 4
  *  This program is free software; you can redistribute it and/or modify
5 5
  *  it under the terms of the GNU General Public License as published by
... ...
@@ -14,6 +14,9 @@
14 14
  *  You should have received a copy of the GNU General Public License
15 15
  *  along with this program; if not, write to the Free Software
16 16
  *  Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
17
+ *
18
+ * TODO: Embedded fonts
19
+ * TODO: Predictor image handling
17 20
  */
18 21
 static	char	const	rcsid[] = "$Id: pdf.c,v 1.61 2007/02/12 20:46:09 njh Exp $";
19 22
 
... ...
@@ -51,6 +54,11 @@ static	char	const	rcsid[] = "$Id: pdf.c,v 1.61 2007/02/12 20:46:09 njh Exp $";
51 51
 #include "mbox.h"
52 52
 #include "pdf.h"
53 53
 
54
+#ifdef	CL_DEBUG
55
+/*#define	SAVE_TMP	/* Save the file being worked on in tmp */
56
+#endif
57
+
58
+static	int	try_flatedecode(unsigned char *buf, off_t real_len, off_t calculated_len, int fout, const cli_ctx *ctx);
54 59
 static	int	flatedecode(unsigned char *buf, off_t len, int fout, const cli_ctx *ctx);
55 60
 static	int	ascii85decode(const char *buf, off_t len, unsigned char *output);
56 61
 static	const	char	*pdf_nextlinestart(const char *ptr, size_t len);
... ...
@@ -72,6 +80,8 @@ cli_pdf(const char *dir, int desc, const cli_ctx *ctx)
72 72
 	/*size_t xreflength;*/
73 73
 	int rc = CL_CLEAN;
74 74
 	struct table *md5table;
75
+	int printed_predictor_message;
76
+	int printed_embedded_font_message;
75 77
 
76 78
 	cli_dbgmsg("in cli_pdf(%s)\n", dir);
77 79
 
... ...
@@ -162,6 +172,8 @@ cli_pdf(const char *dir, int desc, const cli_ctx *ctx)
162 162
 		return CL_EFORMAT;
163 163
 	}
164 164
 
165
+	printed_predictor_message = printed_embedded_font_message = 0;
166
+
165 167
 	md5table = tableCreate();
166 168
 	/*
167 169
 	 * not true, since edits may put data after the trailer
... ...
@@ -179,7 +191,8 @@ cli_pdf(const char *dir, int desc, const cli_ctx *ctx)
179 179
 		/*int object_number, generation_number;*/
180 180
 		const char *objstart, *objend, *streamstart, *streamend;
181 181
 		char *md5digest;
182
-		size_t length, objlen, streamlen;
182
+		size_t length, objlen, real_streamlen, calculated_streamlen;
183
+		int is_embedded_font, predictor;
183 184
 		char fullname[NAME_MAX + 1];
184 185
 
185 186
 		if(q == xrefstart)
... ...
@@ -231,7 +244,10 @@ cli_pdf(const char *dir, int desc, const cli_ctx *ctx)
231 231
 		if(streamstart == NULL)
232 232
 			continue;
233 233
 
234
-		length = is_ascii85decode = is_flatedecode = 0;
234
+		is_embedded_font = length = is_ascii85decode =
235
+			is_flatedecode = 0;
236
+		predictor = 1;
237
+
235 238
 		/*
236 239
 		 * TODO: handle F and FFilter?
237 240
 		 */
... ...
@@ -245,6 +261,14 @@ cli_pdf(const char *dir, int desc, const cli_ctx *ctx)
245 245
 					while(isdigit(*q))
246 246
 						q++;
247 247
 					q--;
248
+				} else if(strncmp(q, "Length2 ", 8) == 0)
249
+					is_embedded_font = 1;
250
+				else if(strncmp(q, "Predictor ", 10) == 0) {
251
+					q += 10;
252
+					predictor = atoi(q);
253
+					while(isdigit(*q))
254
+						q++;
255
+					q--;
248 256
 				} else if(strncmp(q, "FlateDecode", 11) == 0) {
249 257
 					is_flatedecode = 1;
250 258
 					q += 11;
... ...
@@ -258,6 +282,31 @@ cli_pdf(const char *dir, int desc, const cli_ctx *ctx)
258 258
 				break;
259 259
 		}
260 260
 
261
+		if(is_embedded_font) {
262
+			/*
263
+			 * Need some documentation, the only I can find a
264
+			 * reference to is not free, if some kind sole wishes
265
+			 * to donate a copy, please contact me!
266
+			 * (http://safari.adobepress.com/0321304748)
267
+			 */
268
+			if(!printed_embedded_font_message) {
269
+				cli_dbgmsg("Embedded fonts not yet supported\n");
270
+				printed_embedded_font_message = 1;
271
+			}
272
+			continue;
273
+		}
274
+		if(predictor > 1) {
275
+			/*
276
+			 * Needs some thought
277
+			 */
278
+			if(!printed_predictor_message) {
279
+				cli_dbgmsg("Predictor %d not honoured for embedded image\n",
280
+					predictor);
281
+				printed_predictor_message = 1;
282
+			}
283
+			continue;
284
+		}
285
+
261 286
 		/* objend points to the end of the object (start of "endobj") */
262 287
 		streamstart += 6;	/* go past the word "stream" */
263 288
 		len = (int)(objend - streamstart);
... ...
@@ -274,16 +323,6 @@ cli_pdf(const char *dir, int desc, const cli_ctx *ctx)
274 274
 				break;
275 275
 			}
276 276
 		}
277
-		/*while(strchr("\r\n", *--streamend))
278
-			;*/
279
-
280
-		streamlen = (int)(streamend - streamstart) + 1;
281
-
282
-		if(streamlen == 0) {
283
-			cli_dbgmsg("Empty stream\n");
284
-			continue;
285
-		}
286
-
287 277
 		snprintf(fullname, sizeof(fullname), "%s/pdfXXXXXX", dir);
288 278
 #if	defined(C_LINUX) || defined(C_BSD) || defined(HAVE_MKSTEMP) || defined(C_SOLARIS) || defined(C_CYGWIN)
289 279
 		fout = mkstemp(fullname);
... ...
@@ -312,8 +351,26 @@ cli_pdf(const char *dir, int desc, const cli_ctx *ctx)
312 312
 			break;
313 313
 		}
314 314
 
315
-		cli_dbgmsg("length %d, streamlen %d isFlate %d isASCII85 %d\n",
316
-			length, streamlen, is_flatedecode, is_ascii85decode);
315
+		/*
316
+		 * Calculate the length ourself, the Length parameter is often
317
+		 * wrong
318
+		 */
319
+		while(strchr("\r\n", *--streamend))
320
+			;
321
+
322
+		if(streamend <= streamstart) {
323
+			cli_dbgmsg("Empty stream\n");
324
+			continue;
325
+		}
326
+		calculated_streamlen = (int)(streamend - streamstart) + 1;
327
+		real_streamlen = length;
328
+
329
+		if(calculated_streamlen != real_streamlen)
330
+			cli_dbgmsg("cli_pdf: Incorrect Length field in file attempting to recover\n");
331
+
332
+		cli_dbgmsg("length %d, calculated_streamlen %d isFlate %d isASCII85 %d\n",
333
+			length, calculated_streamlen,
334
+			is_flatedecode, is_ascii85decode);
317 335
 
318 336
 #if	0
319 337
 		/* FIXME: this isn't right... */
... ...
@@ -323,7 +380,7 @@ cli_pdf(const char *dir, int desc, const cli_ctx *ctx)
323 323
 #endif
324 324
 
325 325
 		if(is_ascii85decode) {
326
-			unsigned char *tmpbuf = cli_malloc(streamlen * 5);
326
+			unsigned char *tmpbuf = cli_malloc(calculated_streamlen * 5);
327 327
 			int ret;
328 328
 
329 329
 			if(tmpbuf == NULL) {
... ...
@@ -333,7 +390,7 @@ cli_pdf(const char *dir, int desc, const cli_ctx *ctx)
333 333
 				continue;
334 334
 			}
335 335
 
336
-			ret = ascii85decode(streamstart, streamlen, tmpbuf);
336
+			ret = ascii85decode(streamstart, calculated_streamlen, tmpbuf);
337 337
 
338 338
 			if(ret == -1) {
339 339
 				free(tmpbuf);
... ...
@@ -343,31 +400,32 @@ cli_pdf(const char *dir, int desc, const cli_ctx *ctx)
343 343
 				continue;
344 344
 			}
345 345
 			if(ret) {
346
-				streamlen = (size_t)ret;
346
+				real_streamlen = (size_t)ret;
347 347
 				/* free unused trailing bytes */
348
-				tmpbuf = cli_realloc(tmpbuf, streamlen);
348
+				tmpbuf = cli_realloc(tmpbuf,
349
+					calculated_streamlen);
349 350
 				/*
350 351
 				 * Note that it will probably be both
351 352
 				 * ascii85encoded and flateencoded
352 353
 				 */
353 354
 				if(is_flatedecode) {
354
-					const int zstat = flatedecode((unsigned char *)tmpbuf, streamlen, fout, ctx);
355
+					const int zstat = try_flatedecode((unsigned char *)tmpbuf, real_streamlen, real_streamlen, fout, ctx);
355 356
 
356 357
 					if(zstat != Z_OK)
357 358
 						rc = CL_EZIP;
358 359
 				} else
359
-					cli_writen(fout, (const char *)streamstart, streamlen);
360
+					cli_writen(fout, (const char *)streamstart, real_streamlen);
360 361
 			}
361 362
 			free(tmpbuf);
362 363
 		} else if(is_flatedecode) {
363
-			const int zstat = flatedecode((unsigned char *)streamstart, streamlen, fout, ctx);
364
+			const int zstat = try_flatedecode((unsigned char *)streamstart, real_streamlen, calculated_streamlen, fout, ctx);
364 365
 
365 366
 			if(zstat != Z_OK)
366 367
 				rc = CL_EZIP;
367 368
 		} else {
368 369
 			cli_dbgmsg("cli_pdf: writing %lu bytes from the stream\n",
369
-				(unsigned long)streamlen);
370
-			cli_writen(fout, (const char *)streamstart, streamlen);
370
+				(unsigned long)real_streamlen);
371
+			cli_writen(fout, (const char *)streamstart, real_streamlen);
371 372
 		}
372 373
 
373 374
 		close(fout);
... ...
@@ -391,15 +449,56 @@ cli_pdf(const char *dir, int desc, const cli_ctx *ctx)
391 391
 
392 392
 /* flate inflation - returns zlib status, e.g. Z_OK */
393 393
 static int
394
+try_flatedecode(unsigned char *buf, off_t real_len, off_t calculated_len, int fout, const cli_ctx *ctx)
395
+{
396
+	int ret = flatedecode(buf, real_len, fout, ctx);
397
+
398
+	if(ret == Z_OK)
399
+		return Z_OK;
400
+
401
+	if(real_len == calculated_len)
402
+		return ret;
403
+
404
+	return flatedecode(buf, calculated_len, fout, ctx);
405
+}
406
+
407
+static int
394 408
 flatedecode(unsigned char *buf, off_t len, int fout, const cli_ctx *ctx)
395 409
 {
396 410
 	int zstat;
397 411
 	off_t nbytes;
398 412
 	z_stream stream;
399 413
 	unsigned char output[BUFSIZ];
414
+#ifdef	SAVE_TMP
415
+	char tmpfilename[16];
416
+	int tmpfd;
417
+#endif
400 418
 
401 419
 	cli_dbgmsg("cli_pdf: flatedecode %lu bytes\n", (unsigned long)len);
402 420
 
421
+#ifdef	SAVE_TMP
422
+	/*
423
+	 * Copy the embedded area for debugging, so that if it falls over
424
+	 * we have a copy of the offending data. This is debugging code
425
+	 * that you shouldn't of course install in a live environment. I am
426
+	 * not interested in hearing about security issues with this section
427
+	 * of the parser.
428
+	 */
429
+	strcpy(tmpfilename, "/tmp/pdfXXXXXX");
430
+	tmpfd = mkstemp(tmpfilename);
431
+	if(tmpfd < 0) {
432
+		perror(tmpfilename);
433
+		cli_errmsg("Can't make debugging file\n");
434
+	} else {
435
+		FILE *tmpfp = fdopen(tmpfd, "w");
436
+
437
+		if(tmpfp) {
438
+			fwrite(buf, sizeof(char), len, tmpfp);
439
+			fclose(tmpfp);
440
+		} else
441
+			cli_errmsg("cli_pdf: can't fdopen debugging file\n");
442
+	}
443
+#endif
403 444
 	stream.zalloc = (alloc_func)Z_NULL;
404 445
 	stream.zfree = (free_func)Z_NULL;
405 446
 	stream.opaque = (void *)NULL;
... ...
@@ -441,11 +540,11 @@ flatedecode(unsigned char *buf, off_t len, int fout, const cli_ctx *ctx)
441 441
 				break;
442 442
 			default:
443 443
 				if(stream.msg)
444
-					cli_warnmsg("pdf: after writing %lu bytes, got error \"%s\" inflating PDF attachment\n",
444
+					cli_dbgmsg("pdf: after writing %lu bytes, got error \"%s\" inflating PDF attachment\n",
445 445
 						(unsigned long)nbytes,
446 446
 						stream.msg);
447 447
 				else
448
-					cli_warnmsg("pdf: after writing %lu bytes, got error %d inflating PDF attachment\n",
448
+					cli_dbgmsg("pdf: after writing %lu bytes, got error %d inflating PDF attachment\n",
449 449
 						(unsigned long)nbytes, zstat);
450 450
 				inflateEnd(&stream);
451 451
 				return zstat;
... ...
@@ -454,7 +553,8 @@ flatedecode(unsigned char *buf, off_t len, int fout, const cli_ctx *ctx)
454 454
 	}
455 455
 
456 456
 	if(stream.avail_out != sizeof(output))
457
-		(void)cli_writen(fout, output, sizeof(output) - stream.avail_out);
457
+		if(cli_writen(fout, output, sizeof(output) - stream.avail_out) < 0)
458
+			return Z_STREAM_ERROR;
458 459
 
459 460
 	cli_dbgmsg("cli_pdf: flatedecode in=%lu out=%lu ratio %ld (max %d)\n",
460 461
 		stream.total_in, stream.total_out,
... ...
@@ -471,6 +571,9 @@ flatedecode(unsigned char *buf, off_t len, int fout, const cli_ctx *ctx)
471 471
 		return Z_DATA_ERROR;
472 472
 	}
473 473
 
474
+#ifdef	SAVE_TMP
475
+	unlink(tmpfilename);
476
+#endif
474 477
 	return inflateEnd(&stream);
475 478
 }
476 479
 
... ...
@@ -598,10 +701,13 @@ pdf_nextobject(const char *ptr, size_t len)
598 598
 			case '[':	/* Start of an array object */
599 599
 			case '\v':
600 600
 			case '\f':
601
+			case '<':	/* Start of a dictionary object */
601 602
 				inobject = 0;
602 603
 				ptr++;
603 604
 				len--;
604 605
 				break;
606
+			case '/':	/* Start of a name object */
607
+				return ptr;
605 608
 			default:
606 609
 				if(!inobject)
607 610
 					/* TODO: parse and return object type */