Browse code

otf pdf scanning

git-svn: trunk@3620

aCaB authored on 2008/02/12 09:58:49
Showing 4 changed files
... ...
@@ -1,3 +1,7 @@
1
+Tue Feb 12 01:39:03 CET 2008 (acab)
2
+-----------------------------------
3
+  * libclamav/pdf: on the fly scanning of attachments
4
+
1 5
 Mon Feb 11 23:27:47 EET 2008 (edwin)
2 6
 ------------------------------------
3 7
   * libclamav/scanners.c, htmlnorm.c: tagless version of HTML file (bb #162)
... ...
@@ -72,7 +72,7 @@ static	const	char	*cli_pmemstr(const char *haystack, size_t hs, const char *need
72 72
  * TODO: handle embedded URLs if (options&CL_SCAN_MAILURL)
73 73
  */
74 74
 int
75
-cli_pdf(const char *dir, int desc, const cli_ctx *ctx)
75
+cli_pdf(const char *dir, int desc, cli_ctx *ctx)
76 76
 {
77 77
 	off_t size;	/* total number of bytes in the file */
78 78
 	off_t bytesleft, trailerlength;
... ...
@@ -81,7 +81,7 @@ cli_pdf(const char *dir, int desc, const cli_ctx *ctx)
81 81
 	const char *xrefstart;	/* cross reference table */
82 82
 	/*size_t xreflength;*/
83 83
 	table_t *md5table;
84
-	int printed_predictor_message, printed_embedded_font_message, rc, ret;
84
+	int printed_predictor_message, printed_embedded_font_message, ret, rc;
85 85
 	unsigned int files;
86 86
 	struct stat statb;
87 87
 
... ...
@@ -194,16 +194,17 @@ cli_pdf(const char *dir, int desc, const cli_ctx *ctx)
194 194
 	/*
195 195
 	 * The body section consists of a sequence of indirect objects
196 196
 	 */
197
-	while((p < xrefstart) && ((rc=cli_checklimits("cli_pdf", ctx, 0, 0, 0))==CL_CLEAN) &&
197
+	while((p < xrefstart) && (cli_checklimits("cli_pdf", ctx, 0, 0, 0)==CL_CLEAN) &&
198 198
 	      ((q = pdf_nextobject(p, bytesleft)) != NULL)) {
199 199
 		int is_ascii85decode, is_flatedecode, fout, len, has_cr;
200 200
 		/*int object_number, generation_number;*/
201 201
 		const char *objstart, *objend, *streamstart, *streamend;
202
-		char *md5digest;
202
+		unsigned char *md5digest;
203 203
 		unsigned long length, objlen, real_streamlen, calculated_streamlen;
204 204
 		int is_embedded_font, predictor;
205 205
 		char fullname[NAME_MAX + 1];
206 206
 
207
+		rc=CL_CLEAN;
207 208
 		if(q == xrefstart)
208 209
 			break;
209 210
 		if(memcmp(q, "xref", 4) == 0)
... ...
@@ -217,13 +218,11 @@ cli_pdf(const char *dir, int desc, const cli_ctx *ctx)
217 217
 			continue;
218 218
 		if(!isdigit(*q)) {
219 219
 			cli_dbgmsg("cli_pdf: Object number missing\n");
220
-			rc = CL_CLEAN;
221 220
 			break;
222 221
 		}
223 222
 		q = pdf_nextobject(p, bytesleft);
224 223
 		if((q == NULL) || !isdigit(*q)) {
225 224
 			cli_dbgmsg("cli_pdf: Generation number missing\n");
226
-			rc = CL_CLEAN;
227 225
 			break;
228 226
 		}
229 227
 		/*generation_number = atoi(q);*/
... ...
@@ -233,7 +232,6 @@ cli_pdf(const char *dir, int desc, const cli_ctx *ctx)
233 233
 		q = pdf_nextobject(p, bytesleft);
234 234
 		if((q == NULL) || (memcmp(q, "obj", 3) != 0)) {
235 235
 			cli_dbgmsg("cli_pdf: Indirect object missing \"obj\"\n");
236
-			rc = CL_CLEAN;
237 236
 			break;
238 237
 		}
239 238
 
... ...
@@ -430,7 +428,7 @@ cli_pdf(const char *dir, int desc, const cli_ctx *ctx)
430 430
 
431 431
 		if(is_ascii85decode) {
432 432
 			unsigned char *tmpbuf;
433
-			int ret = cli_checklimits("cli_pdf", ctx, calculated_streamlen * 5, calculated_streamlen, 0);
433
+			int ret = cli_checklimits("cli_pdf", ctx, calculated_streamlen * 5, calculated_streamlen, real_streamlen);
434 434
 
435 435
 			if(ret != CL_CLEAN) {
436 436
 				close(fout);
... ...
@@ -475,7 +473,7 @@ cli_pdf(const char *dir, int desc, const cli_ctx *ctx)
475 475
 				if(is_flatedecode)
476 476
 					rc = try_flatedecode((unsigned char *)tmpbuf, real_streamlen, real_streamlen, fout, ctx);
477 477
 				else
478
-					cli_writen(fout, (const char *)streamstart, real_streamlen);
478
+				       	rc = cli_writen(fout, (const char *)streamstart, real_streamlen)==real_streamlen ? CL_CLEAN : CL_EIO;
479 479
 			}
480 480
 			free(tmpbuf);
481 481
 		} else if(is_flatedecode) {
... ...
@@ -484,19 +482,33 @@ cli_pdf(const char *dir, int desc, const cli_ctx *ctx)
484 484
 		} else {
485 485
 			cli_dbgmsg("cli_pdf: writing %lu bytes from the stream\n",
486 486
 				(unsigned long)real_streamlen);
487
-			cli_writen(fout, (const char *)streamstart, real_streamlen);
487
+			if((rc = cli_checklimits("cli_pdf", ctx, real_streamlen, 0, 0))==CL_CLEAN)
488
+				rc = cli_writen(fout, (const char *)streamstart, real_streamlen) == real_streamlen ? CL_CLEAN : CL_EIO;
488 489
 		}
489 490
 
491
+		if (rc == CL_CLEAN) {
492
+			cli_dbgmsg("cli_pdf: extracted file %u to %s\n", ++files, fullname);
493
+	
494
+			lseek(fout, 0, SEEK_SET);
495
+			md5digest = cli_md5digest(fout);
496
+
497
+			if(tableFind(md5table, md5digest) >= 0) {
498
+				cli_dbgmsg("cli_pdf: not scanning duplicate embedded file '%s'\n", fullname);
499
+				free(md5digest);
500
+				close(fout);
501
+				unlink(fullname);
502
+				continue;
503
+			} else
504
+				tableInsert(md5table, md5digest, 1);
505
+
506
+			free(md5digest);
507
+
508
+			lseek(fout, 0, SEEK_SET);
509
+			rc = cli_magic_scandesc(fout, ctx);
510
+		}
490 511
 		close(fout);
491
-		md5digest = cli_md5file(fullname);
492
-		if(tableFind(md5table, md5digest) >= 0) {
493
-			cli_dbgmsg("cli_pdf: not scanning duplicate embedded file '%s'\n", fullname);
494
-			unlink(fullname);
495
-		} else
496
-			tableInsert(md5table, md5digest, 1);
497
-		free(md5digest);
498
-		cli_dbgmsg("cli_pdf: extracted file %u to %s\n", ++files,
499
-			fullname);
512
+		if(!cli_leavetemps_flag) unlink(fullname);
513
+		if(rc != CL_CLEAN) break;
500 514
 	}
501 515
 
502 516
 	munmap(buf, size);
... ...
@@ -516,7 +528,7 @@ try_flatedecode(unsigned char *buf, off_t real_len, off_t calculated_len, int fo
516 516
 	int ret = cli_checklimits("cli_pdf", ctx, real_len, 0, 0);
517 517
 
518 518
 	if (ret==CL_CLEAN && flatedecode(buf, real_len, fout, ctx) == CL_SUCCESS)
519
-		return CL_SUCCESS;
519
+		return CL_CLEAN;
520 520
 
521 521
 	if(real_len == calculated_len) {
522 522
 		/*
... ...
@@ -530,8 +542,8 @@ try_flatedecode(unsigned char *buf, off_t real_len, off_t calculated_len, int fo
530 530
 		return CL_CLEAN;
531 531
 
532 532
 	ret = flatedecode(buf, calculated_len, fout, ctx);
533
-	if(ret == CL_SUCCESS)
534
-		return CL_SUCCESS;
533
+	if(ret == CL_CLEAN)
534
+		return CL_CLEAN;
535 535
 
536 536
 	/* i.e. the PDF file is broken :-( */
537 537
 	cli_dbgmsg("cli_pdf: Bad compressed block length in flate stream\n");
... ...
@@ -20,6 +20,6 @@
20 20
 #ifndef __PDF_H
21 21
 #define __PDF_H
22 22
 
23
-int cli_pdf(const char *dir, int desc, const cli_ctx *ctx);
23
+int cli_pdf(const char *dir, int desc, cli_ctx *ctx);
24 24
 
25 25
 #endif
... ...
@@ -1401,9 +1401,6 @@ static int cli_scanpdf(int desc, cli_ctx *ctx)
1401 1401
 
1402 1402
     ret = cli_pdf(dir, desc, ctx);
1403 1403
 
1404
-    if(ret == CL_CLEAN)
1405
-	ret = cli_scandir(dir, ctx, 0);
1406
-
1407 1404
     if(!cli_leavetemps_flag)
1408 1405
 	cli_rmdirs(dir);
1409 1406
 
... ...
@@ -1887,7 +1884,7 @@ int cli_magic_scandesc(int desc, cli_ctx *ctx)
1887 1887
 		ret = cli_scanjpeg(desc, ctx->virname);
1888 1888
 	    break;
1889 1889
 
1890
-	case CL_TYPE_PDF:
1890
+        case CL_TYPE_PDF: /* FIXMELIMITS: pdf should be an archive! */
1891 1891
 	    if(SCAN_PDF && (DCONF_DOC & DOC_CONF_PDF))
1892 1892
 		ret = cli_scanpdf(desc, ctx);
1893 1893
 	    break;