Browse code

Javascript extraction.

Török Edvin authored on 2010/05/11 17:26:35
Showing 1 changed files
... ...
@@ -71,6 +71,7 @@ enum pdf_flag {
71 71
     BAD_ASCIIDECODE,
72 72
     UNTERMINATED_OBJ_DICT,
73 73
     ESCAPED_COMMON_PDFNAME,
74
+    HEX_JAVASCRIPT
74 75
 };
75 76
 
76 77
 static int xrefCheck(const char *xref, const char *eof)
... ...
@@ -138,7 +139,7 @@ static const char *findNextNonWSBack(const char *q, const char *start)
138 138
     return q;
139 139
 }
140 140
 
141
-static int find_stream_bounds(const char *start, off_t bytesleft, off_t *stream, off_t *endstream)
141
+static int find_stream_bounds(const char *start, off_t bytesleft, off_t bytesleft2, off_t *stream, off_t *endstream)
142 142
 {
143 143
     const char *q2, *q;
144 144
     if ((q2 = cli_memstr(start, bytesleft, "stream", 6))) {
... ...
@@ -148,11 +149,11 @@ static int find_stream_bounds(const char *start, off_t bytesleft, off_t *stream,
148 148
 	if (q2[0] == '\xa')
149 149
 	    q2++;
150 150
 	*stream = q2 - start;
151
-	bytesleft -= q2 - start;
151
+	bytesleft2 -= q2 - start;
152 152
 	q = q2;
153
-	q2 = cli_memstr(q, bytesleft, "endstream", 9);
153
+	q2 = cli_memstr(q, bytesleft2, "endstream", 9);
154 154
 	if (!q2)
155
-	    return;/* no more objs */
155
+	    return 0;/* no more objs */
156 156
 	*endstream = q2 - start;
157 157
 	return 1;
158 158
     }
... ...
@@ -173,6 +174,7 @@ static int pdf_findobj(struct pdf_struct *pdf)
173 173
 	return -1;
174 174
     }
175 175
     obj = &pdf->objs[pdf->nobjs-1];
176
+    memset(obj, 0, sizeof(*obj));
176 177
     start = pdf->map+pdf->offset;
177 178
     bytesleft = pdf->size - pdf->offset;
178 179
     q2 = cli_memstr(start, bytesleft, " obj", 4);
... ...
@@ -197,7 +199,7 @@ static int pdf_findobj(struct pdf_struct *pdf)
197 197
 	if (!q2)
198 198
 	    return 0;/* no more objs */
199 199
 	bytesleft -= q2 - q;
200
-	if (find_stream_bounds(q-1, q2-q+1, &p_stream, &p_endstream)) {
200
+	if (find_stream_bounds(q-1, q2-q, bytesleft + (q2-q), &p_stream, &p_endstream)) {
201 201
 	    obj->flags |= 1 << OBJ_STREAM;
202 202
 	    q2 = q-1 + p_endstream + 6;
203 203
 	    bytesleft -= q2 - q + 1;
... ...
@@ -349,30 +351,52 @@ static int find_length(struct pdf_struct *pdf,
349 349
     return length;
350 350
 }
351 351
 
352
+#define DUMP_MASK ((1 << OBJ_FILTER_FLATE) | (1 << OBJ_FILTER_AH) | (1 << OBJ_FILTER_A85) | (1 << OBJ_EMBEDDED_FILE) | (1 << OBJ_JAVASCRIPT))
353
+
354
+static int obj_size(struct pdf_struct *pdf, struct pdf_obj *obj)
355
+{
356
+    int i = obj - pdf->objs;
357
+    i++;
358
+    if (i < pdf->nobjs) {
359
+	return pdf->objs[i].start - obj->start - 4;
360
+    }
361
+    return pdf->size - obj->start;
362
+}
363
+
352 364
 static int pdf_extract_obj(struct pdf_struct *pdf, struct pdf_obj *obj)
353 365
 {
366
+    char fullname[NAME_MAX + 1];
367
+    int fout;
368
+    off_t sum = 0;
354 369
     int rc = CL_SUCCESS;
355
-    if (obj->flags | (1 << OBJ_STREAM)) {
370
+    char *ascii_decoded = NULL;
371
+
372
+    if (!(obj->flags & DUMP_MASK)) {
373
+	/* don't dump all streams */
374
+	return CL_CLEAN;
375
+    }
376
+    snprintf(fullname, sizeof(fullname), "%s"PATHSEP"pdf%02u", pdf->dir, pdf->files++);
377
+    fout = open(fullname,O_RDWR|O_CREAT|O_EXCL|O_TRUNC|O_BINARY, 0600);
378
+    if (fout < 0) {
379
+	char err[128];
380
+	cli_errmsg("cli_pdf: can't create temporary file %s: %s\n", fullname, cli_strerror(errno, err, sizeof(err)));
381
+	free(ascii_decoded);
382
+	return CL_ETMPFILE;
383
+    }
384
+
385
+    do {
386
+    if (obj->flags & (1 << OBJ_STREAM)) {
356 387
 	const char *start = pdf->map + obj->start;
357 388
 	off_t p_stream = 0, p_endstream = 0;
358 389
 	off_t length;
359 390
 	find_stream_bounds(start, pdf->size - obj->start,
391
+			   pdf->size - obj->start,
360 392
 			   &p_stream, &p_endstream);
361 393
 	if (p_stream && p_endstream) {
362
-	    char fullname[NAME_MAX + 1];
363
-	    int fout;
364
-	    off_t sum = 0;
365 394
 	    const char *flate_in;
366
-	    char *ascii_decoded = NULL;
367 395
 	    long ascii_decoded_size = 0;
368 396
 	    size_t size = p_endstream - p_stream;
369 397
 
370
-	    if (!(obj->flags & (1 << OBJ_FILTER_FLATE)) &&
371
-		!(obj->flags & (1 << OBJ_EMBEDDED_FILE)) &&
372
-		!ascii_decoded) {
373
-		/* only dump encoded streams */
374
-		return CL_CLEAN;
375
-	    }
376 398
 	    length = find_length(pdf, obj, start, p_stream);
377 399
 	    if (!(obj->flags & (1 << OBJ_FILTER_FLATE)) && !length) {
378 400
 		const char *q = start + p_endstream;
... ...
@@ -395,7 +419,8 @@ static int pdf_extract_obj(struct pdf_struct *pdf, struct pdf_obj *obj)
395 395
 		ascii_decoded = cli_malloc(size/2 + 1);
396 396
 		if (!ascii_decoded) {
397 397
 		    cli_errmsg("Cannot allocate memory for asciidecode\n");
398
-		    return CL_EMEM;
398
+		    rc = CL_EMEM;
399
+		    break;
399 400
 		}
400 401
 		ascii_decoded_size = asciihexdecode(start + p_stream,
401 402
 						    length,
... ...
@@ -404,7 +429,8 @@ static int pdf_extract_obj(struct pdf_struct *pdf, struct pdf_obj *obj)
404 404
 		ascii_decoded = cli_malloc(size*5);
405 405
 		if (!ascii_decoded) {
406 406
 		    cli_errmsg("Cannot allocate memory for asciidecode\n");
407
-		    return CL_EMEM;
407
+		    rc = CL_EMEM;
408
+		    break;
408 409
 		}
409 410
 		ascii_decoded_size = ascii85decode(start+p_stream,
410 411
 						   length,
... ...
@@ -413,40 +439,71 @@ static int pdf_extract_obj(struct pdf_struct *pdf, struct pdf_obj *obj)
413 413
 	    if (ascii_decoded_size < 0) {
414 414
 		pdf->flags |= 1 << BAD_ASCIIDECODE;
415 415
 		cli_dbgmsg("cli_pdf: failed to asciidecode in %u %u obj\n", obj->id>>8,obj->id&0xff);
416
-		free(ascii_decoded);
417
-		return CL_SUCCESS;
416
+		rc = CL_CLEAN;
417
+		break;
418 418
 	    }
419 419
 	    /* either direct or ascii-decoded input */
420 420
 	    if (!ascii_decoded)
421 421
 		ascii_decoded_size = length;
422 422
 	    flate_in = ascii_decoded ? ascii_decoded : start+p_stream;
423 423
 
424
-	    snprintf(fullname, sizeof(fullname), "%s"PATHSEP"pdf%02u", pdf->dir, pdf->files++);
425
-	    fout = open(fullname,O_RDWR|O_CREAT|O_EXCL|O_TRUNC|O_BINARY, 0600);
426
-	    if (fout < 0) {
427
-		char err[128];
428
-		cli_errmsg("cli_pdf: can't create temporary file %s: %s\n", fullname, cli_strerror(errno, err, sizeof(err)));
429
-		free(ascii_decoded);
430
-		return CL_ETMPFILE;
431
-	    }
432 424
 	    if (obj->flags & (1 << OBJ_FILTER_FLATE)) {
433 425
 		rc = filter_flatedecode(pdf, obj, flate_in, ascii_decoded_size, fout, &sum);
434 426
 	    } else {
435
-		rc = filter_writen(pdf, obj, fout, flate_in, ascii_decoded_size, &sum);
427
+		if (filter_writen(pdf, obj, fout, flate_in, ascii_decoded_size, &sum) != ascii_decoded_size)
428
+		    rc = CL_EWRITE;
436 429
 	    }
437 430
 	    cli_updatelimits(pdf->ctx, sum);
438
-	    /* invoke bytecode on this pdf obj with metainformation associated
431
+	    /* TODO: invoke bytecode on this pdf obj with metainformation associated
439 432
 	     * */
440 433
 	    cli_dbgmsg("cli_pdf: extracted %ld bytes %u %u obj to %s\n", sum, obj->id>>8, obj->id&0xff, fullname);
441 434
 	    lseek(fout, 0, SEEK_SET);
442 435
 	    rc = cli_magic_scandesc(fout, pdf->ctx);
443
-	    close(fout);
444
-	    free(ascii_decoded);
445
-	    if (!pdf->ctx->engine->keeptmp)
446
-		if (cli_unlink(fullname) && rc != CL_VIRUS)
447
-		    rc = CL_EUNLINK;
436
+	}
437
+    } else if (obj->flags & (1 << OBJ_JAVASCRIPT)) {
438
+	const char *q2;
439
+	const char *q = pdf->map+obj->start;
440
+	/* TODO: get obj-endobj size */
441
+	off_t bytesleft = obj_size(pdf, obj);
442
+
443
+	q2 = cli_memstr(q, bytesleft, "/JavaScript", 11);
444
+	if (!q2)
445
+	    break;
446
+	q2++;
447
+	bytesleft -= q2 - q;
448
+	q = pdf_nextobject(q2, bytesleft);
449
+	if (!q)
450
+	    break;
451
+	bytesleft -= q - q2;
452
+	if (*q == '(') {
453
+	    if (filter_writen(pdf, obj, fout, q+1, bytesleft-1, &sum) != (bytesleft-1)) {
454
+		rc = CL_EWRITE;
455
+		break;
456
+	    }
457
+	} else if (*q == '<') {
458
+	    char *decoded;
459
+	    q2 = memchr(q+1, '>', bytesleft);
460
+	    if (!q2) q2 = q + bytesleft;
461
+	    decoded = cli_malloc(q2 - q);
462
+	    if (!decoded) {
463
+		rc = CL_EMEM;
464
+		break;
465
+	    }
466
+	    cli_hex2str_to(q2, decoded, q2-q-1);
467
+	    decoded[q2-q-1] = '\0';
468
+	    cli_dbgmsg("cli_pdf: found hexadecimal encoded javascript in %u %u obj\n",
469
+		       obj->id>>8, obj->id&0xff);
470
+	    pdf->flags |= 1 << HEX_JAVASCRIPT;
471
+	    filter_writen(pdf, obj, fout, decoded, q2-q-1, &sum);
472
+	    free(decoded);
448 473
 	}
449 474
     }
475
+    } while (0);
476
+    close(fout);
477
+    free(ascii_decoded);
478
+    if (!pdf->ctx->engine->keeptmp)
479
+	if (cli_unlink(fullname) && rc != CL_VIRUS)
480
+	    rc = CL_EUNLINK;
450 481
     return rc;
451 482
 }
452 483
 
... ...
@@ -621,6 +678,27 @@ static void pdf_parseobj(struct pdf_struct *pdf, struct pdf_obj *obj)
621 621
 	pdfname[i] = '\0';
622 622
 
623 623
 	handle_pdfname(pdf, obj, pdfname, escapes, q, &objstate);
624
+	if (objstate == STATE_JAVASCRIPT) {
625
+	    const char *q2;
626
+	    q2 = pdf_nextobject(q, dict_length);
627
+	    if (q2 && isdigit(*q2)) {
628
+		uint32_t objid = atoi(q2) << 8;
629
+		while (isdigit(*q2)) q2++;
630
+		q2 = pdf_nextobject(q2, dict_length);
631
+		if (q2 && isdigit(*q2)) {
632
+		    objid |= atoi(q2) & 0xff;
633
+		    q2 = pdf_nextobject(q2, dict_length);
634
+		    if (*q2 == 'R') {
635
+			struct pdf_obj *obj2;
636
+			cli_dbgmsg("cli_pdf: found javascript stored in indirect object %u %u",
637
+				   objid >> 8, objid&0xff);
638
+			obj2 = find_obj(pdf, obj, objid);
639
+			obj2->flags |= OBJ_JAVASCRIPT;
640
+		    }
641
+		}
642
+	    }
643
+	    objstate = STATE_NONE;
644
+	}
624 645
     }
625 646
     cli_dbgmsg("cli_pdf: %u %u obj flags: %02x\n", obj->id>>8, obj->id&0xff, obj->flags);
626 647
 }
... ...
@@ -732,8 +810,14 @@ int cli_pdf(const char *dir, cli_ctx *ctx, off_t offset)
732 732
 	    break;
733 733
     }
734 734
 
735
-    if (pdf.flags)
735
+    if (pdf.flags) {
736 736
 	cli_dbgmsg("cli_pdf: flags 0x%02x\n", pdf.flags);
737
+	if (pdf.flags & ESCAPED_COMMON_PDFNAME) {
738
+	    /* for example /Fl#61te#44#65#63#6f#64#65 instead of /FlateDecode */
739
+	    *ctx->virname = "Heuristics.PDF.ObfuscatedNameObject";
740
+	    rc = CL_VIRUS;
741
+	}
742
+    }
737 743
     cli_dbgmsg("cli_pdf: returning %d\n", rc);
738 744
     free(pdf.objs);
739 745
     return rc;