Browse code

dump&scan pdf objs

Török Edvin authored on 2010/05/11 16:37:10
Showing 1 changed files
... ...
@@ -57,6 +57,7 @@ static	char	const	rcsid[] = "$Id: pdf.c,v 1.61 2007/02/12 20:46:09 njh Exp $";
57 57
 static	int	try_flatedecode(unsigned char *buf, off_t real_len, off_t calculated_len, int fout, cli_ctx *ctx);
58 58
 static	int	flatedecode(unsigned char *buf, off_t len, int fout, cli_ctx *ctx);
59 59
 static	int	ascii85decode(const char *buf, off_t len, unsigned char *output);
60
+static	int	asciihexdecode(const char *buf, off_t len, unsigned char *output);
60 61
 static	const	char	*pdf_nextlinestart(const char *ptr, size_t len);
61 62
 static	const	char	*pdf_nextobject(const char *ptr, size_t len);
62 63
 
... ...
@@ -66,6 +67,8 @@ enum pdf_flag {
66 66
     BAD_PDF_TRAILER,
67 67
     BAD_PDF_TOOMANYOBJS,
68 68
     BAD_STREAM_FILTERS,
69
+    BAD_FLATE,
70
+    BAD_ASCIIDECODE,
69 71
     UNTERMINATED_OBJ_DICT,
70 72
     ESCAPED_COMMON_PDFNAME,
71 73
 };
... ...
@@ -94,6 +97,7 @@ static int xrefCheck(const char *xref, const char *eof)
94 94
 enum objflags {
95 95
     OBJ_STREAM=0,
96 96
     OBJ_DICT,
97
+    OBJ_EMBEDDED_FILE,
97 98
     OBJ_FILTER_AH,
98 99
     OBJ_FILTER_A85,
99 100
     OBJ_FILTER_FLATE,
... ...
@@ -119,6 +123,9 @@ struct pdf_struct {
119 119
     off_t size;
120 120
     off_t offset;
121 121
     unsigned flags;
122
+    cli_ctx *ctx;
123
+    const char *dir;
124
+    unsigned files;
122 125
 };
123 126
 
124 127
 static const char *findNextNonWSBack(const char *q, const char *start)
... ...
@@ -131,9 +138,30 @@ static const char *findNextNonWSBack(const char *q, const char *start)
131 131
     return q;
132 132
 }
133 133
 
134
+static int find_stream_bounds(const char *start, off_t bytesleft, off_t *stream, off_t *endstream)
135
+{
136
+    const char *q2, *q;
137
+    if ((q2 = cli_memstr(start, bytesleft, "stream", 6))) {
138
+	q2 += 6;
139
+	if (q2[0] == '\xd' && q2[1] == '\xa')
140
+	    q2 += 2;
141
+	if (q2[0] == '\xa')
142
+	    q2++;
143
+	*stream = q2 - start;
144
+	bytesleft -= q2 - start;
145
+	q = q2;
146
+	q2 = cli_memstr(q, bytesleft, "endstream", 9);
147
+	if (!q2)
148
+	    return;/* no more objs */
149
+	*endstream = q2 - start;
150
+	return 1;
151
+    }
152
+    return 0;
153
+}
154
+
134 155
 static int pdf_findobj(struct pdf_struct *pdf)
135 156
 {
136
-    const char *start, *q, *q2, *eof;
157
+    const char *start, *q, *q2, *q3, *eof;
137 158
     struct pdf_obj *obj;
138 159
     off_t bytesleft;
139 160
     unsigned genid, objid;
... ...
@@ -164,21 +192,17 @@ static int pdf_findobj(struct pdf_struct *pdf)
164 164
     eof = pdf->map + pdf->size;
165 165
     q = pdf->map + obj->start;
166 166
     while (q < eof && bytesleft > 0) {
167
+	off_t p_stream, p_endstream;
167 168
 	q2 = pdf_nextobject(q, bytesleft);
168 169
 	if (!q2)
169 170
 	    return 0;/* no more objs */
170 171
 	bytesleft -= q2 - q;
171
-	if ((q2 = cli_memstr(q-1, q2-q+1, "stream", 6))) {
172
+	if (find_stream_bounds(q-1, q2-q+1, &p_stream, &p_endstream)) {
172 173
 	    obj->flags |= 1 << OBJ_STREAM;
173
-	    q2 += 6;
174
-	    bytesleft -= 6;
175
-	    q2 = cli_memstr(q2, bytesleft, "endstream", 9);
176
-	    if (!q2)
177
-		return 0;/* no more objs */
178
-	    q2 += 6;
179
-	    bytesleft -= q2 - q;
180
-	} else if ((q2 = cli_memstr(q-1, q2-q+1, "endobj", 6))) {
181
-	    q2 += 6;
174
+	    q2 = q-1 + p_endstream + 6;
175
+	    bytesleft -= q2 - q + 1;
176
+	} else if ((q3 = cli_memstr(q-1, q2-q+1, "endobj", 6))) {
177
+	    q2 = q3 + 6;
182 178
 	    pdf->offset = q2 - pdf->map;
183 179
 	    return 1; /* obj found and offset positioned */
184 180
 	} else {
... ...
@@ -189,6 +213,243 @@ static int pdf_findobj(struct pdf_struct *pdf)
189 189
     return 0;/* no more objs */
190 190
 }
191 191
 
192
+static int filter_writen(struct pdf_struct *pdf, struct pdf_obj *obj,
193
+			 int fout, const unsigned char *buf, off_t len, off_t *sum)
194
+{
195
+    if (cli_checklimits("pdf", pdf->ctx, *sum, 0, 0))
196
+	return len; /* pretend it was a successful write to suppress CL_EWRITE */
197
+    *sum += len;
198
+    return cli_writen(fout, buf, len);
199
+}
200
+
201
+static int filter_flatedecode(struct pdf_struct *pdf, struct pdf_obj *obj,
202
+			      const unsigned char *buf, off_t len, int fout, off_t *sum)
203
+{
204
+    int zstat, ret;
205
+    z_stream stream;
206
+    off_t nbytes;
207
+    unsigned char output[BUFSIZ];
208
+
209
+    if (len == 0)
210
+	return CL_CLEAN;
211
+    memset(&stream, 0, sizeof(stream));
212
+    stream.next_in = (Bytef *)buf;
213
+    stream.avail_in = len;
214
+    stream.next_out = output;
215
+    stream.avail_out = sizeof(output);
216
+
217
+    zstat = inflateInit(&stream);
218
+    if(zstat != Z_OK) {
219
+	cli_warnmsg("cli_pdf: inflateInit failed\n");
220
+	return CL_EMEM;
221
+    }
222
+
223
+    nbytes = 0;
224
+    while(stream.avail_in) {
225
+	zstat = inflate(&stream, Z_NO_FLUSH);	/* zlib */
226
+	switch(zstat) {
227
+	    case Z_OK:
228
+		if(stream.avail_out == 0) {
229
+		    int written;
230
+		    if ((written=filter_writen(pdf, obj, fout, output, sizeof(output), sum))!=sizeof(output)) {
231
+			cli_errmsg("cli_pdf: failed to write output file\n");
232
+			inflateEnd(&stream);
233
+			return CL_EWRITE;
234
+		    }
235
+		    nbytes += written;
236
+		    stream.next_out = output;
237
+		    stream.avail_out = sizeof(output);
238
+		}
239
+		continue;
240
+	    case Z_STREAM_END:
241
+		break;
242
+	    default:
243
+		if(stream.msg)
244
+		    cli_dbgmsg("cli_pdf: after writing %lu bytes, got error \"%s\" inflating PDF stream in %u %u obj\n",
245
+			       (unsigned long)nbytes,
246
+			       stream.msg, obj->id>>8, obj->id&0xff);
247
+		else
248
+		    cli_dbgmsg("cli_pdf: after writing %lu bytes, got error %d inflating PDF stream in %u %u obj\n",
249
+			       (unsigned long)nbytes, zstat, obj->id>>8, obj->id&0xff);
250
+		pdf->flags |= 1 << BAD_FLATE;
251
+		inflateEnd(&stream);
252
+		return CL_CLEAN;
253
+	}
254
+	break;
255
+    }
256
+
257
+    if(stream.avail_out != sizeof(output)) {
258
+	if(filter_writen(pdf, obj, fout, output, sizeof(output) - stream.avail_out, sum) < 0) {
259
+	    cli_errmsg("cli_pdf: failed to write output file\n");
260
+	    inflateEnd(&stream);
261
+	    return CL_EWRITE;
262
+	}
263
+    }
264
+
265
+    inflateEnd(&stream);
266
+    return CL_CLEAN;
267
+}
268
+
269
+static struct pdf_obj *find_obj(struct pdf_struct *pdf,
270
+				struct pdf_obj *obj, uint32_t objid)
271
+{
272
+    int j;
273
+    int i = obj - pdf->objs;
274
+    /* search starting at previous obj */
275
+    if (i > 0)
276
+	i--;
277
+    for (j=i;j<pdf->nobjs;j++) {
278
+	obj = &pdf->objs[j];
279
+	if (obj->id == objid)
280
+	    return obj;
281
+    }
282
+    /* restart search from beginning if not found */
283
+    for (j=0;j<i;j++) {
284
+	obj = &pdf->objs[j];
285
+	if (obj->id == objid)
286
+	    return obj;
287
+    }
288
+    return NULL;
289
+}
290
+
291
+static int find_length(struct pdf_struct *pdf,
292
+		       struct pdf_obj *obj,
293
+		       const char *start, off_t len)
294
+{
295
+    int length;
296
+    const char *q;
297
+    q = cli_memstr(start, len, "/Length", 7);
298
+    if (!q)
299
+	return 0;
300
+    q++;
301
+    len -= q - start;
302
+    start = pdf_nextobject(q, len);
303
+    if (!start)
304
+	return 0;
305
+    len -= start - q;
306
+    q = start;
307
+    length = atoi(q);
308
+    while (isdigit(*q)) q++;
309
+    if (*q == ' ') {
310
+	int genid;
311
+	q++;
312
+	genid = atoi(q);
313
+	while(isdigit(*q)) q++;
314
+	if (q[0] == ' ' && q[1] == 'R') {
315
+	    cli_dbgmsg("cli_pdf: length is in indirect object %u %u\n", length, genid);
316
+	    obj = find_obj(pdf, obj, (length << 8) | (genid&0xff));
317
+	    if (!obj) {
318
+		cli_dbgmsg("cli_pdf: indirect object not found\n");
319
+		return 0;
320
+	    }
321
+	    q = pdf_nextobject(pdf->map+obj->start, pdf->size - obj->start);
322
+	    length = atoi(q);
323
+	}
324
+    }
325
+    return length;
326
+}
327
+
328
+static int pdf_extract_obj(struct pdf_struct *pdf, struct pdf_obj *obj)
329
+{
330
+    int rc = CL_SUCCESS;
331
+    if (obj->flags | (1 << OBJ_STREAM)) {
332
+	const char *start = pdf->map + obj->start;
333
+	off_t p_stream = 0, p_endstream = 0;
334
+	off_t length;
335
+	find_stream_bounds(start, pdf->size - obj->start,
336
+			   &p_stream, &p_endstream);
337
+	if (p_stream && p_endstream) {
338
+	    char fullname[NAME_MAX + 1];
339
+	    int fout;
340
+	    off_t sum = 0;
341
+	    const char *flate_in;
342
+	    char *ascii_decoded = NULL;
343
+	    long ascii_decoded_size = 0;
344
+	    size_t size = p_endstream - p_stream;
345
+
346
+	    if (!(obj->flags & (1 << OBJ_FILTER_FLATE)) &&
347
+		!(obj->flags & (1 << OBJ_EMBEDDED_FILE)) &&
348
+		!ascii_decoded) {
349
+		/* only dump encoded streams */
350
+		return CL_CLEAN;
351
+	    }
352
+	    length = find_length(pdf, obj, start, p_stream);
353
+	    if (!(obj->flags & (1 << OBJ_FILTER_FLATE)) && !length) {
354
+		const char *q = start + p_endstream;
355
+		length = size;
356
+		q--;
357
+		if (*q == '\n') {
358
+		    q--;
359
+		    length--;
360
+		    if (*q == '\r')
361
+			length--;
362
+		} else if (*q == '\r') {
363
+		    length--;
364
+		}
365
+		cli_dbgmsg("cli_pdf: calculated length %d\n", length);
366
+	    }
367
+	    if (!length)
368
+		length = size;
369
+
370
+	    if (obj->flags & (1 << OBJ_FILTER_AH)) {
371
+		ascii_decoded = cli_malloc(size/2 + 1);
372
+		if (!ascii_decoded) {
373
+		    cli_errmsg("Cannot allocate memory for asciidecode\n");
374
+		    return CL_EMEM;
375
+		}
376
+		ascii_decoded_size = asciihexdecode(start + p_stream,
377
+						    length,
378
+						    ascii_decoded);
379
+	    } else if (obj->flags & (1 << OBJ_FILTER_A85)) {
380
+		ascii_decoded = cli_malloc(size*5);
381
+		if (!ascii_decoded) {
382
+		    cli_errmsg("Cannot allocate memory for asciidecode\n");
383
+		    return CL_EMEM;
384
+		}
385
+		ascii_decoded_size = ascii85decode(start+p_stream,
386
+						   length,
387
+						   ascii_decoded);
388
+	    }
389
+	    if (ascii_decoded_size < 0) {
390
+		pdf->flags |= 1 << BAD_ASCIIDECODE;
391
+		cli_dbgmsg("cli_pdf: failed to asciidecode in %u %u obj\n", obj->id>>8,obj->id&0xff);
392
+		free(ascii_decoded);
393
+		return CL_SUCCESS;
394
+	    }
395
+	    /* either direct or ascii-decoded input */
396
+	    if (!ascii_decoded)
397
+		ascii_decoded_size = length;
398
+	    flate_in = ascii_decoded ? ascii_decoded : start+p_stream;
399
+
400
+	    snprintf(fullname, sizeof(fullname), "%s"PATHSEP"pdf%02u", pdf->dir, pdf->files++);
401
+	    fout = open(fullname,O_RDWR|O_CREAT|O_EXCL|O_TRUNC|O_BINARY, 0600);
402
+	    if (fout < 0) {
403
+		char err[128];
404
+		cli_errmsg("cli_pdf: can't create temporary file %s: %s\n", fullname, cli_strerror(errno, err, sizeof(err)));
405
+		free(ascii_decoded);
406
+		return CL_ETMPFILE;
407
+	    }
408
+	    if (obj->flags & (1 << OBJ_FILTER_FLATE)) {
409
+		rc = filter_flatedecode(pdf, obj, flate_in, ascii_decoded_size, fout, &sum);
410
+	    } else {
411
+		rc = filter_writen(pdf, obj, fout, flate_in, ascii_decoded_size, &sum);
412
+	    }
413
+	    cli_updatelimits(pdf->ctx, sum);
414
+	    /* invoke bytecode on this pdf obj with metainformation associated
415
+	     * */
416
+	    cli_dbgmsg("cli_pdf: extracted %ld bytes %u %u obj to %s\n", sum, obj->id>>8, obj->id&0xff, fullname);
417
+	    lseek(fout, 0, SEEK_SET);
418
+	    rc = cli_magic_scandesc(fout, pdf->ctx);
419
+	    close(fout);
420
+	    free(ascii_decoded);
421
+	    if (!pdf->ctx->engine->keeptmp)
422
+		if (cli_unlink(fullname) && rc != CL_VIRUS)
423
+		    rc = CL_EUNLINK;
424
+	}
425
+    }
426
+    return rc;
427
+}
428
+
192 429
 static void pdfobj_flag(struct pdf_struct *pdf, struct pdf_obj *obj, enum pdf_flag flag)
193 430
 {
194 431
     const char *s;
... ...
@@ -219,6 +480,7 @@ enum objstate {
219 219
     STATE_NONE,
220 220
     STATE_S,
221 221
     STATE_FILTER,
222
+    STATE_JAVASCRIPT,
222 223
     STATE_ANY /* for actions table below */
223 224
 };
224 225
 
... ...
@@ -232,6 +494,7 @@ struct pdfname_action {
232 232
 static struct pdfname_action pdfname_actions[] = {
233 233
     {"ASCIIHexDecode", OBJ_FILTER_AH, STATE_FILTER, STATE_FILTER},
234 234
     {"ASCII85Decode", OBJ_FILTER_A85, STATE_FILTER, STATE_FILTER},
235
+    {"EmbeddedFile", OBJ_EMBEDDED_FILE, STATE_NONE, STATE_NONE},
235 236
     {"FlateDecode", OBJ_FILTER_FLATE, STATE_FILTER, STATE_FILTER},
236 237
     {"LZWDecode", OBJ_FILTER_LZW, STATE_FILTER, STATE_FILTER},
237 238
     {"RunLengthDecode", OBJ_FILTER_RL, STATE_FILTER, STATE_FILTER},
... ...
@@ -241,7 +504,7 @@ static struct pdfname_action pdfname_actions[] = {
241 241
     {"JPXDecode", OBJ_FILTER_JPX, STATE_FILTER, STATE_FILTER},
242 242
     {"Crypt",  OBJ_FILTER_CRYPT, STATE_FILTER, STATE_NONE},
243 243
     {"Filter", OBJ_DICT, STATE_ANY, STATE_FILTER},
244
-    {"JavaScript", OBJ_JAVASCRIPT, STATE_S, STATE_NONE},
244
+    {"JavaScript", OBJ_JAVASCRIPT, STATE_S, STATE_JAVASCRIPT},
245 245
     {"Length", OBJ_DICT, STATE_FILTER, STATE_NONE},
246 246
     {"S", OBJ_DICT, STATE_NONE, STATE_S},
247 247
     {"Type", OBJ_DICT, STATE_NONE, STATE_NONE}
... ...
@@ -339,6 +602,7 @@ static void pdf_parseobj(struct pdf_struct *pdf, struct pdf_obj *obj)
339 339
 	if (!q2)
340 340
 	    break;
341 341
 	dict_length -= q2 - q;
342
+	q = q2;
342 343
 	// normalize PDF names
343 344
 	for (i = 0;dict_length && (i < sizeof(pdfname)-1); i++) {
344 345
 	    q++;
... ...
@@ -371,9 +635,12 @@ int cli_pdf(const char *dir, cli_ctx *ctx, off_t offset)
371 371
     long xref;
372 372
     const char *pdfver, *start, *eofmap, *q, *eof;
373 373
     int rc;
374
+    unsigned i;
374 375
 
375 376
     cli_dbgmsg("in cli_pdf(%s)\n", dir);
376 377
     memset(&pdf, 0, sizeof(pdf));
378
+    pdf.ctx = ctx;
379
+    pdf.dir = dir;
377 380
 
378 381
     pdfver = start = fmap_need_off_once(map, offset, versize);
379 382
 
... ...
@@ -448,6 +715,7 @@ int cli_pdf(const char *dir, cli_ctx *ctx, off_t offset)
448 448
 	cli_errmsg("cli_pdf: mmap() failed\n");
449 449
 	return CL_EMAP;
450 450
     }
451
+    // parse PDF and find obj offsets
451 452
     while ((rc = pdf_findobj(&pdf)) > 0) {
452 453
 	struct pdf_obj *obj = &pdf.objs[pdf.nobjs-1];
453 454
 	cli_dbgmsg("found %d %d obj @%ld\n", obj->id >> 8, obj->id&0xff, obj->start + offset);
... ...
@@ -456,9 +724,19 @@ int cli_pdf(const char *dir, cli_ctx *ctx, off_t offset)
456 456
     if (rc == -1)
457 457
 	pdf.flags |= 1 << BAD_PDF_TOOMANYOBJS;
458 458
 
459
+    // extract PDF objs
460
+    for (i=0;i<pdf.nobjs;i++) {
461
+	struct pdf_obj *obj = &pdf.objs[i];
462
+	rc = pdf_extract_obj(&pdf, obj);
463
+	if (rc != CL_SUCCESS)
464
+	    break;
465
+    }
466
+
459 467
     if (pdf.flags)
460 468
 	cli_dbgmsg("cli_pdf: flags 0x%02x\n", pdf.flags);
461
-    return CL_SUCCESS;
469
+    cli_dbgmsg("cli_pdf: returning %d\n", rc);
470
+    free(pdf.objs);
471
+    return rc;
462 472
 }
463 473
 
464 474
 int
... ...
@@ -1052,6 +1330,18 @@ flatedecode(unsigned char *buf, off_t len, int fout, cli_ctx *ctx)
1052 1052
 	return CL_CLEAN;
1053 1053
 }
1054 1054
 
1055
+static int asciihexdecode(const char *buf, off_t len, unsigned char *output)
1056
+{
1057
+    unsigned i,j;
1058
+    for (i=0,j=0;i<len;i++) {
1059
+	if (buf[i] == ' ')
1060
+	    continue;
1061
+	if (buf[i] == '>')
1062
+	    break;
1063
+	cli_hex2str_to(buf+i, output+j++, 2);
1064
+    }
1065
+    return j;
1066
+}
1055 1067
 /*
1056 1068
  * ascii85 inflation, returns number of bytes in output, -1 for error
1057 1069
  *