c3d221dc |
/*
* Copyright (C) 2005 Nigel Horne <njh@bandsman.co.uk>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software |
19dc1a8d |
* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. |
c3d221dc |
*/ |
5e45f438 |
static char const rcsid[] = "$Id: pdf.c,v 1.58 2007/01/25 13:59:56 njh Exp $"; |
c3d221dc |
#if HAVE_CONFIG_H
#include "clamav-config.h"
#endif
|
28dcdb28 |
#include "clamav.h" |
46af1ed0 |
#include "others.h" |
28dcdb28 |
|
c64f16b7 |
#if HAVE_SYS_MMAN_H
#include <sys/mman.h>
#endif
|
46af1ed0 |
#if HAVE_MMAP |
c64f16b7 |
#include <stdio.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <ctype.h>
#include <string.h>
#include <fcntl.h>
#include <stdlib.h>
#include <limits.h> |
3fd29c0a |
#include <errno.h> |
c64f16b7 |
#ifdef HAVE_ZLIB_H
#include <zlib.h>
#endif
|
e7fab79f |
#ifdef C_WINDOWS
#include <io.h>
#endif
|
c64f16b7 |
#include "mbox.h" |
7bd32c11 |
#include "pdf.h" |
c64f16b7 |
|
e81f478a |
static int flatedecode(const unsigned char *buf, size_t len, int fout, const cli_ctx *ctx); |
4427302c |
static int ascii85decode(const char *buf, size_t len, unsigned char *output); |
45f6e75a |
static const char *pdf_nextlinestart(const char *ptr, size_t len); |
8f5c46fa |
static const char *pdf_nextobject(const char *ptr, size_t len); |
725bd592 |
static const char *cli_pmemstr(const char *haystack, size_t hs, const char *needle, size_t ns); |
4427302c |
|
5c208f55 |
/*
* TODO: handle embedded URLs if (options&CL_SCAN_MAILURL)
*/ |
c3d221dc |
int |
f3e0aea0 |
cli_pdf(const char *dir, int desc, const cli_ctx *ctx) |
c3d221dc |
{ |
c64f16b7 |
struct stat statb; |
f223b55a |
off_t size; /* total number of bytes in the file */ |
45f6e75a |
long bytesleft, trailerlength; |
f223b55a |
char *buf; /* start of memory mapped area */ |
45f6e75a |
const char *p, *q, *trailerstart; |
f223b55a |
const char *xrefstart; /* cross reference table */ |
1c886862 |
/*size_t xreflength;*/ |
fd905821 |
int rc = CL_CLEAN; |
57a293d5 |
struct table *md5table; |
c64f16b7 |
|
e7fab79f |
cli_dbgmsg("in cli_pdf(%s)\n", dir); |
327e0c3c |
|
c64f16b7 |
if(fstat(desc, &statb) < 0)
return CL_EOPEN;
size = (size_t)statb.st_size;
if(size == 0)
return CL_CLEAN;
|
7609e4ac |
if(size <= 7) /* doesn't even include the file header */
return CL_EFORMAT;
|
fb7d9937 |
p = buf = mmap(NULL, size, PROT_READ, MAP_PRIVATE, desc, 0); |
c64f16b7 |
if(buf == MAP_FAILED)
return CL_EMEM;
|
f223b55a |
cli_dbgmsg("cli_pdf: scanning %lu bytes\n", size); |
7b999f91 |
|
7609e4ac |
/* Lines are terminated by \r, \n or both */
/* File Header */
if(memcmp(p, "%PDF-1.", 7) != 0) {
munmap(buf, size);
return CL_EFORMAT;
}
|
8f5c46fa |
#if 0
q = pdf_nextlinestart(&p[6], size - 6); |
45f6e75a |
if(q == NULL) {
munmap(buf, size);
return CL_EFORMAT; |
7609e4ac |
} |
f223b55a |
bytesleft = size - (long)(q - p); |
45f6e75a |
p = q; |
8f5c46fa |
#else
p = &p[6];
bytesleft = size - 6;
#endif |
7609e4ac |
/* Find the file trailer */ |
f223b55a |
for(q = &p[bytesleft - 6]; q > p; --q) |
7609e4ac |
if(memcmp(q, "%%EOF", 5) == 0)
break;
if(q == p) {
munmap(buf, size);
return CL_EFORMAT;
}
|
5fef5ee8 |
for(trailerstart = &q[-7]; trailerstart > p; --trailerstart) |
45f6e75a |
if(memcmp(trailerstart, "trailer", 7) == 0) |
7609e4ac |
break;
/* |
45f6e75a |
* q points to the end of the trailer section |
7609e4ac |
*/ |
45f6e75a |
trailerlength = (long)(q - trailerstart);
if(cli_pmemstr(trailerstart, trailerlength, "Encrypt", 7)) { |
61f404d2 |
/*
* This tends to mean that the file is, in effect, read-only
*/
munmap(buf, size); |
f223b55a |
cli_warnmsg("Encrypted PDF files not yet supported\n"); |
61f404d2 |
return CL_EFORMAT;
}
|
8f5c46fa |
/*
* not true, since edits may put data after the trailer |
45f6e75a |
bytesleft -= trailerlength; |
8f5c46fa |
*/ |
45f6e75a |
|
96000796 |
/*
* FIXME: Handle more than one xref section in the xref table
*/ |
f223b55a |
for(xrefstart = trailerstart; xrefstart > p; --xrefstart)
if(memcmp(xrefstart, "xref", 4) == 0) |
96000796 |
/*
* Make sure it's the start of the line, not a startxref
* token
*/
if((xrefstart[-1] == '\n') || (xrefstart[-1] == '\r'))
break; |
f223b55a |
if(xrefstart == p) {
munmap(buf, size);
return CL_EFORMAT;
}
|
57a293d5 |
md5table = tableCreate(); |
8f5c46fa |
/*
* not true, since edits may put data after the trailer |
1c886862 |
xreflength = (size_t)(trailerstart - xrefstart); |
f223b55a |
bytesleft -= xreflength; |
8f5c46fa |
*/ |
f223b55a |
/* |
8f5c46fa |
* The body section consists of a sequence of indirect objects |
f223b55a |
*/ |
d1c8821e |
while((p < xrefstart) &&
((q = pdf_nextobject(p, bytesleft)) != NULL) &&
(rc == CL_CLEAN)) { |
45f6e75a |
int is_ascii85decode, is_flatedecode, fout, len; |
249fef37 |
/*int object_number, generation_number;*/ |
f223b55a |
const char *objstart, *objend, *streamstart, *streamend; |
57a293d5 |
char *md5digest; |
45f6e75a |
size_t length, objlen, streamlen; |
c64f16b7 |
char fullname[NAME_MAX + 1]; |
6d53e27f |
|
8f5c46fa |
if(q == xrefstart)
break;
if(memcmp(q, "xref", 4) == 0)
break; |
209a0b31 |
/*object_number = atoi(q);*/
bytesleft -= (q - p);
p = q;
if(memcmp(q, "endobj", 6) == 0)
continue; |
8f5c46fa |
if(!isdigit(*q)) { |
cae20c9b |
cli_warnmsg("cli_pdf: Object number missing\n"); |
8f5c46fa |
rc = CL_EFORMAT;
break;
}
q = pdf_nextobject(p, bytesleft);
if((q == NULL) || !isdigit(*q)) { |
cae20c9b |
cli_warnmsg("cli_pdf: Generation number missing\n"); |
8f5c46fa |
rc = CL_EFORMAT;
break;
} |
12c43df1 |
/*generation_number = atoi(q);*/ |
8f5c46fa |
bytesleft -= (q - p);
p = q;
q = pdf_nextobject(p, bytesleft);
if((q == NULL) || (memcmp(q, "obj", 3) != 0)) {
cli_warnmsg("Indirect object missing \"obj\"\n");
rc = CL_EFORMAT;
break;
}
bytesleft -= (q - p) + 3;
objstart = p = &q[3]; |
f223b55a |
objend = cli_pmemstr(p, bytesleft, "endobj", 6);
if(objend == NULL) { |
8f5c46fa |
cli_dbgmsg("No matching endobj\n"); |
c64f16b7 |
break;
} |
f223b55a |
bytesleft -= (objend - p) + 6;
p = &objend[6];
objlen = (size_t)(objend - objstart); |
c64f16b7 |
|
f223b55a |
/* Is this object a stream? */ |
45f6e75a |
streamstart = cli_pmemstr(objstart, objlen, "stream", 6);
if(streamstart == NULL)
continue; |
c64f16b7 |
|
7609e4ac |
length = is_ascii85decode = is_flatedecode = 0; |
45f6e75a |
/*
* TODO: handle F and FFilter?
*/ |
c8ac5876 |
q = objstart;
while(q < streamstart) { |
8f5c46fa |
if(*q == '/') { /* name object */ |
6d53e27f |
/*cli_dbgmsg("Name object %8.8s\n", q+1, q+1);*/ |
f223b55a |
if(strncmp(++q, "Length ", 7) == 0) {
q += 7;
length = atoi(q);
while(isdigit(*q))
q++;
q--;
} else if(strncmp(q, "FlateDecode", 11) == 0) { |
4427302c |
is_flatedecode = 1; |
6d53e27f |
q += 11;
} else if(strncmp(q, "ASCII85Decode", 13) == 0) { |
4427302c |
is_ascii85decode = 1; |
f223b55a |
q += 13; |
c64f16b7 |
}
} |
8f5c46fa |
q = pdf_nextobject(q, (size_t)(streamstart - q)); |
c8ac5876 |
if(q == NULL)
break;
} |
95c913cd |
|
f223b55a |
/* objend points to the end of the object (start of "endobj") */
streamstart += 6; /* go past the word "stream" */
len = (int)(objend - streamstart);
q = pdf_nextlinestart(streamstart, len);
if(q == NULL) |
45f6e75a |
break; |
f223b55a |
len -= (int)(q - streamstart);
streamstart = q;
streamend = cli_pmemstr(streamstart, len, "endstream\n", 10);
if(streamend == NULL) {
streamend = cli_pmemstr(streamstart, len, "endstream\r", 10);
if(streamend == NULL) { |
56896211 |
cli_dbgmsg("No endstream\n"); |
7b999f91 |
break;
} |
c64f16b7 |
} |
19dc1a8d |
/*while(strchr("\r\n", *--streamend))
;*/
|
9a1d5d83 |
streamlen = (int)(streamend - streamstart) + 1;
if(streamlen == 0) { |
56896211 |
cli_dbgmsg("Empty stream\n"); |
9a1d5d83 |
continue;
}
|
c64f16b7 |
snprintf(fullname, sizeof(fullname), "%s/pdfXXXXXX", dir);
#if defined(C_LINUX) || defined(C_BSD) || defined(HAVE_MKSTEMP) || defined(C_SOLARIS) || defined(C_CYGWIN)
fout = mkstemp(fullname); |
e7fab79f |
#elif defined(C_WINDOWS)
if(_mktemp(fullname) == NULL) {
/* mktemp only allows 26 files */
char *name = cli_gentemp(dir);
if(name == NULL)
fout = -1;
else {
strcpy(fullname, name);
free(name);
fout = open(fullname,
O_WRONLY|O_CREAT|O_EXCL|O_TRUNC|O_BINARY, 0600);
}
} else
fout = open(fullname, O_WRONLY|O_CREAT|O_EXCL|O_TRUNC|O_BINARY, 0600); |
6d53e27f |
#else |
e7fab79f |
mktemp(fullname); |
c64f16b7 |
fout = open(fullname, O_WRONLY|O_CREAT|O_EXCL|O_TRUNC|O_BINARY, 0600);
#endif
if(fout < 0) {
cli_errmsg("cli_pdf: can't create temporary file %s: %s\n", fullname, strerror(errno)); |
ca8921bc |
rc = CL_ETMPFILE;
break; |
c64f16b7 |
}
|
6d53e27f |
cli_dbgmsg("length %d, streamlen %d isFlate %d isASCII85 %d\n",
length, streamlen, is_flatedecode, is_ascii85decode); |
45f6e75a |
#if 0
/* FIXME: this isn't right... */
if(length)
/*streamlen = (is_flatedecode) ? length : MIN(length, streamlen);*/
streamlen = MIN(length, streamlen);
#endif
|
4427302c |
if(is_ascii85decode) { |
ee92d4a3 |
unsigned char *tmpbuf = cli_malloc(streamlen * 5); |
45f6e75a |
int ret; |
fd905821 |
|
218e036b |
if(tmpbuf == NULL) { |
62a89f51 |
close(fout); |
f8cff436 |
unlink(fullname);
rc = CL_EMEM; |
218e036b |
continue;
}
|
45f6e75a |
ret = ascii85decode(streamstart, streamlen, tmpbuf); |
ca8921bc |
|
45f6e75a |
if(ret == -1) { |
4427302c |
free(tmpbuf); |
62a89f51 |
close(fout); |
f8cff436 |
unlink(fullname);
rc = CL_EFORMAT; |
c64f16b7 |
continue;
} |
9a1d5d83 |
if(ret) {
streamlen = (size_t)ret;
/* free unused trailing bytes */
tmpbuf = cli_realloc(tmpbuf, streamlen);
/*
* Note that it will probably be both
* ascii85encoded and flateencoded
*/
if(is_flatedecode) { |
e81f478a |
const int zstat = flatedecode((unsigned char *)tmpbuf, streamlen, fout, ctx); |
9a1d5d83 |
if(zstat != Z_OK)
rc = CL_EZIP; |
8f5c46fa |
} else
cli_writen(fout, (char *)streamstart, streamlen); |
fd905821 |
} |
4427302c |
free(tmpbuf);
} else if(is_flatedecode) { |
e81f478a |
const int zstat = flatedecode((unsigned char *)streamstart, streamlen, fout, ctx); |
c64f16b7 |
|
4427302c |
if(zstat != Z_OK) |
218e036b |
rc = CL_EZIP; |
d1c8821e |
} else {
cli_dbgmsg("cli_pdf: writing %u bytes from the stream\n",
streamlen); |
14038c6d |
cli_writen(fout, (char *)streamstart, streamlen); |
d1c8821e |
} |
c64f16b7 |
close(fout); |
57a293d5 |
md5digest = cli_md5file(fullname);
if(tableFind(md5table, md5digest) >= 0) {
cli_dbgmsg("cli_pdf: not scanning duplicate embedded file '%s'\n", fullname);
unlink(fullname);
} else
tableInsert(md5table, md5digest, 1);
free(md5digest); |
c64f16b7 |
cli_dbgmsg("cli_pdf: extracted to %s\n", fullname);
}
munmap(buf, size); |
7b999f91 |
|
57a293d5 |
tableDestroy(md5table);
|
ca8921bc |
cli_dbgmsg("cli_pdf: returning %d\n", rc); |
fd905821 |
return rc; |
c3d221dc |
} |
4427302c |
/* flate inflation - returns zlib status, e.g. Z_OK */
static int |
e81f478a |
flatedecode(const unsigned char *buf, size_t len, int fout, const cli_ctx *ctx) |
4427302c |
{
int zstat; |
e81f478a |
off_t nbytes; |
4427302c |
z_stream stream;
unsigned char output[BUFSIZ];
cli_dbgmsg("cli_pdf: flatedecode %lu bytes\n", len);
stream.zalloc = (alloc_func)Z_NULL;
stream.zfree = (free_func)Z_NULL;
stream.opaque = (void *)NULL;
stream.next_in = (unsigned char *)buf;
stream.avail_in = len; |
61f404d2 |
stream.next_out = output;
stream.avail_out = sizeof(output); |
4427302c |
zstat = inflateInit(&stream);
if(zstat != Z_OK) {
cli_warnmsg("cli_pdf: inflateInit failed");
return zstat;
} |
c8ea3fca |
|
e81f478a |
nbytes = 0; |
c8ea3fca |
|
4427302c |
for(;;) { |
5e45f438 |
zstat = inflate(&stream, Z_NO_FLUSH); /* zlib */ |
4427302c |
switch(zstat) {
case Z_OK: |
218e036b |
if(stream.avail_out == 0) { |
c8ea3fca |
|
e81f478a |
nbytes += cli_writen(fout, output, sizeof(output)); |
c8ea3fca |
|
e81f478a |
if(ctx->limits &&
ctx->limits->maxfilesize &&
(nbytes > (off_t) ctx->limits->maxfilesize)) {
cli_dbgmsg("cli_pdf: flatedecode size exceeded (%lu)\n", nbytes);
inflateEnd(&stream);
*ctx->virname = "PDF.ExceededFileSize";
return Z_DATA_ERROR;
} |
218e036b |
stream.next_out = output; |
61f404d2 |
stream.avail_out = sizeof(output); |
218e036b |
} |
4427302c |
continue;
case Z_STREAM_END:
break;
default: |
00ccd898 |
if(stream.msg) |
5e45f438 |
cli_warnmsg("pdf: after writing %u bytes, got error \"%s\" inflating PDF attachment\n", |
d1c8821e |
nbytes, stream.msg); |
00ccd898 |
else |
5e45f438 |
cli_warnmsg("pdf: after writing %u bytes, got error %d inflating PDF attachment\n", |
d1c8821e |
nbytes, zstat); |
4427302c |
inflateEnd(&stream);
return zstat;
}
break;
}
|
c8ea3fca |
if(stream.avail_out != sizeof(output))
(void)cli_writen(fout, output, sizeof(output) - stream.avail_out);
|
e81f478a |
cli_dbgmsg("cli_pdf: flatedecode in=%lu out=%lu ratio %ld (max %d)\n",
stream.total_in, stream.total_out,
stream.total_out / stream.total_in,
ctx->limits ? ctx->limits->maxratio : 0);
if(ctx->limits &&
ctx->limits->maxratio && |
5e45f438 |
BLOCKMAX && |
e81f478a |
((stream.total_out / stream.total_in) > ctx->limits->maxratio)) {
cli_dbgmsg("cli_pdf: flatedecode Max ratio reached\n");
inflateEnd(&stream);
*ctx->virname = "Oversized.PDF";
return Z_DATA_ERROR;
} |
f3e0aea0 |
|
4427302c |
return inflateEnd(&stream);
}
|
19dc1a8d |
/*
* ascii85 inflation, returns number of bytes in output, -1 for error
*
* See http://www.piclist.com/techref/method/encode.htm (look for base85)
*/ |
4427302c |
static int
ascii85decode(const char *buf, size_t len, unsigned char *output)
{ |
19dc1a8d |
const char *ptr; |
4427302c |
uint32_t sum = 0;
int quintet = 0;
int ret = 0;
|
19dc1a8d |
if(cli_pmemstr(buf, len, "~>", 2) == NULL)
cli_warnmsg("ascii85decode: no EOF marker found\n");
ptr = buf;
cli_dbgmsg("cli_pdf: ascii85decode %u bytes\n", len); |
4427302c |
|
45f6e75a |
while(len > 0) {
int byte = (len--) ? (int)*ptr++ : EOF; |
4427302c |
if((byte == '~') && (*ptr == '>'))
byte = EOF;
if(byte >= '!' && byte <= 'u') { |
f739d4db |
sum = (sum * 85) + ((uint32_t)byte - '!'); |
4427302c |
if(++quintet == 5) { |
d1bf43fe |
*output++ = (unsigned char)(sum >> 24);
*output++ = (unsigned char)((sum >> 16) & 0xFF);
*output++ = (unsigned char)((sum >> 8) & 0xFF);
*output++ = (unsigned char)(sum & 0xFF); |
4427302c |
ret += 4;
quintet = 0;
sum = 0;
}
} else if(byte == 'z') {
if(quintet) { |
218e036b |
cli_warnmsg("ascii85decode: unexpected 'z'\n"); |
4427302c |
return -1;
}
*output++ = '\0';
*output++ = '\0';
*output++ = '\0';
*output++ = '\0';
ret += 4;
} else if(byte == EOF) { |
19dc1a8d |
cli_dbgmsg("ascii85decode: quintet %d\n", quintet); |
4427302c |
if(quintet) {
int i;
if(quintet == 1) {
cli_warnmsg("ascii85Decode: only 1 byte in last quintet\n");
return -1;
} |
f739d4db |
for(i = quintet; i < 5; i++)
sum *= 85;
|
4427302c |
if(quintet > 1)
sum += (0xFFFFFF >> ((quintet - 2) * 8));
ret += quintet;
for(i = 0; i < quintet - 1; i++) |
d1bf43fe |
*output++ = (unsigned char)((sum >> (24 - 8 * i)) & 0xFF); |
4427302c |
quintet = 0;
} |
f223b55a |
len = 0; |
4427302c |
break;
} else if(!isspace(byte)) { |
f223b55a |
cli_warnmsg("ascii85Decode: invalid character 0x%x, len %lu\n", byte & 0xFF, len); |
4427302c |
return -1;
}
}
return ret;
} |
45f6e75a |
/*
* Find the start of the next line
*/
static const char *
pdf_nextlinestart(const char *ptr, size_t len)
{
while(strchr("\r\n", *ptr) == NULL) {
if(--len == 0L)
return NULL;
ptr++;
}
while(strchr("\r\n", *ptr) != NULL) {
if(--len == 0L)
return NULL;
ptr++;
}
return ptr;
} |
c8ac5876 |
|
8f5c46fa |
/*
* Return the start of the next PDF object.
* This assumes that we're not in a stream.
*/
static const char *
pdf_nextobject(const char *ptr, size_t len)
{
const char *p;
int inobject = 1;
while(len) {
switch(*ptr) {
case '\n':
case '\r':
case '%': /* comment */
p = pdf_nextlinestart(ptr, len);
if(p == NULL)
return NULL;
len -= (size_t)(p - ptr);
ptr = p;
inobject = 0;
break;
|
c8ac5876 |
case ' ':
case '\t': |
6d53e27f |
case '[': /* Start of an array object */ |
8f5c46fa |
case '\v':
case '\f':
inobject = 0; |
c8ac5876 |
ptr++;
len--;
break;
default: |
8f5c46fa |
if(!inobject)
/* TODO: parse and return object type */ |
c8ac5876 |
return ptr;
ptr++;
len--;
}
}
return NULL;
} |
725bd592 |
/*
* like cli_memstr - but returns the location of the match
* FIXME: need a case insensitive version
*/
static const char *
cli_pmemstr(const char *haystack, size_t hs, const char *needle, size_t ns)
{
const char *pt, *hay;
size_t n;
if(haystack == needle)
return haystack;
if(hs < ns)
return NULL;
if(memcmp(haystack, needle, ns) == 0)
return haystack;
pt = hay = haystack;
n = hs;
while((pt = memchr(hay, needle[0], n)) != NULL) {
n -= (int) pt - (int) hay;
if(n < ns)
break;
if(memcmp(pt, needle, ns) == 0)
return pt;
if(hay == pt) {
n--;
hay++;
} else
hay = pt;
}
return NULL;
} |
46af1ed0 |
#else /*!HAVE_MMAP*/
int |
6516f163 |
cli_pdf(const char *dir, int desc, const cli_ctx *ctx) |
46af1ed0 |
{
cli_warnmsg("File not decoded - PDF decoding needs mmap() (for now)\n");
return CL_CLEAN;
}
#endif |