/* * Copyright (C) 2005-2007 Nigel Horne * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. * * TODO: Embedded fonts * TODO: Predictor image handling */ static char const rcsid[] = "$Id: pdf.c,v 1.61 2007/02/12 20:46:09 njh Exp $"; #if HAVE_CONFIG_H #include "clamav-config.h" #endif #if HAVE_MMAP #include #include #include #include #include #include #include #include #ifdef HAVE_LIMITS_H #include #endif #ifdef HAVE_UNISTD_H #include #endif #ifdef HAVE_SYS_MMAN_H #include #endif #ifdef HAVE_ZLIB_H #include #endif #ifdef C_WINDOWS #include #endif #include "clamav.h" #include "others.h" #include "mbox.h" #include "pdf.h" #ifdef CL_DEBUG /*#define SAVE_TMP /* Save the file being worked on in tmp */ #endif static int try_flatedecode(unsigned char *buf, off_t real_len, off_t calculated_len, int fout, const cli_ctx *ctx); static int flatedecode(unsigned char *buf, off_t len, int fout, const cli_ctx *ctx); static int ascii85decode(const char *buf, off_t len, unsigned char *output); static const char *pdf_nextlinestart(const char *ptr, size_t len); static const char *pdf_nextobject(const char *ptr, size_t len); static const char *cli_pmemstr(const char *haystack, size_t hs, const char *needle, size_t ns); /* * TODO: handle embedded URLs if (options&CL_SCAN_MAILURL) */ int cli_pdf(const char *dir, int desc, const cli_ctx *ctx) { off_t size; /* total number of bytes in the file */ long bytesleft, trailerlength; char *buf, *alloced; /* start of memory mapped area */ const char *p, *q, *trailerstart; const char *xrefstart; /* cross reference table */ /*size_t xreflength;*/ int rc = CL_CLEAN; table_t *md5table; int printed_predictor_message; int printed_embedded_font_message; struct stat statb; cli_dbgmsg("in cli_pdf(%s)\n", dir); if(fstat(desc, &statb) < 0) return CL_EOPEN; size = statb.st_size; if(size == 0) return CL_CLEAN; if(size <= 7) /* doesn't even include the file header */ return CL_EFORMAT; p = buf = mmap(NULL, size, PROT_READ, MAP_PRIVATE, desc, 0); if(buf == MAP_FAILED) return CL_EMEM; alloced = cli_malloc(size); if(alloced) { /* * FIXME: now I have this, there's no need for the lack of * support on systems without mmap, e.g. cygwin */ memcpy(alloced, buf, size); munmap(buf, size); p = alloced; } cli_dbgmsg("cli_pdf: scanning %lu bytes\n", (unsigned long)size); /* Lines are terminated by \r, \n or both */ /* File Header */ if(memcmp(p, "%PDF-1.", 7) != 0) { if(alloced) free(alloced); else munmap(buf, size); return CL_EFORMAT; } #if 0 q = pdf_nextlinestart(&p[6], size - 6); if(q == NULL) { if(alloced) free(alloced); else munmap(buf, size); return CL_EFORMAT; } bytesleft = size - (long)(q - p); p = q; #else p = &p[6]; bytesleft = size - 6; #endif /* Find the file trailer */ for(q = &p[bytesleft - 6]; q > p; --q) if(memcmp(q, "%%EOF", 5) == 0) break; if(q <= p) { if(alloced) free(alloced); else munmap(buf, size); return CL_EFORMAT; } for(trailerstart = &q[-7]; trailerstart > p; --trailerstart) if(memcmp(trailerstart, "trailer", 7) == 0) break; /* * q points to the end of the trailer section */ trailerlength = (long)(q - trailerstart); if(cli_pmemstr(trailerstart, trailerlength, "Encrypt", 7)) { /* * This tends to mean that the file is, in effect, read-only */ if(alloced) free(alloced); else munmap(buf, size); cli_warnmsg("Encrypted PDF files not yet supported\n"); return CL_EFORMAT; } /* * not true, since edits may put data after the trailer bytesleft -= trailerlength; */ /* * FIXME: Handle more than one xref section in the xref table */ for(xrefstart = trailerstart; xrefstart > p; --xrefstart) if(memcmp(xrefstart, "xref", 4) == 0) /* * Make sure it's the start of the line, not a startxref * token */ if((xrefstart[-1] == '\n') || (xrefstart[-1] == '\r')) break; if(xrefstart == p) { if(alloced) free(alloced); else munmap(buf, size); return CL_EFORMAT; } printed_predictor_message = printed_embedded_font_message = 0; md5table = tableCreate(); /* * not true, since edits may put data after the trailer xreflength = (size_t)(trailerstart - xrefstart); bytesleft -= xreflength; */ /* * The body section consists of a sequence of indirect objects */ while((p < xrefstart) && ((q = pdf_nextobject(p, bytesleft)) != NULL)) { int is_ascii85decode, is_flatedecode, fout, len, has_cr; /*int object_number, generation_number;*/ const char *objstart, *objend, *streamstart, *streamend; char *md5digest; unsigned long length, objlen, real_streamlen, calculated_streamlen; int is_embedded_font, predictor; char fullname[NAME_MAX + 1]; if(q == xrefstart) break; if(memcmp(q, "xref", 4) == 0) break; /*object_number = atoi(q);*/ bytesleft -= (q - p); p = q; if(memcmp(q, "endobj", 6) == 0) continue; if(!isdigit(*q)) { cli_warnmsg("cli_pdf: Object number missing\n"); rc = CL_EFORMAT; break; } q = pdf_nextobject(p, bytesleft); if((q == NULL) || !isdigit(*q)) { cli_warnmsg("cli_pdf: Generation number missing\n"); rc = CL_EFORMAT; break; } /*generation_number = atoi(q);*/ bytesleft -= (q - p); p = q; q = pdf_nextobject(p, bytesleft); if((q == NULL) || (memcmp(q, "obj", 3) != 0)) { cli_warnmsg("Indirect object missing \"obj\"\n"); rc = CL_EFORMAT; break; } bytesleft -= (q - p) + 3; objstart = p = &q[3]; objend = cli_pmemstr(p, bytesleft, "endobj", 6); if(objend == NULL) { cli_dbgmsg("No matching endobj\n"); break; } bytesleft -= (objend - p) + 6; p = &objend[6]; objlen = (unsigned long)(objend - objstart); /* Is this object a stream? */ streamstart = cli_pmemstr(objstart, objlen, "stream", 6); if(streamstart == NULL) continue; is_embedded_font = length = is_ascii85decode = is_flatedecode = 0; predictor = 1; /* * TODO: handle F and FFilter? */ q = objstart; while(q < streamstart) { if(*q == '/') { /* name object */ /*cli_dbgmsg("Name object %8.8s\n", q+1, q+1);*/ if(strncmp(++q, "Length ", 7) == 0) { q += 7; length = atoi(q); while(isdigit(*q)) q++; /* * Note: incremental updates are not * supported */ if((bytesleft > 11) && strncmp(q, " 0 R", 4) == 0) { const char *r; char b[14]; q += 4; cli_dbgmsg("Length is in indirect obj %ld\n", length); snprintf(b, sizeof(b), "\n%ld 0 obj", length); length = (unsigned long)strlen(b); r = cli_pmemstr(alloced ? alloced : buf, size, b, length); if(r == NULL) { b[0] = '\r'; r = cli_pmemstr(alloced ? alloced : buf, size, b, length); } if(r) { r += length - 1; r = pdf_nextobject(r, bytesleft - (r - q)); if(r) { length = atoi(r); while(isdigit(*r)) r++; cli_dbgmsg("length in '%s' %ld\n", &b[1], length); } } else cli_warnmsg("Couldn't find '%s'\n", &b[1]); } q--; } else if(strncmp(q, "Length2 ", 8) == 0) is_embedded_font = 1; else if(strncmp(q, "Predictor ", 10) == 0) { q += 10; predictor = atoi(q); while(isdigit(*q)) q++; q--; } else if(strncmp(q, "FlateDecode", 11) == 0) { is_flatedecode = 1; q += 11; } else if(strncmp(q, "ASCII85Decode", 13) == 0) { is_ascii85decode = 1; q += 13; } } q = pdf_nextobject(q, (size_t)(streamstart - q)); if(q == NULL) break; } if(is_embedded_font) { /* * Need some documentation, the only I can find a * reference to is not free, if some kind soul wishes * to donate a copy, please contact me! * (http://safari.adobepress.com/0321304748) */ if(!printed_embedded_font_message) { cli_dbgmsg("Embedded fonts not yet supported\n"); printed_embedded_font_message = 1; } continue; } if(predictor > 1) { /* * Needs some thought */ if(!printed_predictor_message) { cli_dbgmsg("Predictor %d not honoured for embedded image\n", predictor); printed_predictor_message = 1; } continue; } /* objend points to the end of the object (start of "endobj") */ streamstart += 6; /* go past the word "stream" */ len = (int)(objend - streamstart); q = pdf_nextlinestart(streamstart, len); if(q == NULL) break; len -= (int)(q - streamstart); streamstart = q; streamend = cli_pmemstr(streamstart, len, "endstream\n", 10); if(streamend == NULL) { streamend = cli_pmemstr(streamstart, len, "endstream\r", 10); if(streamend == NULL) { cli_dbgmsg("No endstream\n"); break; } has_cr = 1; } else has_cr = 0; snprintf(fullname, sizeof(fullname), "%s/pdfXXXXXX", dir); #if defined(C_LINUX) || defined(C_BSD) || defined(HAVE_MKSTEMP) || defined(C_SOLARIS) || defined(C_CYGWIN) fout = mkstemp(fullname); #elif defined(C_WINDOWS) if(_mktemp(fullname) == NULL) { /* mktemp only allows 26 files */ char *name = cli_gentemp(dir); if(name == NULL) fout = -1; else { strcpy(fullname, name); free(name); fout = open(fullname, O_WRONLY|O_CREAT|O_EXCL|O_TRUNC|O_BINARY, 0600); } } else fout = open(fullname, O_WRONLY|O_CREAT|O_EXCL|O_TRUNC|O_BINARY, 0600); #else mktemp(fullname); fout = open(fullname, O_WRONLY|O_CREAT|O_EXCL|O_TRUNC|O_BINARY, 0600); #endif if(fout < 0) { cli_errmsg("cli_pdf: can't create temporary file %s: %s\n", fullname, strerror(errno)); rc = CL_ETMPFILE; break; } /* * Calculate the length ourself, the Length parameter is often * wrong */ if(*--streamend != '\n') streamend++; else if(has_cr && (*--streamend != '\r')) streamend++; if(streamend <= streamstart) { close(fout); cli_dbgmsg("Empty stream\n"); continue; } calculated_streamlen = (int)(streamend - streamstart); real_streamlen = length; if(calculated_streamlen != real_streamlen) cli_dbgmsg("cli_pdf: Incorrect Length field in file attempting to recover\n"); cli_dbgmsg("length %ld, calculated_streamlen %ld isFlate %d isASCII85 %d\n", length, calculated_streamlen, is_flatedecode, is_ascii85decode); #if 0 /* FIXME: this isn't right... */ if(length) /*streamlen = (is_flatedecode) ? length : MIN(length, streamlen);*/ streamlen = MIN(length, streamlen); #endif if(is_ascii85decode) { unsigned char *tmpbuf = cli_malloc(calculated_streamlen * 5); int ret; if(tmpbuf == NULL) { close(fout); unlink(fullname); rc = CL_EMEM; continue; } ret = ascii85decode(streamstart, calculated_streamlen, tmpbuf); if(ret == -1) { free(tmpbuf); close(fout); unlink(fullname); rc = CL_EFORMAT; continue; } if(ret) { unsigned char *t; real_streamlen = ret; /* free unused trailing bytes */ t = (unsigned char *)cli_realloc(tmpbuf, calculated_streamlen); if(t == NULL) { free(tmpbuf); close(fout); unlink(fullname); rc = CL_EMEM; continue; } tmpbuf = t; /* * Note that it will probably be both * ascii85encoded and flateencoded */ if(is_flatedecode) { const int zstat = try_flatedecode((unsigned char *)tmpbuf, real_streamlen, real_streamlen, fout, ctx); if(zstat != Z_OK) rc = CL_EZIP; } else cli_writen(fout, (const char *)streamstart, real_streamlen); } free(tmpbuf); } else if(is_flatedecode) { const int zstat = try_flatedecode((unsigned char *)streamstart, real_streamlen, calculated_streamlen, fout, ctx); if(zstat != Z_OK) rc = CL_EZIP; } else { cli_dbgmsg("cli_pdf: writing %lu bytes from the stream\n", (unsigned long)real_streamlen); cli_writen(fout, (const char *)streamstart, real_streamlen); } close(fout); md5digest = cli_md5file(fullname); if(tableFind(md5table, md5digest) >= 0) { cli_dbgmsg("cli_pdf: not scanning duplicate embedded file '%s'\n", fullname); unlink(fullname); } else tableInsert(md5table, md5digest, 1); free(md5digest); cli_dbgmsg("cli_pdf: extracted to %s\n", fullname); } if(alloced) free(alloced); else munmap(buf, size); tableDestroy(md5table); cli_dbgmsg("cli_pdf: returning %d\n", rc); return rc; } /* flate inflation - returns zlib status, e.g. Z_OK */ static int try_flatedecode(unsigned char *buf, off_t real_len, off_t calculated_len, int fout, const cli_ctx *ctx) { int ret = flatedecode(buf, real_len, fout, ctx); if(ret == Z_OK) return Z_OK; if(real_len == calculated_len) { /* * Nothing more we can do to inflate */ cli_warnmsg("Bad compression in flate stream\n"); return ret; } ret = flatedecode(buf, calculated_len, fout, ctx); if(ret == Z_OK) return Z_OK; /* i.e. the PDF file is broken :-( */ cli_warnmsg("cli_pdf: Bad compressed block length in flate stream\n"); return ret; } static int flatedecode(unsigned char *buf, off_t len, int fout, const cli_ctx *ctx) { int zstat; off_t nbytes; z_stream stream; unsigned char output[BUFSIZ]; #ifdef SAVE_TMP char tmpfilename[16]; int tmpfd; #endif cli_dbgmsg("cli_pdf: flatedecode %lu bytes\n", (unsigned long)len); if(len == 0) { cli_warnmsg("cli_pdf: flatedecode len == 0\n"); return Z_OK; } #ifdef SAVE_TMP /* * Copy the embedded area for debugging, so that if it falls over * we have a copy of the offending data. This is debugging code * that you shouldn't of course install in a live environment. I am * not interested in hearing about security issues with this section * of the parser. */ strcpy(tmpfilename, "/tmp/pdfXXXXXX"); tmpfd = mkstemp(tmpfilename); if(tmpfd < 0) { perror(tmpfilename); cli_errmsg("Can't make debugging file\n"); } else { FILE *tmpfp = fdopen(tmpfd, "w"); if(tmpfp) { fwrite(buf, sizeof(char), len, tmpfp); fclose(tmpfp); cli_dbgmsg("cli_pdf: flatedecode: debugging file is %s\n", tmpfilename); } else cli_errmsg("cli_pdf: can't fdopen debugging file\n"); } #endif stream.zalloc = (alloc_func)Z_NULL; stream.zfree = (free_func)Z_NULL; stream.opaque = (void *)NULL; stream.next_in = (Bytef *)buf; stream.avail_in = len; stream.next_out = output; stream.avail_out = sizeof(output); zstat = inflateInit(&stream); if(zstat != Z_OK) { cli_warnmsg("cli_pdf: inflateInit failed"); return zstat; } nbytes = 0; while(stream.avail_in) { zstat = inflate(&stream, Z_NO_FLUSH); /* zlib */ switch(zstat) { case Z_OK: if(stream.avail_out == 0) { nbytes += cli_writen(fout, output, sizeof(output)); if(ctx->limits && ctx->limits->maxfilesize && (nbytes > (off_t) ctx->limits->maxfilesize)) { cli_dbgmsg("cli_pdf: flatedecode size exceeded (%lu)\n", (unsigned long)nbytes); inflateEnd(&stream); *ctx->virname = "PDF.ExceededFileSize"; return Z_DATA_ERROR; } stream.next_out = output; stream.avail_out = sizeof(output); } continue; case Z_STREAM_END: break; default: if(stream.msg) cli_dbgmsg("pdf: after writing %lu bytes, got error \"%s\" inflating PDF attachment\n", (unsigned long)nbytes, stream.msg); else cli_dbgmsg("pdf: after writing %lu bytes, got error %d inflating PDF attachment\n", (unsigned long)nbytes, zstat); inflateEnd(&stream); return zstat; } break; } if(stream.avail_out != sizeof(output)) if(cli_writen(fout, output, sizeof(output) - stream.avail_out) < 0) return Z_STREAM_ERROR; cli_dbgmsg("cli_pdf: flatedecode in=%lu out=%lu ratio %ld (max %d)\n", stream.total_in, stream.total_out, stream.total_out / stream.total_in, ctx->limits ? ctx->limits->maxratio : 0); if(ctx->limits && ctx->limits->maxratio && BLOCKMAX && ((stream.total_out / stream.total_in) > ctx->limits->maxratio)) { cli_dbgmsg("cli_pdf: flatedecode Max ratio reached\n"); inflateEnd(&stream); *ctx->virname = "Oversized.PDF"; return Z_DATA_ERROR; } #ifdef SAVE_TMP unlink(tmpfilename); #endif return inflateEnd(&stream); } /* * ascii85 inflation, returns number of bytes in output, -1 for error * * See http://www.piclist.com/techref/method/encode.htm (look for base85) */ static int ascii85decode(const char *buf, off_t len, unsigned char *output) { const char *ptr; uint32_t sum = 0; int quintet = 0; int ret = 0; if(cli_pmemstr(buf, len, "~>", 2) == NULL) cli_warnmsg("ascii85decode: no EOF marker found\n"); ptr = buf; cli_dbgmsg("cli_pdf: ascii85decode %lu bytes\n", (unsigned long)len); while(len > 0) { int byte = (len--) ? (int)*ptr++ : EOF; if((byte == '~') && (*ptr == '>')) byte = EOF; if(byte >= '!' && byte <= 'u') { sum = (sum * 85) + ((uint32_t)byte - '!'); if(++quintet == 5) { *output++ = (unsigned char)(sum >> 24); *output++ = (unsigned char)((sum >> 16) & 0xFF); *output++ = (unsigned char)((sum >> 8) & 0xFF); *output++ = (unsigned char)(sum & 0xFF); ret += 4; quintet = 0; sum = 0; } } else if(byte == 'z') { if(quintet) { cli_warnmsg("ascii85decode: unexpected 'z'\n"); return -1; } *output++ = '\0'; *output++ = '\0'; *output++ = '\0'; *output++ = '\0'; ret += 4; } else if(byte == EOF) { cli_dbgmsg("ascii85decode: quintet %d\n", quintet); if(quintet) { int i; if(quintet == 1) { cli_warnmsg("ascii85Decode: only 1 byte in last quintet\n"); return -1; } for(i = quintet; i < 5; i++) sum *= 85; if(quintet > 1) sum += (0xFFFFFF >> ((quintet - 2) * 8)); ret += quintet; for(i = 0; i < quintet - 1; i++) *output++ = (unsigned char)((sum >> (24 - 8 * i)) & 0xFF); quintet = 0; } len = 0; break; } else if(!isspace(byte)) { cli_warnmsg("ascii85Decode: invalid character 0x%x, len %lu\n", byte & 0xFF, (unsigned long)len); return -1; } } return ret; } /* * Find the start of the next line */ static const char * pdf_nextlinestart(const char *ptr, size_t len) { while(strchr("\r\n", *ptr) == NULL) { if(--len == 0L) return NULL; ptr++; } while(strchr("\r\n", *ptr) != NULL) { if(--len == 0L) return NULL; ptr++; } return ptr; } /* * Return the start of the next PDF object. * This assumes that we're not in a stream. */ static const char * pdf_nextobject(const char *ptr, size_t len) { const char *p; int inobject = 1; while(len) { switch(*ptr) { case '\n': case '\r': case '%': /* comment */ p = pdf_nextlinestart(ptr, len); if(p == NULL) return NULL; len -= (size_t)(p - ptr); ptr = p; inobject = 0; break; case ' ': case '\t': case '[': /* Start of an array object */ case '\v': case '\f': case '<': /* Start of a dictionary object */ inobject = 0; ptr++; len--; break; case '/': /* Start of a name object */ return ptr; default: if(!inobject) /* TODO: parse and return object type */ return ptr; ptr++; len--; } } return NULL; } /* * like cli_memstr - but returns the location of the match * FIXME: need a case insensitive version */ static const char * cli_pmemstr(const char *haystack, size_t hs, const char *needle, size_t ns) { const char *pt, *hay; size_t n; if(haystack == needle) return haystack; if(hs < ns) return NULL; if(memcmp(haystack, needle, ns) == 0) return haystack; pt = hay = haystack; n = hs; while((pt = memchr(hay, needle[0], n)) != NULL) { n -= (size_t)(pt - hay); if(n < ns) break; if(memcmp(pt, needle, ns) == 0) return pt; if(hay == pt) { n--; hay++; } else hay = pt; } return NULL; } #else /*!HAVE_MMAP*/ #include "clamav.h" #include "others.h" #include "pdf.h" int cli_pdf(const char *dir, int desc, const cli_ctx *ctx) { cli_warnmsg("File not decoded - PDF decoding needs mmap() (for now)\n"); return CL_CLEAN; } #endif