/* * Copyright (C) 2007-2014 Sourcefire, Inc. * Copyright (C) 2014 Cisco and/or its affiliates. All rights reserved. * * Authors: Nigel Horne, Török Edvin * * Also based on Matt Olney's pdf parser in snort-nrt. * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as * published by the Free Software Foundation. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, * MA 02110-1301, USA. * * TODO: Embedded fonts * TODO: Predictor image handling */ #if HAVE_CONFIG_H #include "clamav-config.h" #endif #include #include #include #include #include #include #include #include #ifdef HAVE_LIMITS_H #include #endif #ifdef HAVE_UNISTD_H #include #endif #include #if HAVE_ICONV #include #endif #include "clamav.h" #include "others.h" #include "pdf.h" #include "scanners.h" #include "fmap.h" #include "str.h" #include "bytecode.h" #include "bytecode_api.h" #include "arc4.h" #include "rijndael.h" #include "textnorm.h" #include "conv.h" #include "json_api.h" #ifdef CL_DEBUG /*#define SAVE_TMP *Save the file being worked on in tmp */ #endif struct pdf_struct; static int asciihexdecode(const char *buf, off_t len, char *output); static int ascii85decode(const char *buf, off_t len, unsigned char *output); static const char *pdf_nextlinestart(const char *ptr, size_t len); static const char *pdf_nextobject(const char *ptr, size_t len); /* PDF statistics callbacks and related */ struct pdfname_action; #if HAVE_JSON static void pdf_export_json(struct pdf_struct *); static void ASCIIHexDecode_cb(struct pdf_struct *, struct pdf_obj *, struct pdfname_action *); static void ASCII85Decode_cb(struct pdf_struct *, struct pdf_obj *, struct pdfname_action *); static void EmbeddedFile_cb(struct pdf_struct *, struct pdf_obj *, struct pdfname_action *); static void FlateDecode_cb(struct pdf_struct *, struct pdf_obj *, struct pdfname_action *); static void Image_cb(struct pdf_struct *, struct pdf_obj *, struct pdfname_action *); static void LZWDecode_cb(struct pdf_struct *, struct pdf_obj *, struct pdfname_action *); static void RunLengthDecode_cb(struct pdf_struct *, struct pdf_obj *, struct pdfname_action *); static void CCITTFaxDecode_cb(struct pdf_struct *, struct pdf_obj *, struct pdfname_action *); static void JBIG2Decode_cb(struct pdf_struct *, struct pdf_obj *, struct pdfname_action *); static void DCTDecode_cb(struct pdf_struct *, struct pdf_obj *, struct pdfname_action *); static void JPXDecode_cb(struct pdf_struct *, struct pdf_obj *, struct pdfname_action *); static void Crypt_cb(struct pdf_struct *, struct pdf_obj *, struct pdfname_action *); static void Standard_cb(struct pdf_struct *, struct pdf_obj *, struct pdfname_action *); static void Sig_cb(struct pdf_struct *, struct pdf_obj *, struct pdfname_action *); static void JavaScript_cb(struct pdf_struct *, struct pdf_obj *, struct pdfname_action *); static void OpenAction_cb(struct pdf_struct *, struct pdf_obj *, struct pdfname_action *); static void Launch_cb(struct pdf_struct *, struct pdf_obj *, struct pdfname_action *); static void Page_cb(struct pdf_struct *, struct pdf_obj *, struct pdfname_action *); static void Author_cb(struct pdf_struct *, struct pdf_obj *, struct pdfname_action *); static void Creator_cb(struct pdf_struct *, struct pdf_obj *, struct pdfname_action *); static void Producer_cb(struct pdf_struct *, struct pdf_obj *, struct pdfname_action *); static void CreationDate_cb(struct pdf_struct *, struct pdf_obj *, struct pdfname_action *); static void ModificationDate_cb(struct pdf_struct *, struct pdf_obj *, struct pdfname_action *); static void Title_cb(struct pdf_struct *, struct pdf_obj *, struct pdfname_action *); static void Subject_cb(struct pdf_struct *, struct pdf_obj *, struct pdfname_action *); static void Keywords_cb(struct pdf_struct *, struct pdf_obj *, struct pdfname_action *); static void Pages_cb(struct pdf_struct *, struct pdf_obj *, struct pdfname_action *); static void Colors_cb(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdfname_action *act); static void RichMedia_cb(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdfname_action *act); static void AcroForm_cb(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdfname_action *act); static void XFA_cb(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdfname_action *act); #endif /* End PDF statistics callbacks and related */ static int xrefCheck(const char *xref, const char *eof) { const char *q; while (xref < eof && (*xref == ' ' || *xref == '\n' || *xref == '\r')) xref++; if (xref + 4 >= eof) return -1; if (!memcmp(xref, "xref", 4)) { cli_dbgmsg("cli_pdf: found xref\n"); return 0; } /* could be xref stream */ for (q=xref; q+5 < eof; q++) { if (!memcmp(q,"/XRef",4)) { cli_dbgmsg("cli_pdf: found /XRef\n"); return 0; } } return -1; } /* define this to be noisy about things that we can't parse properly */ /*#define NOISY*/ #ifdef NOISY #define noisy_msg(pdf, ...) cli_infomsg(pdf->ctx, __VA_ARGS__) #define noisy_warnmsg cli_warnmsg #else #define noisy_msg (void) #define noisy_warnmsg (void) #endif static const char *findNextNonWSBack(const char *q, const char *start) { while (q > start && (*q == 0 || *q == 9 || *q == 0xa || *q == 0xc || *q == 0xd || *q == 0x20)) q--; return q; } static int find_stream_bounds(const char *start, off_t bytesleft, off_t bytesleft2, off_t *stream, off_t *endstream, int newline_hack) { const char *q2, *q; if ((q2 = cli_memstr(start, bytesleft, "stream", 6))) { q2 += 6; bytesleft -= q2 - start; if (bytesleft < 0) return 0; if (bytesleft >= 2 && q2[0] == '\xd' && q2[1] == '\xa') { q2 += 2; if (newline_hack && (bytesleft > 2) && q2[0] == '\xa') q2++; } else if (bytesleft && q2[0] == '\xa') { q2++; } *stream = q2 - start; bytesleft2 -= q2 - start; if (bytesleft2 <= 0) return 0; q = q2; q2 = cli_memstr(q, bytesleft2, "endstream", 9); if (!q2) q2 = q + bytesleft2-9; /* till EOF */ *endstream = q2 - start; if (*endstream < *stream) *endstream = *stream; return 1; } return 0; } /* Expected returns: 1 if success, 0 if no more objects, -1 if error */ int pdf_findobj(struct pdf_struct *pdf) { const char *start, *q, *q2, *q3, *eof; struct pdf_obj *obj; off_t bytesleft; unsigned genid, objid; pdf->nobjs++; pdf->objs = cli_realloc2(pdf->objs, sizeof(*pdf->objs)*pdf->nobjs); if (!pdf->objs) { cli_warnmsg("cli_pdf: out of memory parsing objects (%u)\n", pdf->nobjs); return -1; } obj = &pdf->objs[pdf->nobjs-1]; memset(obj, 0, sizeof(*obj)); start = pdf->map+pdf->offset; bytesleft = pdf->size - pdf->offset; while (bytesleft > 0) { q2 = cli_memstr(start, bytesleft, "obj", 3); if (!q2) return 0;/* no more objs */ q2--; bytesleft -= q2 - start; if (*q2 != 0 && *q2 != 9 && *q2 != 0xa && *q2 != 0xc && *q2 != 0xd && *q2 != 0x20) { start = q2+4; bytesleft -= 4; continue; } break; } if (bytesleft <= 0) return 0; q = findNextNonWSBack(q2-1, start); while (q > start && isdigit(*q)) q--; genid = atoi(q); q = findNextNonWSBack(q-1,start); while (q > start && isdigit(*q)) q--; objid = atoi(q); obj->id = (objid << 8) | (genid&0xff); obj->start = q2+4 - pdf->map; obj->flags = 0; bytesleft -= 4; eof = pdf->map + pdf->size; q = pdf->map + obj->start; while (q < eof && bytesleft > 0) { off_t p_stream, p_endstream; q2 = pdf_nextobject(q, bytesleft); if (!q2) q2 = pdf->map + pdf->size; bytesleft -= q2 - q; if (find_stream_bounds(q-1, q2-q, bytesleft + (q2-q), &p_stream, &p_endstream, 1)) { obj->flags |= 1 << OBJ_STREAM; q2 = q-1 + p_endstream + 9; bytesleft -= q2 - q + 1; if (bytesleft < 0) { obj->flags |= 1 << OBJ_TRUNCATED; pdf->offset = pdf->size; return 1;/* truncated */ } } else if ((q3 = cli_memstr(q-1, q2-q+1, "endobj", 6))) { q2 = q3 + 6; pdf->offset = q2 - pdf->map; return 1; /* obj found and offset positioned */ } else { q2++; bytesleft--; } q = q2; } obj->flags |= 1 << OBJ_TRUNCATED; pdf->offset = pdf->size; return 1;/* truncated */ } static int filter_writen(struct pdf_struct *pdf, struct pdf_obj *obj, int fout, const char *buf, off_t len, off_t *sum) { UNUSEDPARAM(obj); if (cli_checklimits("pdf", pdf->ctx, *sum, 0, 0)) return len; /* pretend it was a successful write to suppress CL_EWRITE */ *sum += len; return cli_writen(fout, buf, len); } static void pdfobj_flag(struct pdf_struct *pdf, struct pdf_obj *obj, enum pdf_flag flag) { const char *s= ""; pdf->flags |= 1 << flag; if (!cli_debug_flag) return; switch (flag) { case UNTERMINATED_OBJ_DICT: s = "dictionary not terminated"; break; case ESCAPED_COMMON_PDFNAME: /* like /JavaScript */ s = "escaped common pdfname"; break; case BAD_STREAM_FILTERS: s = "duplicate stream filters"; break; case BAD_PDF_VERSION: s = "bad pdf version"; break; case BAD_PDF_HEADERPOS: s = "bad pdf header position"; break; case BAD_PDF_TRAILER: s = "bad pdf trailer"; break; case BAD_PDF_TOOMANYOBJS: s = "too many pdf objs"; break; case BAD_FLATE: s = "bad deflate stream"; break; case BAD_FLATESTART: s = "bad deflate stream start"; break; case BAD_STREAMSTART: s = "bad stream start"; break; case UNKNOWN_FILTER: s = "unknown filter used"; break; case BAD_ASCIIDECODE: s = "bad ASCII decode"; break; case HEX_JAVASCRIPT: s = "hex javascript"; break; case BAD_INDOBJ: s = "referencing nonexistent obj"; break; case HAS_OPENACTION: s = "has /OpenAction"; break; case HAS_LAUNCHACTION: s = "has /LaunchAction"; break; case BAD_STREAMLEN: s = "bad /Length, too small"; break; case ENCRYPTED_PDF: s = "PDF is encrypted"; break; case LINEARIZED_PDF: s = "linearized PDF"; break; case MANY_FILTERS: s = "more than 2 filters per obj"; break; case DECRYPTABLE_PDF: s = "decryptable PDF"; break; } cli_dbgmsg("cli_pdf: %s flagged in object %u %u\n", s, obj->id>>8, obj->id&0xff); } static int filter_flatedecode(struct pdf_struct *pdf, struct pdf_obj *obj, const char *buf, off_t len, int fout, off_t *sum) { int skipped = 0; int zstat; z_stream stream; off_t nbytes; char output[BUFSIZ]; if (len == 0) return CL_CLEAN; if (*buf == '\r') { buf++; len--; pdfobj_flag(pdf, obj, BAD_STREAMSTART); /* PDF spec says stream is followed by \r\n or \n, but not \r alone. * Sample 0015315109, it has \r followed by zlib header. * Flag pdf as suspicious, and attempt to extract by skipping the \r. */ if (!len) return CL_CLEAN; } memset(&stream, 0, sizeof(stream)); stream.next_in = (Bytef *)buf; stream.avail_in = len; stream.next_out = (Bytef *)output; stream.avail_out = sizeof(output); zstat = inflateInit(&stream); if(zstat != Z_OK) { cli_warnmsg("cli_pdf: inflateInit failed\n"); return CL_EMEM; } nbytes = 0; while(stream.avail_in) { int written; zstat = inflate(&stream, Z_NO_FLUSH); /* zlib */ switch(zstat) { case Z_OK: if(stream.avail_out == 0) { if ((written=filter_writen(pdf, obj, fout, output, sizeof(output), sum))!=sizeof(output)) { cli_errmsg("cli_pdf: failed to write output file\n"); inflateEnd(&stream); return CL_EWRITE; } nbytes += written; stream.next_out = (Bytef *)output; stream.avail_out = sizeof(output); } continue; case Z_STREAM_END: default: written = sizeof(output) - stream.avail_out; if (!written && !nbytes && !skipped) { /* skip till EOL, and try inflating from there, sometimes * PDFs contain extra whitespace */ const char *q = pdf_nextlinestart(buf, len); if (q) { skipped = 1; inflateEnd(&stream); len -= q - buf; buf = q; stream.next_in = (Bytef *)buf; stream.avail_in = len; stream.next_out = (Bytef *)output; stream.avail_out = sizeof(output); zstat = inflateInit(&stream); if(zstat != Z_OK) { cli_warnmsg("cli_pdf: inflateInit failed\n"); return CL_EMEM; } pdfobj_flag(pdf, obj, BAD_FLATESTART); continue; } } if (filter_writen(pdf, obj, fout, output, written, sum)!=written) { cli_errmsg("cli_pdf: failed to write output file\n"); inflateEnd(&stream); return CL_EWRITE; } nbytes += written; stream.next_out = (Bytef *)output; stream.avail_out = sizeof(output); if (zstat == Z_STREAM_END) break; if(stream.msg) cli_dbgmsg("cli_pdf: after writing %lu bytes, got error \"%s\" inflating PDF stream in %u %u obj\n", (unsigned long)nbytes, stream.msg, obj->id>>8, obj->id&0xff); else cli_dbgmsg("cli_pdf: after writing %lu bytes, got error %d inflating PDF stream in %u %u obj\n", (unsigned long)nbytes, zstat, obj->id>>8, obj->id&0xff); if(stream.msg) noisy_warnmsg("cli_pdf: after writing %lu bytes, got error \"%s\" inflating PDF stream in %u %u obj\n", (unsigned long)nbytes, stream.msg, obj->id>>8, obj->id&0xff); else noisy_warnmsg("cli_pdf: after writing %lu bytes, got error %d inflating PDF stream in %u %u obj\n", (unsigned long)nbytes, zstat, obj->id>>8, obj->id&0xff); /* mark stream as bad only if not encrypted */ inflateEnd(&stream); if (!nbytes) { pdfobj_flag(pdf, obj, BAD_FLATESTART); cli_dbgmsg("filter_flatedecode: No bytes, returning CL_EFORMAT for this stream.\n"); return CL_EFORMAT; } else { pdfobj_flag(pdf, obj, BAD_FLATE); } return CL_CLEAN; } break; } if(stream.avail_out != sizeof(output)) { if(filter_writen(pdf, obj, fout, output, sizeof(output) - stream.avail_out, sum) < 0) { cli_errmsg("cli_pdf: failed to write output file\n"); inflateEnd(&stream); return CL_EWRITE; } } inflateEnd(&stream); return CL_CLEAN; } struct pdf_obj *find_obj(struct pdf_struct *pdf, struct pdf_obj *obj, uint32_t objid) { uint32_t j; uint32_t i; /* search starting at previous obj (if exists) */ i = (obj != pdf->objs) ? obj - pdf->objs : 0; for (j=i;jnobjs;j++) { obj = &pdf->objs[j]; if (obj->id == objid) return obj; } /* restart search from beginning if not found */ for (j=0;jobjs[j]; if (obj->id == objid) return obj; } return NULL; } static int find_length(struct pdf_struct *pdf, struct pdf_obj *obj, const char *start, off_t len) { int length; const char *q; q = cli_memstr(start, len, "/Length", 7); if (!q) return 0; q++; len -= q - start; start = pdf_nextobject(q, len); if (!start) return 0; /* len -= start - q; */ q = start; length = atoi(q); while (isdigit(*q)) q++; if (*q == ' ') { int genid; q++; genid = atoi(q); while(isdigit(*q)) q++; if (q[0] == ' ' && q[1] == 'R') { cli_dbgmsg("cli_pdf: length is in indirect object %u %u\n", length, genid); obj = find_obj(pdf, obj, (length << 8) | (genid&0xff)); if (!obj) { cli_dbgmsg("cli_pdf: indirect object not found\n"); return 0; } q = pdf_nextobject(pdf->map+obj->start, pdf->size - obj->start); if (!q) { cli_dbgmsg("cli_pdf: next object not found\n"); return 0; } length = atoi(q); } } /* limit length */ if (start - pdf->map + length+5 > pdf->size) length = pdf->size - (start - pdf->map)-5; return length; } #define DUMP_MASK ((1 << OBJ_CONTENTS) | (1 << OBJ_FILTER_FLATE) | (1 << OBJ_FILTER_DCT) | (1 << OBJ_FILTER_AH) | (1 << OBJ_FILTER_A85) | (1 << OBJ_EMBEDDED_FILE) | (1 << OBJ_JAVASCRIPT) | (1 << OBJ_OPENACTION) | (1 << OBJ_LAUNCHACTION)) static int obj_size(struct pdf_struct *pdf, struct pdf_obj *obj, int binary) { unsigned i = obj - pdf->objs; i++; if (i < pdf->nobjs) { int s = pdf->objs[i].start - obj->start - 4; if (s > 0) { if (!binary) { const char *p = pdf->map + obj->start; const char *q = p + s; while (q > p && (isspace(*q) || isdigit(*q))) q--; if (q > p+5 && !memcmp(q-5,"endobj",6)) q -= 6; q = findNextNonWSBack(q, p); q++; return q - p; } return s; } } if (binary) return pdf->size - obj->start; return pdf->offset - obj->start - 6; } static int run_pdf_hooks(struct pdf_struct *pdf, enum pdf_phase phase, int fd, int dumpid) { int ret; struct cli_bc_ctx *bc_ctx; cli_ctx *ctx = pdf->ctx; fmap_t *map; UNUSEDPARAM(dumpid); bc_ctx = cli_bytecode_context_alloc(); if (!bc_ctx) { cli_errmsg("cli_pdf: can't allocate memory for bc_ctx"); return CL_EMEM; } map = *ctx->fmap; if (fd != -1) { map = fmap(fd, 0, 0); if (!map) { cli_warnmsg("can't mmap pdf extracted obj\n"); map = *ctx->fmap; fd = -1; } } cli_bytecode_context_setpdf(bc_ctx, phase, pdf->nobjs, pdf->objs, &pdf->flags, pdf->size, pdf->startoff); cli_bytecode_context_setctx(bc_ctx, ctx); ret = cli_bytecode_runhook(ctx, ctx->engine, bc_ctx, BC_PDF, map); cli_bytecode_context_destroy(bc_ctx); if (fd != -1) funmap(map); return ret; } static void dbg_printhex(const char *msg, const char *hex, unsigned len); static void aes_decrypt(const unsigned char *in, off_t *length, unsigned char *q, char *key, unsigned key_n, int has_iv) { unsigned long rk[RKLENGTH(256)]; unsigned char iv[16]; unsigned len = *length; unsigned char pad, i; int nrounds; cli_dbgmsg("cli_pdf: aes_decrypt: key length: %d, data length: %d\n", key_n, (int)*length); if (key_n > 32) { cli_dbgmsg("cli_pdf: aes_decrypt: key length is %d!\n", key_n*8); return; } if (len < 32) { cli_dbgmsg("cli_pdf: aes_decrypt: len is <32: %d\n", len); noisy_warnmsg("cli_pdf: aes_decrypt: len is <32: %d\n", len); return; } if (has_iv) { memcpy(iv, in, 16); in += 16; len -= 16; } else { memset(iv, 0, sizeof(iv)); } cli_dbgmsg("aes_decrypt: Calling rijndaelSetupDecrypt\n"); nrounds = rijndaelSetupDecrypt(rk, (const unsigned char *)key, key_n*8); cli_dbgmsg("aes_decrypt: Beginning rijndaelDecrypt\n"); while (len >= 16) { unsigned i; rijndaelDecrypt(rk, nrounds, in, q); for (i=0;i<16;i++) q[i] ^= iv[i]; memcpy(iv, in, 16); q += 16; in += 16; len -= 16; } if (has_iv) { len += 16; pad = q[-1]; if (pad > 0x10) { cli_dbgmsg("cli_pdf: aes_decrypt: bad pad: %x (extra len: %d)\n", pad, len-16); noisy_warnmsg("cli_pdf: aes_decrypt: bad pad: %x (extra len: %d)\n", pad, len-16); *length -= len; return; } q -= pad; for (i=1;i>8, id&0xff); return NULL; } n = pdf->keylen + 5; if (enc_method == ENC_AESV2) n += 4; key = cli_malloc(n); if (!key) { noisy_warnmsg("decrypt_any: malloc failed\n"); return NULL; } memcpy(key, pdf->key, pdf->keylen); q = key + pdf->keylen; *q++ = id >> 8; *q++ = id >> 16; *q++ = id >> 24; *q++ = id; *q++ = 0; if (enc_method == ENC_AESV2) memcpy(q, "sAlT", 4); cl_hash_data("md5", key, n, result, NULL); free(key); n = pdf->keylen + 5; if (n > 16) n = 16; q = cli_malloc(*length); if (!q) { noisy_warnmsg("decrypt_any: malloc failed\n"); return NULL; } switch (enc_method) { case ENC_V2: cli_dbgmsg("cli_pdf: enc is v2\n"); memcpy(q, in, *length); arc4_init(&arc4, result, n); arc4_apply(&arc4, q, *length); noisy_msg(pdf, "decrypted ARC4 data\n"); break; case ENC_AESV2: cli_dbgmsg("cli_pdf: enc is aesv2\n"); aes_decrypt((const unsigned char *)in, length, q, (char *)result, n, 1); noisy_msg(pdf, "decrypted AES(v2) data\n"); break; case ENC_AESV3: cli_dbgmsg("cli_pdf: enc is aesv3\n"); if (pdf->keylen == 0) { cli_dbgmsg("cli_pdf: no key\n"); return NULL; } aes_decrypt((const unsigned char *)in, length, q, pdf->key, pdf->keylen, 1); noisy_msg(pdf, "decrypted AES(v3) data\n"); break; case ENC_IDENTITY: cli_dbgmsg("cli_pdf: enc is identity\n"); memcpy(q, in, *length); noisy_msg(pdf, "identity encryption\n"); break; case ENC_NONE: cli_dbgmsg("cli_pdf: enc is none\n"); noisy_msg(pdf, "encryption is none\n"); free(q); return NULL; case ENC_UNKNOWN: cli_dbgmsg("cli_pdf: enc is unknown\n"); free(q); noisy_warnmsg("decrypt_any: unknown encryption method for obj %u %u\n", id>>8,id&0xff); return NULL; } return (char *)q; } enum enc_method get_enc_method(struct pdf_struct *pdf, struct pdf_obj *obj) { if (obj->flags & (1 << OBJ_EMBEDDED_FILE)) return pdf->enc_method_embeddedfile; if (obj->flags & (1 << OBJ_STREAM)) return pdf->enc_method_stream; return pdf->enc_method_string; } enum cstate { CSTATE_NONE, CSTATE_TJ, CSTATE_TJ_PAROPEN }; static void process(struct text_norm_state *s, enum cstate *st, const char *buf, int length, int fout) { do { switch (*st) { case CSTATE_NONE: if (*buf == '[') { *st = CSTATE_TJ; } else { const char *nl = memchr(buf, '\n', length); if (!nl) return; length -= nl - buf; buf = nl; } break; case CSTATE_TJ: if (*buf == '(') *st = CSTATE_TJ_PAROPEN; break; case CSTATE_TJ_PAROPEN: if (*buf == ')') { *st = CSTATE_TJ; } else { if (text_normalize_buffer(s, (const unsigned char *)buf, 1) != 1) { cli_writen(fout, s->out, s->out_pos); text_normalize_reset(s); } } break; } buf++; length--; } while (length > 0); } static int pdf_scan_contents(int fd, struct pdf_struct *pdf) { struct text_norm_state s; char fullname[1024]; char outbuff[BUFSIZ]; char inbuf[BUFSIZ]; int fout, n, rc; enum cstate st = CSTATE_NONE; snprintf(fullname, sizeof(fullname), "%s"PATHSEP"pdf%02u_c", pdf->dir, (pdf->files-1)); fout = open(fullname,O_RDWR|O_CREAT|O_EXCL|O_TRUNC|O_BINARY, 0600); if (fout < 0) { char err[128]; cli_errmsg("cli_pdf: can't create temporary file %s: %s\n", fullname, cli_strerror(errno, err, sizeof(err))); return CL_ETMPFILE; } text_normalize_init(&s, (unsigned char *)outbuff, sizeof(outbuff)); while (1) { n = cli_readn(fd, inbuf, sizeof(inbuf)); if (n <= 0) break; process(&s, &st, inbuf, n, fout); } cli_writen(fout, s.out, s.out_pos); lseek(fout, 0, SEEK_SET); rc = cli_magic_scandesc(fout, pdf->ctx); close(fout); if (!pdf->ctx->engine->keeptmp) if (cli_unlink(fullname) && rc != CL_VIRUS) rc = CL_EUNLINK; return rc; } static const char *pdf_getdict(const char *q0, int* len, const char *key); static char *pdf_readval(const char *q, int len, const char *key); static enum enc_method parse_enc_method(const char *dict, unsigned len, const char *key, enum enc_method def); static char *pdf_readstring(const char *q0, int len, const char *key, unsigned *slen, const char **qend, int noescape); int pdf_extract_obj(struct pdf_struct *pdf, struct pdf_obj *obj, uint32_t flags) { char fullname[NAME_MAX + 1]; int fout; off_t sum = 0; int rc = CL_SUCCESS; char *ascii_decoded = NULL; char *decrypted = NULL; int dump = 1; cli_dbgmsg("pdf_extract_obj: obj %u %u\n", obj->id>>8, obj->id&0xff); /* TODO: call bytecode hook here, allow override dumpability */ if ((!(obj->flags & (1 << OBJ_STREAM)) || (obj->flags & (1 << OBJ_HASFILTERS))) && !(obj->flags & DUMP_MASK)) { /* don't dump all streams */ dump = 0; } if ((obj->flags & (1 << OBJ_IMAGE)) && !(obj->flags & (1 << OBJ_FILTER_DCT))) { /* don't dump / scan non-JPG images */ dump = 0; } if (obj->flags & (1 << OBJ_FORCEDUMP)) { /* bytecode can force dump by setting this flag */ dump = 1; } if (!dump) return CL_CLEAN; cli_dbgmsg("cli_pdf: dumping obj %u %u\n", obj->id>>8, obj->id&0xff); snprintf(fullname, sizeof(fullname), "%s"PATHSEP"pdf%02u", pdf->dir, pdf->files++); fout = open(fullname,O_RDWR|O_CREAT|O_EXCL|O_TRUNC|O_BINARY, 0600); if (fout < 0) { char err[128]; cli_errmsg("cli_pdf: can't create temporary file %s: %s\n", fullname, cli_strerror(errno, err, sizeof(err))); free(ascii_decoded); return CL_ETMPFILE; } if (!(flags & PDF_EXTRACT_OBJ_SCAN)) obj->path = strdup(fullname); do { if (obj->flags & (1 << OBJ_STREAM)) { const char *start = pdf->map + obj->start; const char *flate_orig; off_t p_stream = 0, p_endstream = 0; off_t length, flate_orig_length; find_stream_bounds(start, pdf->size - obj->start, pdf->size - obj->start, &p_stream, &p_endstream, pdf->enc_method_stream <= ENC_IDENTITY && pdf->enc_method_embeddedfile <= ENC_IDENTITY); if (p_stream && p_endstream) { const char *flate_in; long ascii_decoded_size = 0; size_t size = p_endstream - p_stream; off_t orig_length; length = find_length(pdf, obj, start, p_stream); if (length < 0) length = 0; orig_length = length; if (length > pdf->size || obj->start + p_stream + length > pdf->size) { cli_dbgmsg("cli_pdf: length out of file: %ld + %ld > %ld\n", p_stream, length, pdf->size); noisy_warnmsg("length out of file, truncated: %ld + %ld > %ld\n", p_stream, length, pdf->size); length = pdf->size - (obj->start + p_stream); } if (!(obj->flags & (1 << OBJ_FILTER_FLATE)) && length <= 0) { const char *q = start + p_endstream; length = size; q--; if (*q == '\n') { q--; length--; if (*q == '\r') length--; } else if (*q == '\r') { length--; } if (length < 0) length = 0; cli_dbgmsg("cli_pdf: calculated length %ld\n", length); } else { if (size > (size_t)length+2) { cli_dbgmsg("cli_pdf: calculated length %ld < %ld\n", length, size); length = size; } } if (orig_length && size > (size_t)orig_length + 20) { cli_dbgmsg("cli_pdf: orig length: %ld, length: %ld, size: %ld\n", orig_length, length, size); pdfobj_flag(pdf, obj, BAD_STREAMLEN); } if (!length) { length = size; if (!length) { cli_dbgmsg("pdf_extract_obj: length and size both 0\n"); break; /* Empty stream, nothing to scan */ } } flate_orig = flate_in = start + p_stream; flate_orig_length = length; if (pdf->flags & (1 << DECRYPTABLE_PDF)) { enum enc_method enc = get_enc_method(pdf, obj); if (obj->flags & (1 << OBJ_FILTER_CRYPT)) { int len = p_stream; const char *q = pdf_getdict(start, &len, "/DecodeParams"); enc = ENC_IDENTITY; if (q && pdf->CF) { char *name = pdf_readval(q, len, "/Name"); cli_dbgmsg("cli_pdf: Crypt filter %s\n", name); if (name && strcmp(name, "/Identity")) enc = parse_enc_method(pdf->CF, pdf->CF_n, name, enc); free(name); } } if (cli_memstr(start, p_stream, "/XRef", 5)) { cli_dbgmsg("cli_pdf: cross reference stream, skipping\n"); } else { decrypted = decrypt_any(pdf, obj->id, flate_in, &length, enc); if (decrypted) flate_in = decrypted; } } if (obj->flags & (1 << OBJ_FILTER_AH)) { ascii_decoded = cli_malloc(length/2 + 1); if (!ascii_decoded) { cli_errmsg("Cannot allocate memory for ascii_decoded\n"); rc = CL_EMEM; break; } ascii_decoded_size = asciihexdecode(flate_in, length, ascii_decoded); } else if (obj->flags & (1 << OBJ_FILTER_A85)) { ascii_decoded = cli_malloc(length*5); if (!ascii_decoded) { cli_errmsg("Cannot allocate memory for ascii_decoded\n"); rc = CL_EMEM; break; } ascii_decoded_size = ascii85decode(flate_in, length, (unsigned char*)ascii_decoded); } if (ascii_decoded_size < 0) { /* don't flag for images or truncated objs*/ if (!(obj->flags & ((1 << OBJ_IMAGE) | (1 << OBJ_TRUNCATED)))) pdfobj_flag(pdf, obj, BAD_ASCIIDECODE); cli_dbgmsg("cli_pdf: failed to asciidecode in %u %u obj\n", obj->id>>8,obj->id&0xff); free(ascii_decoded); ascii_decoded = NULL; /* attempt to directly flatedecode it */ } /* either direct or ascii-decoded input */ if (!ascii_decoded) ascii_decoded_size = length; else flate_in = ascii_decoded; if (obj->flags & (1 << OBJ_FILTER_FLATE)) { cli_dbgmsg("cli_pdf: deflate len %ld (orig %ld)\n", ascii_decoded_size, (long)orig_length); rc = filter_flatedecode(pdf, obj, flate_in, ascii_decoded_size, fout, &sum); if (rc == CL_EFORMAT) { if (decrypted) { flate_in = flate_orig; ascii_decoded_size = flate_orig_length; } cli_dbgmsg("cli_pdf: dumping raw stream (probably encrypted)\n"); noisy_warnmsg("cli_pdf: dumping raw stream, probably encrypted and we failed to decrypt'n"); if (filter_writen(pdf, obj, fout, flate_in, ascii_decoded_size, &sum) != ascii_decoded_size) { cli_errmsg("cli_pdf: failed to write output file\n"); return CL_EWRITE; } } } else { if (filter_writen(pdf, obj, fout, flate_in, ascii_decoded_size, &sum) != ascii_decoded_size) rc = CL_EWRITE; } } else { noisy_warnmsg("cannot find stream bounds for obj %u %u\n", obj->id>>8, obj->id&0xff); } } else if (obj->flags & (1 << OBJ_JAVASCRIPT)) { const char *q2; const char *q = pdf->map+obj->start; /* TODO: get obj-endobj size */ off_t bytesleft = obj_size(pdf, obj, 0); if (bytesleft < 0) break; do { char *js = NULL; off_t js_len = 0; const char *q3; q2 = cli_memstr(q, bytesleft, "/JavaScript", 11); if (!q2) break; bytesleft -= q2 - q + 11; q = q2 + 11; js = pdf_readstring(q, bytesleft, "/JS", NULL, &q2, !(pdf->flags & (1<flags & (1 << DECRYPTABLE_PDF)) { cli_dbgmsg("cli_pdf: encrypted string\n"); decrypted = decrypt_any(pdf, obj->id, js, &js_len, pdf->enc_method_string); if (decrypted) { noisy_msg(pdf, "decrypted Javascript string from obj %u %u\n", obj->id>>8,obj->id&0xff); out = decrypted; } } if (filter_writen(pdf, obj, fout, out, js_len, &sum) != js_len) { rc = CL_EWRITE; free(js); break; } free(js); cli_dbgmsg("bytesleft: %d\n", (int)bytesleft); if (bytesleft > 0) { q2 = pdf_nextobject(q, bytesleft); if (!q2) q2 = q + bytesleft - 1; /* non-conforming PDFs that don't escape ) properly */ q3 = memchr(q, ')', bytesleft); if (q3 && q3 < q2) q2 = q3; while (q2 > q && q2[-1] == ' ') q2--; if (q2 > q) { q--; filter_writen(pdf, obj, fout, q, q2 - q, &sum); q++; } } } } while (bytesleft > 0); } else { off_t bytesleft = obj_size(pdf, obj, 0); if (bytesleft < 0) rc = CL_EFORMAT; else if (filter_writen(pdf, obj, fout , pdf->map + obj->start, bytesleft,&sum) != bytesleft) rc = CL_EWRITE; } } while (0); cli_dbgmsg("cli_pdf: extracted %ld bytes %u %u obj\n", sum, obj->id>>8, obj->id&0xff); cli_dbgmsg(" ... to %s\n", fullname); if (flags & PDF_EXTRACT_OBJ_SCAN && sum) { int rc2; cli_updatelimits(pdf->ctx, sum); /* TODO: invoke bytecode on this pdf obj with metainformation associated */ lseek(fout, 0, SEEK_SET); rc2 = cli_magic_scandesc(fout, pdf->ctx); if (rc2 == CL_VIRUS || rc == CL_SUCCESS) rc = rc2; if ((rc == CL_CLEAN) || ((rc == CL_VIRUS) && (pdf->ctx->options & CL_SCAN_ALLMATCHES))) { rc2 = run_pdf_hooks(pdf, PDF_PHASE_POSTDUMP, fout, obj - pdf->objs); if (rc2 == CL_VIRUS) rc = rc2; } if (((rc == CL_CLEAN) || ((rc == CL_VIRUS) && (pdf->ctx->options & CL_SCAN_ALLMATCHES))) && (obj->flags & (1 << OBJ_CONTENTS))) { lseek(fout, 0, SEEK_SET); cli_dbgmsg("cli_pdf: dumping contents %u %u\n", obj->id>>8, obj->id&0xff); rc2 = pdf_scan_contents(fout, pdf); if (rc2 == CL_VIRUS) rc = rc2; noisy_msg(pdf, "extracted text from obj %u %u\n", obj->id>>8, obj->id&0xff); } } close(fout); free(ascii_decoded); free(decrypted); if (flags & PDF_EXTRACT_OBJ_SCAN && !pdf->ctx->engine->keeptmp) if (cli_unlink(fullname) && rc != CL_VIRUS) rc = CL_EUNLINK; return rc; } enum objstate { STATE_NONE, STATE_S, STATE_FILTER, STATE_JAVASCRIPT, STATE_OPENACTION, STATE_LINEARIZED, STATE_LAUNCHACTION, STATE_CONTENTS, STATE_ANY /* for actions table below */ }; #define NAMEFLAG_NONE 0x0 #define NAMEFLAG_HEURISTIC 0x1 struct pdfname_action { const char *pdfname; enum pdf_objflags set_objflag;/* OBJ_DICT is noop */ enum objstate from_state;/* STATE_NONE is noop */ enum objstate to_state; uint32_t nameflags; #if HAVE_JSON void (*pdf_stats_cb)(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdfname_action *act); #endif }; #if HAVE_JSON static struct pdfname_action pdfname_actions[] = { {"ASCIIHexDecode", OBJ_FILTER_AH, STATE_FILTER, STATE_FILTER, NAMEFLAG_HEURISTIC, ASCIIHexDecode_cb}, {"ASCII85Decode", OBJ_FILTER_A85, STATE_FILTER, STATE_FILTER, NAMEFLAG_HEURISTIC, ASCII85Decode_cb}, {"A85", OBJ_FILTER_A85, STATE_FILTER, STATE_FILTER, NAMEFLAG_HEURISTIC, ASCII85Decode_cb}, {"AHx", OBJ_FILTER_AH, STATE_FILTER, STATE_FILTER, NAMEFLAG_HEURISTIC, ASCIIHexDecode_cb}, {"EmbeddedFile", OBJ_EMBEDDED_FILE, STATE_NONE, STATE_NONE, NAMEFLAG_HEURISTIC, EmbeddedFile_cb}, {"FlateDecode", OBJ_FILTER_FLATE, STATE_FILTER, STATE_FILTER, NAMEFLAG_HEURISTIC, FlateDecode_cb}, {"Fl", OBJ_FILTER_FLATE, STATE_FILTER, STATE_FILTER, NAMEFLAG_HEURISTIC, FlateDecode_cb}, {"Image", OBJ_IMAGE, STATE_NONE, STATE_NONE, NAMEFLAG_HEURISTIC, Image_cb}, {"LZWDecode", OBJ_FILTER_LZW, STATE_FILTER, STATE_FILTER, NAMEFLAG_HEURISTIC, LZWDecode_cb}, {"LZW", OBJ_FILTER_LZW, STATE_FILTER, STATE_FILTER, NAMEFLAG_HEURISTIC, LZWDecode_cb}, {"RunLengthDecode", OBJ_FILTER_RL, STATE_FILTER, STATE_FILTER, NAMEFLAG_HEURISTIC, RunLengthDecode_cb}, {"RL", OBJ_FILTER_RL, STATE_FILTER, STATE_FILTER, NAMEFLAG_HEURISTIC, RunLengthDecode_cb}, {"CCITTFaxDecode", OBJ_FILTER_FAX, STATE_FILTER, STATE_FILTER, NAMEFLAG_HEURISTIC, CCITTFaxDecode_cb}, {"CCF", OBJ_FILTER_FAX, STATE_FILTER, STATE_FILTER, NAMEFLAG_HEURISTIC, CCITTFaxDecode_cb}, {"JBIG2Decode", OBJ_FILTER_DCT, STATE_FILTER, STATE_FILTER, NAMEFLAG_HEURISTIC, JBIG2Decode_cb}, {"DCTDecode", OBJ_FILTER_DCT, STATE_FILTER, STATE_FILTER, NAMEFLAG_HEURISTIC, DCTDecode_cb}, {"DCT", OBJ_FILTER_DCT, STATE_FILTER, STATE_FILTER, NAMEFLAG_HEURISTIC, DCTDecode_cb}, {"JPXDecode", OBJ_FILTER_JPX, STATE_FILTER, STATE_FILTER, NAMEFLAG_HEURISTIC, JPXDecode_cb}, {"Crypt", OBJ_FILTER_CRYPT, STATE_FILTER, STATE_NONE, NAMEFLAG_HEURISTIC, Crypt_cb}, {"Standard", OBJ_FILTER_STANDARD, STATE_FILTER, STATE_FILTER, NAMEFLAG_HEURISTIC, Standard_cb}, {"Sig", OBJ_SIGNED, STATE_ANY, STATE_NONE, NAMEFLAG_HEURISTIC, Sig_cb}, {"V", OBJ_SIGNED, STATE_ANY, STATE_NONE, NAMEFLAG_HEURISTIC, NULL}, {"R", OBJ_SIGNED, STATE_ANY, STATE_NONE, NAMEFLAG_HEURISTIC, NULL}, {"Linearized", OBJ_DICT, STATE_NONE, STATE_LINEARIZED, NAMEFLAG_HEURISTIC, NULL}, {"Filter", OBJ_HASFILTERS, STATE_ANY, STATE_FILTER, NAMEFLAG_HEURISTIC, NULL}, {"JavaScript", OBJ_JAVASCRIPT, STATE_S, STATE_JAVASCRIPT, NAMEFLAG_HEURISTIC, JavaScript_cb}, {"Length", OBJ_DICT, STATE_FILTER, STATE_NONE, NAMEFLAG_HEURISTIC, NULL}, {"S", OBJ_DICT, STATE_NONE, STATE_S, NAMEFLAG_HEURISTIC, NULL}, {"Type", OBJ_DICT, STATE_NONE, STATE_NONE, NAMEFLAG_HEURISTIC, NULL}, {"OpenAction", OBJ_OPENACTION, STATE_ANY, STATE_OPENACTION, NAMEFLAG_HEURISTIC, OpenAction_cb}, {"Launch", OBJ_LAUNCHACTION, STATE_ANY, STATE_LAUNCHACTION, NAMEFLAG_HEURISTIC, Launch_cb}, {"Page", OBJ_PAGE, STATE_NONE, STATE_NONE, NAMEFLAG_HEURISTIC, Page_cb}, {"Contents", OBJ_CONTENTS, STATE_NONE, STATE_CONTENTS, NAMEFLAG_HEURISTIC, NULL}, {"Author", OBJ_DICT, STATE_NONE, STATE_NONE, NAMEFLAG_NONE, Author_cb}, {"Producer", OBJ_DICT, STATE_NONE, STATE_NONE, NAMEFLAG_NONE, Producer_cb}, {"CreationDate", OBJ_DICT, STATE_NONE, STATE_NONE, NAMEFLAG_NONE, CreationDate_cb}, {"ModDate", OBJ_DICT, STATE_NONE, STATE_NONE, NAMEFLAG_NONE, ModificationDate_cb}, {"Creator", OBJ_DICT, STATE_NONE, STATE_NONE, NAMEFLAG_NONE, Creator_cb}, {"Title", OBJ_DICT, STATE_NONE, STATE_NONE, NAMEFLAG_NONE, Title_cb}, {"Keywords", OBJ_DICT, STATE_NONE, STATE_NONE, NAMEFLAG_NONE, Keywords_cb}, {"Subject", OBJ_DICT, STATE_NONE, STATE_NONE, NAMEFLAG_NONE, Subject_cb}, {"Pages", OBJ_DICT, STATE_NONE, STATE_NONE, NAMEFLAG_NONE, Pages_cb}, {"Colors", OBJ_DICT, STATE_NONE, STATE_NONE, NAMEFLAG_NONE, Colors_cb}, {"RichMedia", OBJ_DICT, STATE_NONE, STATE_NONE, NAMEFLAG_NONE, RichMedia_cb}, {"AcroForm", OBJ_DICT, STATE_NONE, STATE_NONE, NAMEFLAG_NONE, AcroForm_cb}, {"XFA", OBJ_DICT, STATE_NONE, STATE_NONE, NAMEFLAG_NONE, XFA_cb} }; #else static struct pdfname_action pdfname_actions[] = { {"ASCIIHexDecode", OBJ_FILTER_AH, STATE_FILTER, STATE_FILTER, NAMEFLAG_HEURISTIC}, {"ASCII85Decode", OBJ_FILTER_A85, STATE_FILTER, STATE_FILTER, NAMEFLAG_HEURISTIC}, {"A85", OBJ_FILTER_A85, STATE_FILTER, STATE_FILTER, NAMEFLAG_HEURISTIC}, {"AHx", OBJ_FILTER_AH, STATE_FILTER, STATE_FILTER, NAMEFLAG_HEURISTIC}, {"EmbeddedFile", OBJ_EMBEDDED_FILE, STATE_NONE, STATE_NONE, NAMEFLAG_HEURISTIC}, {"FlateDecode", OBJ_FILTER_FLATE, STATE_FILTER, STATE_FILTER, NAMEFLAG_HEURISTIC}, {"Fl", OBJ_FILTER_FLATE, STATE_FILTER, STATE_FILTER, NAMEFLAG_HEURISTIC}, {"Image", OBJ_IMAGE, STATE_NONE, STATE_NONE, NAMEFLAG_HEURISTIC}, {"LZWDecode", OBJ_FILTER_LZW, STATE_FILTER, STATE_FILTER, NAMEFLAG_HEURISTIC}, {"LZW", OBJ_FILTER_LZW, STATE_FILTER, STATE_FILTER, NAMEFLAG_HEURISTIC}, {"RunLengthDecode", OBJ_FILTER_RL, STATE_FILTER, STATE_FILTER, NAMEFLAG_HEURISTIC}, {"RL", OBJ_FILTER_RL, STATE_FILTER, STATE_FILTER, NAMEFLAG_HEURISTIC}, {"CCITTFaxDecode", OBJ_FILTER_FAX, STATE_FILTER, STATE_FILTER, NAMEFLAG_HEURISTIC}, {"CCF", OBJ_FILTER_FAX, STATE_FILTER, STATE_FILTER, NAMEFLAG_HEURISTIC}, {"JBIG2Decode", OBJ_FILTER_DCT, STATE_FILTER, STATE_FILTER, NAMEFLAG_HEURISTIC}, {"DCTDecode", OBJ_FILTER_DCT, STATE_FILTER, STATE_FILTER, NAMEFLAG_HEURISTIC}, {"DCT", OBJ_FILTER_DCT, STATE_FILTER, STATE_FILTER, NAMEFLAG_HEURISTIC}, {"JPXDecode", OBJ_FILTER_JPX, STATE_FILTER, STATE_FILTER, NAMEFLAG_HEURISTIC}, {"Crypt", OBJ_FILTER_CRYPT, STATE_FILTER, STATE_NONE, NAMEFLAG_HEURISTIC}, {"Standard", OBJ_FILTER_STANDARD, STATE_FILTER, STATE_FILTER, NAMEFLAG_HEURISTIC}, {"Sig", OBJ_SIGNED, STATE_ANY, STATE_NONE, NAMEFLAG_HEURISTIC}, {"V", OBJ_SIGNED, STATE_ANY, STATE_NONE, NAMEFLAG_HEURISTIC}, {"R", OBJ_SIGNED, STATE_ANY, STATE_NONE, NAMEFLAG_HEURISTIC}, {"Linearized", OBJ_DICT, STATE_NONE, STATE_LINEARIZED, NAMEFLAG_HEURISTIC}, {"Filter", OBJ_HASFILTERS, STATE_ANY, STATE_FILTER, NAMEFLAG_HEURISTIC}, {"JavaScript", OBJ_JAVASCRIPT, STATE_S, STATE_JAVASCRIPT, NAMEFLAG_HEURISTIC}, {"Length", OBJ_DICT, STATE_FILTER, STATE_NONE, NAMEFLAG_HEURISTIC}, {"S", OBJ_DICT, STATE_NONE, STATE_S, NAMEFLAG_HEURISTIC}, {"Type", OBJ_DICT, STATE_NONE, STATE_NONE, NAMEFLAG_HEURISTIC}, {"OpenAction", OBJ_OPENACTION, STATE_ANY, STATE_OPENACTION, NAMEFLAG_HEURISTIC}, {"Launch", OBJ_LAUNCHACTION, STATE_ANY, STATE_LAUNCHACTION, NAMEFLAG_HEURISTIC}, {"Page", OBJ_PAGE, STATE_NONE, STATE_NONE, NAMEFLAG_HEURISTIC}, {"Contents", OBJ_CONTENTS, STATE_NONE, STATE_CONTENTS, NAMEFLAG_HEURISTIC} }; #endif #define KNOWN_FILTERS ((1 << OBJ_FILTER_AH) | (1 << OBJ_FILTER_RL) | (1 << OBJ_FILTER_A85) | (1 << OBJ_FILTER_FLATE) | (1 << OBJ_FILTER_LZW) | (1 << OBJ_FILTER_FAX) | (1 << OBJ_FILTER_DCT) | (1 << OBJ_FILTER_JPX) | (1 << OBJ_FILTER_CRYPT)) static void handle_pdfname(struct pdf_struct *pdf, struct pdf_obj *obj, const char *pdfname, int escapes, enum objstate *state) { struct pdfname_action *act = NULL; unsigned j; obj->statsflags |= OBJ_FLAG_PDFNAME_DONE; for (j=0;jflags & (1 << OBJ_SIGNED)) && !(obj->flags & KNOWN_FILTERS)) { cli_dbgmsg("cli_pdf: unknown filter %s\n", pdfname); obj->flags |= 1 << OBJ_FILTER_UNKNOWN; } return; } if ((act->nameflags & NAMEFLAG_HEURISTIC) && escapes) { /* if a commonly used PDF name is escaped that is certainly suspicious. */ cli_dbgmsg("cli_pdf: pdfname %s is escaped\n", pdfname); pdfobj_flag(pdf, obj, ESCAPED_COMMON_PDFNAME); } #if HAVE_JSON if ((act->pdf_stats_cb)) act->pdf_stats_cb(pdf, obj, act); #endif if (act->from_state == *state || act->from_state == STATE_ANY) { *state = act->to_state; if (*state == STATE_FILTER && act->set_objflag !=OBJ_DICT && (obj->flags & (1 << act->set_objflag))) { cli_dbgmsg("cli_pdf: duplicate stream filter %s\n", pdfname); pdfobj_flag(pdf, obj, BAD_STREAM_FILTERS); } obj->flags |= 1 << act->set_objflag; } else { /* auto-reset states */ switch (*state) { case STATE_S: *state = STATE_NONE; break; default: break; } } } static int pdf_readint(const char *q0, int len, const char *key); static void pdf_parse_encrypt(struct pdf_struct *pdf, const char *enc, int len) { const char *q, *q2; uint32_t objid; if (len >= 16 && !strncmp(enc, "/EncryptMetadata", 16)) { q = cli_memstr(enc+16, len-16, "/Encrypt", 8); if (!q) return; len -= q - enc; enc = q; } q = enc + 8; len -= 8; q2 = pdf_nextobject(q, len); if (!q2 || !isdigit(*q2)) return; objid = atoi(q2) << 8; len -= q2 - q; q = q2; q2 = pdf_nextobject(q, len); if (!q2 || !isdigit(*q2)) return; objid |= atoi(q2) & 0xff; len -= q2 - q; q = q2; q2 = pdf_nextobject(q, len); if (!q2 || *q2 != 'R') return; cli_dbgmsg("cli_pdf: Encrypt dictionary in obj %d %d\n", objid>>8, objid&0xff); pdf->enc_objid = objid; } static void pdf_parse_trailer(struct pdf_struct *pdf, const char *s, long length) { const char *enc; enc = cli_memstr(s, length, "/Encrypt", 8); if (enc) { char *newID; pdf->flags |= 1 << ENCRYPTED_PDF; pdf_parse_encrypt(pdf, enc, s + length - enc); newID = pdf_readstring(s, length, "/ID", &pdf->fileIDlen, NULL, 0); if (newID) { free(pdf->fileID); pdf->fileID = newID; } } } void pdf_parseobj(struct pdf_struct *pdf, struct pdf_obj *obj) { /* enough to hold common pdf names, we don't need all the names */ char pdfname[64]; const char *q2, *q3; const char *nextobj, *nextopen, *nextclose; const char *q = obj->start + pdf->map; const char *dict, *enddict, *start; off_t dict_length, full_dict_length; off_t objsize = obj_size(pdf, obj, 1); off_t bytesleft; unsigned i, filters=0; unsigned blockopens=0; enum objstate objstate = STATE_NONE; #if HAVE_JSON json_object *pdfobj=NULL, *jsonobj=NULL; #endif if (objsize < 0) return; start = q; bytesleft = objsize; /* find start of dictionary */ do { nextobj = pdf_nextobject(q, bytesleft); bytesleft -= nextobj -q; if (!nextobj || bytesleft < 0) { cli_dbgmsg("cli_pdf: %u %u obj: no dictionary\n", obj->id>>8, obj->id&0xff); #if HAVE_JSON if (!(pdfobj) && pdf->ctx->wrkproperty != NULL) { pdfobj = cli_jsonobj(pdf->ctx->wrkproperty, "PDFStats"); if (!(pdfobj)) return; } if (pdfobj) { if (!(jsonobj)) jsonobj = cli_jsonarray(pdfobj, "ObjectsWithoutDictionaries"); if (jsonobj) cli_jsonint_array(jsonobj, obj->id>>8); } #endif return; } q3 = memchr(q-1, '<', nextobj-q+1); nextobj++; bytesleft--; q = nextobj; } while (!q3 || q3[1] != '<'); dict = q3+2; q = dict; blockopens++; bytesleft = objsize - (q - start); enddict = q + bytesleft - 1; /* find end of dictionary block */ if (bytesleft < 0) { cli_dbgmsg("cli_pdf: %u %u obj: broken dictionary\n", obj->id>>8, obj->id&0xff); #if HAVE_JSON if (!(pdfobj) && pdf->ctx->wrkproperty != NULL) { pdfobj = cli_jsonobj(pdf->ctx->wrkproperty, "PDFStats"); if (!(pdfobj)) return; } if (pdfobj) { if (!(jsonobj)) jsonobj = cli_jsonarray(pdfobj, "ObjectsWithBrokenDictionaries"); if (jsonobj) cli_jsonint_array(jsonobj, obj->id>>8); } #endif return; } /* while still looking ... */ while ((q < enddict-1) && (blockopens > 0)) { /* find next close */ nextclose = memchr(q, '>', enddict-q); if (nextclose && (nextclose[1] == '>')) { /* check for nested open */ while ((nextopen = memchr(q-1, '<', nextclose-q+1)) != NULL) { if (nextopen[1] == '<') { /* nested open */ blockopens++; q = nextopen + 2; } else { /* unmatched < before next close */ q = nextopen + 2; } } /* close block */ blockopens--; q = nextclose + 2; } else if (nextclose) { /* found one > but not two */ q = nextclose + 2; } else { /* next closing not found */ break; } } /* Was end of dictionary found? */ if (blockopens) { /* probably truncated */ cli_dbgmsg("cli_pdf: %u %u obj broken dictionary\n", obj->id>>8, obj->id&0xff); #if HAVE_JSON if (!(pdfobj) && pdf->ctx->wrkproperty != NULL) { pdfobj = cli_jsonobj(pdf->ctx->wrkproperty, "PDFStats"); if (!(pdfobj)) return; } if (pdfobj) { if (!(jsonobj)) jsonobj = cli_jsonarray(pdfobj, "ObjectsWithBrokenDictionaries"); if (jsonobj) cli_jsonint_array(jsonobj, obj->id>>8); } #endif return; } enddict = nextclose; obj->flags |= 1 << OBJ_DICT; full_dict_length = dict_length = enddict - dict; /* This code prints the dictionary content. { char * dictionary = malloc(dict_length + 1); if (dictionary) { for (i = 0; i < dict_length; i++) { if (isprint(dict[i]) || isspace(dict[i])) dictionary[i] = dict[i]; else dictionary[i] = '*'; } dictionary[dict_length] = '\0'; cli_dbgmsg("cli_pdf: dictionary is <<%s>>\n", dictionary); free(dictionary); } } */ /* process pdf names */ for (q = dict;dict_length > 0;) { int escapes = 0, breakout=0; q2 = memchr(q, '/', dict_length); if (!q2) break; dict_length -= q2 - q; q = q2; /* normalize PDF names */ for (i = 0;dict_length > 0 && (i < sizeof(pdfname)-1); i++) { q++; dict_length--; if (*q == '#') { if (cli_hex2str_to(q+1, pdfname+i, 2) == -1) break; q += 2; dict_length -= 2; escapes = 1; continue; } switch (*q) { case ' ': case '\t': case '\r': case '\n': case '/': case '>': case '[': case ']': case '<': case '(': breakout = 1; } if (breakout) break; pdfname[i] = *q; } pdfname[i] = '\0'; handle_pdfname(pdf, obj, pdfname, escapes, &objstate); if (objstate == STATE_LINEARIZED) { long trailer_end, trailer; pdfobj_flag(pdf, obj, LINEARIZED_PDF); objstate = STATE_NONE; trailer_end = pdf_readint(dict, full_dict_length, "/H"); if (trailer_end > 0 && trailer_end < pdf->size) { trailer = trailer_end - 1024; if (trailer < 0) trailer = 0; q2 = pdf->map + trailer; cli_dbgmsg("cli_pdf: looking for trailer in linearized pdf: %ld - %ld\n", trailer, trailer_end); pdf_parse_trailer(pdf, q2, trailer_end - trailer); if (pdf->fileID) cli_dbgmsg("cli_pdf: found fileID\n"); } } if (objstate == STATE_LAUNCHACTION) pdfobj_flag(pdf, obj, HAS_LAUNCHACTION); if (dict_length > 0 && (objstate == STATE_JAVASCRIPT || objstate == STATE_OPENACTION || objstate == STATE_CONTENTS)) { if (objstate == STATE_OPENACTION) pdfobj_flag(pdf, obj, HAS_OPENACTION); q2 = pdf_nextobject(q, dict_length); if (q2 && isdigit(*q2)) { uint32_t objid = atoi(q2) << 8; while (isdigit(*q2)) q2++; q2 = pdf_nextobject(q2, dict_length); if (q2 && isdigit(*q2)) { objid |= atoi(q2) & 0xff; q2 = pdf_nextobject(q2, dict_length); if (q2 && *q2 == 'R') { struct pdf_obj *obj2; cli_dbgmsg("cli_pdf: found %s stored in indirect object %u %u\n", pdfname, objid >> 8, objid&0xff); obj2 = find_obj(pdf, obj, objid); if (obj2) { enum pdf_objflags flag = objstate == STATE_JAVASCRIPT ? OBJ_JAVASCRIPT : objstate == STATE_OPENACTION ? OBJ_OPENACTION : OBJ_CONTENTS; obj2->flags |= 1 << flag; obj->flags &= ~(1 << flag); } else { pdfobj_flag(pdf, obj, BAD_INDOBJ); } } } } objstate = STATE_NONE; } } for (i=0;iflags & (1 << act->set_objflag)) && act->from_state == STATE_FILTER && act->to_state == STATE_FILTER && act->set_objflag != OBJ_FILTER_CRYPT && act->set_objflag != OBJ_FILTER_STANDARD) { filters++; } } if (filters > 2) { /* more than 2 non-crypt filters */ pdfobj_flag(pdf, obj, MANY_FILTERS); } if (obj->flags & ((1 << OBJ_SIGNED) | KNOWN_FILTERS)) obj->flags &= ~(1 << OBJ_FILTER_UNKNOWN); if (obj->flags & (1 << OBJ_FILTER_UNKNOWN)) pdfobj_flag(pdf, obj, UNKNOWN_FILTER); cli_dbgmsg("cli_pdf: %u %u obj flags: %02x\n", obj->id>>8, obj->id&0xff, obj->flags); } static const char *pdf_getdict(const char *q0, int* len, const char *key) { const char *q; if (*len <= 0) { cli_dbgmsg("cli_pdf: bad length %d\n", *len); return NULL; } if (!q0) return NULL; q = cli_memstr(q0, *len, key, strlen(key)); if (!q) { cli_dbgmsg("cli_pdf: %s not found in dict\n", key); return NULL; } *len -= q - q0; q0 = q; q = pdf_nextobject(q0 + 1, *len - 1); if (!q) { cli_dbgmsg("cli_pdf: %s is invalid in dict\n", key); return NULL; } if (q[-1] == '<') q--; *len -= q - q0; return q; } static char *pdf_readstring(const char *q0, int len, const char *key, unsigned *slen, const char **qend, int noescape) { char *s, *s0; const char *start, *q, *end; if (slen) *slen = 0; if (qend) *qend = q0; q = pdf_getdict(q0, &len, key); if (!q) return NULL; if (*q == '(') { int paren = 1; start = ++q; for (;paren > 0 && len > 0; q++,len--) { switch (*q) { case '(': paren++; break; case ')': paren--; break; case '\\': q++; len--; break; default: break; } } if (qend) *qend = q; q--; len = q - start; s0 = s = cli_malloc(len + 1); if (!s) { cli_errmsg("pdf_readstring: Unable to allocate buffer\n"); return NULL; } end = start + len; if (noescape) { memcpy(s0, start, len); s = s0 + len; } else { for (q = start;q < end;q++) { if (*q != '\\') { *s++ = *q; } else { q++; switch (*q) { case 'n': *s++ = '\n'; break; case 'r': *s++ = '\r'; break; case 't': *s++ = '\t'; break; case 'b': *s++ = '\b'; break; case 'f': *s++ = '\f'; break; case '(':/* fall-through */ case ')':/* fall-through */ case '\\': *s++ = *q; break; case '\n': /* ignore */ break; case '\r': /* ignore */ if (q+1 < end && q[1] == '\n') q++; break; case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': /* octal escape */ if (q+2 < end) q++; *s++ = 64*(q[0] - '0') + 8*(q[1] - '0') + (q[2] - '0'); break; default: /* ignore */ *s++ = '\\'; q--; break; } } } } *s++ = '\0'; if (slen) *slen = s - s0 - 1; return s0; } if (*q == '<') { start = ++q; q = memchr(q+1, '>', len); if (!q) return NULL; if (qend) *qend = q; s = cli_malloc((q - start)/2 + 1); if (s == NULL) { /* oops, couldn't allocate memory */ cli_dbgmsg("cli_pdf: unable to allocate memory...\n"); return NULL; } if (cli_hex2str_to(start, s, q - start)) { cli_dbgmsg("cli_pdf: %s has bad hex value\n", key); free(s); return NULL; } s[(q-start)/2] = '\0'; if (slen) *slen = (q - start)/2; return s; } cli_dbgmsg("cli_pdf: %s is invalid string in dict\n", key); return NULL; } static char *pdf_readval(const char *q, int len, const char *key) { const char *end; char *s; q = pdf_getdict(q, &len, key); if (!q || len <= 0) return NULL; while (len > 0 && *q && *q == ' ') { q++; len--; } if (*q != '/') return NULL; q++; len--; end = q; while (len > 0 && *end && !(*end == '/' || (len > 1 && end[0] == '>' && end[1] == '>'))) { end++; len--; } s = cli_malloc(end - q + 1); if (!s) return NULL; memcpy(s, q, end-q); s[end-q] = '\0'; return s; } static int pdf_readint(const char *q0, int len, const char *key) { const char *q = pdf_getdict(q0, &len, key); return (q != NULL) ? atoi(q) : -1; } static int pdf_readbool(const char *q0, int len, const char *key, int Default) { const char *q = pdf_getdict(q0, &len, key); if (!q || len < 5) return Default; if (!strncmp(q, "true", 4)) return 1; if (!strncmp(q, "false", 5)) return 0; cli_dbgmsg("cli_pdf: invalid value for %s bool\n", key); return Default; } static const char *key_padding = "\x28\xBF\x4E\x5E\x4E\x75\x8A\x41\x64\x00\x4e\x56\xff\xfa\x01\x08" "\x2e\x2e\x00\xB6\xD0\x68\x3E\x80\x2F\x0C\xA9\xFE\x64\x53\x69\x7A"; static void dbg_printhex(const char *msg, const char *hex, unsigned len) { if (cli_debug_flag) { char *kh = cli_str2hex(hex, len); cli_dbgmsg("cli_pdf: %s: %s\n", msg, kh); free(kh); } } static void check_user_password(struct pdf_struct *pdf, int R, const char *O, const char *U, int32_t P, int EM, const char *UE, unsigned length, unsigned oulen) { unsigned i; uint8_t result[16]; char data[32]; struct arc4_state arc4; unsigned password_empty = 0; UNUSEDPARAM(oulen); dbg_printhex("U: ", U, 32); dbg_printhex("O: ", O, 32); if (R == 5) { uint8_t result2[32]; /* supplement to ISO3200, 3.5.2 Algorithm 3.11 */ /* user validation salt */ cl_sha256(U+32, 8, result2, NULL); dbg_printhex("Computed U", (const char *)result2, 32); if (!memcmp(result2, U, 32)) { off_t n; /* Algorithm 3.2a could be used to recover encryption key */ password_empty = 1; cl_sha256(U+40, 8, result2, NULL); n = UE ? strlen(UE) : 0; if (n != 32) { cli_dbgmsg("cli_pdf: UE length is not 32: %d\n", (int)n); noisy_warnmsg("cli_pdf: UE length is not 32: %d\n", n); } else { pdf->keylen = 32; pdf->key = cli_malloc(32); if (!pdf->key) { cli_errmsg("check_user_password: Cannot allocate memory for pdf->key\n"); return; } aes_decrypt((const unsigned char *)UE, &n, (unsigned char *)(pdf->key), (char *)result2, 32, 0); dbg_printhex("cli_pdf: Candidate encryption key", pdf->key, pdf->keylen); } } } else if ((R >= 2) && (R <= 4)) { unsigned char *d; size_t sz = 68 + pdf->fileIDlen + (R >= 4 && !EM ? 4 : 0); d = calloc(1, sz); if (!(d)) return; memcpy(d, key_padding, 32); memcpy(d+32, O, 32); P = le32_to_host(P); memcpy(d+64, &P, 4); memcpy(d+68, pdf->fileID, pdf->fileIDlen); /* 7.6.3.3 Algorithm 2 */ /* empty password, password == padding */ if (R >= 4 && !EM) { uint32_t v = 0xFFFFFFFF; memcpy(d+68+pdf->fileIDlen, &v, 4); } cl_hash_data("md5", d, sz, result, NULL); free(d); if (length > 128) length = 128; if (R >= 3) { /* Yes, this really is on purpose */ for (i=0;i<50;i++) cl_hash_data("md5", result, length/8, result, NULL); } if (R == 2) length = 40; pdf->keylen = length / 8; pdf->key = cli_malloc(pdf->keylen); if (!pdf->key) return; memcpy(pdf->key, result, pdf->keylen); dbg_printhex("md5", (const char *)result, 16); dbg_printhex("Candidate encryption key", pdf->key, pdf->keylen); /* 7.6.3.3 Algorithm 6 */ if (R == 2) { /* 7.6.3.3 Algorithm 4 */ memcpy(data, key_padding, 32); arc4_init(&arc4, (const uint8_t *)(pdf->key), pdf->keylen); arc4_apply(&arc4, (uint8_t *)data, 32); dbg_printhex("computed U (R2)", data, 32); if (!memcmp(data, U, 32)) password_empty = 1; } else if (R >= 3) { unsigned len = pdf->keylen; unsigned char *d; d = calloc(1, 32 + pdf->fileIDlen); if (!(d)) return; /* 7.6.3.3 Algorithm 5 */ memcpy(d, key_padding, 32); memcpy(d+32, pdf->fileID, pdf->fileIDlen); cl_hash_data("md5", d, 32 + pdf->fileIDlen, result, NULL); memcpy(data, pdf->key, len); arc4_init(&arc4, (const uint8_t *)data, len); arc4_apply(&arc4, result, 16); for (i=1;i<=19;i++) { unsigned j; for (j=0;jkey[j] ^ i; arc4_init(&arc4, (const uint8_t *)data, len); arc4_apply(&arc4, result, 16); } dbg_printhex("fileID", pdf->fileID, pdf->fileIDlen); dbg_printhex("computed U (R>=3)", (const char *)result, 16); if (!memcmp(result, U, 16)) password_empty = 1; free(d); } else { cli_dbgmsg("cli_pdf: invalid revision %d\n", R); noisy_warnmsg("cli_pdf: invalid revision %d\n", R); } } else { /* Supported R is in {2,3,4,5} */ cli_dbgmsg("cli_pdf: R value out of range\n"); noisy_warnmsg("cli_pdf: R value out of range\n"); return; } if (password_empty) { cli_dbgmsg("cli_pdf: user password is empty\n"); noisy_msg(pdf, "cli_pdf: encrypted PDF found, user password is empty, will attempt to decrypt\n"); /* The key we computed above is the key used to encrypt the streams. * We could decrypt it now if we wanted to */ pdf->flags |= 1 << DECRYPTABLE_PDF; } else { /* the key is not valid, we would need the user or the owner password to decrypt */ cli_dbgmsg("cli_pdf: user/owner password would be required for decryption\n"); noisy_warnmsg("cli_pdf: encrypted PDF found, user password is NOT empty, cannot decrypt!\n"); } } static enum enc_method parse_enc_method(const char *dict, unsigned len, const char *key, enum enc_method def) { const char *q; char *CFM = NULL; enum enc_method ret = ENC_UNKNOWN; if (!key) return def; if (!strcmp(key, "Identity")) return ENC_IDENTITY; q = pdf_getdict(dict, (int *)(&len), key); if (!q) return def; CFM = pdf_readval(q, len, "/CFM"); if (CFM) { cli_dbgmsg("cli_pdf: %s CFM: %s\n", key, CFM); if (!strncmp(CFM,"V2", 2)) ret = ENC_V2; else if (!strncmp(CFM,"AESV2",5)) ret = ENC_AESV2; else if (!strncmp(CFM,"AESV3",5)) ret = ENC_AESV3; else if (!strncmp(CFM,"None",4)) ret = ENC_NONE; free(CFM); } return ret; } void pdf_handle_enc(struct pdf_struct *pdf) { struct pdf_obj *obj; uint32_t len, n, R, P, length, EM = 1, i, oulen; char *O, *U, *UE, *StmF, *StrF, *EFF; const char *q, *q2; if (pdf->enc_objid == ~0u) return; if (!pdf->fileID) { cli_dbgmsg("cli_pdf: pdf_handle_enc no file ID\n"); noisy_warnmsg("cli_pdf: pdf_handle_enc no file ID\n"); return; } obj = find_obj(pdf, pdf->objs, pdf->enc_objid); if (!obj) { cli_dbgmsg("cli_pdf: can't find encrypted object %d %d\n", pdf->enc_objid>>8, pdf->enc_objid&0xff); noisy_warnmsg("cli_pdf: can't find encrypted object %d %d\n", pdf->enc_objid>>8, pdf->enc_objid&0xff); return; } len = obj_size(pdf, obj, 1); q = pdf->map + obj->start; O = U = UE = StmF = StrF = EFF = NULL; do { pdf->enc_method_string = ENC_UNKNOWN; pdf->enc_method_stream = ENC_UNKNOWN; pdf->enc_method_embeddedfile = ENC_UNKNOWN; P = pdf_readint(q, len, "/P"); if (P == ~0u) { cli_dbgmsg("cli_pdf: invalid P\n"); noisy_warnmsg("cli_pdf: invalid P\n"); break; } q2 = cli_memstr(q, len, "/Standard", 9); if (!q2) { cli_dbgmsg("cli_pdf: /Standard not found\n"); noisy_warnmsg("cli_pdf: /Standard not found\n"); break; } /* we can have both of these: * /AESV2/Length /Standard/Length * /Length /Standard * make sure we don't mistake AES's length for Standard's */ length = pdf_readint(q2, len - (q2 - q), "/Length"); if (length == ~0u) length = pdf_readint(q, len, "/Length"); if (length < 40) { cli_dbgmsg("cli_pdf: invalid length: %d\n", length); length = 40; } R = pdf_readint(q, len, "/R"); if (R == ~0u) { cli_dbgmsg("cli_pdf: invalid R\n"); noisy_warnmsg("cli_pdf: invalid R\n"); break; } if ((R > 5) || (R < 2)) { cli_dbgmsg("cli_pdf: R value outside supported range [2..5]\n"); noisy_warnmsg("cli_pdf: R value outside supported range [2..5]\n"); break; } if (R < 5) oulen = 32; else oulen = 48; if (R == 2 || R == 3) { pdf->enc_method_stream = ENC_V2; pdf->enc_method_string = ENC_V2; pdf->enc_method_embeddedfile = ENC_V2; } else if (R == 4 || R == 5) { EM = pdf_readbool(q, len, "/EncryptMetadata", 1); StmF = pdf_readval(q, len, "/StmF"); StrF = pdf_readval(q, len, "/StrF"); EFF = pdf_readval(q, len, "/EFF"); n = len; pdf->CF = pdf_getdict(q, (int *)(&n), "/CF"); pdf->CF_n = n; if (StmF) cli_dbgmsg("cli_pdf: StmF: %s\n", StmF); if (StrF) cli_dbgmsg("cli_pdf: StrF: %s\n", StrF); if (EFF) cli_dbgmsg("cli_pdf: EFF: %s\n", EFF); pdf->enc_method_stream = parse_enc_method(pdf->CF, n, StmF, ENC_IDENTITY); pdf->enc_method_string = parse_enc_method(pdf->CF, n, StrF, ENC_IDENTITY); pdf->enc_method_embeddedfile = parse_enc_method(pdf->CF, n, EFF, pdf->enc_method_stream); free(StmF); free(StrF); free(EFF); cli_dbgmsg("cli_pdf: EncryptMetadata: %s\n", EM ? "true" : "false"); if (R == 4) { length = 128; } else { n = 0; UE = pdf_readstring(q, len, "/UE", &n, NULL, 0); length = 256; } } if (length == ~0u) length = 40; n = 0; O = pdf_readstring(q, len, "/O", &n, NULL, 0); if (!O || n < oulen) { cli_dbgmsg("cli_pdf: invalid O: %d\n", n); cli_dbgmsg("cli_pdf: invalid O: %d\n", n); if (O) dbg_printhex("invalid O", O, n); break; } if (n > oulen) { for (i=oulen;i oulen) { for (i=oulen;ifmap; size_t size = map->len - offset; off_t versize = size > 1032 ? 1032 : size; off_t map_off, bytesleft; long xref; const char *pdfver, *start, *eofmap, *q, *eof; int rc, badobjects = 0; unsigned i, alerts = 0; #if HAVE_JSON json_object *pdfobj=NULL; char *begin, *end, *p1; #endif cli_dbgmsg("in cli_pdf(%s)\n", dir); memset(&pdf, 0, sizeof(pdf)); pdf.ctx = ctx; pdf.dir = dir; pdf.enc_objid = ~0u; pdfver = start = fmap_need_off_once(map, offset, versize); /* Check PDF version */ if (!pdfver) { cli_errmsg("cli_pdf: mmap() failed (1)\n"); return CL_EMAP; } #if HAVE_JSON if (ctx->wrkproperty) pdfobj = cli_jsonobj(ctx->wrkproperty, "PDFStats"); #endif /* offset is 0 when coming from filetype2 */ pdfver = cli_memstr(pdfver, versize, "%PDF-", 5); if (!pdfver) { cli_dbgmsg("cli_pdf: no PDF- header found\n"); noisy_warnmsg("cli_pdf: no PDF- header found\n"); #if HAVE_JSON pdf_export_json(&pdf); #endif return CL_SUCCESS; } /* Check for PDF-1.[0-9]. Although 1.7 is highest now, allow for future versions */ if (pdfver[5] != '1' || pdfver[6] != '.' || pdfver[7] < '1' || pdfver[7] > '9') { pdf.flags |= 1 << BAD_PDF_VERSION; cli_dbgmsg("cli_pdf: bad pdf version: %.8s\n", pdfver); #if HAVE_JSON if (pdfobj) cli_jsonbool(pdfobj, "BadVersion", 1); #endif } else { #if HAVE_JSON if (pdfobj) { begin = (char *)(pdfver+5); end = begin+2; strtoul(end, &end, 10); p1 = cli_calloc((end - begin) + 2, 1); if (p1) { strncpy(p1, begin, end - begin); p1[end - begin] = '\0'; cli_jsonstr(pdfobj, "PDFVersion", p1); free(p1); } } #endif } if (pdfver != start || offset) { pdf.flags |= 1 << BAD_PDF_HEADERPOS; cli_dbgmsg("cli_pdf: PDF header is not at position 0: %ld\n",pdfver-start+offset); #if HAVE_JSON if (pdfobj) cli_jsonbool(pdfobj, "BadVersionLocation", 1); #endif } offset += pdfver - start; /* find trailer and xref, don't fail if not found */ map_off = (off_t)map->len - 2048; if (map_off < 0) map_off = 0; bytesleft = map->len - map_off; eofmap = fmap_need_off_once(map, map_off, bytesleft); if (!eofmap) { cli_errmsg("cli_pdf: mmap() failed (2)\n"); #if HAVE_JSON pdf_export_json(&pdf); #endif return CL_EMAP; } eof = eofmap + bytesleft; for (q=&eofmap[bytesleft-5]; q > eofmap; q--) { if (memcmp(q, "%%EOF", 5) == 0) break; } if (q <= eofmap) { pdf.flags |= 1 << BAD_PDF_TRAILER; cli_dbgmsg("cli_pdf: %%%%EOF not found\n"); #if HAVE_JSON if (pdfobj) cli_jsonbool(pdfobj, "NoEOF", 1); #endif } else { const char *t; /*size = q - eofmap + map_off;*/ q -= 9; for (;q > eofmap;q--) { if (memcmp(q, "startxref", 9) == 0) break; } if (q <= eofmap) { pdf.flags |= 1 << BAD_PDF_TRAILER; cli_dbgmsg("cli_pdf: startxref not found\n"); #if HAVE_JSON if (pdfobj) cli_jsonbool(pdfobj, "NoXREF", 1); #endif } else { for (t=q;t > eofmap; t--) { if (memcmp(t,"trailer",7) == 0) break; } pdf_parse_trailer(&pdf, eofmap, eof - eofmap); q += 9; while (q < eof && (*q == ' ' || *q == '\n' || *q == '\r')) { q++; } xref = atol(q); bytesleft = map->len - offset - xref; if (bytesleft > 4096) bytesleft = 4096; q = fmap_need_off_once(map, offset + xref, bytesleft); if (!q || xrefCheck(q, q+bytesleft) == -1) { cli_dbgmsg("cli_pdf: did not find valid xref\n"); pdf.flags |= 1 << BAD_PDF_TRAILER; } } } size -= offset; pdf.size = size; pdf.map = fmap_need_off(map, offset, size); if (!pdf.map) { cli_errmsg("cli_pdf: mmap() failed (3)\n"); #if HAVE_JSON pdf_export_json(&pdf); #endif return CL_EMAP; } pdf.startoff = offset; rc = run_pdf_hooks(&pdf, PDF_PHASE_PRE, -1, -1); if ((rc == CL_VIRUS) && SCAN_ALL) { cli_dbgmsg("cli_pdf: (pre hooks) returned %d\n", rc); alerts++; rc = CL_CLEAN; } else if (rc) { cli_dbgmsg("cli_pdf: (pre hooks) returning %d\n", rc); #if HAVE_JSON pdf_export_json(&pdf); #endif return rc == CL_BREAK ? CL_CLEAN : rc; } /* parse PDF and find obj offsets */ while ((rc = pdf_findobj(&pdf)) > 0) { struct pdf_obj *obj = &pdf.objs[pdf.nobjs-1]; cli_dbgmsg("cli_pdf: found %d %d obj @%ld\n", obj->id >> 8, obj->id&0xff, obj->start + offset); } if (pdf.nobjs) pdf.nobjs--; if (rc == -1) pdf.flags |= 1 << BAD_PDF_TOOMANYOBJS; /* must parse after finding all objs, so we can flag indirect objects */ for (i=0;idconf->other & OTHER_CONF_PDFNAMEOBJ)) { if (pdf.flags & (1 << ESCAPED_COMMON_PDFNAME)) { /* for example /Fl#61te#44#65#63#6f#64#65 instead of /FlateDecode */ cli_append_virus(ctx, "Heuristics.PDF.ObfuscatedNameObject"); rc = cli_found_possibly_unwanted(ctx); } } #if 0 /* TODO: find both trailers, and /Encrypt settings */ if (pdf.flags & (1 << LINEARIZED_PDF)) pdf.flags &= ~ (1 << BAD_ASCIIDECODE); if (pdf.flags & (1 << MANY_FILTERS)) pdf.flags &= ~ (1 << BAD_ASCIIDECODE); if (!rc && (pdf.flags & ((1 << BAD_PDF_TOOMANYOBJS) | (1 << BAD_STREAM_FILTERS) | (1<') break; if (cli_hex2str_to(buf+i, output+j, 2) == -1) { if (len - i < 4) continue; return -1; } j++; i++; } return j; } /* * ascii85 inflation, returns number of bytes in output, -1 for error * * See http://www.piclist.com/techref/method/encode.htm (look for base85) */ static int ascii85decode(const char *buf, off_t len, unsigned char *output) { const char *ptr; uint32_t sum = 0; int quintet = 0; int ret = 0; if(cli_memstr(buf, len, "~>", 2) == NULL) cli_dbgmsg("cli_pdf: ascii85decode: no EOF marker found\n"); ptr = buf; cli_dbgmsg("cli_pdf: ascii85decode %lu bytes\n", (unsigned long)len); while(len > 0) { int byte = (len--) ? (int)*ptr++ : EOF; if((byte == '~') && (len > 0) && (*ptr == '>')) byte = EOF; if(byte >= '!' && byte <= 'u') { sum = (sum * 85) + ((uint32_t)byte - '!'); if(++quintet == 5) { *output++ = (unsigned char)(sum >> 24); *output++ = (unsigned char)((sum >> 16) & 0xFF); *output++ = (unsigned char)((sum >> 8) & 0xFF); *output++ = (unsigned char)(sum & 0xFF); ret += 4; quintet = 0; sum = 0; } } else if(byte == 'z') { if(quintet) { cli_dbgmsg("cli_pdf: ascii85decode: unexpected 'z'\n"); return -1; } *output++ = '\0'; *output++ = '\0'; *output++ = '\0'; *output++ = '\0'; ret += 4; } else if(byte == EOF) { cli_dbgmsg("cli_pdf: ascii85decode: quintet %d\n", quintet); if(quintet) { int i; if(quintet == 1) { cli_dbgmsg("cli_pdf: ascii85Decode: only 1 byte in last quintet\n"); return -1; } for(i = quintet; i < 5; i++) sum *= 85; if(quintet > 1) sum += (0xFFFFFF >> ((quintet - 2) * 8)); ret += quintet-1; for(i = 0; i < quintet - 1; i++) *output++ = (unsigned char)((sum >> (24 - 8 * i)) & 0xFF); } break; } else if(!isspace(byte)) { cli_dbgmsg("cli_pdf: ascii85Decode: invalid character 0x%x, len %lu\n", byte & 0xFF, (unsigned long)len); return -1; } } return ret; } /* * Find the start of the next line */ static const char * pdf_nextlinestart(const char *ptr, size_t len) { while(strchr("\r\n", *ptr) == NULL) { if(--len == 0L) return NULL; ptr++; } while(strchr("\r\n", *ptr) != NULL) { if(--len == 0L) return NULL; ptr++; } return ptr; } /* * Return the start of the next PDF object. * This assumes that we're not in a stream. */ static const char * pdf_nextobject(const char *ptr, size_t len) { const char *p; int inobject = 1; while(len) { switch(*ptr) { case '\n': case '\r': case '%': /* comment */ p = pdf_nextlinestart(ptr, len); if(p == NULL) return NULL; len -= (size_t)(p - ptr); ptr = p; inobject = 0; break; case ' ': case '\t': case '[': /* Start of an array object */ case '\v': case '\f': case '<': /* Start of a dictionary object */ inobject = 0; ptr++; len--; break; case '/': /* Start of a name object */ return ptr; case '(': /* start of JS */ return ptr; default: if(!inobject) { /* TODO: parse and return object type */ return ptr; } ptr++; len--; } } return NULL; } /* PDF statistics */ #if HAVE_JSON static void ASCIIHexDecode_cb(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdfname_action *act) { UNUSEDPARAM(obj); UNUSEDPARAM(act); if (!(pdf)) return; pdf->stats.nasciihexdecode++; } #endif #if HAVE_JSON static void ASCII85Decode_cb(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdfname_action *act) { UNUSEDPARAM(obj); UNUSEDPARAM(act); if (!(pdf)) return; pdf->stats.nascii85decode++; } #endif #if HAVE_JSON static void EmbeddedFile_cb(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdfname_action *act) { UNUSEDPARAM(obj); UNUSEDPARAM(act); if (!(pdf)) return; pdf->stats.nembeddedfile++; } #endif #if HAVE_JSON static void FlateDecode_cb(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdfname_action *act) { UNUSEDPARAM(obj); UNUSEDPARAM(act); if (!(pdf)) return; pdf->stats.nflate++; } #endif #if HAVE_JSON static void Image_cb(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdfname_action *act) { UNUSEDPARAM(obj); UNUSEDPARAM(act); if (!(pdf)) return; pdf->stats.nimage++; } #endif #if HAVE_JSON static void LZWDecode_cb(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdfname_action *act) { UNUSEDPARAM(obj); UNUSEDPARAM(act); if (!(pdf)) return; pdf->stats.nlzw++; } #endif #if HAVE_JSON static void RunLengthDecode_cb(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdfname_action *act) { UNUSEDPARAM(obj); UNUSEDPARAM(act); if (!(pdf)) return; pdf->stats.nrunlengthdecode++; } #endif #if HAVE_JSON static void CCITTFaxDecode_cb(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdfname_action *act) { UNUSEDPARAM(obj); UNUSEDPARAM(act); if (!(pdf)) return; pdf->stats.nfaxdecode++; } #endif #if HAVE_JSON static void JBIG2Decode_cb(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdfname_action *act) { struct json_object *pdfobj, *jbig2arr; UNUSEDPARAM(obj); UNUSEDPARAM(act); if (!(pdf)) return; if (!(pdf->ctx->options & CL_SCAN_FILE_PROPERTIES)) return; if (!(pdf->ctx->wrkproperty)) return; pdfobj = cli_jsonobj(pdf->ctx->wrkproperty, "PDFStats"); if (!(pdfobj)) return; jbig2arr = cli_jsonarray(pdfobj, "JBIG2Objects"); if (!(jbig2arr)) return; cli_jsonint_array(jbig2arr, obj->id>>8); pdf->stats.njbig2decode++; } #endif #if HAVE_JSON static void DCTDecode_cb(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdfname_action *act) { UNUSEDPARAM(obj); UNUSEDPARAM(act); if (!(pdf)) return; pdf->stats.ndctdecode++; } #endif #if HAVE_JSON static void JPXDecode_cb(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdfname_action *act) { UNUSEDPARAM(obj); UNUSEDPARAM(act); if (!(pdf)) return; pdf->stats.njpxdecode++; } #endif #if HAVE_JSON static void Crypt_cb(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdfname_action *act) { UNUSEDPARAM(obj); UNUSEDPARAM(act); if (!(pdf)) return; pdf->stats.ncrypt++; } #endif #if HAVE_JSON static void Standard_cb(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdfname_action *act) { UNUSEDPARAM(obj); UNUSEDPARAM(act); if (!(pdf)) return; pdf->stats.nstandard++; } #endif #if HAVE_JSON static void Sig_cb(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdfname_action *act) { UNUSEDPARAM(obj); UNUSEDPARAM(act); if (!(pdf)) return; pdf->stats.nsigned++; } #endif #if HAVE_JSON static void JavaScript_cb(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdfname_action *act) { struct json_object *pdfobj, *jbig2arr; UNUSEDPARAM(act); if (!(pdf)) return; if (!(pdf->ctx->options & CL_SCAN_FILE_PROPERTIES)) return; if (!(pdf->ctx->wrkproperty)) return; pdfobj = cli_jsonobj(pdf->ctx->wrkproperty, "PDFStats"); if (!(pdfobj)) return; jbig2arr = cli_jsonarray(pdfobj, "JavascriptObjects"); if (!(jbig2arr)) return; cli_jsonint_array(jbig2arr, obj->id>>8); pdf->stats.njs++; } #endif #if HAVE_JSON static void OpenAction_cb(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdfname_action *act) { UNUSEDPARAM(obj); UNUSEDPARAM(act); if (!(pdf)) return; pdf->stats.nopenaction++; } #endif #if HAVE_JSON static void Launch_cb(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdfname_action *act) { UNUSEDPARAM(obj); UNUSEDPARAM(act); if (!(pdf)) return; pdf->stats.nlaunch++; } #endif #if HAVE_JSON static void Page_cb(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdfname_action *act) { UNUSEDPARAM(obj); UNUSEDPARAM(act); if (!(pdf)) return; pdf->stats.npage++; } #endif #if HAVE_JSON static void Author_cb(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdfname_action *act) { UNUSEDPARAM(act); if (!(pdf)) return; if (!(pdf->ctx->options & CL_SCAN_FILE_PROPERTIES)) return; if (!(pdf->stats.author)) { pdf->stats.author = cli_calloc(1, sizeof(struct pdf_stats_entry)); if (!(pdf->stats.author)) return; pdf->stats.author->data = pdf_parse_string(pdf, obj, obj->start + pdf->map, obj_size(pdf, obj, 1), "/Author", NULL, &(pdf->stats.author->meta)); } } #endif #if HAVE_JSON static void Creator_cb(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdfname_action *act) { UNUSEDPARAM(act); if (!(pdf)) return; if (!(pdf->ctx->options & CL_SCAN_FILE_PROPERTIES)) return; if (!(pdf->stats.creator)) { pdf->stats.creator = cli_calloc(1, sizeof(struct pdf_stats_entry)); if (!(pdf->stats.creator)) return; pdf->stats.creator->data = pdf_parse_string(pdf, obj, obj->start + pdf->map, obj_size(pdf, obj, 1), "/Creator", NULL, &(pdf->stats.creator->meta)); } } #endif #if HAVE_JSON static void ModificationDate_cb(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdfname_action *act) { UNUSEDPARAM(act); if (!(pdf)) return; if (!(pdf->ctx->options & CL_SCAN_FILE_PROPERTIES)) return; if (!(pdf->stats.modificationdate)) { pdf->stats.modificationdate = cli_calloc(1, sizeof(struct pdf_stats_entry)); if (!(pdf->stats.modificationdate)) return; pdf->stats.modificationdate->data = pdf_parse_string(pdf, obj, obj->start + pdf->map, obj_size(pdf, obj, 1), "/ModDate", NULL, &(pdf->stats.modificationdate->meta)); } } #endif #if HAVE_JSON static void CreationDate_cb(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdfname_action *act) { UNUSEDPARAM(act); if (!(pdf)) return; if (!(pdf->ctx->options & CL_SCAN_FILE_PROPERTIES)) return; if (!(pdf->stats.creationdate)) { pdf->stats.creationdate = cli_calloc(1, sizeof(struct pdf_stats_entry)); if (!(pdf->stats.creationdate)) return; pdf->stats.creationdate->data = pdf_parse_string(pdf, obj, obj->start + pdf->map, obj_size(pdf, obj, 1), "/CreationDate", NULL, &(pdf->stats.creationdate->meta)); } } #endif #if HAVE_JSON static void Producer_cb(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdfname_action *act) { UNUSEDPARAM(act); if (!(pdf)) return; if (!(pdf->ctx->options & CL_SCAN_FILE_PROPERTIES)) return; if (!(pdf->stats.producer)) { pdf->stats.producer = cli_calloc(1, sizeof(struct pdf_stats_entry)); if (!(pdf->stats.producer)) return; pdf->stats.producer->data = pdf_parse_string(pdf, obj, obj->start + pdf->map, obj_size(pdf, obj, 1), "/Producer", NULL, &(pdf->stats.producer->meta)); } } #endif #if HAVE_JSON static void Title_cb(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdfname_action *act) { UNUSEDPARAM(act); if (!(pdf)) return; if (!(pdf->ctx->options & CL_SCAN_FILE_PROPERTIES)) return; if (!(pdf->stats.title)) { pdf->stats.title = cli_calloc(1, sizeof(struct pdf_stats_entry)); if (!(pdf->stats.title)) return; pdf->stats.title->data = pdf_parse_string(pdf, obj, obj->start + pdf->map, obj_size(pdf, obj, 1), "/Title", NULL, &(pdf->stats.title->meta)); } } #endif #if HAVE_JSON static void Keywords_cb(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdfname_action *act) { UNUSEDPARAM(act); if (!(pdf)) return; if (!(pdf->ctx->options & CL_SCAN_FILE_PROPERTIES)) return; if (!(pdf->stats.keywords)) { pdf->stats.keywords = cli_calloc(1, sizeof(struct pdf_stats_entry)); if (!(pdf->stats.keywords)) return; pdf->stats.keywords->data = pdf_parse_string(pdf, obj, obj->start + pdf->map, obj_size(pdf, obj, 1), "/Keywords", NULL, &(pdf->stats.keywords->meta)); } } #endif #if HAVE_JSON static void Subject_cb(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdfname_action *act) { UNUSEDPARAM(act); if (!(pdf)) return; if (!(pdf->ctx->options & CL_SCAN_FILE_PROPERTIES)) return; if (!(pdf->stats.subject)) { pdf->stats.subject = cli_calloc(1, sizeof(struct pdf_stats_entry)); if (!(pdf->stats.subject)) return; pdf->stats.subject->data = pdf_parse_string(pdf, obj, obj->start + pdf->map, obj_size(pdf, obj, 1), "/Subject", NULL, &(pdf->stats.subject->meta)); } } #endif #if HAVE_JSON static void RichMedia_cb(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdfname_action *act) { UNUSEDPARAM(obj); UNUSEDPARAM(act); if (!(pdf)) return; pdf->stats.nrichmedia++; } #endif #if HAVE_JSON static void AcroForm_cb(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdfname_action *act) { UNUSEDPARAM(obj); UNUSEDPARAM(act); if (!(pdf)) return; pdf->stats.nacroform++; } #endif #if HAVE_JSON static void XFA_cb(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdfname_action *act) { UNUSEDPARAM(obj); UNUSEDPARAM(act); if (!(pdf)) return; pdf->stats.nxfa++; } #endif #if HAVE_JSON static void Pages_cb(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdfname_action *act) { struct pdf_array *array; const char *objstart = (const char *)(obj->start + pdf->map); const char *begin; unsigned int objsz; unsigned long npages=0, count; struct pdf_array_node *node; json_object *pdfobj; UNUSEDPARAM(act); if (!(pdf) || !(pdf->ctx->wrkproperty)) return; if (!(pdf->ctx->options & CL_SCAN_FILE_PROPERTIES)) return; objsz = obj_size(pdf, obj, 1); pdfobj = cli_jsonobj(pdf->ctx->wrkproperty, "PDFStats"); if (!(pdfobj)) return; begin = cli_memstr(objstart, objsz, "/Kids", 5); if (!(begin)) return; begin += 5; array = pdf_parse_array(pdf, obj, objsz, (char *)begin, NULL); if (!(array)) { cli_jsonbool(pdfobj, "IncorrectPagesCount", 1); return; } for (node = array->nodes; node != NULL; node = node->next) if (node->datasz) if (strchr((char *)(node->data), 'R')) npages++; begin = cli_memstr(obj->start + pdf->map, objsz, "/Count", 6); if (!(begin)) { cli_jsonbool(pdfobj, "IncorrectPagesCount", 1); goto cleanup; } begin += 6; while (begin - objstart < objsz && isspace(begin[0])) begin++; if (begin - objstart >= objsz) { goto cleanup; } count = strtoul(begin, NULL, 10); if (count != npages) cli_jsonbool(pdfobj, "IncorrectPagesCount", 1); cleanup: pdf_free_array(array); } #endif #if HAVE_JSON static void Colors_cb(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdfname_action *act) { json_object *colorsobj, *pdfobj; unsigned long ncolors; char *start, *p1; size_t objsz; UNUSEDPARAM(act); if (!(pdf) || !(pdf->ctx) || !(pdf->ctx->wrkproperty)) return; if (!(pdf->ctx->options & CL_SCAN_FILE_PROPERTIES)) return; objsz = obj_size(pdf, obj, 1); start = (char *)(obj->start + pdf->map); p1 = (char *)cli_memstr(start, objsz, "/Colors", 7); if (!(p1)) return; p1 += 7; /* Ensure that we have at least one whitespace character plus at least one number */ if (objsz - (p1 - start) < 2) return; while (p1 - start < objsz && isspace(p1[0])) p1++; if ((size_t)(p1 - start) == objsz) return; ncolors = strtoul(p1, NULL, 10); /* We only care if the number of colors > 2**24 */ if (ncolors < 1<<24) return; pdfobj = cli_jsonobj(pdf->ctx->wrkproperty, "PDFStats"); if (!(pdfobj)) return; colorsobj = cli_jsonarray(pdfobj, "BigColors"); if (!(colorsobj)) return; cli_jsonint_array(colorsobj, obj->id>>8); } #endif #if HAVE_JSON static void pdf_export_json(struct pdf_struct *pdf) { json_object *pdfobj; unsigned long i; if (!(pdf)) return; if (!(pdf->ctx)) { goto cleanup; } if (!(pdf->ctx->options & CL_SCAN_FILE_PROPERTIES) || !(pdf->ctx->wrkproperty)) { goto cleanup; } pdfobj = cli_jsonobj(pdf->ctx->wrkproperty, "PDFStats"); if (!(pdfobj)) { goto cleanup; } if (pdf->stats.author) { if (!pdf->stats.author->meta.success) { char *out = pdf_finalize_string(pdf, pdf->stats.author->meta.obj, pdf->stats.author->data, pdf->stats.author->meta.length); if (out) { free(pdf->stats.author->data); pdf->stats.author->data = out; pdf->stats.author->meta.length = strlen(out); pdf->stats.author->meta.success = 1; } } if (pdf->stats.author->meta.success && cli_isutf8(pdf->stats.author->data, pdf->stats.author->meta.length)) { cli_jsonstr(pdfobj, "Author", pdf->stats.author->data); } else if (pdf->stats.author->data && pdf->stats.author->meta.length) { char *b64 = cl_base64_encode(pdf->stats.author->data, pdf->stats.author->meta.length); cli_jsonstr(pdfobj, "Author", b64); cli_jsonbool(pdfobj, "Author_base64", 1); free(b64); } else { cli_jsonstr(pdfobj, "Author", ""); } } if (pdf->stats.creator) { if (!pdf->stats.creator->meta.success) { char *out = pdf_finalize_string(pdf, pdf->stats.creator->meta.obj, pdf->stats.creator->data, pdf->stats.creator->meta.length); if (out) { free(pdf->stats.creator->data); pdf->stats.creator->data = out; pdf->stats.creator->meta.length = strlen(out); pdf->stats.creator->meta.success = 1; } } if (pdf->stats.creator->meta.success && cli_isutf8(pdf->stats.creator->data, pdf->stats.creator->meta.length)) { cli_jsonstr(pdfobj, "Creator", pdf->stats.creator->data); } else if (pdf->stats.creator->data && pdf->stats.creator->meta.length) { char *b64 = cl_base64_encode(pdf->stats.creator->data, pdf->stats.creator->meta.length); cli_jsonstr(pdfobj, "Creator", b64); cli_jsonbool(pdfobj, "Creator_base64", 1); free(b64); } else { cli_jsonstr(pdfobj, "Creator", ""); } } if (pdf->stats.producer) { if (!pdf->stats.producer->meta.success) { char *out = pdf_finalize_string(pdf, pdf->stats.producer->meta.obj, pdf->stats.producer->data, pdf->stats.producer->meta.length); if (out) { free(pdf->stats.producer->data); pdf->stats.producer->data = out; pdf->stats.producer->meta.length = strlen(out); pdf->stats.producer->meta.success = 1; } } if (pdf->stats.producer->meta.success && cli_isutf8(pdf->stats.producer->data, pdf->stats.producer->meta.length)) { cli_jsonstr(pdfobj, "Producer", pdf->stats.producer->data); } else if (pdf->stats.producer->data && pdf->stats.producer->meta.length) { char *b64 = cl_base64_encode(pdf->stats.producer->data, pdf->stats.producer->meta.length); cli_jsonstr(pdfobj, "Producer", b64); cli_jsonbool(pdfobj, "Producer_base64", 1); free(b64); } else { cli_jsonstr(pdfobj, "Producer", ""); } } if (pdf->stats.modificationdate) { if (!pdf->stats.modificationdate->meta.success) { char *out = pdf_finalize_string(pdf, pdf->stats.modificationdate->meta.obj, pdf->stats.modificationdate->data, pdf->stats.modificationdate->meta.length); if (out) { free(pdf->stats.modificationdate->data); pdf->stats.modificationdate->data = out; pdf->stats.modificationdate->meta.length = strlen(out); pdf->stats.modificationdate->meta.success = 1; } } if (pdf->stats.modificationdate->meta.success && cli_isutf8(pdf->stats.modificationdate->data, pdf->stats.modificationdate->meta.length)) { cli_jsonstr(pdfobj, "ModificationDate", pdf->stats.modificationdate->data); } else if (pdf->stats.modificationdate->data && pdf->stats.modificationdate->meta.length) { char *b64 = cl_base64_encode(pdf->stats.modificationdate->data, pdf->stats.modificationdate->meta.length); cli_jsonstr(pdfobj, "ModificationDate", b64); cli_jsonbool(pdfobj, "ModificationDate_base64", 1); free(b64); } else { cli_jsonstr(pdfobj, "ModificationDate", ""); } } if (pdf->stats.creationdate) { if (!pdf->stats.creationdate->meta.success) { char *out = pdf_finalize_string(pdf, pdf->stats.creationdate->meta.obj, pdf->stats.creationdate->data, pdf->stats.creationdate->meta.length); if (out) { free(pdf->stats.creationdate->data); pdf->stats.creationdate->data = out; pdf->stats.creationdate->meta.length = strlen(out); pdf->stats.creationdate->meta.success = 1; } } if (pdf->stats.creationdate->meta.success && cli_isutf8(pdf->stats.creationdate->data, pdf->stats.creationdate->meta.length)) { cli_jsonstr(pdfobj, "CreationDate", pdf->stats.creationdate->data); } else if (pdf->stats.creationdate->data && pdf->stats.creationdate->meta.length) { char *b64 = cl_base64_encode(pdf->stats.creationdate->data, pdf->stats.creationdate->meta.length); cli_jsonstr(pdfobj, "CreationDate", b64); cli_jsonbool(pdfobj, "CreationDate_base64", 1); free(b64); } else { cli_jsonstr(pdfobj, "CreationDate", ""); } } if (pdf->stats.title) { if (!pdf->stats.title->meta.success) { char *out = pdf_finalize_string(pdf, pdf->stats.title->meta.obj, pdf->stats.title->data, pdf->stats.title->meta.length); if (out) { free(pdf->stats.title->data); pdf->stats.title->data = out; pdf->stats.title->meta.length = strlen(out); pdf->stats.title->meta.success = 1; } } if (pdf->stats.title->meta.success && cli_isutf8(pdf->stats.title->data, pdf->stats.title->meta.length)) { cli_jsonstr(pdfobj, "Title", pdf->stats.title->data); } else if (pdf->stats.title->data && pdf->stats.title->meta.length) { char *b64 = cl_base64_encode(pdf->stats.title->data, pdf->stats.title->meta.length); cli_jsonstr(pdfobj, "Title", b64); cli_jsonbool(pdfobj, "Title_base64", 1); free(b64); } else { cli_jsonstr(pdfobj, "Title", ""); } } if (pdf->stats.subject) { if (!pdf->stats.subject->meta.success) { char *out = pdf_finalize_string(pdf, pdf->stats.subject->meta.obj, pdf->stats.subject->data, pdf->stats.subject->meta.length); if (out) { free(pdf->stats.subject->data); pdf->stats.subject->data = out; pdf->stats.subject->meta.length = strlen(out); pdf->stats.subject->meta.success = 1; } } if (pdf->stats.subject->meta.success && cli_isutf8(pdf->stats.subject->data, pdf->stats.subject->meta.length)) { cli_jsonstr(pdfobj, "Subject", pdf->stats.subject->data); } else if (pdf->stats.subject->data && pdf->stats.subject->meta.length) { char *b64 = cl_base64_encode(pdf->stats.subject->data, pdf->stats.subject->meta.length); cli_jsonstr(pdfobj, "Subject", b64); cli_jsonbool(pdfobj, "Subject_base64", 1); free(b64); } else { cli_jsonstr(pdfobj, "Subject", ""); } } if (pdf->stats.keywords) { if (!pdf->stats.keywords->meta.success) { char *out = pdf_finalize_string(pdf, pdf->stats.keywords->meta.obj, pdf->stats.keywords->data, pdf->stats.keywords->meta.length); if (out) { free(pdf->stats.keywords->data); pdf->stats.keywords->data = out; pdf->stats.keywords->meta.length = strlen(out); pdf->stats.keywords->meta.success = 1; } } if (pdf->stats.keywords->meta.success && cli_isutf8(pdf->stats.keywords->data, pdf->stats.keywords->meta.length)) { cli_jsonstr(pdfobj, "Keywords", pdf->stats.keywords->data); } else if (pdf->stats.keywords->data && pdf->stats.keywords->meta.length) { char *b64 = cl_base64_encode(pdf->stats.keywords->data, pdf->stats.keywords->meta.length); cli_jsonstr(pdfobj, "Keywords", b64); cli_jsonbool(pdfobj, "Keywords_base64", 1); free(b64); } else { cli_jsonbool(pdfobj, "Keywords", ""); } } if (pdf->stats.ninvalidobjs) cli_jsonint(pdfobj, "InvalidObjectCount", pdf->stats.ninvalidobjs); if (pdf->stats.njs) cli_jsonint(pdfobj, "JavaScriptObjectCount", pdf->stats.njs); if (pdf->stats.nflate) cli_jsonint(pdfobj, "DeflateObjectCount", pdf->stats.nflate); if (pdf->stats.nactivex) cli_jsonint(pdfobj, "ActiveXObjectCount", pdf->stats.nactivex); if (pdf->stats.nflash) cli_jsonint(pdfobj, "FlashObjectCount", pdf->stats.nflash); if (pdf->stats.ncolors) cli_jsonint(pdfobj, "ColorCount", pdf->stats.ncolors); if (pdf->stats.nasciihexdecode) cli_jsonint(pdfobj, "AsciiHexDecodeObjectCount", pdf->stats.nasciihexdecode); if (pdf->stats.nascii85decode) cli_jsonint(pdfobj, "Ascii85DecodeObjectCount", pdf->stats.nascii85decode); if (pdf->stats.nembeddedfile) cli_jsonint(pdfobj, "EmbeddedFileCount", pdf->stats.nembeddedfile); if (pdf->stats.nimage) cli_jsonint(pdfobj, "ImageCount", pdf->stats.nimage); if (pdf->stats.nlzw) cli_jsonint(pdfobj, "LZWCount", pdf->stats.nlzw); if (pdf->stats.nrunlengthdecode) cli_jsonint(pdfobj, "RunLengthDecodeCount", pdf->stats.nrunlengthdecode); if (pdf->stats.nfaxdecode) cli_jsonint(pdfobj, "FaxDecodeCount", pdf->stats.nfaxdecode); if (pdf->stats.njbig2decode) cli_jsonint(pdfobj, "JBIG2DecodeCount", pdf->stats.njbig2decode); if (pdf->stats.ndctdecode) cli_jsonint(pdfobj, "DCTDecodeCount", pdf->stats.ndctdecode); if (pdf->stats.njpxdecode) cli_jsonint(pdfobj, "JPXDecodeCount", pdf->stats.njpxdecode); if (pdf->stats.ncrypt) cli_jsonint(pdfobj, "CryptCount", pdf->stats.ncrypt); if (pdf->stats.nstandard) cli_jsonint(pdfobj, "StandardCount", pdf->stats.nstandard); if (pdf->stats.nsigned) cli_jsonint(pdfobj, "SignedCount", pdf->stats.nsigned); if (pdf->stats.nopenaction) cli_jsonint(pdfobj, "OpenActionCount", pdf->stats.nopenaction); if (pdf->stats.nlaunch) cli_jsonint(pdfobj, "LaunchCount", pdf->stats.nlaunch); if (pdf->stats.npage) cli_jsonint(pdfobj, "PageCount", pdf->stats.npage); if (pdf->stats.nrichmedia) cli_jsonint(pdfobj, "RichMediaCount", pdf->stats.nrichmedia); if (pdf->stats.nacroform) cli_jsonint(pdfobj, "AcroFormCount", pdf->stats.nacroform); if (pdf->stats.nxfa) cli_jsonint(pdfobj, "XFACount", pdf->stats.nxfa); if (pdf->flags & (1 << BAD_PDF_VERSION)) cli_jsonbool(pdfobj, "BadVersion", 1); if (pdf->flags & (1 << BAD_PDF_HEADERPOS)) cli_jsonbool(pdfobj, "BadHeaderPosition", 1); if (pdf->flags & (1 << BAD_PDF_TRAILER)) cli_jsonbool(pdfobj, "BadTrailer", 1); if (pdf->flags & (1 << BAD_PDF_TOOMANYOBJS)) cli_jsonbool(pdfobj, "TooManyObjects", 1); if (pdf->flags & (1 << ENCRYPTED_PDF)) { cli_jsonbool(pdfobj, "Encrypted", 1); if (pdf->flags & (1 << DECRYPTABLE_PDF)) cli_jsonbool(pdfobj, "Decryptable", 1); else cli_jsonbool(pdfobj, "Decryptable", 0); } for (i=0; i < pdf->nobjs; i++) { if (pdf->objs[i].flags & (1<objs[i].id>>8); } } cleanup: if ((pdf->stats.author)) { if (pdf->stats.author->data) free(pdf->stats.author->data); free(pdf->stats.author); pdf->stats.author = NULL; } if (pdf->stats.creator) { if (pdf->stats.creator->data) free(pdf->stats.creator->data); free(pdf->stats.creator); pdf->stats.creator = NULL; } if (pdf->stats.producer) { if (pdf->stats.producer->data) free(pdf->stats.producer->data); free(pdf->stats.producer); pdf->stats.producer = NULL; } if (pdf->stats.modificationdate) { if (pdf->stats.modificationdate->data) free(pdf->stats.modificationdate->data); free(pdf->stats.modificationdate); pdf->stats.modificationdate = NULL; } if (pdf->stats.creationdate) { if (pdf->stats.creationdate->data) free(pdf->stats.creationdate->data); free(pdf->stats.creationdate); pdf->stats.creationdate = NULL; } if (pdf->stats.title) { if (pdf->stats.title->data) free(pdf->stats.title->data); free(pdf->stats.title); pdf->stats.title = NULL; } if (pdf->stats.subject) { if (pdf->stats.subject->data) free(pdf->stats.subject->data); free(pdf->stats.subject); pdf->stats.subject = NULL; } if (pdf->stats.keywords) { if (pdf->stats.keywords->data) free(pdf->stats.keywords->data); free(pdf->stats.keywords); pdf->stats.keywords = NULL; } } #endif