/* * Copyright (C) 2002-2006 Nigel Horne * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, * MA 02110-1301, USA. */ static char const rcsid[] = "$Id: mbox.c,v 1.356 2006/10/16 00:33:34 tkojm Exp $"; #ifdef _MSC_VER #include /* only needed in CL_EXPERIMENTAL */ #endif #if HAVE_CONFIG_H #include "clamav-config.h" #endif #ifndef CL_DEBUG #define NDEBUG /* map CLAMAV debug onto standard */ #endif #ifdef CL_THREAD_SAFE #ifndef _REENTRANT #define _REENTRANT /* for Solaris 2.8 */ #endif #endif #include #include #include #include #include #ifdef HAVE_STRINGS_H #include #endif #include #include #include #ifdef HAVE_SYS_PARAM_H #include #endif #include "clamav.h" #ifndef C_WINDOWS #include #endif #include #include #ifdef HAVE_UNISTD_H #include #endif #if defined(HAVE_READDIR_R_3) || defined(HAVE_READDIR_R_2) #include #endif #ifdef CL_THREAD_SAFE #include #endif #include "others.h" #include "defaults.h" #include "str.h" #include "filetypes.h" #include "mbox.h" #ifdef CL_DEBUG #include #if __GLIBC__ == 2 && __GLIBC_MINOR__ >= 1 #define HAVE_BACKTRACE #endif #endif #ifdef HAVE_BACKTRACE #include #include static void sigsegv(int sig); static void print_trace(int use_syslog); #endif #if defined(NO_STRTOK_R) || !defined(CL_THREAD_SAFE) #undef strtok_r #undef __strtok_r #define strtok_r(a,b,c) strtok(a,b) #endif #ifdef C_LINUX /* Others??? Old linux, e.g. Red Hat 5.2, doesn't have this */ #include #else #ifdef FALSE typedef unsigned char bool; #else typedef enum { FALSE = 0, TRUE = 1 } bool; #endif #endif #ifndef isblank #define isblank(c) (((c) == ' ') || ((c) == '\t')) #endif #define SAVE_TO_DISC /* multipart/message are saved in a temporary file */ #ifndef CL_EXPERIMENTAL /* * Code does exist to run FOLLOWURLS on systems without libcurl, however that * is not recommended so it is not compiled by default * * On Solaris, when using the GNU C compiler, the clamAV build system uses the * Sun supplied ld instead of the GNU ld causing an error. Therefore you cannot * use WITH_CURL on Solaris with gcc, you must configure with * "--without-libcurl". I don't know if it works with Sun's own compiler * * Fails to link on Solaris 10 with this error: * Undefined first referenced * symbol in file * __floatdidf /opt/sfw/lib/libcurl.s */ #if C_SOLARIS && __GNUC__ #undef WITH_CURL #endif #endif #if defined(WITH_CURL) || defined(CL_EXPERIMENTAL) #define FOLLOWURLS 5 /* * Maximum number of URLs scanned in a message * part. Helps to find Dialer.gen-45. If * not defined, don't check any URLs */ #endif #ifdef FOLLOWURLS #include "htmlnorm.h" #endif #ifdef CL_EXPERIMENTAL #include "phishcheck.h" #endif #ifdef FOLLOWURLS #ifndef C_WINDOWS #include #include #include #include #include #endif #ifndef C_WINDOWS #define closesocket(s) close(s) #endif #ifdef CL_EXPERIMENTAL /* dropping curl support */ #include #ifndef C_WINDOWS #include #endif #ifndef HAVE_IN_PORT_T typedef unsigned short in_port_t; #endif #ifndef HAVE_IN_ADDR_T typedef unsigned int in_addr_t; #endif #if (!defined(EALREADY)) && (defined(WSAEALREADY)) #define EALREADY WSAEALREADY #endif #if (!defined(EINPROGRESS)) && (defined(WSAEINPROGRESS)) #define EINPROGRESS WSAEINPROGRESS #endif #if (!defined(EISCONN)) && (defined(WSAEISCONN)) #define EISCONN WSAEISCONN #endif #else #ifdef WITH_CURL /* Set in configure */ /* * To build with WITH_CURL: * LDFLAGS=`curl-config --libs` ./configure ... */ #include /* * Needs curl >= 7.11 (I've heard that 7.9 can cause crashes and I have seen * 7.10 segfault, later versions can be flakey as well) * untested) * * Even 7.15 crashes, valgrind shows this: * ==2835== Warning: client switching stacks? SP change: 0xBEB0FD2C --> 0xD0678F0 * ==2835== to suppress, use: --max-stackframe=1314225092 or greater * ==2835== Invalid write of size 4 * ==2835== at 0x40F67BD: Curl_resolv (in /usr/lib/libcurl.so.3.0.0) * ==2835== Address 0xD0678F4 is on thread 1's stack * ==2835== Can't extend stack to 0xD067390 during signal delivery for thread 1: * ==2835== no stack segment * ==2835== * ==2835== Process terminating with default action of signal 11 (SIGSEGV) * ==2835== Access not within mapped region at address 0xD067390 * ==2835== at 0x40F67BD: Curl_resolv (in /usr/lib/libcurl.so.3.0.0) * * This bug has been reported upstream, however they claim that the bug * does not exist :-(. I have received reports that 7.15.5 suffers from the * same problem in Curl_resolv * * TODO: Drop curl and do it ourselves */ #if (LIBCURL_VERSION_NUM < 0x070B00) #undef WITH_CURL /* also undef FOLLOWURLS? */ #endif #else #error "FOLLOWURLS without CURL is no longer supported" #endif /*WITH_CURL*/ #endif /* CL_EXPERIMENTAL */ #else /*!FOLLOWURLS*/ #undef WITH_CURL #endif /*FOLLOWURLS*/ /* * Define this to handle messages covered by section 7.3.2 of RFC1341. * This is experimental code so it is up to YOU to (1) ensure it's secure * (2) periodically trim the directory of old files * * If you use the load balancing feature of clamav-milter to run clamd on * more than one machine you must make sure that .../partial is on a shared * network filesystem */ #ifndef C_WINDOWS /* TODO: when opendir() is done */ #define PARTIAL_DIR #endif /*#define NEW_WORLD*/ /*#define SCAN_UNENCODED_BOUNCES *//* * Slows things down a lot and only catches unencoded copies * of EICAR within bounces, which don't matter */ typedef struct mbox_ctx { const char *dir; const table_t *rfc821Table; const table_t *subtypeTable; cli_ctx *ctx; } mbox_ctx; static int cli_parse_mbox(const char *dir, int desc, cli_ctx *ctx); static message *parseEmailFile(FILE *fin, const table_t *rfc821Table, const char *firstLine, const char *dir); static message *parseEmailHeaders(message *m, const table_t *rfc821Table); static int parseEmailHeader(message *m, const char *line, const table_t *rfc821Table); static int parseEmailBody(message *messageIn, text *textIn, mbox_ctx *mctx); static int boundaryStart(const char *line, const char *boundary); static int endOfMessage(const char *line, const char *boundary); static int initialiseTables(table_t **rfc821Table, table_t **subtypeTable); static int getTextPart(message *const messages[], size_t size); static size_t strip(char *buf, int len); static int parseMimeHeader(message *m, const char *cmd, const table_t *rfc821Table, const char *arg); static void saveTextPart(message *m, const char *dir, int destroy_text); static char *rfc2047(const char *in); static char *rfc822comments(const char *in, char *out); #ifdef PARTIAL_DIR static int rfc1341(message *m, const char *dir); #endif static bool usefulHeader(int commandNumber, const char *cmd); static char *getline_from_mbox(char *buffer, size_t len, FILE *fin); static bool isBounceStart(const char *line); static bool exportBinhexMessage(const char *dir, message *m); static int exportBounceMessage(text *start, const mbox_ctx *ctx); static message *do_multipart(message *mainMessage, message **messages, int i, int *rc, mbox_ctx *mctx, message *messageIn, text **tptr); static int count_quotes(const char *buf); static bool next_is_folded_header(const text *t); static void checkURLs(message *m, mbox_ctx *mctx,int *rc, int is_html); #ifdef CL_EXPERIMENTAL static void do_checkURLs(message *m, const char *dir,tag_arguments_t* hrefs); static blob* getHrefs(message* m,tag_arguments_t* hrefs); static void hrefs_done(blob *b,tag_arguments_t* hrefs); #endif #if defined(FOLLOWURLS) && (FOLLOWURLS > 0) struct arg { #ifdef CL_EXPERIMENTAL char *url; #else CURL *curl; const char *url; #endif const char *dir; char *filename; }; #ifdef CL_THREAD_SAFE static void *getURL(void *a); #else static void *getURL(struct arg *arg); #endif #endif /* Maximum line length according to RFC821 */ #define RFC2821LENGTH 1000 /* Hashcodes for our hash tables */ #define CONTENT_TYPE 1 #define CONTENT_TRANSFER_ENCODING 2 #define CONTENT_DISPOSITION 3 /* Mime sub types */ #define PLAIN 1 #define ENRICHED 2 #define HTML 3 #define RICHTEXT 4 #define MIXED 5 #define ALTERNATIVE 6 /* RFC1521*/ #define DIGEST 7 #define SIGNED 8 #define PARALLEL 9 #define RELATED 10 /* RFC2387 */ #define REPORT 11 /* RFC1892 */ #define APPLEDOUBLE 12 /* Handling of this in only noddy for now */ #define FAX MIXED /* * RFC3458 * Drafts stated to treat is as mixed if it is * not known. This disappeared in the final * version (except when talking about * voice-message), but it is good enough for us * since we do no validation of coversheet * presence etc. (which also has disappeared * in the final version) */ #define ENCRYPTED 13 /* * e.g. RFC2015 * Content-Type: multipart/encrypted; * boundary="nextPart1383049.XCRrrar2yq"; * protocol="application/pgp-encrypted" */ #define X_BFILE RELATED /* * BeOS, expert two parts: the file and it's * attributes. The attributes part comes as * Content-Type: application/x-be_attribute * name="foo" * I can't find where it is defined, any * pointers would be appreciated. For now * we treat it as multipart/related */ #define KNOWBOT 14 /* Unknown and undocumented format? */ static const struct tableinit { const char *key; int value; } rfc821headers[] = { /* TODO: make these regular expressions */ { "Content-Type", CONTENT_TYPE }, { "Content-Transfer-Encoding", CONTENT_TRANSFER_ENCODING }, { "Content-Disposition", CONTENT_DISPOSITION }, { NULL, 0 } }, mimeSubtypes[] = { /* see RFC2045 */ /* subtypes of Text */ { "plain", PLAIN }, { "enriched", ENRICHED }, { "html", HTML }, { "richtext", RICHTEXT }, /* subtypes of Multipart */ { "mixed", MIXED }, { "alternative", ALTERNATIVE }, { "digest", DIGEST }, { "signed", SIGNED }, { "parallel", PARALLEL }, { "related", RELATED }, { "report", REPORT }, { "appledouble", APPLEDOUBLE }, { "fax-message", FAX }, { "encrypted", ENCRYPTED }, { "x-bfile", X_BFILE }, /* BeOS */ { "knowbot", KNOWBOT }, /* ??? */ { "knowbot-metadata", KNOWBOT }, /* ??? */ { "knowbot-code", KNOWBOT }, /* ??? */ { "knowbot-state", KNOWBOT }, /* ??? */ { NULL, 0 } }; #ifdef CL_THREAD_SAFE static pthread_mutex_t tables_mutex = PTHREAD_MUTEX_INITIALIZER; #endif #ifndef O_BINARY #define O_BINARY 0 #endif #ifdef NEW_WORLD #include "matcher.h" #undef PARTIAL_DIR #if HAVE_MMAP #if HAVE_SYS_MMAN_H #include #else /* HAVE_SYS_MMAN_H */ #undef HAVE_MMAP #endif #else /*HAVE_MMAP*/ #undef NEW_WORLD #endif #endif #ifdef NEW_WORLD /* * Files larger than this are scanned with the old method, should be * StreamMaxLength, I guess * If NW_MAX_FILE_SIZE is not defined, all files go through the * new method. This definition is for machines very tight on RAM, or * with large StreamMaxLength values */ #define MAX_ALLOCATION 134217728 /* see libclamav/others.c */ #define NW_MAX_FILE_SIZE MAX_ALLOCATION struct scanlist { const char *start; size_t size; encoding_type decoder; /* only BASE64 and QUOTEDPRINTABLE for now */ struct scanlist *next; }; static struct map { const char *offset; /* sorted */ const char *word; struct map *next; } *map, *tail; static int save_text(cli_ctx *ctx, const char *dir, const char *start, size_t len); static void create_map(const char *begin, const char *end); static void add_to_map(const char *offset, const char *word); static const char *find_in_map(const char *offset, const char *word); static void free_map(void); /* * This could be the future. Instead of parsing and decoding it just decodes. * * USE IT AT YOUR PERIL, a large number of viruses are not detected with this * method, possibly because the decoded files must be exact and not have * extra data at the start or end, which this code will produce. * * Currently only supports base64 and quoted-printable * * You may also see a lot of warnings. For the moment it falls back to old * world mode if it doesn't know what to do - that'll be removed. * The code is untidy... * * FIXME: Some mailbox scans are slower with this method. I suspect that it's * because the scan can proceed to the end of the file rather than the end * of the attachment which can mean than later emails are scanned many times * * FIXME: quoted printable doesn't know when to stop, so size related virus * matching breaks * * TODO: Fall through to cli_parse_mbox() too often * * TODO: Add support for systems without mmap() * * TODO: partial_dir fall through * * FIXME: Some EICAR gets through */ int cli_mbox(const char *dir, int desc, cli_ctx *ctx) { char *start, *ptr, *line; const char *last, *p, *q; size_t size; struct stat statb; message *m; fileblob *fb; int ret = CL_CLEAN; int wasAlloced; struct scanlist *scanlist, *scanelem; if(dir == NULL) { cli_warnmsg("cli_mbox called with NULL dir\n"); return CL_ENULLARG; } if(fstat(desc, &statb) < 0) return CL_EOPEN; size = statb.st_size; if(size == 0) return CL_CLEAN; #ifdef NW_MAX_FILE_SIZE if(size > NW_MAX_FILE_SIZE) return cli_parse_mbox(dir, desc, ctx); #endif /*cli_warnmsg("NEW_WORLD is new code - use at your own risk.\n");*/ #ifdef PARTIAL_DIR cli_warnmsg("PARTIAL_DIR doesn't work in the NEW_WORLD yet\n"); #endif start = mmap(NULL, size, PROT_READ, MAP_PRIVATE, desc, 0); if(start == MAP_FAILED) return CL_EMEM; cli_dbgmsg("mmap'ed mbox\n"); ptr = cli_malloc(size); if(ptr) { memcpy(ptr, start, size); munmap(start, size); start = ptr; wasAlloced = 1; } else wasAlloced = 0; /* last points to the last *valid* address in the array */ last = &start[size - 1]; create_map(start, last); scanelem = scanlist = NULL; q = start; /* * FIXME: mismatch of const char * and char * here and in later calls * to find_in_map() */ while((p = find_in_map(q, "base64")) != NULL) { cli_dbgmsg("Found base64\n"); if(scanelem) { scanelem->next = cli_malloc(sizeof(struct scanlist)); scanelem = scanelem->next; } else scanlist = scanelem = cli_malloc(sizeof(struct scanlist)); scanelem->next = NULL; scanelem->decoder = BASE64; q = scanelem->start = &p[6]; if(((p = find_in_map(q, "\nFrom ")) != NULL) || ((p = find_in_map(q, "base64")) != NULL) || ((p = find_in_map(q, "quoted-printable")) != NULL)) { scanelem->size = (size_t)(p - q); q = p; } else { scanelem->size = (size_t)(last - scanelem->start) + 1; break; } cli_dbgmsg("base64: last %u q %u\n", (unsigned int)last, (unsigned int)q); assert(scanelem->size <= size); } q = start; while((p = find_in_map(q, "quoted-printable")) != NULL) { if(p != q) switch(p[-1]) { case ' ': case ':': case '=': /* wrong but allow it */ break; default: q = &p[16]; cli_dbgmsg("Ignore quoted-printable false positive\n"); continue; /* false positive */ } cli_dbgmsg("Found quoted-printable\n"); #ifdef notdef /* * The problem with quoted printable is recognising when to stop * parsing */ if(scanelem) { scanelem->next = cli_malloc(sizeof(struct scanlist)); scanelem = scanelem->next; } else scanlist = scanelem = cli_malloc(sizeof(struct scanlist)); scanelem->next = NULL; scanelem->decoder = QUOTEDPRINTABLE; q = scanelem->start = &p[16]; cli_dbgmsg("qp: last %u q %u\n", (unsigned int)last, (unsigned int)q); if(((p = find_in_map(q, "\nFrom ")) != NULL) || ((p = find_in_map(q, "quoted-printable")) != NULL) || ((p = find_in_map(q, "base64")) != NULL)) { scanelem->size = (size_t)(p - q); q = p; cli_dbgmsg("qp: scanelem->size = %u\n", scanelem->size); } else { scanelem->size = (size_t)(last - scanelem->start) + 1; break; } assert(scanelem->size <= size); #else if(wasAlloced) free(start); else munmap(start, size); free_map(); return cli_parse_mbox(dir, desc, ctx); #endif } if(scanlist == NULL) { const struct tableinit *tableinit; bool anyHeadersFound = FALSE; bool hasuuencode = FALSE; cli_file_t type; /* FIXME: message: There could of course be no decoder needed... */ for(tableinit = rfc821headers; tableinit->key; tableinit++) if(find_in_map(start, tableinit->key)) { anyHeadersFound = TRUE; break; } if((!anyHeadersFound) && ((p = find_in_map(start, "\nbegin ")) != NULL) && (isuuencodebegin(++p))) /* uuencoded part */ hasuuencode = TRUE; else { cli_dbgmsg("Nothing encoded, looking for a text part to save\n"); ret = save_text(ctx, dir, start, size); if(wasAlloced) free(start); else munmap(start, size); free_map(); if(ret != CL_EFORMAT) return ret; ret = CL_CLEAN; } free_map(); type = cli_filetype(start, size); if((type == CL_TYPE_UNKNOWN_TEXT) && (strncmp(start, "Microsoft Mail Internet Headers", 31) == 0)) type = CL_TYPE_MAIL; if(wasAlloced) free(start); else munmap(start, size); if(anyHeadersFound || hasuuencode) { /* TODO: reduce the number of falls through here */ if(hasuuencode) /* TODO: fast track visa */ cli_warnmsg("New world - fall back to old uudecoder\n"); else cli_warnmsg("cli_mbox: unknown encoder, type %d\n", type); if(type == CL_TYPE_MAIL) return cli_parse_mbox(dir, desc, ctx); cli_dbgmsg("Unknown filetype %d, return CLEAN\n", type); return CL_CLEAN; } #if 0 /* I don't believe this is needed any more */ /* * The message could be a plain text phish * FIXME: Can't get to the option whether we are looking for * phishes or not, so assume we are, this slows things a * lot * Should be * if((type == CL_TYPE_MAIL) && (!(no-phishing)) */ if(type == CL_TYPE_MAIL) return cli_parse_mbox(dir, desc, ctx); #endif cli_dbgmsg("cli_mbox: I believe it's plain text (type == %d) which must be clean\n", type); return CL_CLEAN; } #if 0 if(wasAlloced) { const char *max = NULL; for(scanelem = scanlist; scanelem; scanelem = scanelem->next) { const char *end = &scanelem->start[scanelem->size]; if(end > max) max = end; } if(max < last) printf("could free %d bytes\n", (int)(last - max)); } #endif for(scanelem = scanlist; scanelem; scanelem = scanelem->next) { if(scanelem->decoder == BASE64) { const char *b64start = scanelem->start; size_t b64size = scanelem->size; cli_dbgmsg("b64size = %lu\n", b64size); while((*b64start != '\n') && (*b64start != '\r')) { b64start++; b64size--; } /* * Look for the end of the headers */ while(b64start < last) { if(*b64start == ';') { b64start++; b64size--; } else if((memcmp(b64start, "\n\n", 2) == 0) || (memcmp(b64start, "\r\r", 2) == 0)) { b64start += 2; b64size -= 2; break; } else if(memcmp(b64start, "\r\n\r\n", 4) == 0) { b64start += 4; b64size -= 4; break; } else if(memcmp(b64start, "\n \n", 3) == 0) { /* * Some viruses are broken and have * one space character at the end of * the headers */ b64start += 3; b64size -= 3; break; } else if(memcmp(b64start, "\r\n \r\n", 5) == 0) { /* * Some viruses are broken and have * one space character at the end of * the headers */ b64start += 5; b64size -= 5; break; } b64start++; b64size--; } if(b64size > 0L) while((!isalnum(*b64start)) && (*b64start != '/')) { if(b64size-- == 0L) break; b64start++; } if(b64size > 0L) { int lastline; char *tmpfilename; unsigned char *uptr; cli_dbgmsg("cli_mbox: decoding %ld base64 bytes\n", b64size); if((fb = fileblobCreate()) == NULL) { free_map(); if(wasAlloced) free(start); else munmap(start, size); return CL_EMEM; } tmpfilename = cli_gentemp(dir); if(tmpfilename == NULL) { free_map(); if(wasAlloced) free(start); else munmap(start, size); fileblobDestroy(fb); return CL_EMEM; } fileblobSetFilename(fb, dir, tmpfilename); free(tmpfilename); line = NULL; m = messageCreate(); if(m == NULL) { free_map(); if(wasAlloced) free(start); else munmap(start, size); fileblobDestroy(fb); return CL_EMEM; } messageSetEncoding(m, "base64"); messageSetCTX(m, ctx); fileblobSetCTX(fb, ctx); lastline = 0; do { int length = 0, datalen; char *newline, *equal; unsigned char *bigbuf, *data; unsigned char smallbuf[1024]; const char *cptr; /*printf("%ld: ", b64size); fflush(stdout);*/ for(cptr = b64start; b64size && (*cptr != '\n') && (*cptr != '\r'); cptr++) { length++; --b64size; } /*printf("%d: ", length); fflush(stdout);*/ newline = cli_realloc(line, length + 1); if(newline == NULL) break; line = newline; memcpy(line, b64start, length); line[length] = '\0'; equal = strchr(line, '='); if(equal) { lastline++; *equal = '\0'; } /*puts(line);*/ #if 0 if(messageAddStr(m, line) < 0) break; #endif if(length >= (int)sizeof(smallbuf)) { datalen = length + 2; data = bigbuf = cli_malloc(datalen); if(data == NULL) break; } else { bigbuf = NULL; data = smallbuf; datalen = sizeof(data) - 1; } uptr = decodeLine(m, BASE64, line, data, datalen); if(uptr == NULL) { if(bigbuf) free(bigbuf); break; } /*cli_dbgmsg("base64: write %u bytes\n", (size_t)(uptr - data));*/ datalen = fileblobAddData(fb, data, (size_t)(uptr - data)); if(bigbuf) free(bigbuf); if(datalen < 0) break; if(fileblobContainsVirus(fb)) break; if((b64size > 0) && (*cptr == '\r')) { b64start = ++cptr; --b64size; } if((b64size > 0) && (*cptr == '\n')) { b64start = ++cptr; --b64size; } if(lastline) break; } while(b64size > 0L); if(m->base64chars) { unsigned char data[4]; uptr = base64Flush(m, data); if(uptr) { /*cli_dbgmsg("base64: flush %u bytes\n", (size_t)(uptr - data));*/ (void)fileblobAddData(fb, data, (size_t)(uptr - data)); } } if(fb) fileblobDestroy(fb); else ret = -1; messageDestroy(m); free(line); } } else if(scanelem->decoder == QUOTEDPRINTABLE) { const char *quotedstart = scanelem->start; size_t quotedsize = scanelem->size; cli_dbgmsg("quotedsize = %lu\n", quotedsize); while(*quotedstart != '\n') { quotedstart++; quotedsize--; } /* * Look for the end of the headers */ while(quotedstart < last) { if(*quotedstart == ';') { quotedstart++; quotedsize--; } else if((*quotedstart == '\n') || (*quotedstart == '\r')) { quotedstart++; quotedsize--; if((*quotedstart == '\n') || (*quotedstart == '\r')) { quotedstart++; quotedsize--; break; } } quotedstart++; quotedsize--; } while(!isalnum(*quotedstart)) { quotedstart++; quotedsize--; } if(quotedsize > 0L) { cli_dbgmsg("cli_mbox: decoding %ld quoted-printable bytes\n", quotedsize); m = messageCreate(); if(m == NULL) { free_map(); if(wasAlloced) free(start); else munmap(start, size); return CL_EMEM; } messageSetEncoding(m, "quoted-printable"); messageSetCTX(m, ctx); line = NULL; do { int length = 0; char *newline; const char *cptr; /*printf("%ld: ", quotedsize); fflush(stdout);*/ for(cptr = quotedstart; quotedsize && (*cptr != '\n') && (*cptr != '\r'); cptr++) { length++; --quotedsize; } /*printf("%d: ", length); fflush(stdout);*/ newline = cli_realloc(line, length + 1); if(newline == NULL) break; line = newline; memcpy(line, quotedstart, length); line[length] = '\0'; /*puts(line);*/ if(messageAddStr(m, line) < 0) break; if((quotedsize > 0) && (*cptr == '\r')) { quotedstart = ++cptr; --quotedsize; } if((quotedsize > 0) && (*cptr == '\n')) { quotedstart = ++cptr; --quotedsize; } } while(quotedsize > 0L); free(line); fb = messageToFileblob(m, dir, 1); messageDestroy(m); if(fb) fileblobDestroy(fb); else ret = -1; } } } scanelem = scanlist; /* * There could be a phish in the plain text part, so save that * FIXME: Can't get to the option whether we are looking for * phishes or not, so assume we are, this slows things a * lot * Should be * if((type == CL_TYPE_MAIL) && (!(no-phishing)) */ ret = save_text(ctx, dir, start, size); free_map(); while(scanelem) { struct scanlist *n = scanelem->next; free(scanelem); scanelem = n; } if(wasAlloced) free(start); else munmap(start, size); /* * FIXME: Need to run cl_scandir() here and return that value */ cli_dbgmsg("cli_mbox: ret = %d\n", ret); if(ret != CL_EFORMAT) return ret; cli_warnmsg("New world - don't know what to do - fall back to old world\n"); /* Fall back for now */ lseek(desc, 0L, SEEK_SET); return cli_parse_mbox(dir, desc, ctx); } /* * Save a text part - it could contain phish or jscript */ static int save_text(cli_ctx *ctx, const char *dir, const char *start, size_t len) { const char *p; if((p = find_in_map(start, "\n\n")) || (p = find_in_map(start, "\r\n\r\n"))) { const char *q; fileblob *fb; char *tmpfilename; if(((q = find_in_map(start, "base64")) == NULL) && ((q = find_in_map(start, "quoted_printable")) == NULL)) { cli_dbgmsg("It's all plain text!\n"); if(*p == '\r') p += 4; else p += 2; len -= (p - start); } else if(((q = find_in_map(p, "\nFrom ")) == NULL) && ((q = find_in_map(p, "base64")) == NULL) && ((q = find_in_map(p, "quoted-printable")) == NULL)) cli_dbgmsg("Can't find end of plain text - assume it's all\n"); else len = (size_t)(q - p); if(len < 5) { cli_dbgmsg("save_text: Too small\n"); return CL_EFORMAT; } if(ctx->scanned) *ctx->scanned += len / CL_COUNT_PRECISION; /* * This doesn't work, cli_scanbuff isn't designed to be used * in this way. It gets the "filetype" wrong and then * doesn't scan correctly */ if(cli_scanbuff((char *)p, len, ctx->virname, ctx->engine, 0) == CL_VIRUS) { cli_dbgmsg("save_text: found %s\n", *ctx->virname); return CL_VIRUS; } fb = fileblobCreate(); if(fb == NULL) return CL_EMEM; tmpfilename = cli_gentemp(dir); if(tmpfilename == NULL) { fileblobDestroy(fb); return CL_ETMPFILE; } cli_dbgmsg("save plain bit to %s, %u bytes\n", tmpfilename, len); fileblobSetFilename(fb, dir, tmpfilename); free(tmpfilename); (void)fileblobAddData(fb, (const unsigned char *)p, len); fileblobDestroy(fb); return CL_SUCCESS; } cli_dbgmsg("No text part found to save\n"); return CL_EFORMAT; } static void create_map(const char *begin, const char *end) { const struct wordlist { const char *word; int len; } wordlist[] = { { "base64", 6 }, { "quoted-printable", 16 }, { "\nbegin ", 7 }, { "\nFrom ", 6 }, { "\n\n", 2 }, { "\r\n\r\n", 4 }, { NULL, 0 } }; if(map) { cli_warnmsg("create_map called without free_map\n"); free_map(); } while(begin < end) { const struct wordlist *word; for(word = wordlist; word->word; word++) { if((end - begin) < word->len) continue; if(strncasecmp(begin, word->word, word->len) == 0) { add_to_map(begin, word->word); break; } } begin++; } } /* To sort map, assume 'offset' is presented in sorted order */ static void add_to_map(const char *offset, const char *word) { if(map) { tail->next = cli_malloc(sizeof(struct map)); /* FIXME: verify */ tail = tail->next; } else map = tail = cli_malloc(sizeof(struct map)); /* FIXME: verify */ tail->offset = offset; tail->word = word; tail->next = NULL; } static const char * find_in_map(const char *offset, const char *word) { const struct map *item; for(item = map; item; item = item->next) if(item->offset >= offset) if(strcasecmp(word, item->word) == 0) return item->offset; return NULL; } static void free_map(void) { while(map) { struct map *next = map->next; free(map); map = next; } map = NULL; } #else /*!NEW_WORLD*/ int cli_mbox(const char *dir, int desc, cli_ctx *ctx) { if(dir == NULL) { cli_warnmsg("cli_mbox called with NULL dir\n"); return CL_ENULLARG; } return cli_parse_mbox(dir, desc, ctx); } #endif /* * TODO: when signal handling is added, need to remove temp files when a * signal is received * TODO: add option to scan in memory not via temp files, perhaps with a * named pipe or memory mapped file, though this won't work on big e-mails * containing many levels of encapsulated messages - it'd just take too much * RAM * TODO: parse .msg format files * TODO: fully handle AppleDouble format, see * http://www.lazerware.com/formats/Specs/AppleSingle_AppleDouble.pdf * TODO: ensure parseEmailHeaders is always called before parseEmailBody * TODO: create parseEmail which calls parseEmailHeaders then parseEmailBody * TODO: Handle unepected NUL bytes in header lines which stop strcmp()s: * e.g. \0Content-Type: application/binary; */ static int cli_parse_mbox(const char *dir, int desc, cli_ctx *ctx) { int retcode, i; message *body; FILE *fd; char buffer[RFC2821LENGTH + 1]; mbox_ctx mctx; #ifdef HAVE_BACKTRACE void (*segv)(int); #endif static table_t *rfc821, *subtype; #ifdef CL_DEBUG char tmpfilename[16]; int tmpfd; #endif #if defined(FOLLOWURLS) && (!defined(CL_EXPERIMENTAL)) static int initialised = 0; #ifdef CL_THREAD_SAFE static pthread_mutex_t init_mutex = PTHREAD_MUTEX_INITIALIZER; #endif #endif #ifdef NEW_WORLD cli_dbgmsg("fall back to old world\n"); #else cli_dbgmsg("in mbox()\n"); #endif #if defined(FOLLOWURLS) && (!defined(CL_EXPERIMENTAL)) if(ctx->options&CL_SCAN_MAILURL) { #ifdef CL_THREAD_SAFE pthread_mutex_lock(&init_mutex); #endif if(!initialised) { if(curl_global_init(CURL_GLOBAL_ALL) != 0) { #ifdef CL_THREAD_SAFE pthread_mutex_unlock(&init_mutex); #endif cli_warnmsg("curl_global_init failed, disabling mail-follow-urls"); ctx->options &= ~CL_SCAN_MAILURL; } initialised = 1; } #ifdef CL_THREAD_SAFE pthread_mutex_unlock(&init_mutex); #endif } #endif i = dup(desc); if((fd = fdopen(i, "rb")) == NULL) { cli_errmsg("Can't open descriptor %d\n", desc); close(i); return CL_EOPEN; } #ifdef CL_DEBUG /* * Copy the incoming mail for debugging, so that if it falls over * we have a copy of the offending email. This is debugging code * that you shouldn't of course install in a live environment. I am * not interested in hearing about security issues with this section * of the parser. */ strcpy(tmpfilename, "/tmp/mboxXXXXXX"); tmpfd = mkstemp(tmpfilename); if(tmpfd < 0) { perror(tmpfilename); cli_errmsg("Can't make debugging file\n"); } else { FILE *tmpfp = fdopen(tmpfd, "w"); if(tmpfp) { while(fgets(buffer, sizeof(buffer) - 1, fd) != NULL) fputs(buffer, tmpfp); fclose(tmpfp); rewind(fd); } else cli_errmsg("Can't fdopen debugging file\n"); } #endif if(fgets(buffer, sizeof(buffer) - 1, fd) == NULL) { /* empty message */ fclose(fd); #ifdef CL_DEBUG unlink(tmpfilename); #endif return CL_CLEAN; } #ifdef CL_THREAD_SAFE pthread_mutex_lock(&tables_mutex); #endif if(rfc821 == NULL) { assert(subtype == NULL); if(initialiseTables(&rfc821, &subtype) < 0) { rfc821 = NULL; subtype = NULL; #ifdef CL_THREAD_SAFE pthread_mutex_unlock(&tables_mutex); #endif fclose(fd); #ifdef CL_DEBUG unlink(tmpfilename); #endif return CL_EMEM; } } #ifdef CL_THREAD_SAFE pthread_mutex_unlock(&tables_mutex); #endif #ifdef HAVE_BACKTRACE segv = signal(SIGSEGV, sigsegv); #endif retcode = CL_SUCCESS; body = NULL; mctx.dir = dir; mctx.rfc821Table = rfc821; mctx.subtypeTable = subtype; mctx.ctx = ctx; /* * Is it a UNIX style mbox with more than one * mail message, or just a single mail message? * * TODO: It would be better if we called cli_scandir here rather than * in cli_scanmail. Then we could improve the way mailboxes with more * than one message is handled, e.g. stopping parsing when an infected * message is stopped, and giving a better indication of which message * within the mailbox is infected */ /*if((strncmp(buffer, "From ", 5) == 0) && isalnum(buffer[5])) {*/ if(strncmp(buffer, "From ", 5) == 0) { /* * Have been asked to check a UNIX style mbox file, which * may contain more than one e-mail message to decode * * It would be far better for scanners.c to do this splitting * and do this * FOR EACH mail in the mailbox * DO * pass this mail to cli_mbox -- * scan this file * IF this file has a virus quit * THEN * return CL_VIRUS * FI * END * This would remove a problem with this code that it can * fill up the tmp directory before it starts scanning */ bool lastLineWasEmpty; int messagenumber; message *m = messageCreate(); if(m == NULL) { fclose(fd); #ifdef HAVE_BACKTRACE signal(SIGSEGV, segv); #endif #ifdef CL_DEBUG unlink(tmpfilename); #endif return CL_EMEM; } lastLineWasEmpty = FALSE; messagenumber = 1; messageSetCTX(m, ctx); do { cli_chomp(buffer); /*if(lastLineWasEmpty && (strncmp(buffer, "From ", 5) == 0) && isalnum(buffer[5])) {*/ if(lastLineWasEmpty && (strncmp(buffer, "From ", 5) == 0)) { cli_dbgmsg("Deal with email number %d\n", messagenumber++); /* * End of a message in the mail box */ body = parseEmailHeaders(m, rfc821); if(body == NULL) { messageReset(m); continue; } messageSetCTX(body, ctx); messageDestroy(m); if(messageGetBody(body)) { int rc = parseEmailBody(body, NULL, &mctx); if(rc == 0) { messageReset(body); m = body; continue; } else if(rc == 3) { cli_dbgmsg("Message number %d is infected\n", messagenumber); retcode = CL_VIRUS; m = NULL; break; } } /* * Starting a new message, throw away all the * information about the old one. It would * be best to be able to scan this message * now, but cli_scanfile needs arguments * that haven't been passed here so it can't be * called */ m = body; messageReset(body); messageSetCTX(body, ctx); cli_dbgmsg("Finished processing message\n"); } else lastLineWasEmpty = (bool)(buffer[0] == '\0'); if(isuuencodebegin(buffer)) { /* * Fast track visa to uudecode. * TODO: binhex, yenc */ if(uudecodeFile(m, buffer, dir, fd) < 0) if(messageAddStr(m, buffer) < 0) break; } else if(messageAddStr(m, buffer) < 0) break; } while(fgets(buffer, sizeof(buffer) - 1, fd) != NULL); fclose(fd); if(retcode == CL_SUCCESS) { cli_dbgmsg("Extract attachments from email %d\n", messagenumber); body = parseEmailHeaders(m, rfc821); } if(m) messageDestroy(m); } else { /* * It's a single message, parse the headers then the body */ if(strncmp(buffer, "P I ", 4) == 0) /* * CommuniGate Pro format: ignore headers until * blank line */ while((fgets(buffer, sizeof(buffer) - 1, fd) != NULL) && (strchr("\r\n", buffer[0]) == NULL)) ; /* * Ignore any blank lines at the top of the message */ while(strchr("\r\n", buffer[0]) && (getline_from_mbox(buffer, sizeof(buffer) - 1, fd) != NULL)) ; buffer[sizeof(buffer) - 1] = '\0'; body = parseEmailFile(fd, rfc821, buffer, dir); fclose(fd); } if(body) { /* * Write out the last entry in the mailbox */ if((retcode == CL_SUCCESS) && messageGetBody(body)) { messageSetCTX(body, ctx); switch(parseEmailBody(body, NULL, &mctx)) { case 0: retcode = CL_EFORMAT; break; case 3: retcode = CL_VIRUS; break; } } /* * Tidy up and quit */ messageDestroy(body); } cli_dbgmsg("cli_mbox returning %d\n", retcode); #ifdef HAVE_BACKTRACE signal(SIGSEGV, segv); #endif #ifdef CL_DEBUG unlink(tmpfilename); #endif return retcode; } /* * Read in an email message from fin, parse it, and return the message * * FIXME: files full of new lines and nothing else are * handled ungracefully... */ static message * parseEmailFile(FILE *fin, const table_t *rfc821, const char *firstLine, const char *dir) { bool inHeader = TRUE; bool bodyIsEmpty = TRUE; bool lastWasBlank = FALSE, lastBodyLineWasBlank = FALSE; message *ret; bool anyHeadersFound = FALSE; int commandNumber = -1; char *fullline = NULL, *boundary = NULL; size_t fulllinelength = 0; char buffer[RFC2821LENGTH + 1]; cli_dbgmsg("parseEmailFile\n"); ret = messageCreate(); if(ret == NULL) return NULL; strcpy(buffer, firstLine); do { const char *line; (void)cli_chomp(buffer); if(buffer[0] == '\0') line = NULL; else line = buffer; /* * Don't blank lines which are only spaces from headers, * otherwise they'll be treated as the end of header marker */ if(lastWasBlank) { lastWasBlank = FALSE; if(boundaryStart(buffer, boundary)) { cli_dbgmsg("Found a header line with space that should be blank\n"); inHeader = FALSE; } } if(inHeader) { cli_dbgmsg("parseEmailFile: check '%s' fullline %p\n", buffer ? buffer : "", fullline); if(line && isspace(line[0])) { char copy[sizeof(buffer)]; strcpy(copy, buffer); strstrip(copy); if(copy[0] == '\0') { /* * The header line contains only white * space. This is not the end of the * headers according to RFC2822, but * some MUAs will handle it as though * it were, and virus writers exploit * this bug. We can't just break from * the loop here since that would allow * other exploits such as inserting a * white space line before the * content-type line. So we just have * to make a best guess. Sigh. */ if(fullline) { if(parseEmailHeader(ret, fullline, rfc821) < 0) continue; free(fullline); fullline = NULL; } if(boundary || ((boundary = (char *)messageFindArgument(ret, "boundary")) != NULL)) { lastWasBlank = TRUE; continue; } } } if((line == NULL) && (fullline == NULL)) { /* empty line */ /* * A blank line signifies the end of * the header and the start of the text */ if(!anyHeadersFound) /* Ignore the junk at the top */ continue; cli_dbgmsg("End of header information\n"); inHeader = FALSE; bodyIsEmpty = TRUE; } else { char *ptr; int lookahead; if(fullline == NULL) { char cmd[RFC2821LENGTH + 1], out[RFC2821LENGTH + 1]; /* * Continuation of line we're ignoring? */ if(isblank(line[0])) continue; /* * Is this a header we're interested in? */ if((strchr(line, ':') == NULL) || (cli_strtokbuf(line, 0, ":", cmd) == NULL)) { if(strncmp(line, "From ", 5) == 0) anyHeadersFound = TRUE; continue; } ptr = rfc822comments(cmd, out); commandNumber = tableFind(rfc821, ptr ? ptr : cmd); switch(commandNumber) { case CONTENT_TRANSFER_ENCODING: case CONTENT_DISPOSITION: case CONTENT_TYPE: anyHeadersFound = TRUE; break; default: if(!anyHeadersFound) anyHeadersFound = usefulHeader(commandNumber, cmd); continue; } fullline = strdup(line); fulllinelength = strlen(line) + 1; } else if(line != NULL) { fulllinelength += strlen(line); ptr = cli_realloc(fullline, fulllinelength); if(ptr == NULL) continue; fullline = ptr; strcat(fullline, line); } assert(fullline != NULL); lookahead = getc(fin); if(lookahead != EOF) { ungetc(lookahead, fin); /* * Section B.2 of RFC822 says TAB or * SPACE means a continuation of the * previous entry. * * Add all the arguments on the line */ if(isblank(lookahead)) continue; } if(line && (count_quotes(fullline) & 1)) continue; ptr = rfc822comments(fullline, NULL); if(ptr) { free(fullline); fullline = ptr; } if(parseEmailHeader(ret, fullline, rfc821) < 0) continue; free(fullline); fullline = NULL; } } else if(line && isuuencodebegin(line)) { /* * Fast track visa to uudecode. * TODO: binhex, yenc */ bodyIsEmpty = FALSE; if(uudecodeFile(ret, line, dir, fin) < 0) if(messageAddStr(ret, line) < 0) break; } else { if(line == NULL) { /* * Although this would save time and RAM, some * phish signatures have been built which need * the blank lines */ if(lastBodyLineWasBlank && (messageGetMimeType(ret) != TEXT)) { cli_dbgmsg("Ignoring consecutive blank lines in the body\n"); continue; } lastBodyLineWasBlank = TRUE; } else { if(bodyIsEmpty) { /* * Broken message: new line in the * middle of the headers, so the first * line of the body is in fact * the last lines of the header */ if(strncmp(line, "Message-Id: ", 12) == 0) continue; if(strncmp(line, "Date: ", 6) == 0) continue; } bodyIsEmpty = FALSE; lastBodyLineWasBlank = FALSE; } if(messageAddStr(ret, line) < 0) break; } } while(getline_from_mbox(buffer, sizeof(buffer) - 1, fin) != NULL); if(boundary) free(boundary); if(fullline) { if(*fullline) switch(commandNumber) { case CONTENT_TRANSFER_ENCODING: case CONTENT_DISPOSITION: case CONTENT_TYPE: cli_dbgmsg("parseEmailFile: Fullline unparsed '%s'\n", fullline); } free(fullline); } if(!anyHeadersFound) { /* * False positive in believing we have an e-mail when we don't */ messageDestroy(ret); cli_dbgmsg("parseEmailFile: no headers found, assuming it isn't an email\n"); return NULL; } messageClean(ret); cli_dbgmsg("parseEmailFile: return\n"); return ret; } /* * The given message contains a raw e-mail. * * Returns the message's body with the correct arguments set * * The downside of this approach is that for a short time we have two copies * of the message in memory, the upside is that it makes for easier parsing * of encapsulated messages, and in the long run uses less memory in those * scenarios * * TODO: remove the duplication with parseEmailFile */ static message * parseEmailHeaders(message *m, const table_t *rfc821) { bool inHeader = TRUE; bool bodyIsEmpty = TRUE; const text *t; message *ret; bool anyHeadersFound = FALSE; int commandNumber = -1; char *fullline = NULL; size_t fulllinelength = 0; cli_dbgmsg("parseEmailHeaders\n"); if(m == NULL) return NULL; ret = messageCreate(); for(t = messageGetBody(m); t; t = t->t_next) { const char *buffer; if(t->t_line) buffer = lineGetData(t->t_line); else buffer = NULL; if(inHeader) { cli_dbgmsg("parseEmailHeaders: check '%s'\n", buffer ? buffer : ""); if(buffer == NULL) { /* * A blank line signifies the end of * the header and the start of the text */ cli_dbgmsg("End of header information\n"); if(!anyHeadersFound) { cli_dbgmsg("Nothing interesting in the header\n"); break; } inHeader = FALSE; bodyIsEmpty = TRUE; } else { char *ptr; if(fullline == NULL) { char cmd[RFC2821LENGTH + 1]; /* * Continuation of line we're ignoring? */ if(isblank(buffer[0])) continue; /* * Is this a header we're interested in? */ if((strchr(buffer, ':') == NULL) || (cli_strtokbuf(buffer, 0, ":", cmd) == NULL)) { if(strncmp(buffer, "From ", 5) == 0) anyHeadersFound = TRUE; continue; } ptr = rfc822comments(cmd, NULL); commandNumber = tableFind(rfc821, ptr ? ptr : cmd); if(ptr) free(ptr); switch(commandNumber) { case CONTENT_TRANSFER_ENCODING: case CONTENT_DISPOSITION: case CONTENT_TYPE: anyHeadersFound = TRUE; break; default: if(!anyHeadersFound) anyHeadersFound = usefulHeader(commandNumber, cmd); continue; } fullline = strdup(buffer); fulllinelength = strlen(buffer) + 1; } else if(buffer) { fulllinelength += strlen(buffer); ptr = cli_realloc(fullline, fulllinelength); if(ptr == NULL) continue; fullline = ptr; strcat(fullline, buffer); } assert(fullline != NULL); if(next_is_folded_header(t)) /* Add arguments to this line */ continue; if(count_quotes(fullline) & 1) continue; ptr = rfc822comments(fullline, NULL); if(ptr) { free(fullline); fullline = ptr; } if(parseEmailHeader(ret, fullline, rfc821) < 0) continue; free(fullline); fullline = NULL; } } else { if(bodyIsEmpty) { if(buffer == NULL) /* throw away leading blank lines */ continue; cli_dbgmsg("bodyIsEmpty, check \"%s\"\n", buffer); /* * Broken message: new line in the * middle of the headers, so the first * line of the body is in fact * the last lines of the header */ if(strncmp(buffer, "Message-Id: ", 12) == 0) continue; if(strncmp(buffer, "Date: ", 6) == 0) continue; bodyIsEmpty = FALSE; } /*if(t->t_line && isuuencodebegin(t->t_line)) puts("FIXME: add fast visa here");*/ /*cli_dbgmsg("Add line to body '%s'\n", buffer);*/ if(messageAddLine(ret, t->t_line) < 0) break; } } if(fullline) { if(*fullline) switch(commandNumber) { case CONTENT_TRANSFER_ENCODING: case CONTENT_DISPOSITION: case CONTENT_TYPE: cli_dbgmsg("parseEmailHeaders: Fullline unparsed '%s'\n", fullline); } free(fullline); } if(!anyHeadersFound) { /* * False positive in believing we have an e-mail when we don't */ messageDestroy(ret); cli_dbgmsg("parseEmailHeaders: no headers found, assuming it isn't an email\n"); return NULL; } messageClean(ret); cli_dbgmsg("parseEmailHeaders: return\n"); return ret; } /* * Handle a header line of an email message */ static int parseEmailHeader(message *m, const char *line, const table_t *rfc821) { char *cmd; int ret = -1; #ifdef CL_THREAD_SAFE char *strptr; #endif const char *separater; char *copy, tokenseparater[2]; cli_dbgmsg("parseEmailHeader '%s'\n", line); /* * In RFC822 the separater between the key a value is a colon, * e.g. Content-Transfer-Encoding: base64 * However some MUA's are lapse about this and virus writers exploit * this hole, so we need to check all known possiblities */ for(separater = ":= "; *separater; separater++) if(strchr(line, *separater) != NULL) break; if(*separater == '\0') return -1; copy = rfc2047(line); if(copy == NULL) /* an RFC checker would return -1 here */ copy = strdup(line); tokenseparater[0] = *separater; tokenseparater[1] = '\0'; #ifdef CL_THREAD_SAFE cmd = strtok_r(copy, tokenseparater, &strptr); #else cmd = strtok(copy, tokenseparater); #endif if(cmd && (strstrip(cmd) > 0)) { #ifdef CL_THREAD_SAFE char *arg = strtok_r(NULL, "", &strptr); #else char *arg = strtok(NULL, ""); #endif if(arg) /* * Found a header such as * Content-Type: multipart/mixed; * set arg to be * "multipart/mixed" and cmd to * be "Content-Type" */ ret = parseMimeHeader(m, cmd, rfc821, arg); } free(copy); return ret; } /* * This is a recursive routine. * FIXME: We are not passed &mrec so we can't check against MAX_MAIL_RECURSION * * This function parses the body of mainMessage and saves its attachments in dir * * mainMessage is the buffer to be parsed, it contains an e-mail's body, without * any headers. First time of calling it'll be * the whole message. Later it'll be parts of a multipart message * textIn is the plain text message being built up so far * * Returns: * 0 for fail * 1 for success, attachments saved * 2 for success, attachments not saved * 3 for virus found */ static int /* success or fail */ parseEmailBody(message *messageIn, text *textIn, mbox_ctx *mctx) { int rc = 1; text *aText = textIn; message *mainMessage = messageIn; fileblob *fb; bool infected = FALSE; #ifdef CL_EXPERIMENTAL const int doPhishingScan = !(mctx->ctx->options&CL_SCAN_NOPHISHING); /* || (mctx->ctx->options&CL_SCAN_PHISHING_GA_TRAIN) || (mctx->ctx->options&CL_SCAN_PHISHING_GA); kept here for the GA MERGE */ #endif cli_dbgmsg("in parseEmailBody\n"); /* Anything left to be parsed? */ if(mainMessage && (messageGetBody(mainMessage) != NULL)) { mime_type mimeType; int subtype, inhead, htmltextPart, inMimeHead, i; const char *mimeSubtype, *boundary; char *protocol; const text *t_line; /*bool isAlternative;*/ message *aMessage; int multiparts = 0; message **messages = NULL; /* parts of a multipart message */ cli_dbgmsg("Parsing mail file\n"); mimeType = messageGetMimeType(mainMessage); mimeSubtype = messageGetMimeSubtype(mainMessage); /* pre-process */ subtype = tableFind(mctx->subtypeTable, mimeSubtype); if((mimeType == TEXT) && (subtype == PLAIN)) { /* * This is effectively no encoding, notice that we * don't check that charset is us-ascii */ cli_dbgmsg("assume no encoding\n"); mimeType = NOMIME; messageSetMimeSubtype(mainMessage, ""); } else if((mimeType == MESSAGE) && (strcasecmp(mimeSubtype, "rfc822-headers") == 0)) { /* * RFC1892/RFC3462: section 2 text/rfc822-headers * incorrectly sent as message/rfc822-headers * * Parse as text/plain, i.e. no mime */ cli_dbgmsg("Changing message/rfc822-headers to text/rfc822-headers\n"); mimeType = NOMIME; messageSetMimeSubtype(mainMessage, ""); } else cli_dbgmsg("mimeType = %d\n", mimeType); switch(mimeType) { case NOMIME: cli_dbgmsg("Not a mime encoded message\n"); aText = textAddMessage(aText, mainMessage); #ifdef CL_EXPERIMENTAL if(!doPhishingScan) break; /* * Fall through: some phishing mails claim they are * text/plain, when they are in fact html */ #else break; #endif case TEXT: /* text/plain has been preprocessed as no encoding */ #ifdef CL_EXPERIMENTAL if((subtype == HTML) || doPhishingScan) { #else if((mctx->ctx->options&CL_SCAN_MAILURL) && (subtype == HTML)) #endif /* * It would be better to save and scan the * file and only checkURLs if it's found to be * clean */ checkURLs(mainMessage, mctx, &rc, (subtype == HTML)); #ifdef CL_EXPERIMENTAL /* * There might be html sent without subtype * html too, so scan them for phishing */ if(rc == 3) infected = TRUE; } #endif break; case MULTIPART: cli_dbgmsg("Content-type 'multipart' handler\n"); boundary = messageFindArgument(mainMessage, "boundary"); if(boundary == NULL) { cli_warnmsg("Multipart/%s MIME message contains no boundary header\n", mimeSubtype); /* Broken e-mail message */ mimeType = NOMIME; /* * The break means that we will still * check if the file contains a uuencoded file */ break; } /* Perhaps it should assume mixed? */ if(mimeSubtype[0] == '\0') { cli_warnmsg("Multipart has no subtype assuming alternative\n"); mimeSubtype = "alternative"; messageSetMimeSubtype(mainMessage, "alternative"); } /* * Get to the start of the first message */ t_line = messageGetBody(mainMessage); if(t_line == NULL) { cli_warnmsg("Multipart MIME message has no body\n"); free((char *)boundary); mimeType = NOMIME; break; } do if(t_line->t_line) { if(boundaryStart(lineGetData(t_line->t_line), boundary)) break; /* * Found a binhex file before * the first multipart * TODO: check yEnc */ if(binhexBegin(mainMessage) == t_line) { if(exportBinhexMessage(mctx->dir, mainMessage)) { /* virus found */ rc = 3; infected = TRUE; break; } } else if(encodingLine(mainMessage) == t_line->t_next) { /* * We look for the next line * since later on we'll skip * over the important line when * we think it's a blank line * at the top of the message - * which it would have been in * an RFC compliant world */ cli_dbgmsg("Found MIME attachment before the first MIME section\n"); if(messageGetEncoding(mainMessage) == NOENCODING) break; } } while((t_line = t_line->t_next) != NULL); if(t_line == NULL) { cli_dbgmsg("Multipart MIME message contains no boundary lines (%s)\n", boundary); /* * Free added by Thomas Lamy * */ free((char *)boundary); mimeType = NOMIME; /* * The break means that we will still * check if the file contains a yEnc/binhex file */ break; } /* * Build up a table of all of the parts of this * multipart message. Remember, each part may itself * be a multipart message. */ inhead = 1; inMimeHead = 0; /* * Re-read this variable in case mimeSubtype has changed */ subtype = tableFind(mctx->subtypeTable, mimeSubtype); /* * Parse the mainMessage object and create an array * of objects called messages, one for each of the * multiparts that mainMessage contains. * * This looks like parseEmailHeaders() - maybe there's * some duplication of code to be cleaned up * * We may need to create an array rather than just * save each part as it is found because not all * elements will need scanning, and we don't yet know * which of those elements it will be, except in * the case of mixed, when all parts need to be scanned. */ for(multiparts = 0; t_line && !infected; multiparts++) { int lines = 0; message **m; m = cli_realloc(messages, ((multiparts + 1) * sizeof(message *))); if(m == NULL) break; messages = m; aMessage = messages[multiparts] = messageCreate(); if(aMessage == NULL) { multiparts--; continue; } messageSetCTX(aMessage, mctx->ctx); cli_dbgmsg("Now read in part %d\n", multiparts); /* * Ignore blank lines. There shouldn't be ANY * but some viruses insert them */ while((t_line = t_line->t_next) != NULL) if(t_line->t_line && /*(cli_chomp(t_line->t_text) > 0))*/ (strlen(lineGetData(t_line->t_line)) > 0)) break; if(t_line == NULL) { cli_dbgmsg("Empty part\n"); /* * Remove this part unless there's * a binhex portion somewhere in * the complete message that we may * throw away by mistake if the MIME * encoding information is incorrect */ if(mainMessage && (binhexBegin(mainMessage) == NULL)) { messageDestroy(aMessage); --multiparts; } continue; } do { const char *line = lineGetData(t_line->t_line); /*cli_dbgmsg("multipart %d: inMimeHead %d inhead %d boundary '%s' line '%s' next '%s'\n", multiparts, inMimeHead, inhead, boundary, line, t_line->t_next && t_line->t_next->t_line ? lineGetData(t_line->t_next->t_line) : "(null)");*/ if(inMimeHead) { /* continuation line */ if(line == NULL) { /*inhead =*/ inMimeHead = 0; continue; } /* * Handle continuation lines * because the previous line * ended with a ; or this line * starts with a white space */ cli_dbgmsg("Multipart %d: About to add mime Argument '%s'\n", multiparts, line); /* * Handle the case when it * isn't really a continuation * line: * Content-Type: application/octet-stream; * Content-Transfer-Encoding: base64 */ parseEmailHeader(aMessage, line, mctx->rfc821Table); while(isspace((int)*line)) line++; if(*line == '\0') { inhead = inMimeHead = 0; continue; } inMimeHead = FALSE; messageAddArgument(aMessage, line); } else if(inhead) { /* handling normal headers */ /*int quotes;*/ char *fullline, *ptr; if(line == NULL) { /* * empty line, should the end of the headers, * but some base64 decoders, e.g. uudeview, are broken * and will handle this type of entry, decoding the * base64 content... * Content-Type: application/octet-stream; name=text.zip * Content-Transfer-Encoding: base64 * Content-Disposition: attachment; filename="text.zip" * * Content-Disposition: attachment; * filename=text.zip * Content-Type: application/octet-stream; * name=text.zip * Content-Transfer-Encoding: base64 * * UEsDBAoAAAAAAACgPjJ2RHw676gAAO+oAABEAAAAbWFpbF90ZXh0LWluZm8udHh0ICAgICAgICAg */ const text *next = t_line->t_next; if(next && next->t_line) { const char *data = lineGetData(next->t_line); if((messageGetEncoding(aMessage) == NOENCODING) && (messageGetMimeType(aMessage) == APPLICATION) && strstr(data, "base64")) { /* * Handle this nightmare (note the blank * line in the header and the incorrect * content-transfer-encoding header) * * Content-Type: application/octet-stream; name="zipped_files.EXEX-Spanska: Yes * * r-Encoding: base64 * Content-Disposition: attachment; filename="zipped_files.EXE" */ messageSetEncoding(aMessage, "base64"); cli_dbgmsg("Ignoring fake end of headers\n"); continue; } if((strncmp(data, "Content", 7) == 0) || (strncmp(data, "filename=", 9) == 0)) { cli_dbgmsg("Ignoring fake end of headers\n"); continue; } } cli_dbgmsg("Multipart %d: End of header information\n", multiparts); inhead = 0; continue; } if(isspace((int)*line)) { /* * The first line is * continuation line. * This is tricky * to handle, but * all we can do is our * best */ cli_dbgmsg("Part %d starts with a continuation line\n", multiparts); messageAddArgument(aMessage, line); /* * Give it a default * MIME type since * that may be the * missing line * * Choose application to * force a save */ if(messageGetMimeType(aMessage) == NOMIME) messageSetMimeType(aMessage, "application"); continue; } inMimeHead = FALSE; assert(strlen(line) <= RFC2821LENGTH); fullline = rfc822comments(line, NULL); if(fullline == NULL) fullline = strdup(line); /*quotes = count_quotes(fullline);*/ /* * Fold next lines to the end of this * if they start with a white space * or if this line has an odd number of quotes: * Content-Type: application/octet-stream; name="foo * " */ while(t_line && next_is_folded_header(t_line)) { const char *data; t_line = t_line->t_next; data = lineGetData(t_line->t_line); if(data[1] == '\0') { /* * Broken message: the * blank line at the end * of the headers isn't blank - * it contains a space */ cli_dbgmsg("Multipart %d: headers not terminated by blank line\n", multiparts); inhead = FALSE; break; } ptr = cli_realloc(fullline, strlen(fullline) + strlen(data) + 1); if(ptr == NULL) break; fullline = ptr; strcat(fullline, data); /*quotes = count_quotes(data);*/ } cli_dbgmsg("Multipart %d: About to parse folded header '%s'\n", multiparts, fullline); parseEmailHeader(aMessage, fullline, mctx->rfc821Table); free(fullline); } else if(endOfMessage(line, boundary)) { /* * Some viruses put information * *after* the end of message, * which presumably some broken * mail clients find, so we * can't assume that this * is the end of the message */ /* t_line = NULL;*/ break; } else if(boundaryStart(line, boundary)) { inhead = 1; break; } else { if(messageAddLine(aMessage, t_line->t_line) < 0) break; lines++; } } while((t_line = t_line->t_next) != NULL); cli_dbgmsg("Part %d has %d lines\n", multiparts, lines); /* * Only save in the array of messages if some * decision will be taken on whether to scan. * If all parts will be scanned then save to * file straight away */ switch(subtype) { case MIXED: case ALTERNATIVE: case REPORT: case DIGEST: case APPLEDOUBLE: case KNOWBOT: case -1: mainMessage = do_multipart(mainMessage, messages, multiparts, &rc, mctx, messageIn, &aText); --multiparts; if(rc == 3) infected = TRUE; break; default: messageClean(aMessage); } } free((char *)boundary); /* * Preprocess. Anything special to be done before * we handle the multiparts? */ switch(subtype) { case KNOWBOT: /* TODO */ cli_dbgmsg("multipart/knowbot parsed as multipart/mixed for now\n"); mimeSubtype = "mixed"; break; case -1: /* * According to section 7.2.6 of * RFC1521, unrecognised multiparts * should be treated as multipart/mixed. */ cli_dbgmsg("Unsupported multipart format `%s', parsed as mixed\n", mimeSubtype); mimeSubtype = "mixed"; break; } /* * We've finished message we're parsing */ if(mainMessage && (mainMessage != messageIn)) { messageDestroy(mainMessage); mainMessage = NULL; } cli_dbgmsg("The message has %d parts\n", multiparts); if(((multiparts == 0) || infected) && (aText == NULL)) { if(messages) { for(i = 0; i < multiparts; i++) if(messages[i]) messageDestroy(messages[i]); free(messages); } /* * FIXME: we could return 2 here when we have * saved stuff earlier */ return (rc == 3) ? 3 : 2; /* Nothing to do */ } cli_dbgmsg("Find out the multipart type (%s)\n", mimeSubtype); /* * We now have all the parts of the multipart message * in the messages array: * message *messages[multiparts] * Let's decide what to do with them all */ switch(tableFind(mctx->subtypeTable, mimeSubtype)) { case RELATED: cli_dbgmsg("Multipart related handler\n"); /* * Have a look to see if there's HTML code * which will need scanning */ aMessage = NULL; assert(multiparts > 0); htmltextPart = getTextPart(messages, multiparts); if(htmltextPart >= 0) aText = textAddMessage(aText, messages[htmltextPart]); else /* * There isn't an HTML bit. If there's a * multipart bit, it'll may be in there * somewhere */ for(i = 0; i < multiparts; i++) if(messageGetMimeType(messages[i]) == MULTIPART) { aMessage = messages[i]; htmltextPart = i; break; } if(htmltextPart == -1) cli_dbgmsg("No HTML code found to be scanned\n"); else { rc = parseEmailBody(aMessage, aText, mctx); if(rc == 1) { assert(aMessage == messages[htmltextPart]); messageDestroy(aMessage); messages[htmltextPart] = NULL; } } /* * Fixed based on an idea from Stephen White * The message is confused about the difference * between alternative and related. Badtrans.B * suffers from this problem. * * Fall through in this case: * Content-Type: multipart/related; * type="multipart/alternative" */ /* * Changed to always fall through based on * an idea from Michael Dankov * that some viruses are completely confused * about the difference between related * and mixed */ /*cptr = messageFindArgument(mainMessage, "type"); if(cptr == NULL) break; isAlternative = (bool)(strcasecmp(cptr, "multipart/alternative") == 0); free((char *)cptr); if(!isAlternative) break;*/ case DIGEST: /* * According to section 5.1.5 RFC2046, the * default mime type of multipart/digest parts * is message/rfc822 * * We consider them as alternative, wrong in * the strictest sense since they aren't * alternatives - all parts a valid - but it's * OK for our needs since it means each part * will be scanned */ case ALTERNATIVE: cli_dbgmsg("Multipart alternative handler\n"); /* * Fall through - some clients are broken and * say alternative instead of mixed. The Klez * virus is broken that way, and anyway we * wish to scan all of the alternatives */ case REPORT: /* * According to section 1 of RFC1892, the * syntax of multipart/report is the same * as multipart/mixed. There are some required * parameters, but there's no need for us to * verify that they exist */ case MIXED: case APPLEDOUBLE: /* not really supported */ /* * Look for attachments * * Not all formats are supported. If an * unsupported format turns out to be * common enough to implement, it is a simple * matter to add it */ if(aText) { if(mainMessage && (mainMessage != messageIn)) messageDestroy(mainMessage); mainMessage = NULL; } cli_dbgmsg("Mixed message with %d parts\n", multiparts); for(i = 0; i < multiparts; i++) { mainMessage = do_multipart(mainMessage, messages, i, &rc, mctx, messageIn, &aText); if(rc == 3) { infected = TRUE; break; } } /* rc = parseEmailBody(NULL, NULL, mctx); */ break; case SIGNED: case PARALLEL: /* * If we're here it could be because we have a * multipart/mixed message, consisting of a * message followed by an attachment. That * message itself is a multipart/alternative * message and we need to dig out the plain * text part of that alternative */ htmltextPart = getTextPart(messages, multiparts); if(htmltextPart == -1) htmltextPart = 0; rc = parseEmailBody(messages[htmltextPart], aText, mctx); break; case ENCRYPTED: rc = 0; protocol = (char *)messageFindArgument(mainMessage, "protocol"); if(protocol) { if(strcasecmp(protocol, "application/pgp-encrypted") == 0) { /* RFC2015 */ cli_warnmsg("PGP encoded attachment not scanned\n"); rc = 2; } else cli_warnmsg("Unknown encryption protocol '%s' - if you believe this file contains a virus, submit it to www.clamav.net\n", protocol); free(protocol); } else cli_dbgmsg("Encryption method missing protocol name\n"); break; default: assert(0); } if(mainMessage && (mainMessage != messageIn)) messageDestroy(mainMessage); if(aText && (textIn == NULL)) { if((!infected) && (fb = fileblobCreate()) != NULL) { cli_dbgmsg("Save non mime and/or text/plain part\n"); fileblobSetFilename(fb, mctx->dir, "textpart"); /*fileblobAddData(fb, "Received: by clamd (textpart)\n", 30);*/ fileblobSetCTX(fb, mctx->ctx); (void)textToFileblob(aText, fb, 1); fileblobDestroy(fb); } textDestroy(aText); } for(i = 0; i < multiparts; i++) if(messages[i]) messageDestroy(messages[i]); if(messages) free(messages); return rc; case MESSAGE: /* * Check for forbidden encodings */ switch(messageGetEncoding(mainMessage)) { case NOENCODING: case EIGHTBIT: case BINARY: break; default: cli_warnmsg("MIME type 'message' cannot be decoded\n"); break; } rc = 0; if((strcasecmp(mimeSubtype, "rfc822") == 0) || (strcasecmp(mimeSubtype, "delivery-status") == 0)) { message *m = parseEmailHeaders(mainMessage, mctx->rfc821Table); if(m) { cli_dbgmsg("Decode rfc822\n"); messageSetCTX(m, mctx->ctx); if(mainMessage && (mainMessage != messageIn)) { messageDestroy(mainMessage); mainMessage = NULL; } else messageReset(mainMessage); if(messageGetBody(m)) rc = parseEmailBody(m, NULL, mctx); messageDestroy(m); } break; } else if(strcasecmp(mimeSubtype, "disposition-notification") == 0) { /* RFC 2298 - handle like a normal email */ rc = 1; break; } else if(strcasecmp(mimeSubtype, "partial") == 0) { #ifdef PARTIAL_DIR /* RFC1341 message split over many emails */ if(rfc1341(mainMessage, mctx->dir) >= 0) rc = 1; #else cli_warnmsg("Partial message received from MUA/MTA - message cannot be scanned\n"); rc = 0; #endif } else if(strcasecmp(mimeSubtype, "external-body") == 0) /* TODO */ cli_warnmsg("Attempt to send Content-type message/external-body trapped"); else cli_warnmsg("Unsupported message format `%s' - if you believe this file contains a virus, submit it to www.clamav.net\n", mimeSubtype); if(mainMessage && (mainMessage != messageIn)) messageDestroy(mainMessage); if(messages) free(messages); return rc; case APPLICATION: /*cptr = messageGetMimeSubtype(mainMessage); if((strcasecmp(cptr, "octet-stream") == 0) || (strcasecmp(cptr, "x-msdownload") == 0)) {*/ { fb = messageToFileblob(mainMessage, mctx->dir, 1); if(fb) { cli_dbgmsg("Saving main message as attachment\n"); fileblobDestroy(fb); if(mainMessage != messageIn) { messageDestroy(mainMessage); mainMessage = NULL; } else messageReset(mainMessage); } } /*else cli_warnmsg("Discarded application not sent as attachment\n");*/ break; case AUDIO: case VIDEO: case IMAGE: break; default: cli_warnmsg("Message received with unknown mime encoding"); break; } if(messages) { /* "can't happen" */ cli_warnmsg("messages != NULL, report to http://bugs.clamav.net\n"); free(messages); } } if(aText && (textIn == NULL)) { /* Look for a bounce in the text (non mime encoded) portion */ const text *t; for(t = aText; t; t = t->t_next) { const line_t *l = t->t_line; const text *lookahead, *topofbounce; const char *s; bool inheader; if(l == NULL) continue; if(!isBounceStart(lineGetData(l))) continue; /* * We've found what looks like the start of a bounce * message. Only bother saving if it really is a bounce * message, this helps to speed up scanning of ping-pong * messages that have lots of bounces within bounces in * them */ for(lookahead = t->t_next; lookahead; lookahead = lookahead->t_next) { l = lookahead->t_line; if(l == NULL) break; s = lineGetData(l); if(strncasecmp(s, "Content-Type:", 13) == 0) /* * Don't bother with plain/text or * plain/html */ if(strstr(s, "text/") == NULL) /* * Don't bother to save the unuseful * part */ break; } if(lookahead && (lookahead->t_line == NULL)) { cli_dbgmsg("Non mime part bounce message is not mime encoded, so it will not be scanned\n"); t = lookahead; /* look for next bounce message */ continue; } /* * Prescan the bounce message to see if there's likely * to be anything nasty. * This algorithm is hand crafted and may be breakable * so all submissions are welcome. It's best NOT to * remove this however you may be tempted, because it * significantly speeds up the scanning of multiple * bounces (i.e. bounces within many bounces) */ for(; lookahead; lookahead = lookahead->t_next) { l = lookahead->t_line; if(l) { s = lineGetData(l); if((strncasecmp(s, "Content-Type:", 13) == 0) && (strstr(s, "multipart/") == NULL) && (strstr(s, "message/rfc822") == NULL) && (strstr(s, "text/plain") == NULL)) break; } } if(lookahead == NULL) { cli_dbgmsg("cli_mbox: I believe it's plain text which must be clean\n"); /* nothing here, move along please */ break; } if((fb = fileblobCreate()) == NULL) break; cli_dbgmsg("Save non mime part bounce message\n"); fileblobSetFilename(fb, mctx->dir, "bounce"); fileblobAddData(fb, (unsigned char *)"Received: by clamd (bounce)\n", 28); fileblobSetCTX(fb, mctx->ctx); inheader = TRUE; topofbounce = NULL; do { l = t->t_line; if(l == NULL) { if(inheader) { inheader = FALSE; topofbounce = t; } } else { s = lineGetData(l); fileblobAddData(fb, (unsigned char *)s, strlen(s)); } fileblobAddData(fb, (unsigned char *)"\n", 1); lookahead = t->t_next; if(lookahead == NULL) break; t = lookahead; l = t->t_line; if((!inheader) && l) { s = lineGetData(l); if(isBounceStart(s)) { cli_dbgmsg("Found the start of another bounce candidate (%s)\n", s); break; } } } while(!fileblobContainsVirus(fb)); fileblobDestroy(fb); if(topofbounce) t = topofbounce; /* * Don't do this - it slows bugs.txt */ /*if(mainMessage) mainMessage->bounce = NULL;*/ } textDestroy(aText); aText = NULL; } /* * No attachments - scan the text portions, often files * are hidden in HTML code */ if(mainMessage && (rc != 3)) { text *t_line; /* * Look for uu-encoded main file */ if((encodingLine(mainMessage) != NULL) && ((t_line = bounceBegin(mainMessage)) != NULL)) { if(exportBounceMessage(t_line, mctx)) rc = 1; } else { bool saveIt; if(messageGetMimeType(mainMessage) == MESSAGE) /* * Quick peek, if the encapsulated * message has no * content encoding statement don't * bother saving to scan, it's safe */ saveIt = (bool)(encodingLine(mainMessage) != NULL); else if((t_line = encodingLine(mainMessage)) != NULL) { /* * Some bounces include the message * body without the headers. * FIXME: Unfortunately this generates a * lot of false positives that a bounce * has been found when it hasn't. */ if((fb = fileblobCreate()) != NULL) { cli_dbgmsg("Found a bounce message with no header at '%s'\n", lineGetData(t_line->t_line)); fileblobSetFilename(fb, mctx->dir, "bounce"); fileblobAddData(fb, (const unsigned char *)"Received: by clamd (bounce)\n", 28); /*fileblobSetCTX(fb, ctx);*/ fileblobDestroy(textToFileblob(t_line, fb, 1)); } saveIt = FALSE; } else /* * Save the entire text portion, * since it it may be an HTML file with * a JavaScript virus or a phish */ saveIt = TRUE; if(saveIt) { cli_dbgmsg("Saving text part to scan\n"); saveTextPart(mainMessage, mctx->dir, 1); if(mainMessage != messageIn) { messageDestroy(mainMessage); mainMessage = NULL; } else messageReset(mainMessage); rc = 1; } } } else rc = 2; /* nothing saved */ if(mainMessage && (mainMessage != messageIn)) messageDestroy(mainMessage); if((rc != 0) && infected) rc = 3; cli_dbgmsg("parseEmailBody() returning %d\n", rc); return rc; } /* * Is the current line the start of a new section? * * New sections start with --boundary */ static int boundaryStart(const char *line, const char *boundary) { char *ptr, *out; int rc; char buf[RFC2821LENGTH + 1]; if(line == NULL) return 0; /* empty line */ if(boundary == NULL) return 0; /*cli_dbgmsg("boundaryStart: line = '%s' boundary = '%s'\n", line, boundary);*/ if((*line != '-') && (*line != '(')) return 0; if(strchr(line, '-') == NULL) return 0; if(strlen(line) <= sizeof(buf)) { out = NULL; ptr = rfc822comments(line, buf); } else out = ptr = rfc822comments(line, NULL); if(ptr == NULL) ptr = (char *)line; if(*ptr++ != '-') { if(out) free(out); return 0; } /* * Gibe.B3 is broken, it has: * boundary="---- =_NextPart_000_01C31177.9DC7C000" * but it's boundaries look like * ------ =_NextPart_000_01C31177.9DC7C000 * notice the one too few '-'. * Presumably this is a deliberate exploitation of a bug in some mail * clients. * * The trouble is that this creates a lot of false positives for * boundary conditions, if we're too lax about matches. We do our level * best to avoid these false positives. For example if we have * boundary="1" we want to ensure that we don't break out of every line * that has -1 in it instead of starting --1. This needs some more work. * * Look with and without RFC822 comments stripped, I've seen some * samples where () are taken as comments in boundaries and some where * they're not. Irrespective of whatever RFC2822 says we need to find * viruses in both types of mails */ if((strstr(ptr, boundary) != NULL) || (strstr(line, boundary) != NULL)) rc = 1; else if(*ptr++ != '-') rc = 0; else rc = (strcasecmp(ptr, boundary) == 0); if(out) free(out); if(rc == 1) cli_dbgmsg("boundaryStart: found %s in %s\n", boundary, line); return rc; } /* * Is the current line the end? * * The message ends with with --boundary-- */ static int endOfMessage(const char *line, const char *boundary) { size_t len; if(line == NULL) return 0; /*cli_dbgmsg("endOfMessage: line = '%s' boundary = '%s'\n", line, boundary);*/ if(*line++ != '-') return 0; if(*line++ != '-') return 0; len = strlen(boundary); if(strncasecmp(line, boundary, len) != 0) return 0; /* * Use < rather than == because some broken mails have white * space after the boundary */ if(strlen(line) < (len + 2)) return 0; line = &line[len]; if(*line++ != '-') return 0; return *line == '-'; } /* * Initialise the various lookup tables */ static int initialiseTables(table_t **rfc821Table, table_t **subtypeTable) { const struct tableinit *tableinit; /* * Initialise the various look up tables */ *rfc821Table = tableCreate(); assert(*rfc821Table != NULL); for(tableinit = rfc821headers; tableinit->key; tableinit++) if(tableInsert(*rfc821Table, tableinit->key, tableinit->value) < 0) { tableDestroy(*rfc821Table); *rfc821Table = NULL; return -1; } *subtypeTable = tableCreate(); assert(*subtypeTable != NULL); for(tableinit = mimeSubtypes; tableinit->key; tableinit++) if(tableInsert(*subtypeTable, tableinit->key, tableinit->value) < 0) { tableDestroy(*rfc821Table); tableDestroy(*subtypeTable); *rfc821Table = NULL; *subtypeTable = NULL; return -1; } return 0; } /* * If there's a HTML text version use that, otherwise * use the first text part, otherwise just use the * first one around. HTML text is most likely to include * a scripting worm * * If we can't find one, return -1 */ static int getTextPart(message *const messages[], size_t size) { size_t i; int textpart = -1; for(i = 0; i < size; i++) { assert(messages[i] != NULL); if(messageGetMimeType(messages[i]) == TEXT) { if(strcasecmp(messageGetMimeSubtype(messages[i]), "html") == 0) return (int)i; textpart = (int)i; } } return textpart; } /* * strip - * Remove the trailing spaces from a buffer. Don't call this directly, * always call strstrip() which is a wrapper to this routine to be used with * NUL terminated strings. This code looks a bit strange because of it's * heritage from code that worked on strings that weren't necessarily NUL * terminated. * TODO: rewrite for clamAV * * Returns it's new length (a la strlen) * * len must be int not size_t because of the >= 0 test, it is sizeof(buf) * not strlen(buf) */ static size_t strip(char *buf, int len) { register char *ptr; register size_t i; if((buf == NULL) || (len <= 0)) return 0; i = strlen(buf); if(len > (int)(i + 1)) return i; ptr = &buf[--len]; #if defined(UNIX) || defined(C_LINUX) || defined(C_DARWIN) /* watch - it may be in shared text area */ do if(*ptr) *ptr = '\0'; while((--len >= 0) && (!isgraph(*--ptr)) && (*ptr != '\n') && (*ptr != '\r')); #else /* more characters can be displayed on DOS */ do #ifndef REAL_MODE_DOS if(*ptr) /* C8.0 puts into a text area */ #endif *ptr = '\0'; while((--len >= 0) && ((*--ptr == '\0') || (isspace((int)*ptr)))); #endif return((size_t)(len + 1)); } /* * strstrip: * Strip a given string */ size_t strstrip(char *s) { if(s == (char *)NULL) return(0); return(strip(s, (int)strlen(s) + 1)); } static int parseMimeHeader(message *m, const char *cmd, const table_t *rfc821Table, const char *arg) { char *copy, *p; const char *ptr; int commandNumber; cli_dbgmsg("parseMimeHeader: cmd='%s', arg='%s'\n", cmd, arg); copy = rfc822comments(cmd, NULL); if(copy) { commandNumber = tableFind(rfc821Table, copy); free(copy); } else commandNumber = tableFind(rfc821Table, cmd); copy = rfc822comments(arg, NULL); if(copy) ptr = copy; else ptr = arg; switch(commandNumber) { case CONTENT_TYPE: /* * Fix for non RFC1521 compliant mailers * that send content-type: Text instead * of content-type: Text/Plain, or * just simply "Content-Type:" */ if(arg == NULL) /* * According to section 4 of RFC1521: * "Note also that a subtype specification is * MANDATORY. There are no default subtypes" * * We have to break this and make an assumption * for the subtype because virus writers and * email client writers don't get it right */ cli_warnmsg("Empty content-type received, no subtype specified, assuming text/plain; charset=us-ascii\n"); else if(strchr(ptr, '/') == NULL) /* * Empty field, such as * Content-Type: * which I believe is illegal according to * RFC1521 */ cli_dbgmsg("Invalid content-type '%s' received, no subtype specified, assuming text/plain; charset=us-ascii\n", ptr); else { int i; char *mimeArgs; /* RHS of the ; */ /* * Some clients are broken and * put white space after the ; */ if(*arg == '/') { cli_warnmsg("Content-type '/' received, assuming application/octet-stream\n"); messageSetMimeType(m, "application"); messageSetMimeSubtype(m, "octet-stream"); } else { /* * The content type could be in quotes: * Content-Type: "multipart/mixed" * FIXME: this is a hack in that ignores * the quotes, it doesn't handle * them properly */ while(isspace(*ptr)) ptr++; if(ptr[0] == '\"') ptr++; if(ptr[0] != '/') { char *s; char *mimeType; /* LHS of the ; */ #ifdef CL_THREAD_SAFE char *strptr = NULL; #endif s = mimeType = cli_strtok(ptr, 0, ";"); /* * Handle * Content-Type: foo/bar multipart/mixed * and * Content-Type: multipart/mixed foo/bar */ if(s && *s) for(;;) { #ifdef CL_THREAD_SAFE int set = messageSetMimeType(m, strtok_r(s, "/", &strptr)); #else int set = messageSetMimeType(m, strtok(s, "/")); #endif /* * Stephen White * Some clients put space after * the mime type but before * the ; */ #ifdef CL_THREAD_SAFE s = strtok_r(NULL, ";", &strptr); #else s = strtok(NULL, ";"); #endif if(s == NULL) break; if(set) { size_t len = strstrip(s) - 1; if(s[len] == '\"') { s[len] = '\0'; len = strstrip(s); } if(len) { if(strchr(s, ' ')) { char *t = cli_strtok(s, 0, " "); messageSetMimeSubtype(m, t); free(t); } else messageSetMimeSubtype(m, s); } } while(*s && !isspace(*s)) s++; if(*s++ == '\0') break; if(*s == '\0') break; } if(mimeType) free(mimeType); } } /* * Add in all rest of the the arguments. * e.g. if the header is this: * Content-Type:', arg='multipart/mixed; boundary=foo * we find the boundary argument set it */ i = 1; while((mimeArgs = cli_strtok(ptr, i++, ";")) != NULL) { cli_dbgmsg("mimeArgs = '%s'\n", mimeArgs); messageAddArguments(m, mimeArgs); free(mimeArgs); } } break; case CONTENT_TRANSFER_ENCODING: messageSetEncoding(m, ptr); break; case CONTENT_DISPOSITION: p = cli_strtok(ptr, 0, ";"); if(p) { if(*p) { messageSetDispositionType(m, p); free(p); p = cli_strtok(ptr, 1, ";"); messageAddArgument(m, p); } free(p); } if((p = (char *)messageFindArgument(m, "filename")) == NULL) /* * Handle this type of header, without * a filename (e.g. some Worm.Torvil.D) * Content-ID: * Content-Transfer-Encoding: base64 * Content-Disposition: attachment */ messageAddArgument(m, "filename=unknown"); else free(p); } if(copy) free(copy); return 0; } /* * Save the text portion of the message */ static void saveTextPart(message *m, const char *dir, int destroy_text) { fileblob *fb; messageAddArgument(m, "filename=textportion"); if((fb = messageToFileblob(m, dir, destroy_text)) != NULL) { /* * Save main part to scan that */ cli_dbgmsg("Saving main message\n"); fileblobDestroy(fb); } } /* * Handle RFC822 comments in headers. * If out == NULL, return a buffer without the comments, the caller must free * the returned buffer * Return NULL on error or if the input * has no comments. * See secion 3.4.3 of RFC822 * TODO: handle comments that go on to more than one line */ static char * rfc822comments(const char *in, char *out) { const char *iptr; char *optr; int backslash, inquote, commentlevel; if(in == NULL) return NULL; if(strchr(in, '(') == NULL) return NULL; assert(out != in); if(out == NULL) { out = cli_malloc(strlen(in) + 1); if(out == NULL) return NULL; } backslash = commentlevel = inquote = 0; optr = out; cli_dbgmsg("rfc822comments: contains a comment\n"); for(iptr = in; *iptr; iptr++) if(backslash) { if(commentlevel == 0) *optr++ = *iptr; backslash = 0; } else switch(*iptr) { case '\\': backslash = 1; break; case '\"': *optr++ = '\"'; inquote = !inquote; break; case '(': if(inquote) *optr++ = '('; else commentlevel++; break; case ')': if(inquote) *optr++ = ')'; else if(commentlevel > 0) commentlevel--; break; default: if(commentlevel == 0) *optr++ = *iptr; } if(backslash) /* last character was a single backslash */ *optr++ = '\\'; *optr = '\0'; /*strstrip(out);*/ cli_dbgmsg("rfc822comments '%s'=>'%s'\n", in, out); return out; } /* * Handle RFC2047 encoding. Returns a malloc'd buffer that the caller must * free, or NULL on error */ static char * rfc2047(const char *in) { char *out, *pout; size_t len; if((strstr(in, "=?") == NULL) || (strstr(in, "?=") == NULL)) return strdup(in); cli_dbgmsg("rfc2047 '%s'\n", in); out = cli_malloc(strlen(in) + 1); if(out == NULL) return NULL; pout = out; /* For each RFC2047 string */ while(*in) { char encoding, *ptr, *enctext; message *m; blob *b; /* Find next RFC2047 string */ while(*in) { if((*in == '=') && (in[1] == '?')) { in += 2; break; } *pout++ = *in++; } /* Skip over charset, find encoding */ while((*in != '?') && *in) in++; if(*in == '\0') break; encoding = *++in; encoding = tolower(encoding); if((encoding != 'q') && (encoding != 'b')) { cli_warnmsg("Unsupported RFC2047 encoding type '%c' - if you believe this file contains a virus, submit it to www.clamav.net\n", encoding); free(out); out = NULL; break; } /* Skip to encoded text */ if(*++in != '?') break; if(*++in == '\0') break; enctext = strdup(in); if(enctext == NULL) { free(out); out = NULL; break; } in = strstr(in, "?="); if(in == NULL) { free(enctext); break; } in += 2; ptr = strstr(enctext, "?="); assert(ptr != NULL); *ptr = '\0'; /*cli_dbgmsg("Need to decode '%s' with method '%c'\n", enctext, encoding);*/ m = messageCreate(); if(m == NULL) break; messageAddStr(m, enctext); free(enctext); switch(encoding) { case 'q': messageSetEncoding(m, "quoted-printable"); break; case 'b': messageSetEncoding(m, "base64"); break; } b = messageToBlob(m, 1); len = blobGetDataSize(b); cli_dbgmsg("Decoded as '%*.*s'\n", len, len, blobGetData(b)); memcpy(pout, blobGetData(b), len); blobDestroy(b); messageDestroy(m); if(pout[len - 1] == '\n') pout += len - 1; else pout += len; } if(out == NULL) return NULL; *pout = '\0'; cli_dbgmsg("rfc2047 returns '%s'\n", out); return out; } #ifdef PARTIAL_DIR /* * Handle partial messages */ static int rfc1341(message *m, const char *dir) { fileblob *fb; char *arg, *id, *number, *total, *oldfilename; const char *tmpdir; char pdir[NAME_MAX + 1]; id = (char *)messageFindArgument(m, "id"); if(id == NULL) return -1; #ifdef C_CYGWIN if((tmpdir = getenv("TEMP")) == (char *)NULL) if((tmpdir = getenv("TMP")) == (char *)NULL) if((tmpdir = getenv("TMPDIR")) == (char *)NULL) tmpdir = "C:\\"; #else if((tmpdir = getenv("TMPDIR")) == (char *)NULL) if((tmpdir = getenv("TMP")) == (char *)NULL) if((tmpdir = getenv("TEMP")) == (char *)NULL) #ifdef P_tmpdir tmpdir = P_tmpdir; #else tmpdir = "/tmp"; #endif #endif snprintf(pdir, sizeof(pdir) - 1, "%s/clamav-partial", tmpdir); if((mkdir(pdir, 0700) < 0) && (errno != EEXIST)) { cli_errmsg("Can't create the directory '%s'\n", pdir); return -1; } else { struct stat statb; if(stat(pdir, &statb) < 0) { cli_errmsg("Can't stat the directory '%s'\n", pdir); return -1; } if(statb.st_mode & 077) cli_warnmsg("Insecure partial directory %s (mode 0%o)\n", pdir, statb.st_mode & 0777); } number = (char *)messageFindArgument(m, "number"); if(number == NULL) { free(id); return -1; } oldfilename = (char *)messageFindArgument(m, "filename"); if(oldfilename == NULL) oldfilename = (char *)messageFindArgument(m, "name"); arg = cli_malloc(10 + strlen(id) + strlen(number)); if(arg) { sprintf(arg, "filename=%s%s", id, number); messageAddArgument(m, arg); free(arg); } if(oldfilename) { cli_warnmsg("Must reset to %s\n", oldfilename); free(oldfilename); } if((fb = messageToFileblob(m, pdir, 0)) == NULL) { free(id); free(number); return -1; } fileblobDestroy(fb); total = (char *)messageFindArgument(m, "total"); cli_dbgmsg("rfc1341: %s, %s of %s\n", id, number, (total) ? total : "?"); if(total) { int n = atoi(number); int t = atoi(total); DIR *dd = NULL; free(total); /* * If it's the last one - reassemble it * FIXME: this assumes that we receive the parts in order */ if((n == t) && ((dd = opendir(pdir)) != NULL)) { FILE *fout; char outname[NAME_MAX + 1]; time_t now; snprintf(outname, sizeof(outname) - 1, "%s/%s", dir, id); cli_dbgmsg("outname: %s\n", outname); fout = fopen(outname, "wb"); if(fout == NULL) { cli_errmsg("Can't open '%s' for writing", outname); free(id); free(number); closedir(dd); return -1; } time(&now); for(n = 1; n <= t; n++) { char filename[NAME_MAX + 1]; const struct dirent *dent; #if defined(HAVE_READDIR_R_3) || defined(HAVE_READDIR_R_2) union { struct dirent d; char b[offsetof(struct dirent, d_name) + NAME_MAX + 1]; } result; #endif snprintf(filename, sizeof(filename), "%s%d", id, n); #ifdef HAVE_READDIR_R_3 while((readdir_r(dd, &result.d, &dent) == 0) && dent) { #elif defined(HAVE_READDIR_R_2) while((dent = (struct dirent *)readdir_r(dd, &result.d))) { #else /*!HAVE_READDIR_R*/ while((dent = readdir(dd))) { #endif FILE *fin; char buffer[BUFSIZ], fullname[NAME_MAX + 1]; int nblanks; extern short cli_leavetemps_flag; struct stat statb; #ifndef C_CYGWIN if(dent->d_ino == 0) continue; #endif snprintf(fullname, sizeof(fullname) - 1, "%s/%s", pdir, dent->d_name); if(strncmp(filename, dent->d_name, strlen(filename)) != 0) { if(!cli_leavetemps_flag) continue; if(stat(fullname, &statb) < 0) continue; if(now - statb.st_mtime > (time_t)(7 * 24 * 3600)) if(unlink(fullname) >= 0) cli_warnmsg("removed old RFC1341 file %s\n", fullname); continue; } fin = fopen(fullname, "rb"); if(fin == NULL) { cli_errmsg("Can't open '%s' for reading", fullname); fclose(fout); unlink(outname); free(id); free(number); closedir(dd); return -1; } nblanks = 0; while(fgets(buffer, sizeof(buffer) - 1, fin) != NULL) /* * Ensure that trailing newlines * aren't copied */ if(buffer[0] == '\n') nblanks++; else { if(nblanks) do putc('\n', fout); while(--nblanks > 0); fputs(buffer, fout); } fclose(fin); /* don't unlink if leave temps */ if(!cli_leavetemps_flag) unlink(fullname); break; } rewinddir(dd); } closedir(dd); fclose(fout); } } free(number); free(id); return 0; } #endif #ifdef CL_EXPERIMENTAL static void hrefs_done(blob *b, tag_arguments_t *hrefs) { if(b) blobDestroy(b); html_tag_arg_free(hrefs); } /* * This used to be part of checkURLs, split out, because phishingScan needs it * too, and phishingScan might be used in situations where checkURLs is * disabled (see ifdef) */ static blob * getHrefs(message *m, tag_arguments_t *hrefs) { blob *b = messageToBlob(m, 0); size_t len; if(b == NULL) return NULL; len = blobGetDataSize(b); if(len == 0) { blobDestroy(b); return NULL; } /* TODO: make this size customisable */ if(len > 100*1024) { cli_warnmsg("Viruses pointed to by URLs not scanned in large message\n"); blobDestroy(b); return NULL; } hrefs->count = 0; hrefs->tag = hrefs->value = NULL; hrefs->contents = NULL; cli_dbgmsg("getHrefs: calling html_normalise_mem\n"); if(!html_normalise_mem(blobGetData(b), (off_t)len, NULL, hrefs)) { blobDestroy(b); return NULL; } cli_dbgmsg("getHrefs: html_normalise_mem returned\n"); /* TODO: Do we need to call remove_html_comments? */ return b; } static void checkURLs(message *mainMessage, mbox_ctx *mctx, int *rc, int is_html) { tag_arguments_t hrefs; blob *b; hrefs.scanContents = (!(mctx->ctx->options&CL_SCAN_NOPHISHING)); /* aCaB: stripped GA related stuff */ #if (!defined(FOLLOWURLS)) || (FOLLOWURLS <= 0) if(!hrefs.scanContents) /* * Don't waste time extracting hrefs (parsing html), nobody * will need it */ return; #endif hrefs.count = 0; hrefs.tag = hrefs.value = NULL; hrefs.contents = NULL; b = getHrefs(mainMessage, &hrefs); if(b) { if(!(mctx->ctx->options&CL_SCAN_NOPHISHING)) { if(phishingScan(mainMessage, mctx->dir, mctx->ctx, &hrefs) == CL_VIRUS) { mainMessage->isInfected = TRUE; *rc = 3; cli_dbgmsg("PH:Phishing found\n"); } } if(is_html && (mctx->ctx->options&CL_SCAN_MAILURL) && (*rc != 3)) do_checkURLs(mainMessage, mctx->dir, &hrefs); } hrefs_done(b,&hrefs); } #if defined(FOLLOWURLS) && (FOLLOWURLS > 0) static void do_checkURLs(message *m, const char *dir, tag_arguments_t *hrefs) { table_t *t; int i, n; #ifdef CL_THREAD_SAFE pthread_t tid[FOLLOWURLS]; struct arg args[FOLLOWURLS]; #endif t = tableCreate(); if(t == NULL) return; n = 0; for(i = 0; i < hrefs->count; i++) { const char *url = (const char *)hrefs->value[i]; /* * TODO: If it's an image source, it'd be nice to note beacons * where width="0" height="0", which needs support from * the HTML normalise code */ if(strncasecmp("http://", url, 7) == 0) { char *ptr; #ifndef CL_THREAD_SAFE struct arg arg; #endif char name[NAME_MAX + 1]; if(tableFind(t, url) == 1) { cli_dbgmsg("URL %s already downloaded\n", url); continue; } /* * What about foreign character spoofing? * It would be useful be able to check if url * is the same as the text displayed, e.g. * www.paypal.com * but that needs support from HTML normalise */ if(strchr(url, '%') && strchr(url, '@')) cli_warnmsg("Possible URL spoofing attempt noticed, but not yet handled (%s)\n", url); if(n == FOLLOWURLS) { cli_warnmsg("URL %s will not be scanned\n", url); break; } (void)tableInsert(t, url, 1); cli_dbgmsg("Downloading URL %s to be scanned\n", url); strncpy(name, url, sizeof(name) - 1); name[sizeof(name) - 1] = '\0'; for(ptr = name; *ptr; ptr++) if(*ptr == '/') *ptr = '_'; #ifdef CL_THREAD_SAFE args[n].dir = dir; args[n].url = strdup(url); args[n].filename = strdup(name); pthread_create(&tid[n], NULL, getURL, &args[n]); #else arg.url = strdup(url); arg.dir = dir; arg.filename = name; getURL(&arg); free(arg.url); #endif ++n; } } tableDestroy(t); #ifdef CL_THREAD_SAFE assert(n <= FOLLOWURLS); cli_dbgmsg("checkURLs: waiting for %d thread(s) to finish\n", n); while(--n >= 0) { pthread_join(tid[n], NULL); free(args[n].filename); free(args[n].url); } #endif } #else static void do_checkURLs(message *m, const char *dir, tag_arguments_t *hrefs) { } #endif #else /*!CL_EXPERIMENTAL*/ #if defined(FOLLOWURLS) && (FOLLOWURLS > 0) static void checkURLs(message *m, mbox_ctx *mctx, int *rc, int is_html) { blob *b = messageToBlob(m, 0); size_t len; table_t *t; int i, n; #if defined(WITH_CURL) && defined(CL_THREAD_SAFE) pthread_t tid[FOLLOWURLS]; struct arg args[FOLLOWURLS]; #endif tag_arguments_t hrefs; if(b == NULL) return; len = blobGetDataSize(b); if(len == 0) { blobDestroy(b); return; } /* TODO: make this size customisable */ if(len > 100*1024) { cli_warnmsg("Viruses pointed to by URL not scanned in large message\n"); blobDestroy(b); return; } t = tableCreate(); if(t == NULL) { blobDestroy(b); return; } hrefs.count = 0; hrefs.tag = hrefs.value = NULL; cli_dbgmsg("checkURLs: calling html_normalise_mem\n"); if(!html_normalise_mem(blobGetData(b), len, NULL, &hrefs)) { blobDestroy(b); tableDestroy(t); return; } cli_dbgmsg("checkURLs: html_normalise_mem returned\n"); /* TODO: Do we need to call remove_html_comments? */ n = 0; for(i = 0; i < hrefs.count; i++) { const char *url = (const char *)hrefs.value[i]; /* * TODO: If it's an image source, it'd be nice to note beacons * where width="0" height="0", which needs support from * the HTML normalise code */ if(strncasecmp("http://", url, 7) == 0) { char *ptr; #ifdef WITH_CURL #ifndef CL_THREAD_SAFE struct arg arg; #endif #else /*!WITH_CURL*/ #ifdef CL_THREAD_SAFE static pthread_mutex_t system_mutex = PTHREAD_MUTEX_INITIALIZER; #endif struct stat statb; char cmd[512]; #endif /*WITH_CURL*/ char name[NAME_MAX + 1]; if(tableFind(t, url) == 1) { cli_dbgmsg("URL %s already downloaded\n", url); continue; } /* * What about foreign character spoofing? * It would be useful be able to check if url * is the same as the text displayed, e.g. * www.paypal.com * but that needs support from HTML normalise */ if(strchr(url, '%') && strchr(url, '@')) cli_warnmsg("Possible URL spoofing attempt noticed, but not yet handled (%s)\n", url); if(n == FOLLOWURLS) { cli_warnmsg("URL %s will not be scanned\n", url); break; } (void)tableInsert(t, url, 1); cli_dbgmsg("Downloading URL %s to be scanned\n", url); strncpy(name, url, sizeof(name) - 1); name[sizeof(name) - 1] = '\0'; for(ptr = name; *ptr; ptr++) if(*ptr == '/') *ptr = '_'; #ifdef WITH_CURL #ifdef CL_THREAD_SAFE args[n].curl = curl_easy_init(); if(args[n].curl == NULL) { cli_errmsg("curl_easy_init failed\n"); continue; } args[n].dir = mctx->dir; args[n].url = url; args[n].filename = strdup(name); pthread_create(&tid[n], NULL, getURL, &args[n]); #else /* easy isn't the word I'd use... */ arg.curl = curl_easy_init(); if(arg.curl == NULL) { cli_errmsg("curl_easy_init failed\n"); continue; } arg.url = url; arg.dir = mctx->dir; arg.filename = name; getURL(&arg); curl_easy_cleanup(arg.curl); #endif #else /*!WITH_CURL*/ cli_warnmsg("The use of mail-follow-urls without CURL being installed is deprecated\n"); /* * TODO: maximum size and timeouts */ len = sizeof(cmd) - 26 - strlen(mctx->dir) - strlen(name); #ifdef CL_DEBUG snprintf(cmd, sizeof(cmd) - 1, "GET -t10 \"%.*s\" >%s/%s", len, url, mctx->dir, name); #else snprintf(cmd, sizeof(cmd) - 1, "GET -t10 \"%.*s\" >%s/%s 2>/dev/null", len, url, mctx->dir, name); #endif cmd[sizeof(cmd) - 1] = '\0'; cli_dbgmsg("%s\n", cmd); #ifdef CL_THREAD_SAFE pthread_mutex_lock(&system_mutex); #endif system(cmd); #ifdef CL_THREAD_SAFE pthread_mutex_unlock(&system_mutex); #endif snprintf(cmd, sizeof(cmd), "%s/%s", mctx->dir, name); if(stat(cmd, &statb) >= 0) if(statb.st_size == 0) { cli_warnmsg("URL %s failed to download\n", url); /* * Don't bother scanning an empty file */ (void)unlink(cmd); } #endif ++n; } } blobDestroy(b); tableDestroy(t); #if defined(WITH_CURL) && defined(CL_THREAD_SAFE) assert(n <= FOLLOWURLS); cli_dbgmsg("checkURLs: waiting for %d thread(s) to finish\n", n); while(--n >= 0) { pthread_join(tid[n], NULL); free(args[n].filename); curl_easy_cleanup(args[n].curl); } #endif html_tag_arg_free(&hrefs); } #else static void checkURLs(message *m, mbox_ctx *mctx, int* rc, int is_html) { } #endif #endif /* CL_EXPERIMENTAL */ #if defined(FOLLOWURLS) && (FOLLOWURLS > 0) /* * Includes some Win32 patches by Gianluigi Tiesi * * FIXME: Often WMF exploits work by sending people an email directing them * to a page which displays a picture containing the exploit. This is not * currently found, since only the HTML on the referred page is downloaded. * It would be useful to scan the HTML for references to pictures and * download them for scanning. But that will hit performance so there is * an issue here. */ #if defined(CL_EXPERIMENTAL) || (!defined(WITH_CURL)) /* * Removing the reliance on libcurl * Includes some of the freshclam hacks by Everton da Silva Marques * everton.marques@gmail.com> */ #ifndef timercmp # define timercmp(a, b, cmp) \ (((a)->tv_sec == (b)->tv_sec) ? \ ((a)->tv_usec cmp (b)->tv_usec) : \ ((a)->tv_sec cmp (b)->tv_sec)) #endif /* timercmp */ #ifndef timersub # define timersub(a, b, result) \ do { \ (result)->tv_sec = (a)->tv_sec - (b)->tv_sec; \ (result)->tv_usec = (a)->tv_usec - (b)->tv_usec; \ if ((result)->tv_usec < 0) { \ --(result)->tv_sec; \ (result)->tv_usec += 1000000; \ } \ } while (0) #endif /* timersub */ static long nonblock_fcntl(int sock); static void restore_fcntl(int sock, long fcntl_flags); static int nonblock_connect(int sock, const struct sockaddr *addr, socklen_t addrlen, int secs); static int connect_error(int sock); static int my_r_gethostbyname(const char *hostname, struct hostent *hp, char *buf, size_t len); #define NONBLOCK_SELECT_MAX_FAILURES 3 #define NONBLOCK_MAX_BOGUS_LOOPS 10 static void * #ifdef CL_THREAD_SAFE getURL(void *a) #else getURL(struct arg *arg) #endif { FILE *fp; #ifdef CL_THREAD_SAFE struct arg *arg = (struct arg *)a; #endif const char *url = arg->url; const char *dir = arg->dir; const char *filename = arg->filename; char fout[NAME_MAX + 1]; #ifdef C_WINDOWS SOCKET sd; #else int sd; #endif int n; struct sockaddr_in server; #ifdef HAVE_IN_ADDR_T in_addr_t ip; #else unsigned int ip; #endif char buf[BUFSIZ]; char site[BUFSIZ]; in_port_t port; static in_port_t default_port; static int tcp; int doingsite, firstpacket; char *ptr; int flags, via_proxy; const char *proxy; if(strlen(url) > (sizeof(site) - 1)) { cli_dbgmsg("Ignoring long URL \"%s\"\n", url); return NULL; } snprintf(fout, sizeof(fout) - 1, "%s/%s", dir, filename); fp = fopen(fout, "wb"); if(fp == NULL) { cli_errmsg("Can't open '%s' for writing", fout); return NULL; } cli_dbgmsg("Saving %s to %s\n", url, fout); if(tcp == 0) { const struct protoent *proto = getprotobyname("tcp"); if(proto == NULL) { cli_warnmsg("Unknown prototol tcp, check /etc/protocols\n"); fclose(fp); return NULL; } tcp = proto->p_proto; #ifndef C_WINDOWS endprotoent(); #endif } if(default_port == 0) { const struct servent *servent = getservbyname("http", "tcp"); if(servent) default_port = (in_port_t)ntohs(servent->s_port); else default_port = 80; #ifndef C_WINDOWS endservent(); #endif } port = default_port; doingsite = 1; ptr = site; proxy = getenv("http_proxy"); /* FIXME: handle no_proxy */ via_proxy = (proxy && *proxy); if(via_proxy) { if(strncasecmp(proxy, "http://", 7) != 0) { cli_warnmsg("Unsupported proxy protocol\n"); fclose(fp); return NULL; } cli_dbgmsg("Getting %s via %s\n", url, proxy); proxy += 7; while(*proxy) { if(doingsite && (*proxy == ':')) { port = 0; while(isdigit(*++proxy)) { port *= 10; port += *proxy - '0'; } continue; } if(doingsite && (*proxy == '/')) { proxy++; break; } *ptr++ = *proxy++; } } else { cli_dbgmsg("Getting %s\n", url); if(strncasecmp(url, "http://", 7) != 0) { cli_warnmsg("Unsupported protocol\n"); fclose(fp); return NULL; } url += 7; while(*url) { if(doingsite && (*url == ':')) { port = 0; while(isdigit(*++url)) { port *= 10; port += *url - '0'; } continue; } if(doingsite && (*url == '/')) { url++; break; } *ptr++ = *url++; } } *ptr = '\0'; memset((char *)&server, '\0', sizeof(struct sockaddr_in)); server.sin_family = AF_INET; server.sin_port = (in_port_t)htons(port); ip = inet_addr(site); #ifdef INADDR_NONE if(ip == INADDR_NONE) { #else if(ip == (in_addr_t)-1) { #endif struct hostent h; if((my_r_gethostbyname(site, &h, buf, sizeof(buf)) != 0) || (h.h_addr_list == NULL) || (h.h_addr == NULL)) { cli_dbgmsg("Unknown host %s\n", site); fclose(fp); return NULL; } memcpy((char *)&ip, h.h_addr, sizeof(ip)); } server.sin_addr.s_addr = ip; if((sd = socket(AF_INET, SOCK_STREAM, tcp)) < 0) { fclose(fp); return NULL; } flags = nonblock_fcntl(sd); if(nonblock_connect(sd, (struct sockaddr *)&server, sizeof(struct sockaddr_in), 5) < 0) { closesocket(sd); fclose(fp); return NULL; } restore_fcntl(sd, flags); /* * TODO: consider HTTP/1.1 */ if(via_proxy) snprintf(buf, sizeof(buf) - 1, "GET %s HTTP/1.0\nUser-Agent: www.clamav.net\n\n", url); else snprintf(buf, sizeof(buf) - 1, "GET /%s HTTP/1.0\nUser-Agent: www.clamav.net\n\n", url); /*cli_dbgmsg("%s", buf);*/ if(send(sd, buf, (int)strlen(buf), 0) < 0) { closesocket(sd); fclose(fp); return NULL; } #ifdef SHUT_WR shutdown(sd, SHUT_WR); #else shutdown(sd, 1); #endif firstpacket = 1; for(;;) { fd_set set; struct timeval tv; FD_ZERO(&set); FD_SET(sd, &set); tv.tv_sec = 30; /* FIXME: make this customisable */ tv.tv_usec = 0; if(select(sd + 1, &set, NULL, NULL, &tv) < 0) { if(errno == EINTR) continue; closesocket(sd); fclose(fp); return NULL; } if(!FD_ISSET(sd, &set)) { fclose(fp); closesocket(sd); return NULL; } n = recv(sd, buf, BUFSIZ, 0); if(n < 0) { fclose(fp); closesocket(sd); return NULL; } if(n == 0) break; /* * FIXME: Handle header in more than one packet */ if(firstpacket) { char *statusptr; buf[n] = '\0'; statusptr = cli_strtok(buf, 1, " "); if(statusptr) { int status = atoi(statusptr); cli_dbgmsg("HTTP status %d\n", status); free(statusptr); if((status == 301) || (status == 302)) { char *location; location = strstr(buf, "\nLocation: "); if(location) { char *end; fclose(fp); closesocket(sd); unlink(fout); location += 11; free(arg->url); end = location; while(*end && (*end != '\n')) end++; *end = '\0'; arg->url = strdup(location); cli_dbgmsg("Redirecting to %s\n", arg->url); return getURL(arg); } } } /* * Don't write the HTTP header */ ptr = strstr(buf, "\n\n"); if(ptr != NULL) { ptr += 2; n -= (int)(ptr - buf); } else ptr = buf; firstpacket = 0; } else ptr = buf; if(fwrite(ptr, n, 1, fp) != 1) { cli_warnmsg("Error writing %d bytes to %s\n", n, fout); break; } } fclose(fp); closesocket(sd); return NULL; } /* * Have a copy here because r_gethostbyname is in shared not libclamav :-( */ static int my_r_gethostbyname(const char *hostname, struct hostent *hp, char *buf, size_t len) { #if defined(HAVE_GETHOSTBYNAME_R_6) /* e.g. Linux */ struct hostent *hp2; int ret = -1; if((hostname == NULL) || (hp == NULL)) return -1; if(gethostbyname_r(hostname, hp, buf, len, &hp2, &ret) < 0) return ret; #elif defined(HAVE_GETHOSTBYNAME_R_5) /* e.g. BSD, Solaris, Cygwin */ int ret = -1; if((hostname == NULL) || (hp == NULL)) return -1; if(gethostbyname_r(hostname, hp, buf, len, &ret) == NULL) return ret; #elif defined(HAVE_GETHOSTBYNAME_R_3) /* e.g. HP/UX, AIX */ if((hostname == NULL) || (hp == NULL)) return -1; if(gethostbyname_r(hostname, &hp, (struct hostent_data *)buf) < 0) return h_errno; #else /* Single thread the code e.g. VS2005 */ struct hostent *hp2; #ifdef CL_THREAD_SAFE static pthread_mutex_t hostent_mutex = PTHREAD_MUTEX_INITIALIZER; #endif if((hostname == NULL) || (hp == NULL)) return -1; #ifdef CL_THREAD_SAFE pthread_mutex_lock(&hostent_mutex); #endif if((hp2 = gethostbyname(hostname)) == NULL) { #ifdef CL_THREAD_SAFE pthread_mutex_unlock(&hostent_mutex); #endif return h_errno; } memcpy(hp, hp2, sizeof(struct hostent)); #ifdef CL_THREAD_SAFE pthread_mutex_unlock(&hostent_mutex); #endif #endif return 0; } static long nonblock_fcntl(int sock) { #ifdef F_GETFL long fcntl_flags; /* Save fcntl() flags */ fcntl_flags = fcntl(sock, F_GETFL, 0); if(fcntl_flags < 0) cli_warnmsg("nonblock_fcntl: saving: fcntl(%d, F_GETFL): errno=%d: %s\n", sock, errno, strerror(errno)); else if(fcntl(sock, F_SETFL, fcntl_flags | O_NONBLOCK)) cli_warnmsg("nonblock_fcntl: fcntl(%d, F_SETFL, O_NONBLOCK): errno=%d: %s\n", sock, errno, strerror(errno)); return fcntl_flags; #else return 0L; #endif } static void restore_fcntl(int sock, long fcntl_flags) { #ifdef F_SETFL if(fcntl_flags != -1) if(fcntl(sock, F_SETFL, fcntl_flags)) { cli_warnmsg("restore_fcntl: restoring: fcntl(%d, F_SETFL): errno=%d: %s\n", sock, errno, strerror(errno)); } #endif } static int nonblock_connect(int sock, const struct sockaddr *addr, socklen_t addrlen, int secs) { /* Max. of unexpected select() failures */ int select_failures = NONBLOCK_SELECT_MAX_FAILURES; /* Max. of useless loops */ int bogus_loops = NONBLOCK_MAX_BOGUS_LOOPS; struct timeval timeout; /* When we should time out */ int numfd; /* Highest fdset fd plus 1 */ /* Calculate into 'timeout' when we should time out */ gettimeofday(&timeout, 0); timeout.tv_sec += secs; /* Launch (possibly) non-blocking connect() request */ if(connect(sock, addr, addrlen)) { int e = errno; cli_dbgmsg("DEBUG nonblock_connect: connect(): fd=%d errno=%d: %s\n", sock, e, strerror(e)); switch (e) { case EALREADY: case EINPROGRESS: break; /* wait for connection */ case EISCONN: return 0; /* connected */ default: cli_warnmsg("nonblock_connect: connect(): fd=%d errno=%d: %s\n", sock, e, strerror(e)); return -1; /* failed */ } } else return connect_error(sock); numfd = sock + 1; /* Highest fdset fd plus 1 */ for (;;) { fd_set fds; struct timeval now; struct timeval wait; int n; /* Force timeout if we ran out of time */ gettimeofday(&now, 0); if (timercmp(&now, &timeout, >)) { cli_warnmsg("connect timing out (%d secs)\n", secs); break; /* failed */ } /* Calculate into 'wait' how long to wait */ timersub(&timeout, &now, &wait); /* wait = timeout - now */ /* Init fds with 'sock' as the only fd */ FD_ZERO(&fds); FD_SET(sock, &fds); n = select(numfd, 0, &fds, 0, &wait); if (n < 0) { cli_warnmsg("nonblock_connect: select() failure %d: errno=%d: %s\n", select_failures, errno, strerror(errno)); if (--select_failures >= 0) continue; /* keep waiting */ break; /* failed */ } cli_dbgmsg("DEBUG nonblock_connect: select = %d\n", n); if(n) return connect_error(sock); /* Select returned, but there is no work to do... */ if (--bogus_loops < 0) { cli_warnmsg("nonblock_connect: giving up due to excessive bogus loops\n"); break; /* failed */ } } /* for loop: keep waiting */ return -1; /* failed */ } static int connect_error(int sock) { int optval; socklen_t optlen; optlen = sizeof(optval); getsockopt(sock, SOL_SOCKET, SO_ERROR, &optval, &optlen); if(optval) cli_warnmsg("connect_error: getsockopt(SO_ERROR): fd=%d error=%d: %s\n", sock, optval, strerror(optval)); return optval ? -1 : 0; } #else static int curl_has_segfaulted; /* * Inspite of numerious bug reports, curl is still buggy :-( * For a fuller explanation, read the long comment at the top, including * the valgrind evidence */ static void curlsegv(int sig) { curl_has_segfaulted = 1; } static void * #ifdef CL_THREAD_SAFE getURL(void *a) #else getURL(struct arg *arg) #endif { FILE *fp; struct curl_slist *headers; #ifdef CL_THREAD_SAFE struct arg *arg = (struct arg *)a; #endif const char *url = arg->url; const char *dir = arg->dir; CURL *curl = arg->curl; const char *filename = arg->filename; char fout[NAME_MAX + 1]; void (*oldsegv)(int); #ifdef CURLOPT_ERRORBUFFER char errorbuffer[CURL_ERROR_SIZE + 1]; #elif (LIBCURL_VERSION_NUM >= 0x070C00) CURLcode res = CURLE_OK; #endif (void)curl_easy_setopt(curl, CURLOPT_USERAGENT, "www.clamav.net"); if(curl_easy_setopt(curl, CURLOPT_URL, url) != 0) { cli_errmsg("%s: curl_easy_setopt failed\n", url); return NULL; } snprintf(fout, sizeof(fout) - 1, "%s/%s", dir, filename); cli_dbgmsg("Saving %s to %s\n", url, fout); fp = fopen(fout, "wb"); if(fp == NULL) { cli_errmsg("Can't open '%s' for writing", fout); return NULL; } #ifdef CURLOPT_WRITEDATA if(curl_easy_setopt(curl, CURLOPT_WRITEDATA, fp) != 0) { fclose(fp); return NULL; } #else if(curl_easy_setopt(curl, CURLOPT_FILE, fp) != 0) { fclose(fp); return NULL; } #endif /* * If an item is in squid's cache get it from there (TCP_HIT/200) * by default curl doesn't (TCP_CLIENT_REFRESH_MISS/200) */ headers = curl_slist_append(NULL, "Pragma:"); curl_easy_setopt(curl, CURLOPT_HTTPHEADER, headers); /* These should be customisable */ curl_easy_setopt(curl, CURLOPT_TIMEOUT, 30); curl_easy_setopt(curl, CURLOPT_CONNECTTIMEOUT, 10); #ifdef CURLOPT_MAXFILESIZE curl_easy_setopt(curl, CURLOPT_MAXFILESIZE, 50*1024); #endif #ifdef CL_THREAD_SAFE #ifdef CURLOPT_DNS_USE_GLOBAL_CACHE /* Apparently this is depracated */ /*curl_easy_setopt(curl, CURLOPT_DNS_USE_GLOBAL_CACHE, 0);*/ #endif #endif #ifdef CL_THREAD_SAFE #ifdef CURLOPT_NOSIGNAL curl_easy_setopt(curl, CURLOPT_NOSIGNAL, 1); #endif #endif /* * Prevent password: prompting with older versions * FIXME: a better username? */ curl_easy_setopt(curl, CURLOPT_USERPWD, "username:password"); /* * FIXME: valgrind reports "pthread_mutex_unlock: mutex is not locked" * from gethostbyaddr_r within this. It may be a bug in libcurl * rather than this code, but I need to check, see Curl_resolv() * If pushed really hard it will sometimes say * Conditional jump or move depends on uninitialised value(s) and * quit. But the program seems to work OK without valgrind... * Perhaps Curl_resolv() isn't thread safe? * * I have seen segfaults in version 7.12.3. Version 7.14 seems OK. */ /* * On some C libraries (notably with FC3, glibc-2.3.3-74) you get a * memory leak here in getaddrinfo(), see * https://bugzilla.redhat.com/bugzilla/show_bug.cgi?id=139559 */ curl_has_segfaulted = 0; oldsegv = signal(SIGSEGV, curlsegv); #ifdef CURLOPT_ERRORBUFFER curl_easy_setopt(curl, CURLOPT_ERRORBUFFER, errorbuffer); if(curl_easy_perform(curl) != CURLE_OK) cli_warnmsg("URL %s failed to download: %s\n", url, errorbuffer); #elif (LIBCURL_VERSION_NUM >= 0x070C00) if((res = curl_easy_perform(curl)) != CURLE_OK) cli_warnmsg("URL %s failed to download: %s\n", url, curl_easy_strerror(res)); #else if(curl_easy_perform(curl) != CURLE_OK) cli_warnmsg("URL %s failed to download\n", url); #endif fclose(fp); curl_slist_free_all(headers); if(curl_has_segfaulted) cli_warnmsg("Libcurl has segfaulted on '%s'\n", url); signal(SIGSEGV, oldsegv); return NULL; } #endif #endif #ifdef HAVE_BACKTRACE static void sigsegv(int sig) { signal(SIGSEGV, SIG_DFL); print_trace(1); exit(SIGSEGV); } static void print_trace(int use_syslog) { void *array[10]; size_t size; char **strings; size_t i; pid_t pid = getpid(); size = backtrace(array, 10); strings = backtrace_symbols(array, size); if(use_syslog == 0) cli_dbgmsg("Backtrace of pid %d:\n", pid); else syslog(LOG_ERR, "Backtrace of pid %d:", pid); for(i = 0; i < size; i++) if(use_syslog) syslog(LOG_ERR, "bt[%u]: %s", i, strings[i]); else cli_dbgmsg("%s\n", strings[i]); /* TODO: dump the current email */ free(strings); } #endif /* See also clamav-milter */ static bool usefulHeader(int commandNumber, const char *cmd) { switch(commandNumber) { case CONTENT_TRANSFER_ENCODING: case CONTENT_DISPOSITION: case CONTENT_TYPE: return TRUE; default: if(strcasecmp(cmd, "From") == 0) return TRUE; if(strcasecmp(cmd, "Received") == 0) return TRUE; if(strcasecmp(cmd, "De") == 0) return TRUE; } return FALSE; } /* * Like fgets but cope with end of line by "\n", "\r\n", "\n\r", "\r" */ static char * getline_from_mbox(char *buffer, size_t len, FILE *fin) { char *ret; if(feof(fin)) return NULL; if((len == 0) || (buffer == NULL)) { cli_errmsg("Invalid call to getline_from_mbox(). Refer to http://www.clamav.net/bugs.html#pagestart\n"); return NULL; } ret = buffer; do { int c = getc(fin); if(ferror(fin)) return NULL; switch(c) { case '\n': *buffer++ = '\n'; c = getc(fin); if((c != '\r') && !feof(fin)) ungetc(c, fin); break; default: *buffer++ = (char)c; continue; case EOF: break; case '\r': *buffer++ = '\n'; c = getc(fin); if((c != '\n') && !feof(fin)) ungetc(c, fin); break; } break; } while(--len > 1); if(len == 0) { /* the email probably breaks RFC821 */ cli_warnmsg("getline_from_mbox: buffer overflow stopped, line lost\n"); return NULL; } *buffer = '\0'; if(len == 1) /* overflows will have appeared on separate lines */ cli_dbgmsg("getline_from_mbox: buffer overflow stopped, line recovered\n"); return ret; } /* * Is this line a candidate for the start of a bounce message? */ static bool isBounceStart(const char *line) { if(line == NULL) return FALSE; if(*line == '\0') return FALSE; /*if((strncmp(line, "From ", 5) == 0) && !isalnum(line[5])) return FALSE; if((strncmp(line, ">From ", 6) == 0) && !isalnum(line[6])) return FALSE;*/ if(cli_filetype((const unsigned char *)line, strlen(line)) != CL_TYPE_MAIL) return FALSE; if((strncmp(line, "From ", 5) == 0) || (strncmp(line, ">From ", 6) == 0)) { int numSpaces = 0, numDigits = 0; do if(*line == ' ') numSpaces++; else if(isdigit(*line)) numDigits++; while(*++line != '\0'); if(numSpaces < 6) return FALSE; if(numDigits < 11) return FALSE; } return TRUE; } /* * Extract a binhexEncoded message, return if it's found to be infected as we * extract it */ static bool exportBinhexMessage(const char *dir, message *m) { bool infected = FALSE; fileblob *fb; if(messageGetEncoding(m) == NOENCODING) messageSetEncoding(m, "x-binhex"); fb = messageToFileblob(m, dir, 0); if(fb) { if(fileblobContainsVirus(fb)) infected = TRUE; cli_dbgmsg("Binhex file decoded to %s\n", fileblobGetFilename(fb)); fileblobDestroy(fb); } else cli_errmsg("Couldn't decode binhex file to %s\n", dir); return infected; } /* * Locate any bounce message and extract it. Return 1 if anything found */ static int exportBounceMessage(text *start, const mbox_ctx *mctx) { int rc = 0; text *t; fileblob *fb; /* * Attempt to save the original (unbounced) * message - clamscan will find that in the * directory and call us again (with any luck) * having found an e-mail message to handle. * * This finds a lot of false positives, the * search that a content type is in the * bounce (i.e. it's after the bounce header) * helps a bit. * * messageAddLine * optimisation could help here, but needs * careful thought, do it with line numbers * would be best, since the current method in * messageAddLine of checking encoding first * must remain otherwise non bounce messages * won't be scanned */ for(t = start; t; t = t->t_next) { char cmd[RFC2821LENGTH + 1]; const char *txt = lineGetData(t->t_line); if(txt == NULL) continue; if(cli_strtokbuf(txt, 0, ":", cmd) == NULL) continue; switch(tableFind(mctx->rfc821Table, cmd)) { case CONTENT_TRANSFER_ENCODING: if((strstr(txt, "7bit") == NULL) && (strstr(txt, "8bit") == NULL)) break; continue; case CONTENT_DISPOSITION: break; case CONTENT_TYPE: if(strstr(txt, "text/plain") != NULL) t = NULL; break; default: if(strcasecmp(cmd, "From") == 0) start = t; else if(strcasecmp(cmd, "Received") == 0) start = t; continue; } break; } if(t && ((fb = fileblobCreate()) != NULL)) { cli_dbgmsg("Found a bounce message\n"); fileblobSetFilename(fb, mctx->dir, "bounce"); /*fileblobSetCTX(fb, mctx->ctx);*/ if(textToFileblob(start, fb, 1) == NULL) cli_dbgmsg("Nothing new to save in the bounce message\n"); else rc = 1; fileblobDestroy(fb); } else cli_dbgmsg("Not found a bounce message\n"); return rc; } /* * Handle the ith element of a number of multiparts, e.g. multipart/alternative */ static message * do_multipart(message *mainMessage, message **messages, int i, int *rc, mbox_ctx *mctx, message *messageIn, text **tptr) { bool addToText = FALSE; const char *dtype; #ifndef SAVE_TO_DISC message *body; #endif message *aMessage = messages[i]; if(aMessage == NULL) return mainMessage; cli_dbgmsg("Mixed message part %d is of type %d\n", i, messageGetMimeType(aMessage)); switch(messageGetMimeType(aMessage)) { case APPLICATION: case AUDIO: case IMAGE: case VIDEO: break; case NOMIME: cli_dbgmsg("No mime headers found in multipart part %d\n", i); if(mainMessage) { if(binhexBegin(aMessage)) { cli_dbgmsg("Found binhex message in multipart/mixed mainMessage\n"); if(exportBinhexMessage(mctx->dir, mainMessage)) *rc = 3; } if(mainMessage != messageIn) messageDestroy(mainMessage); mainMessage = NULL; } else if(aMessage) { if(binhexBegin(aMessage)) { cli_dbgmsg("Found binhex message in multipart/mixed non mime part\n"); if(exportBinhexMessage(mctx->dir, aMessage)) *rc = 3; assert(aMessage == messages[i]); messageReset(messages[i]); } } addToText = TRUE; if(messageGetBody(aMessage) == NULL) /* * No plain text version */ cli_dbgmsg("No plain text alternative\n"); break; case TEXT: dtype = messageGetDispositionType(aMessage); cli_dbgmsg("Mixed message text part disposition \"%s\"\n", dtype); if(strcasecmp(dtype, "attachment") == 0) break; if((*dtype == '\0') || (strcasecmp(dtype, "inline") == 0)) { const char *cptr; if(mainMessage && (mainMessage != messageIn)) messageDestroy(mainMessage); mainMessage = NULL; cptr = messageGetMimeSubtype(aMessage); cli_dbgmsg("Mime subtype \"%s\"\n", cptr); if((tableFind(mctx->subtypeTable, cptr) == PLAIN) && (messageGetEncoding(aMessage) == NOENCODING)) { char *filename; /* * Strictly speaking * a text/plain part is * not an attachment. We * pretend it is so that * we can decode and * scan it */ filename = (char *)messageFindArgument(aMessage, "filename"); if(filename == NULL) filename = (char *)messageFindArgument(aMessage, "name"); if(filename == NULL) { cli_dbgmsg("Adding part to main message\n"); addToText = TRUE; } else { cli_dbgmsg("Treating %s as attachment\n", filename); free(filename); } } else { const int is_html = (tableFind(mctx->subtypeTable, cptr) == HTML); if((mctx->ctx->options&CL_SCAN_MAILURL) && is_html) checkURLs(aMessage, mctx, rc, 1); #ifdef CL_EXPERIMENTAL else if(!(mctx->ctx->options&CL_SCAN_NOPHISHING)) checkURLs(aMessage, mctx, rc, is_html); #endif messageAddArgument(aMessage, "filename=mixedtextportion"); } break; } cli_dbgmsg("Text type %s is not supported\n", dtype); return mainMessage; case MESSAGE: /* Content-Type: message/rfc822 */ cli_dbgmsg("Found message inside multipart (encoding type %d)\n", messageGetEncoding(aMessage)); #ifndef SCAN_UNENCODED_BOUNCES switch(messageGetEncoding(aMessage)) { case NOENCODING: case EIGHTBIT: case BINARY: if(encodingLine(aMessage) == NULL) { /* * This means that the message * has no attachments * * The test for * messageGetEncoding is needed * since encodingLine won't have * been set if the message * itself has been encoded */ cli_dbgmsg("Unencoded multipart/message will not be scanned\n"); assert(aMessage == messages[i]); messageDestroy(messages[i]); messages[i] = NULL; return mainMessage; } /* FALLTHROUGH */ default: cli_dbgmsg("Encoded multipart/message will be scanned\n"); } #endif #if 0 messageAddStrAtTop(aMessage, "Received: by clamd (message/rfc822)"); #endif #ifdef SAVE_TO_DISC /* * Save this embedded message * to a temporary file */ saveTextPart(aMessage, mctx->dir, 1); assert(aMessage == messages[i]); messageDestroy(messages[i]); messages[i] = NULL; #else /* * Scan in memory, faster but is open to DoS attacks * when many nested levels are involved. */ body = parseEmailHeaders(aMessage, mctx->rfc821Table, TRUE); /* * We've fininished with the * original copy of the message, * so throw that away and * deal with the encapsulated * message as a message. * This can save a lot of memory */ assert(aMessage == messages[i]); messageDestroy(messages[i]); messages[i] = NULL; if(body) { messageSetCTX(body, ctx); rc = parseEmailBody(body, NULL, mctx); if(messageContainsVirus(body)) *rc = 3; messageDestroy(body); } #endif return mainMessage; case MULTIPART: /* * It's a multi part within a multi part * Run the message parser on this bit, it won't * be an attachment */ cli_dbgmsg("Found multipart inside multipart\n"); if(aMessage) { /* * The headers were parsed when reading in the * whole multipart section */ *rc = parseEmailBody(aMessage, *tptr, mctx); cli_dbgmsg("Finished recursion\n"); assert(aMessage == messages[i]); messageDestroy(messages[i]); messages[i] = NULL; } else { *rc = parseEmailBody(NULL, NULL, mctx); if(mainMessage && (mainMessage != messageIn)) messageDestroy(mainMessage); mainMessage = NULL; } return mainMessage; default: cli_warnmsg("Only text and application attachments are supported, type = %d\n", messageGetMimeType(aMessage)); return mainMessage; } if(addToText) { cli_dbgmsg("Adding to non mime-part\n"); *tptr = textAdd(*tptr, messageGetBody(aMessage)); } else { fileblob *fb = messageToFileblob(aMessage, mctx->dir, 1); if(fb) { if(fileblobContainsVirus(fb)) *rc = 3; fileblobDestroy(fb); } } if(messageContainsVirus(aMessage)) *rc = 3; messageDestroy(aMessage); messages[i] = NULL; return mainMessage; } /* * Returns the number of quote characters in the given string */ static int count_quotes(const char *buf) { int quotes = 0; while(*buf) if(*buf++ == '\"') quotes++; return quotes; } /* * Will the next line be a folded header? See RFC2822 section 2.2.3 */ static bool next_is_folded_header(const text *t) { const text *next = t->t_next; const char *data, *ptr; if(next == NULL) return FALSE; if(next->t_line == NULL) return FALSE; data = lineGetData(next->t_line); /* * Section B.2 of RFC822 says TAB or SPACE means a continuation of the * previous entry. */ if(isblank(data[0])) return TRUE; if(strchr(data, '=') == NULL) /* * Avoid false positives with * Content-Type: text/html; * Content-Transfer-Encoding: quoted-printable */ return FALSE; /* * Some are broken and don't fold headers lines * correctly as per section 2.2.3 of RFC2822. * Generally they miss the white space at * the start of the fold line: * Content-Type: multipart/related; * type="multipart/alternative"; * boundary="----=_NextPart_000_006A_01C6AC47.348CB550" * should read: * Content-Type: multipart/related; * type="multipart/alternative"; * boundary="----=_NextPart_000_006A_01C6AC47.348CB550" * Since we're a virus checker not an RFC * verifier we need to handle these */ data = lineGetData(t->t_line); ptr = strchr(data, '\0'); while(--ptr > data) switch(*ptr) { case ';': return TRUE; case '\n': case ' ': case '\r': case '\t': continue; /* white space at end of line */ default: return FALSE; } return FALSE; }