libclamav/filetypes.c
888f5794
 /*
2023340a
  *  Copyright (C) 2007-2008 Sourcefire, Inc.
7021b545
  *
2023340a
  *  Authors: Tomasz Kojm
888f5794
  *
  *  This program is free software; you can redistribute it and/or modify
bb34cb31
  *  it under the terms of the GNU General Public License version 2 as
  *  published by the Free Software Foundation.
888f5794
  *
  *  This program is distributed in the hope that it will be useful,
  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  *  GNU General Public License for more details.
  *
  *  You should have received a copy of the GNU General Public License
  *  along with this program; if not, write to the Free Software
48b7b4a7
  *  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
  *  MA 02110-1301, USA.
888f5794
  */
 
 #if HAVE_CONFIG_H
 #include "clamav-config.h"
 #endif
 
 #include <stdio.h>
 #include <string.h>
 #include <stdlib.h>
ad3c01bf
 #include <sys/types.h>
4e9ab8ed
 #ifdef	HAVE_UNISTD_H
 #include <unistd.h>
 #endif
888f5794
 
 #include "clamav.h"
 #include "filetypes.h"
8000d078
 #include "others.h"
 #include "readdb.h"
c6fb0b98
 #include "matcher-ac.h"
bd988961
 #include "str.h"
c8f2d060
 #include "textdet.h"
589d8d8e
 #include "default.h"
888f5794
 
3506ac49
 #include "htmlnorm.h"
 #include "entconv.h"
e21657df
 #include "mpool.h"
3506ac49
 
7021b545
 static const struct ftmap_s {
     const char *name;
     cli_file_t code;
 } ftmap[] = {
c8f2d060
     { "CL_TYPE_TEXT_ASCII",	CL_TYPE_TEXT_ASCII	},
     { "CL_TYPE_TEXT_UTF8",	CL_TYPE_TEXT_UTF8	},
     { "CL_TYPE_TEXT_UTF16LE",	CL_TYPE_TEXT_UTF16LE	},
     { "CL_TYPE_TEXT_UTF16BE",	CL_TYPE_TEXT_UTF16BE	},
     { "CL_TYPE_BINARY_DATA",	CL_TYPE_BINARY_DATA	},
7021b545
     { "CL_TYPE_IGNORED",	CL_TYPE_IGNORED		},
55094a9c
     { "CL_TYPE_ANY",		CL_TYPE_ANY		},
7021b545
     { "CL_TYPE_MSEXE",		CL_TYPE_MSEXE		},
     { "CL_TYPE_ELF",		CL_TYPE_ELF		},
89c14869
     { "CL_TYPE_MACHO",		CL_TYPE_MACHO		},
3222a096
     { "CL_TYPE_MACHO_UNIBIN",	CL_TYPE_MACHO_UNIBIN	},
7021b545
     { "CL_TYPE_POSIX_TAR",	CL_TYPE_POSIX_TAR	},
     { "CL_TYPE_OLD_TAR",	CL_TYPE_OLD_TAR		},
75e46945
     { "CL_TYPE_CPIO_OLD",	CL_TYPE_CPIO_OLD	},
     { "CL_TYPE_CPIO_ODC",	CL_TYPE_CPIO_ODC	},
     { "CL_TYPE_CPIO_NEWC",	CL_TYPE_CPIO_NEWC	},
     { "CL_TYPE_CPIO_CRC",	CL_TYPE_CPIO_CRC	},
7021b545
     { "CL_TYPE_GZ",		CL_TYPE_GZ		},
     { "CL_TYPE_ZIP",		CL_TYPE_ZIP		},
     { "CL_TYPE_BZ",		CL_TYPE_BZ		},
     { "CL_TYPE_RAR",		CL_TYPE_RAR		},
     { "CL_TYPE_ARJ",		CL_TYPE_ARJ		},
     { "CL_TYPE_MSSZDD",		CL_TYPE_MSSZDD		},
     { "CL_TYPE_MSOLE2",		CL_TYPE_MSOLE2		},
     { "CL_TYPE_MSCAB",		CL_TYPE_MSCAB		},
     { "CL_TYPE_MSCHM",		CL_TYPE_MSCHM		},
     { "CL_TYPE_SIS",		CL_TYPE_SIS		},
     { "CL_TYPE_SCRENC",		CL_TYPE_SCRENC		},
     { "CL_TYPE_GRAPHICS",	CL_TYPE_GRAPHICS	},
     { "CL_TYPE_RIFF",		CL_TYPE_RIFF		},
     { "CL_TYPE_BINHEX",		CL_TYPE_BINHEX		},
     { "CL_TYPE_TNEF",		CL_TYPE_TNEF		},
     { "CL_TYPE_CRYPTFF",	CL_TYPE_CRYPTFF		},
     { "CL_TYPE_PDF",		CL_TYPE_PDF		},
     { "CL_TYPE_UUENCODED",	CL_TYPE_UUENCODED	},
     { "CL_TYPE_HTML_UTF16",	CL_TYPE_HTML_UTF16	},
015ce4a8
     { "CL_TYPE_SCRIPT",         CL_TYPE_SCRIPT          },
7021b545
     { "CL_TYPE_RTF",		CL_TYPE_RTF		},
     { "CL_TYPE_HTML",		CL_TYPE_HTML		},
     { "CL_TYPE_MAIL",		CL_TYPE_MAIL		},
     { "CL_TYPE_SFX",		CL_TYPE_SFX		},
     { "CL_TYPE_ZIPSFX",		CL_TYPE_ZIPSFX		},
     { "CL_TYPE_RARSFX",		CL_TYPE_RARSFX		},
     { "CL_TYPE_CABSFX",		CL_TYPE_CABSFX		},
     { "CL_TYPE_ARJSFX",		CL_TYPE_ARJSFX		},
     { "CL_TYPE_NULSFT",		CL_TYPE_NULSFT		},
     { "CL_TYPE_AUTOIT",		CL_TYPE_AUTOIT		},
cadaa703
     { "CL_TYPE_ISHIELD_MSI",	CL_TYPE_ISHIELD_MSI	},
81fded11
     { "CL_TYPE_7Z",		CL_TYPE_7Z		},
c8f2d060
     { NULL,			CL_TYPE_IGNORED		}
888f5794
 };
 
7021b545
 cli_file_t cli_ftcode(const char *name)
 {
 	unsigned int i;
888f5794
 
7021b545
     for(i = 0; ftmap[i].name; i++)
 	if(!strcmp(ftmap[i].name, name))
 	    return ftmap[i].code;
888f5794
 
7021b545
     return CL_TYPE_ERROR;
 }
888f5794
 
0d9dbdef
 void cli_ftfree(const struct cl_engine *engine)
7021b545
 {
0d9dbdef
 	struct cli_ftype *ftypes=engine->ftypes, *pt;
7021b545
 
     while(ftypes) {
 	pt = ftypes;
 	ftypes = ftypes->next;
47d40feb
 	mpool_free(engine->mempool, pt->magic);
 	mpool_free(engine->mempool, pt->tname);
 	mpool_free(engine->mempool, pt);
7021b545
     }
 }
e88f97f3
 
7021b545
 cli_file_t cli_filetype(const unsigned char *buf, size_t buflen, const struct cl_engine *engine)
888f5794
 {
7021b545
 	struct cli_ftype *ftype = engine->ftypes;
216a697f
 
888f5794
 
7021b545
     while(ftype) {
 	if(ftype->offset + ftype->length <= buflen) {
 	    if(!memcmp(buf + ftype->offset, ftype->magic, ftype->length)) {
 		cli_dbgmsg("Recognized %s file\n", ftype->tname);
 		return ftype->type;
888f5794
 	    }
 	}
7021b545
 	ftype = ftype->next;
888f5794
     }
 
c8f2d060
     return cli_texttype(buf, buflen);
888f5794
 }
 
e12c29d2
 int is_tar(unsigned char *buf, unsigned int nbytes);
a7f5fd00
 
49cc1e3c
 cli_file_t cli_filetype2(fmap_t *map, const struct cl_engine *engine)
a7f5fd00
 {
048d7677
 	unsigned char *buff, *decoded;
 	int bread = MIN(map->len, MAGIC_BUFFER_SIZE), sret;
c8f2d060
 	cli_file_t ret = CL_TYPE_BINARY_DATA;
bd988961
 	struct cli_matcher *root;
4e9ab8ed
 	struct cli_ac_data mdata;
a7f5fd00
 
 
c8f2d060
     if(!engine) {
 	cli_errmsg("cli_filetype2: engine == NULL\n");
 	return CL_TYPE_ERROR;
     }
 
048d7677
     buff = fmap_need_off_once(map, 0, bread);
     if(!buff)
7db77fbf
 	return CL_TYPE_ERROR;
 
72ce4b70
     ret = cli_filetype(buff, bread, engine);
a7f5fd00
 
11dbe195
     if(ret == CL_TYPE_BINARY_DATA) {
 	switch(is_tar(buff, bread)) {
 	    case 1:
 		cli_dbgmsg("Recognized old fashioned tar file\n");
 		return CL_TYPE_OLD_TAR;
 	    case 2:
 		cli_dbgmsg("Recognized POSIX tar file\n");
 		return CL_TYPE_POSIX_TAR;
 	}
     }
 
c8f2d060
     if(ret >= CL_TYPE_TEXT_ASCII && ret <= CL_TYPE_BINARY_DATA) {
 	/* HTML files may contain special characters and could be
 	 * misidentified as BINARY_DATA by cli_filetype()
 	 */
bd988961
 	root = engine->root[0];
 	if(!root)
 	    return ret;
 
aca9ea82
 	if(cli_ac_initdata(&mdata, root->ac_partsigs, root->ac_lsigs, root->ac_reloff_num, CLI_DEFAULT_AC_TRACKLEN))
bd988961
 	    return ret;
 
33872a43
 	sret = cli_ac_scanbuff(buff, bread, NULL, NULL, NULL, engine->root[0], &mdata, 0, ret, NULL, AC_SCAN_FT, NULL);
4e9ab8ed
 
 	cli_ac_freedata(&mdata);
bd988961
 
 	if(sret >= CL_TYPENO) {
 	    ret = sret;
 	} else {
aca9ea82
 	    if(cli_ac_initdata(&mdata, root->ac_partsigs, root->ac_lsigs, root->ac_reloff_num, CLI_DEFAULT_AC_TRACKLEN))
4e9ab8ed
 		return ret;
 
72ce4b70
 	    decoded = (unsigned char *) cli_utf16toascii((char *) buff, bread);
bd988961
 	    if(decoded) {
020ba3ce
 		sret = cli_ac_scanbuff(decoded, bread / 2, NULL, NULL, NULL,  engine->root[0], &mdata, 0, CL_TYPE_TEXT_ASCII, NULL, AC_SCAN_FT, NULL);
bd988961
 		free(decoded);
 		if(sret == CL_TYPE_HTML)
 		    ret = CL_TYPE_HTML_UTF16;
 	    }
4e9ab8ed
 	    cli_ac_freedata(&mdata);
3506ac49
 
692bda68
 	    if((((struct cli_dconf*) engine->dconf)->phishing & PHISHING_CONF_ENTCONV) && ret != CL_TYPE_HTML_UTF16) {
b3fc7f97
 		    const char* encoding;
 
 		    /* check if we can autodetect this encoding.
 		     * If we can't don't try to detect HTML sig, since
 		     * we just tried that above, and failed */
72ce4b70
 		    if((encoding = encoding_detect_bom(buff, bread))) {
306d7ac7
 			    unsigned char decodedbuff[(MAGIC_BUFFER_SIZE+1)*2];
b3fc7f97
 			    m_area_t in_area, out_area;
5961eaef
 			    
 			    memset(decodedbuff, 0, sizeof(decodedbuff));
b3fc7f97
 
72ce4b70
 			    in_area.buffer = (unsigned char *) buff;
b3fc7f97
 			    in_area.length = bread;
 			    in_area.offset = 0;
 			    out_area.buffer = decodedbuff;
 			    out_area.length = sizeof(decodedbuff);
 			    out_area.offset = 0;
 
 			    /* in htmlnorm we simply skip over \0 chars, and that allows to parse HTML in any unicode 
 			     * (multibyte characters will not be exactly handled, but that is not a problem).
 			     * However when detecting whether a file is HTML or not, we need exact conversion.
 			     * (just eliminating zeros and matching would introduce false positives */
 			    if(encoding_normalize_toascii(&in_area, encoding, &out_area) >= 0 && out_area.length > 0) {
aca9ea82
 				    if(cli_ac_initdata(&mdata, root->ac_partsigs, root->ac_lsigs, root->ac_reloff_num, CLI_DEFAULT_AC_TRACKLEN))
b3fc7f97
 					    return ret;
 
 				    if(out_area.length > 0) {
33872a43
 					    sret = cli_ac_scanbuff(decodedbuff, out_area.length, NULL, NULL, NULL, engine->root[0], &mdata, 0, 0, NULL, AC_SCAN_FT, NULL); /* FIXME: can we use CL_TYPE_TEXT_ASCII instead of 0? */
b3fc7f97
 					    if(sret == CL_TYPE_HTML) {
 						    cli_dbgmsg("cli_filetype2: detected HTML signature in Unicode file\n");
 						    /* htmlnorm is able to handle any unicode now, since it skips null chars */
 						    ret = CL_TYPE_HTML;
 					    }
4e1127c5
 				    }
3506ac49
 
b3fc7f97
 				    cli_ac_freedata(&mdata);
 			    }
4e1127c5
 		    }
3506ac49
 	    }
bd988961
 	}
     }
 
a7f5fd00
     return ret;
 }