libclamav/filetypes.c
888f5794
 /*
fd08b8fa
  *  Copyright (C) 2002 - 2005 Tomasz Kojm <tkojm@clamav.net>
888f5794
  *  With enhancements from Thomas Lamy <Thomas.Lamy@in-online.net>
  *
  *  This program is free software; you can redistribute it and/or modify
bb34cb31
  *  it under the terms of the GNU General Public License version 2 as
  *  published by the Free Software Foundation.
888f5794
  *
  *  This program is distributed in the hope that it will be useful,
  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  *  GNU General Public License for more details.
  *
  *  You should have received a copy of the GNU General Public License
  *  along with this program; if not, write to the Free Software
48b7b4a7
  *  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
  *  MA 02110-1301, USA.
888f5794
  */
 
 #if HAVE_CONFIG_H
 #include "clamav-config.h"
 #endif
 
 #include <stdio.h>
 #include <string.h>
 #include <stdlib.h>
216a697f
 #include <ctype.h>
ad3c01bf
 #include <sys/types.h>
4e9ab8ed
 #ifdef	HAVE_UNISTD_H
 #include <unistd.h>
 #endif
888f5794
 
 #include "clamav.h"
 #include "filetypes.h"
8000d078
 #include "others.h"
 #include "readdb.h"
c6fb0b98
 #include "matcher-ac.h"
bd988961
 #include "str.h"
888f5794
 
3506ac49
 #include "htmlnorm.h"
 #include "entconv.h"
 
888f5794
 struct cli_magic_s {
a85bd2ac
     size_t offset;
888f5794
     const char *magic;
     size_t length;
     const char *descr;
     cli_file_t type;
 };
 
 struct cli_smagic_s {
     const char *sig;
     const char *descr;
     cli_file_t type;
 };
 
 static const struct cli_magic_s cli_magic[] = {
 
     /* Executables */
 
3805ebcb
     {0,  "MZ",				2,  "DOS/W32 executable/library/driver", CL_TYPE_MSEXE},
094116b2
     {0,	 "\177ELF",			4,  "ELF",		CL_TYPE_ELF},
2c6f9d57
 
888f5794
     /* Archives */
 
3805ebcb
     {0,	    "Rar!",			4,  "RAR",		CL_TYPE_RAR},
     {0,	    "PK\003\004",		4,  "ZIP",		CL_TYPE_ZIP},
c7da05f9
     {0,	    "PK00PK\003\004",		8,  "ZIP",		CL_TYPE_ZIP},
3805ebcb
     {0,	    "\037\213",			2,  "GZip",		CL_TYPE_GZ},
     {0,	    "BZh",			3,  "BZip",		CL_TYPE_BZ},
     {0,	    "SZDD",			4,  "compress.exe'd",	CL_TYPE_MSSZDD},
     {0,	    "MSCF",			4,  "MS CAB",		CL_TYPE_MSCAB},
     {0,	    "ITSF",			4,  "MS CHM",           CL_TYPE_MSCHM},
bf45bf13
     {8,	    "\x19\x04\x00\x10",		4,  "SIS",		CL_TYPE_SIS},
3805ebcb
     {0,     "#@~^",			4,  "SCRENC",		CL_TYPE_SCRENC},
667ab9c6
     {0,     "(This file must be converted with BinHex 4.0)",
 				       45, "BinHex",		CL_TYPE_BINHEX},
888f5794
 
     /* Mail */
 
3805ebcb
     {0,  "From ",			 5, "MBox",		  CL_TYPE_MAIL},
     {0,  "Received: ",			10, "Raw mail",		  CL_TYPE_MAIL},
     {0,  "Return-Path: ",		13, "Maildir",		  CL_TYPE_MAIL},
     {0,  "Return-path: ",		13, "Maildir",		  CL_TYPE_MAIL},
     {0,  "Delivered-To: ",		14, "Mail",		  CL_TYPE_MAIL},
     {0,  "X-UIDL: ",			 8, "Mail",		  CL_TYPE_MAIL},
     {0,  "X-Apparently-To: ",		17, "Mail",		  CL_TYPE_MAIL},
     {0,  "X-Envelope-From: ",		17, "Mail",		  CL_TYPE_MAIL},
     {0,  "X-Original-To: ",		15, "Mail",		  CL_TYPE_MAIL},
     {0,  "X-Symantec-",			11, "Symantec",		  CL_TYPE_MAIL},
     {0,  "X-EVS",			 5, "EVS mail",		  CL_TYPE_MAIL},
     {0,  "X-Real-To: ",                 11, "Mail",               CL_TYPE_MAIL},
fd08b8fa
     {0,  "X-Sieve: ",			 9, "Mail",		  CL_TYPE_MAIL},
3805ebcb
     {0,  ">From ",			 6, "Mail",		  CL_TYPE_MAIL},
     {0,  "Date: ",			 6, "Mail",		  CL_TYPE_MAIL},
     {0,  "Message-Id: ",		12, "Mail",		  CL_TYPE_MAIL},
     {0,  "Message-ID: ",		12, "Mail",		  CL_TYPE_MAIL},
     {0,  "Envelope-to: ",		13, "Mail",		  CL_TYPE_MAIL},
     {0,  "Delivery-date: ",		15, "Mail",		  CL_TYPE_MAIL},
     {0,  "To: ",			 4, "Mail",		  CL_TYPE_MAIL},
     {0,  "Subject: ",			 9, "Mail",		  CL_TYPE_MAIL},
     {0,  "For: ",			 5, "Eserv mail",	  CL_TYPE_MAIL},
     {0,  "From: ",			 6, "Exim mail",	  CL_TYPE_MAIL},
     {0,  "v:\015\012Received: ",	14, "VPOP3 Mail (DOS)",	  CL_TYPE_MAIL},
     {0,  "v:\012Received: ",		13, "VPOP3 Mail (UNIX)",  CL_TYPE_MAIL},
     {0,  "Hi. This is the qmail-send",  26, "Qmail bounce",	  CL_TYPE_MAIL},
2207a205
     {0,  "\170\237\076\042",		 4, "TNEF",               CL_TYPE_TNEF},
888f5794
 
3953039b
     {0,  "begin ",			6,  "UUencoded",	  CL_TYPE_UUENCODED},
24683a2a
     {0, "\041\102\104\116",		4, "PST",		  CL_TYPE_PST},
3953039b
 
7ec67e94
     /* Graphics (may contain exploits against MS systems) */
 
     {0,  "GIF",				 3, "GIF",	    CL_TYPE_GRAPHICS},
     {0,  "BM",				 2, "BMP",          CL_TYPE_GRAPHICS},
c4f43f56
     {0,  "\377\330\377",		 3, "JPEG",         CL_TYPE_GRAPHICS},
7ec67e94
     {6,  "JFIF",			 4, "JPEG",         CL_TYPE_GRAPHICS},
     {6,  "Exif",			 4, "JPEG",         CL_TYPE_GRAPHICS},
     {0,  "\x89PNG",			 4, "PNG",          CL_TYPE_GRAPHICS},
eb308794
     {0,  "RIFF",                         4, "RIFF",         CL_TYPE_RIFF},
     {0,  "RIFX",                         4, "RIFX",         CL_TYPE_RIFF},
7ec67e94
 
f87214c3
     /* Others */
 
     {0,  "\320\317\021\340\241\261\032\341", 8, "OLE2 container", CL_TYPE_MSOLE2},
bda5598b
     {0,  "%PDF-",			 5, "PDF document", CL_TYPE_PDF},
2c6f9d57
     {0,  "\266\271\254\256\376\377\377\377", 8, "CryptFF", CL_TYPE_CRYPTFF},
52c2a8bd
     {0,  "{\\rtf",                           5, "RTF", CL_TYPE_RTF}, 
f7efc022
 
888f5794
     /* Ignored types */
 
3805ebcb
     {0,  "\000\000\001\263",             4, "MPEG video stream",  CL_TYPE_DATA},
     {0,  "\000\000\001\272",             4, "MPEG sys stream",    CL_TYPE_DATA},
     {0,  "OggS",                         4, "Ogg Stream",         CL_TYPE_DATA},
     {0,  "ID3",				 3, "MP3",		  CL_TYPE_DATA},
     {0,  "\377\373\220",		 3, "MP3",		  CL_TYPE_DATA},
bda5598b
     {0,  "%!PS-Adobe-",			11, "PostScript",	  CL_TYPE_DATA},
3805ebcb
     {0,  "\060\046\262\165\216\146\317", 7, "WMA/WMV/ASF",	  CL_TYPE_DATA},
     {0,  ".RMF" ,			 4, "Real Media File",	  CL_TYPE_DATA},
 
a85bd2ac
     {0, NULL,				 0, NULL,		  CL_TYPE_UNKNOWN_DATA}
888f5794
 };
 
 static const struct cli_smagic_s cli_smagic[] = {
 
8c290294
     /* "\nFrom: " * "\nContent-Type: " */
3805ebcb
     {"0a46726f6d3a20{-2048}0a436f6e74656e742d547970653a20", "Mail file", CL_TYPE_MAIL},
e0549aa0
 
8c290294
     /* "\nReceived: " * "\nContent-Type: " */
3805ebcb
     {"0a52656365697665643a20{-2048}0a436f6e74656e742d547970653a20", "Mail file", CL_TYPE_MAIL},
ece009c0
 
e0549aa0
     /* "\nReceived: " * "\nContent-type: " */
3805ebcb
     {"0a52656365697665643a20{-2048}0a436f6e74656e742d747970653a20", "Mail file", CL_TYPE_MAIL},
e0549aa0
 
2e22ced8
     /* "MIME-Version: " * "\nContent-Type: " */
     {"4d494d452d56657273696f6e3a20{-2048}0a436f6e74656e742d547970653a20", "Mail file", CL_TYPE_MAIL},
 
5ae8bdd5
     /* remember the matcher is case sensitive */
3805ebcb
     {"3c62723e",       "HTML data", CL_TYPE_HTML},	/* <br> */
     {"3c42723e",       "HTML data", CL_TYPE_HTML},	/* <Br> */
     {"3c42523e",       "HTML data", CL_TYPE_HTML},	/* <BR> */
     {"3c703e",	       "HTML data", CL_TYPE_HTML},	/* <p> */
     {"3c503e",	       "HTML data", CL_TYPE_HTML},	/* <P> */
     {"68726566",       "HTML data", CL_TYPE_HTML},	/* href */
     {"48726566",       "HTML data", CL_TYPE_HTML},	/* Href */
     {"48524546",       "HTML data", CL_TYPE_HTML},	/* HREF */
2e16f7df
     {"3c68746d6c3e",   "HTML data", CL_TYPE_HTML},      /* <html> */
     {"3c48544d4c3e",   "HTML data", CL_TYPE_HTML},      /* <HTML> */
     {"3c48746d6c3e",   "HTML data", CL_TYPE_HTML},      /* <Html> */
d5d241bb
     {"3c686561643e",   "HTML data", CL_TYPE_HTML},      /* <head> */
     {"3c484541443e",   "HTML data", CL_TYPE_HTML},      /* <HEAD> */
     {"3c486561643e",   "HTML data", CL_TYPE_HTML},      /* <Head> */
3805ebcb
     {"3c666f6e74",     "HTML data", CL_TYPE_HTML},	/* <font */
     {"3c466f6e74",     "HTML data", CL_TYPE_HTML},	/* <Font */
     {"3c464f4e54",     "HTML data", CL_TYPE_HTML},	/* <FONT */
2e16f7df
     {"3c696d67",       "HTML data", CL_TYPE_HTML},      /* <img */
     {"3c494d47",       "HTML data", CL_TYPE_HTML},      /* <IMG */
     {"3c496d67",       "HTML data", CL_TYPE_HTML},      /* <Img */
3805ebcb
     {"3c736372697074", "HTML data", CL_TYPE_HTML},	/* <script */
     {"3c536372697074", "HTML data", CL_TYPE_HTML},	/* <Script */
     {"3c534352495054", "HTML data", CL_TYPE_HTML},	/* <SCRIPT */
d32309fd
     {"3c6f626a656374", "HTML data", CL_TYPE_HTML},      /* <object */
     {"3c4f626a656374", "HTML data", CL_TYPE_HTML},      /* <Object */
     {"3c4f424a454354", "HTML data", CL_TYPE_HTML},      /* <OBJECT */
     {"3c696672616d65", "HTML data", CL_TYPE_HTML},      /* <iframe */
     {"3c494652414d45", "HTML data", CL_TYPE_HTML},      /* <IFRAME */
188ce080
     {"3c7461626c65",   "HTML data", CL_TYPE_HTML},	/* <table */
     {"3c5441424c45",   "HTML data", CL_TYPE_HTML},	/* <TABLE */
3805ebcb
 
978e3d68
     {"526172211a0700", "RAR-SFX", CL_TYPE_RARSFX},
8bf021e6
     {"504b0304", "ZIP-SFX", CL_TYPE_ZIPSFX},
d8a5c616
     {"4d534346", "CAB-SFX", CL_TYPE_CABSFX},
faaf436a
     {"efbeadde4e756c6c736f6674496e7374", "NSIS", CL_TYPE_NULSFT},
978e3d68
 
ee99255a
     {"4d5a{180-300}50450000", "PE", CL_TYPE_MSEXE},
 
3805ebcb
     {NULL,  NULL,   CL_TYPE_UNKNOWN_DATA}
888f5794
 };
 
e88f97f3
 static char internat[256] = {
     /* TODO: Remember to buy a beer to Joerg Wunsch <joerg@FreeBSD.ORG> */
     0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0,  /* 0x0X */
     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,  /* 0x1X */
     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  /* 0x2X */
     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  /* 0x3X */
     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  /* 0x4X */
     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  /* 0x5X */
     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  /* 0x6X */
     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0,  /* 0x7X */
     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  /* 0x8X */
     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  /* 0x9X */
     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  /* 0xaX */
     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  /* 0xbX */
     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  /* 0xcX */
     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  /* 0xdX */
     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  /* 0xeX */
     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1   /* 0xfX */
 };
 
b58fdfc2
 cli_file_t cli_filetype(const unsigned char *buf, size_t buflen)
888f5794
 {
bd988961
 	int i, text = 1, len;
216a697f
 
888f5794
 
     for(i = 0; cli_magic[i].magic; i++) {
 	if(buflen >= cli_magic[i].offset+cli_magic[i].length) {
 	    if(memcmp(buf+cli_magic[i].offset, cli_magic[i].magic, cli_magic[i].length) == 0) {
 		cli_dbgmsg("Recognized %s file\n", cli_magic[i].descr);
 		return cli_magic[i].type;
 	    }
 	}
     }
 
9b0b9fb9
 /* improve or drop this code
  * https://wwws.clamav.net/bugzilla/show_bug.cgi?id=373
  *
7d668f62
     buflen < 25 ? (len = buflen) : (len = 25);
     for(i = 0; i < len; i++)
e88f97f3
 	if(!iscntrl(buf[i]) && !isprint(buf[i]) && !internat[buf[i] & 0xff]) {
bd988961
 	    text = 0;
216a697f
 	    break;
 	}
9b0b9fb9
 */
bd988961
     return text ? CL_TYPE_UNKNOWN_TEXT : CL_TYPE_UNKNOWN_DATA;
888f5794
 }
 
e12c29d2
 int is_tar(unsigned char *buf, unsigned int nbytes);
a7f5fd00
 
bd988961
 cli_file_t cli_filetype2(int desc, const struct cl_engine *engine)
a7f5fd00
 {
4e9ab8ed
 	unsigned char smallbuff[MAGIC_BUFFER_SIZE + 1], *decoded, *bigbuff;
bd988961
 	int bread, sret;
a7f5fd00
 	cli_file_t ret = CL_TYPE_UNKNOWN_DATA;
bd988961
 	struct cli_matcher *root;
4e9ab8ed
 	struct cli_ac_data mdata;
a7f5fd00
 
 
     memset(smallbuff, 0, sizeof(smallbuff));
     if((bread = read(desc, smallbuff, MAGIC_BUFFER_SIZE)) > 0)
 	ret = cli_filetype(smallbuff, bread);
 
bd988961
     if(engine && ret == CL_TYPE_UNKNOWN_TEXT) {
 	root = engine->root[0];
 	if(!root)
 	    return ret;
 
4e9ab8ed
 	if(cli_ac_initdata(&mdata, root->ac_partsigs, AC_DEFAULT_TRACKLEN))
bd988961
 	    return ret;
 
4e9ab8ed
 	sret = cli_ac_scanbuff(smallbuff, bread, NULL, engine->root[0], &mdata, 1, 0, 0, -1, NULL);
 
 	cli_ac_freedata(&mdata);
bd988961
 
 	if(sret >= CL_TYPENO) {
 	    ret = sret;
 	} else {
4e9ab8ed
 	    if(cli_ac_initdata(&mdata, root->ac_partsigs, AC_DEFAULT_TRACKLEN))
 		return ret;
 
bda5598b
 	    decoded = (unsigned char *) cli_utf16toascii((char *) smallbuff, bread);
bd988961
 	    if(decoded) {
bda5598b
 		sret = cli_ac_scanbuff(decoded, strlen((char *) decoded), NULL, engine->root[0], &mdata, 1, 0, 0, -1, NULL);
bd988961
 		free(decoded);
 		if(sret == CL_TYPE_HTML)
 		    ret = CL_TYPE_HTML_UTF16;
 	    }
4e9ab8ed
 	    cli_ac_freedata(&mdata);
3506ac49
 
692bda68
 	    if((((struct cli_dconf*) engine->dconf)->phishing & PHISHING_CONF_ENTCONV) && ret != CL_TYPE_HTML_UTF16) {
3506ac49
 		    struct entity_conv conv;
 		    const size_t conv_size = 2*bread < 256 ? 256 : 2*bread;
 
 		if(init_entity_converter(&conv,UNKNOWN,conv_size) == 0) {
 			int end = 0;
 			m_area_t area;
 			area.buffer = (unsigned char *) smallbuff;
 			area.length = bread;
 			area.offset = 0;
 
 		    while(!end) {
 			if(cli_ac_initdata(&mdata, root->ac_partsigs, AC_DEFAULT_TRACKLEN))
 			    return ret;
 
fc83da82
 			decoded =  encoding_norm_readline(&conv, NULL, &area, bread);
3506ac49
 
 			if(decoded) {
fc83da82
 			    sret = cli_ac_scanbuff(decoded, strlen((const char *) decoded), NULL, engine->root[0], &mdata, 1, 0, 0, -1, NULL);
3506ac49
 			    free(decoded);
 			    if(sret == CL_TYPE_HTML) {
 				ret = CL_TYPE_HTML;
 				end = 1;
 			    }
 			} else
 			    end = 1;
 
 			cli_ac_freedata(&mdata);
 		    }
 
 		    entity_norm_done(&conv);
 
 		} else {
 		    cli_warnmsg("cli_filetype2: Error initializing entity converter\n");
 		}
 	    }
bd988961
 	}
     }
 
a7f5fd00
     if(ret == CL_TYPE_UNKNOWN_DATA || ret == CL_TYPE_UNKNOWN_TEXT) {
 
acb87f87
 	if(!(bigbuff = (unsigned char *) cli_calloc(37638 + 1, sizeof(unsigned char))))
a7f5fd00
 	    return ret;
 
 	lseek(desc, 0, SEEK_SET);
acb87f87
 	if((bread = read(desc, bigbuff, 37638)) > 0) {
a7f5fd00
 
 	    bigbuff[bread] = 0;
 
 	    switch(is_tar(bigbuff, bread)) {
 		case 1:
 		    ret = CL_TYPE_OLD_TAR;
 		    cli_dbgmsg("Recognized old fashioned tar file\n");
 		    break;
 		case 2:
 		    ret = CL_TYPE_POSIX_TAR;
 		    cli_dbgmsg("Recognized POSIX tar file\n");
 		    break;
 	    }
 	}
 
acb87f87
 	if(ret == CL_TYPE_UNKNOWN_DATA || ret == CL_TYPE_UNKNOWN_TEXT) {
 
 	    if(!memcmp(bigbuff + 32769, "CD001" , 5) || !memcmp(bigbuff + 37633, "CD001" , 5)) {
 		cli_dbgmsg("Recognized ISO 9660 CD-ROM data\n");
 		ret = CL_TYPE_DATA;
 	    } else if(!memcmp(bigbuff + 32776, "CDROM" , 5)) {
 		cli_dbgmsg("Recognized High Sierra CD-ROM data\n");
 		ret = CL_TYPE_DATA;
 	    }
 	}
 
a7f5fd00
 	free(bigbuff);
     }
 
     return ret;
 }
 
5612732c
 int cli_addtypesigs(struct cl_engine *engine)
888f5794
 {
 	int i, ret;
5612732c
 	struct cli_matcher *root;
 
592d88c0
 
     if(!engine->root[0]) {
 	cli_dbgmsg("cli_addtypesigs: Need to allocate AC trie in engine->root[0]\n");
 	root = engine->root[0] = (struct cli_matcher *) cli_calloc(1, sizeof(struct cli_matcher));
 	if(!root) {
 	    cli_errmsg("cli_addtypesigs: Can't initialise AC pattern matcher\n");
 	    return CL_EMEM;
 	}
f7470773
 
fbcef1b0
 	if((ret = cli_ac_init(root, AC_DEFAULT_MIN_DEPTH, AC_DEFAULT_MAX_DEPTH))) {
592d88c0
 	    /* No need to free previously allocated memory here - all engine
 	     * elements will be properly freed by cl_free()
 	     */
fbcef1b0
 	    cli_errmsg("cli_addtypesigs: Can't initialise AC pattern matcher\n");
 	    return ret;
592d88c0
 	}
     } else {
 	root = engine->root[0];
     }
888f5794
 
     for(i = 0; cli_smagic[i].sig; i++) {
b68d11d2
 	if((ret = cli_parse_add(root, cli_smagic[i].descr, cli_smagic[i].sig, cli_smagic[i].type, NULL, 0))) {
592d88c0
 	    cli_errmsg("cli_addtypesigs: Problem adding signature for %s\n", cli_smagic[i].descr);
888f5794
 	    return ret;
 	}
     }
 
     return 0;
 }