888f5794 |
/* |
2023340a |
* Copyright (C) 2007-2008 Sourcefire, Inc. |
7021b545 |
* |
2023340a |
* Authors: Tomasz Kojm |
888f5794 |
*
* This program is free software; you can redistribute it and/or modify |
bb34cb31 |
* it under the terms of the GNU General Public License version 2 as
* published by the Free Software Foundation. |
888f5794 |
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software |
48b7b4a7 |
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
* MA 02110-1301, USA. |
888f5794 |
*/
#if HAVE_CONFIG_H
#include "clamav-config.h"
#endif
#include <stdio.h>
#include <string.h>
#include <stdlib.h> |
ad3c01bf |
#include <sys/types.h> |
4e9ab8ed |
#ifdef HAVE_UNISTD_H
#include <unistd.h>
#endif |
888f5794 |
#include "clamav.h"
#include "filetypes.h" |
8000d078 |
#include "others.h"
#include "readdb.h" |
c6fb0b98 |
#include "matcher-ac.h" |
bd988961 |
#include "str.h" |
c8f2d060 |
#include "textdet.h" |
589d8d8e |
#include "default.h" |
888f5794 |
|
3506ac49 |
#include "htmlnorm.h"
#include "entconv.h" |
e21657df |
#include "mpool.h" |
3506ac49 |
|
7021b545 |
static const struct ftmap_s {
const char *name;
cli_file_t code;
} ftmap[] = { |
c8f2d060 |
{ "CL_TYPE_TEXT_ASCII", CL_TYPE_TEXT_ASCII },
{ "CL_TYPE_TEXT_UTF8", CL_TYPE_TEXT_UTF8 },
{ "CL_TYPE_TEXT_UTF16LE", CL_TYPE_TEXT_UTF16LE },
{ "CL_TYPE_TEXT_UTF16BE", CL_TYPE_TEXT_UTF16BE },
{ "CL_TYPE_BINARY_DATA", CL_TYPE_BINARY_DATA }, |
7021b545 |
{ "CL_TYPE_IGNORED", CL_TYPE_IGNORED }, |
6038397e |
{ "CL_TYPE_ANY", 0 }, /* for ft-sigs */ |
7021b545 |
{ "CL_TYPE_MSEXE", CL_TYPE_MSEXE },
{ "CL_TYPE_ELF", CL_TYPE_ELF }, |
89c14869 |
{ "CL_TYPE_MACHO", CL_TYPE_MACHO }, |
3222a096 |
{ "CL_TYPE_MACHO_UNIBIN", CL_TYPE_MACHO_UNIBIN }, |
7021b545 |
{ "CL_TYPE_POSIX_TAR", CL_TYPE_POSIX_TAR },
{ "CL_TYPE_OLD_TAR", CL_TYPE_OLD_TAR }, |
75e46945 |
{ "CL_TYPE_CPIO_OLD", CL_TYPE_CPIO_OLD },
{ "CL_TYPE_CPIO_ODC", CL_TYPE_CPIO_ODC },
{ "CL_TYPE_CPIO_NEWC", CL_TYPE_CPIO_NEWC },
{ "CL_TYPE_CPIO_CRC", CL_TYPE_CPIO_CRC }, |
7021b545 |
{ "CL_TYPE_GZ", CL_TYPE_GZ },
{ "CL_TYPE_ZIP", CL_TYPE_ZIP },
{ "CL_TYPE_BZ", CL_TYPE_BZ },
{ "CL_TYPE_RAR", CL_TYPE_RAR },
{ "CL_TYPE_ARJ", CL_TYPE_ARJ },
{ "CL_TYPE_MSSZDD", CL_TYPE_MSSZDD },
{ "CL_TYPE_MSOLE2", CL_TYPE_MSOLE2 },
{ "CL_TYPE_MSCAB", CL_TYPE_MSCAB },
{ "CL_TYPE_MSCHM", CL_TYPE_MSCHM },
{ "CL_TYPE_SIS", CL_TYPE_SIS },
{ "CL_TYPE_SCRENC", CL_TYPE_SCRENC },
{ "CL_TYPE_GRAPHICS", CL_TYPE_GRAPHICS },
{ "CL_TYPE_RIFF", CL_TYPE_RIFF },
{ "CL_TYPE_BINHEX", CL_TYPE_BINHEX },
{ "CL_TYPE_TNEF", CL_TYPE_TNEF },
{ "CL_TYPE_CRYPTFF", CL_TYPE_CRYPTFF },
{ "CL_TYPE_PDF", CL_TYPE_PDF },
{ "CL_TYPE_UUENCODED", CL_TYPE_UUENCODED },
{ "CL_TYPE_HTML_UTF16", CL_TYPE_HTML_UTF16 }, |
015ce4a8 |
{ "CL_TYPE_SCRIPT", CL_TYPE_SCRIPT }, |
7021b545 |
{ "CL_TYPE_RTF", CL_TYPE_RTF },
{ "CL_TYPE_HTML", CL_TYPE_HTML },
{ "CL_TYPE_MAIL", CL_TYPE_MAIL },
{ "CL_TYPE_SFX", CL_TYPE_SFX },
{ "CL_TYPE_ZIPSFX", CL_TYPE_ZIPSFX },
{ "CL_TYPE_RARSFX", CL_TYPE_RARSFX },
{ "CL_TYPE_CABSFX", CL_TYPE_CABSFX },
{ "CL_TYPE_ARJSFX", CL_TYPE_ARJSFX },
{ "CL_TYPE_NULSFT", CL_TYPE_NULSFT },
{ "CL_TYPE_AUTOIT", CL_TYPE_AUTOIT }, |
cadaa703 |
{ "CL_TYPE_ISHIELD_MSI", CL_TYPE_ISHIELD_MSI }, |
81fded11 |
{ "CL_TYPE_7Z", CL_TYPE_7Z }, |
c8f2d060 |
{ NULL, CL_TYPE_IGNORED } |
888f5794 |
};
|
7021b545 |
cli_file_t cli_ftcode(const char *name)
{
unsigned int i; |
888f5794 |
|
7021b545 |
for(i = 0; ftmap[i].name; i++)
if(!strcmp(ftmap[i].name, name))
return ftmap[i].code; |
888f5794 |
|
7021b545 |
return CL_TYPE_ERROR;
} |
888f5794 |
|
0d9dbdef |
void cli_ftfree(const struct cl_engine *engine) |
7021b545 |
{ |
0d9dbdef |
struct cli_ftype *ftypes=engine->ftypes, *pt; |
7021b545 |
while(ftypes) {
pt = ftypes;
ftypes = ftypes->next; |
47d40feb |
mpool_free(engine->mempool, pt->magic);
mpool_free(engine->mempool, pt->tname);
mpool_free(engine->mempool, pt); |
7021b545 |
}
} |
e88f97f3 |
|
7021b545 |
cli_file_t cli_filetype(const unsigned char *buf, size_t buflen, const struct cl_engine *engine) |
888f5794 |
{ |
7021b545 |
struct cli_ftype *ftype = engine->ftypes; |
216a697f |
|
888f5794 |
|
7021b545 |
while(ftype) {
if(ftype->offset + ftype->length <= buflen) {
if(!memcmp(buf + ftype->offset, ftype->magic, ftype->length)) {
cli_dbgmsg("Recognized %s file\n", ftype->tname);
return ftype->type; |
888f5794 |
}
} |
7021b545 |
ftype = ftype->next; |
888f5794 |
}
|
c8f2d060 |
return cli_texttype(buf, buflen); |
888f5794 |
}
|
e12c29d2 |
int is_tar(unsigned char *buf, unsigned int nbytes); |
a7f5fd00 |
|
bd988961 |
cli_file_t cli_filetype2(int desc, const struct cl_engine *engine) |
a7f5fd00 |
{ |
72ce4b70 |
unsigned char buff[MAGIC_BUFFER_SIZE + 1], *decoded; |
bd988961 |
int bread, sret; |
c8f2d060 |
cli_file_t ret = CL_TYPE_BINARY_DATA; |
bd988961 |
struct cli_matcher *root; |
4e9ab8ed |
struct cli_ac_data mdata; |
a7f5fd00 |
|
c8f2d060 |
if(!engine) {
cli_errmsg("cli_filetype2: engine == NULL\n");
return CL_TYPE_ERROR;
}
|
72ce4b70 |
memset(buff, 0, sizeof(buff));
bread = cli_readn(desc, buff, MAGIC_BUFFER_SIZE); |
7db77fbf |
if(bread == -1)
return CL_TYPE_ERROR; |
72ce4b70 |
buff[bread] = 0; |
7db77fbf |
|
72ce4b70 |
ret = cli_filetype(buff, bread, engine); |
a7f5fd00 |
|
c8f2d060 |
if(ret >= CL_TYPE_TEXT_ASCII && ret <= CL_TYPE_BINARY_DATA) {
/* HTML files may contain special characters and could be
* misidentified as BINARY_DATA by cli_filetype()
*/ |
bd988961 |
root = engine->root[0];
if(!root)
return ret;
|
aca9ea82 |
if(cli_ac_initdata(&mdata, root->ac_partsigs, root->ac_lsigs, root->ac_reloff_num, CLI_DEFAULT_AC_TRACKLEN)) |
bd988961 |
return ret;
|
33872a43 |
sret = cli_ac_scanbuff(buff, bread, NULL, NULL, NULL, engine->root[0], &mdata, 0, ret, NULL, AC_SCAN_FT, NULL); |
4e9ab8ed |
cli_ac_freedata(&mdata); |
bd988961 |
if(sret >= CL_TYPENO) {
ret = sret;
} else { |
aca9ea82 |
if(cli_ac_initdata(&mdata, root->ac_partsigs, root->ac_lsigs, root->ac_reloff_num, CLI_DEFAULT_AC_TRACKLEN)) |
4e9ab8ed |
return ret;
|
72ce4b70 |
decoded = (unsigned char *) cli_utf16toascii((char *) buff, bread); |
bd988961 |
if(decoded) { |
33872a43 |
sret = cli_ac_scanbuff(decoded, strlen((char *) decoded), NULL, NULL, NULL, engine->root[0], &mdata, 0, CL_TYPE_TEXT_ASCII, NULL, AC_SCAN_FT, NULL); |
bd988961 |
free(decoded);
if(sret == CL_TYPE_HTML)
ret = CL_TYPE_HTML_UTF16;
} |
4e9ab8ed |
cli_ac_freedata(&mdata); |
3506ac49 |
|
692bda68 |
if((((struct cli_dconf*) engine->dconf)->phishing & PHISHING_CONF_ENTCONV) && ret != CL_TYPE_HTML_UTF16) { |
b3fc7f97 |
const char* encoding;
/* check if we can autodetect this encoding.
* If we can't don't try to detect HTML sig, since
* we just tried that above, and failed */ |
72ce4b70 |
if((encoding = encoding_detect_bom(buff, bread))) {
unsigned char decodedbuff[sizeof(buff)*2]; |
b3fc7f97 |
m_area_t in_area, out_area;
|
72ce4b70 |
in_area.buffer = (unsigned char *) buff; |
b3fc7f97 |
in_area.length = bread;
in_area.offset = 0;
out_area.buffer = decodedbuff;
out_area.length = sizeof(decodedbuff);
out_area.offset = 0;
/* in htmlnorm we simply skip over \0 chars, and that allows to parse HTML in any unicode
* (multibyte characters will not be exactly handled, but that is not a problem).
* However when detecting whether a file is HTML or not, we need exact conversion.
* (just eliminating zeros and matching would introduce false positives */
if(encoding_normalize_toascii(&in_area, encoding, &out_area) >= 0 && out_area.length > 0) { |
aca9ea82 |
if(cli_ac_initdata(&mdata, root->ac_partsigs, root->ac_lsigs, root->ac_reloff_num, CLI_DEFAULT_AC_TRACKLEN)) |
b3fc7f97 |
return ret;
if(out_area.length > 0) { |
33872a43 |
sret = cli_ac_scanbuff(decodedbuff, out_area.length, NULL, NULL, NULL, engine->root[0], &mdata, 0, 0, NULL, AC_SCAN_FT, NULL); /* FIXME: can we use CL_TYPE_TEXT_ASCII instead of 0? */ |
b3fc7f97 |
if(sret == CL_TYPE_HTML) {
cli_dbgmsg("cli_filetype2: detected HTML signature in Unicode file\n");
/* htmlnorm is able to handle any unicode now, since it skips null chars */
ret = CL_TYPE_HTML;
} |
4e1127c5 |
} |
3506ac49 |
|
b3fc7f97 |
cli_ac_freedata(&mdata);
} |
4e1127c5 |
} |
3506ac49 |
} |
bd988961 |
}
}
|
c8f2d060 |
if(ret == CL_TYPE_BINARY_DATA) { |
72ce4b70 |
switch(is_tar(buff, bread)) {
case 1:
ret = CL_TYPE_OLD_TAR;
cli_dbgmsg("Recognized old fashioned tar file\n");
break;
case 2:
ret = CL_TYPE_POSIX_TAR;
cli_dbgmsg("Recognized POSIX tar file\n");
break; |
a7f5fd00 |
}
}
return ret;
} |