git-svn: trunk@3486
Tomasz Kojm authored on 2008/01/07 23:20:381 | 1 |
new file mode 100644 |
... | ... |
@@ -0,0 +1,33 @@ |
0 |
+libclamav/textdet.c includes modified code from file-4.23/src/ascmagic.c. |
|
1 |
+The original LEGAL.NOTICE file for file-4.23 is reproduced below. |
|
2 |
+ |
|
3 |
+-------------------------------------------------------------------------- |
|
4 |
+$File: LEGAL.NOTICE,v 1.15 2006/05/03 18:48:33 christos Exp $ |
|
5 |
+Copyright (c) Ian F. Darwin 1986, 1987, 1989, 1990, 1991, 1992, 1994, 1995. |
|
6 |
+Software written by Ian F. Darwin and others; |
|
7 |
+maintained 1994- Christos Zoulas. |
|
8 |
+ |
|
9 |
+This software is not subject to any export provision of the United States |
|
10 |
+Department of Commerce, and may be exported to any country or planet. |
|
11 |
+ |
|
12 |
+Redistribution and use in source and binary forms, with or without |
|
13 |
+modification, are permitted provided that the following conditions |
|
14 |
+are met: |
|
15 |
+1. Redistributions of source code must retain the above copyright |
|
16 |
+ notice immediately at the beginning of the file, without modification, |
|
17 |
+ this list of conditions, and the following disclaimer. |
|
18 |
+2. Redistributions in binary form must reproduce the above copyright |
|
19 |
+ notice, this list of conditions and the following disclaimer in the |
|
20 |
+ documentation and/or other materials provided with the distribution. |
|
21 |
+ |
|
22 |
+THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND |
|
23 |
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
|
24 |
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE |
|
25 |
+ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE FOR |
|
26 |
+ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL |
|
27 |
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS |
|
28 |
+OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) |
|
29 |
+HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT |
|
30 |
+LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY |
|
31 |
+OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF |
|
32 |
+SUCH DAMAGE. |
... | ... |
@@ -1,3 +1,8 @@ |
1 |
+Mon Jan 7 14:50:24 CET 2008 (tk) |
|
2 |
+--------------------------------- |
|
3 |
+ * libclamav/textdet.c: text detection code based on file-4.23 |
|
4 |
+ * libclamav/filetypes.c: re-enable text detection (ASCII, UTF8, UTF16) |
|
5 |
+ |
|
1 | 6 |
Sun Jan 6 19:35:28 EET 2008 (edwin) |
2 | 7 |
------------------------------------ |
3 | 8 |
* build system: improve iconv() detection, by actually trying to link a |
... | ... |
@@ -17,7 +17,7 @@ |
17 | 17 |
# MA 02110-1301, USA. |
18 | 18 |
|
19 | 19 |
SUBDIRS = libclamunrar libclamunrar_iface libclamav clamscan clamd clamdscan freshclam sigtool clamconf database docs etc clamav-milter |
20 |
-EXTRA_DIST = FAQ contrib test examples BUGS shared libclamav.pc.in UPGRADE COPYING.bzip2 COPYING.lzma COPYING.unrar COPYING.LGPL |
|
20 |
+EXTRA_DIST = FAQ contrib test examples BUGS shared libclamav.pc.in UPGRADE COPYING.bzip2 COPYING.lzma COPYING.unrar COPYING.LGPL COPYING.file |
|
21 | 21 |
|
22 | 22 |
bin_SCRIPTS=clamav-config |
23 | 23 |
|
... | ... |
@@ -236,7 +236,7 @@ target_vendor = @target_vendor@ |
236 | 236 |
top_builddir = @top_builddir@ |
237 | 237 |
top_srcdir = @top_srcdir@ |
238 | 238 |
SUBDIRS = libclamunrar libclamunrar_iface libclamav clamscan clamd clamdscan freshclam sigtool clamconf database docs etc clamav-milter |
239 |
-EXTRA_DIST = FAQ contrib test examples BUGS shared libclamav.pc.in UPGRADE COPYING.bzip2 COPYING.lzma COPYING.unrar COPYING.LGPL |
|
239 |
+EXTRA_DIST = FAQ contrib test examples BUGS shared libclamav.pc.in UPGRADE COPYING.bzip2 COPYING.lzma COPYING.unrar COPYING.LGPL COPYING.file |
|
240 | 240 |
bin_SCRIPTS = clamav-config |
241 | 241 |
pkgconfigdir = $(libdir)/pkgconfig |
242 | 242 |
pkgconfig_DATA = libclamav.pc |
... | ... |
@@ -78,17 +78,17 @@ LTLIBRARIES = $(lib_LTLIBRARIES) $(noinst_LTLIBRARIES) |
78 | 78 |
@ENABLE_UNRAR_TRUE@ $(top_builddir)/libclamunrar_iface/libclamunrar_iface.la |
79 | 79 |
am_libclamav_la_OBJECTS = matcher-ac.lo matcher-bm.lo matcher.lo \ |
80 | 80 |
md5.lo others.lo readdb.lo cvd.lo dsig.lo str.lo scanners.lo \ |
81 |
- filetypes.lo rtf.lo blob.lo mbox.lo message.lo snprintf.lo \ |
|
82 |
- table.lo text.lo ole2_extract.lo vba_extract.lo msexpand.lo \ |
|
83 |
- pe.lo upx.lo htmlnorm.lo chmunpack.lo rebuildpe.lo petite.lo \ |
|
84 |
- wwunpack.lo unsp.lo aspack.lo packlibs.lo fsg.lo mew.lo \ |
|
85 |
- upack.lo line.lo untar.lo unzip.lo inflate64.lo special.lo \ |
|
86 |
- binhex.lo is_tar.lo tnef.lo autoit.lo strlcpy.lo regcomp.lo \ |
|
87 |
- regerror.lo regexec.lo regfree.lo unarj.lo bzlib.lo nulsft.lo \ |
|
88 |
- pdf.lo spin.lo yc.lo elf.lo sis.lo uuencode.lo pst.lo \ |
|
89 |
- phishcheck.lo phish_domaincheck_db.lo phish_whitelist.lo \ |
|
90 |
- regex_list.lo mspack.lo cab.lo entconv.lo hashtab.lo dconf.lo \ |
|
91 |
- lzma_iface.lo explode.lo |
|
81 |
+ textdet.lo filetypes.lo rtf.lo blob.lo mbox.lo message.lo \ |
|
82 |
+ snprintf.lo table.lo text.lo ole2_extract.lo vba_extract.lo \ |
|
83 |
+ msexpand.lo pe.lo upx.lo htmlnorm.lo chmunpack.lo rebuildpe.lo \ |
|
84 |
+ petite.lo wwunpack.lo unsp.lo aspack.lo packlibs.lo fsg.lo \ |
|
85 |
+ mew.lo upack.lo line.lo untar.lo unzip.lo inflate64.lo \ |
|
86 |
+ special.lo binhex.lo is_tar.lo tnef.lo autoit.lo strlcpy.lo \ |
|
87 |
+ regcomp.lo regerror.lo regexec.lo regfree.lo unarj.lo bzlib.lo \ |
|
88 |
+ nulsft.lo pdf.lo spin.lo yc.lo elf.lo sis.lo uuencode.lo \ |
|
89 |
+ pst.lo phishcheck.lo phish_domaincheck_db.lo \ |
|
90 |
+ phish_whitelist.lo regex_list.lo mspack.lo cab.lo entconv.lo \ |
|
91 |
+ hashtab.lo dconf.lo lzma_iface.lo explode.lo |
|
92 | 92 |
libclamav_la_OBJECTS = $(am_libclamav_la_OBJECTS) |
93 | 93 |
libclamav_la_LINK = $(LIBTOOL) --tag=CC $(AM_LIBTOOLFLAGS) \ |
94 | 94 |
$(LIBTOOLFLAGS) --mode=link $(CCLD) $(AM_CFLAGS) $(CFLAGS) \ |
... | ... |
@@ -278,6 +278,8 @@ libclamav_la_SOURCES = \ |
278 | 278 |
str.h \ |
279 | 279 |
scanners.c \ |
280 | 280 |
scanners.h \ |
281 |
+ textdet.c \ |
|
282 |
+ textdet.h \ |
|
281 | 283 |
filetypes.c \ |
282 | 284 |
filetypes.h \ |
283 | 285 |
filetypes_int.h \ |
... | ... |
@@ -555,6 +557,7 @@ distclean-compile: |
555 | 555 |
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/strlcpy.Plo@am__quote@ |
556 | 556 |
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/table.Plo@am__quote@ |
557 | 557 |
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/text.Plo@am__quote@ |
558 |
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/textdet.Plo@am__quote@ |
|
558 | 559 |
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/tnef.Plo@am__quote@ |
559 | 560 |
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/unarj.Plo@am__quote@ |
560 | 561 |
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/unsp.Plo@am__quote@ |
... | ... |
@@ -628,7 +628,7 @@ fileblobAddData(fileblob *fb, const unsigned char *data, size_t len) |
628 | 628 |
*ctx->scanned += (unsigned long)len / CL_COUNT_PRECISION; |
629 | 629 |
fb->bytes_scanned += (unsigned long)len; |
630 | 630 |
|
631 |
- if((len > 5) && (cli_scanbuff(data, (unsigned int)len, ctx->virname, ctx->engine, CL_TYPE_UNKNOWN_DATA) == CL_VIRUS)) { |
|
631 |
+ if((len > 5) && (cli_scanbuff(data, (unsigned int)len, ctx->virname, ctx->engine, CL_TYPE_BINARY_DATA) == CL_VIRUS)) { |
|
632 | 632 |
cli_dbgmsg("fileblobAddData: found %s\n", *ctx->virname); |
633 | 633 |
fb->isInfected = 1; |
634 | 634 |
} |
... | ... |
@@ -38,6 +38,7 @@ |
38 | 38 |
#include "readdb.h" |
39 | 39 |
#include "matcher-ac.h" |
40 | 40 |
#include "str.h" |
41 |
+#include "textdet.h" |
|
41 | 42 |
|
42 | 43 |
#include "htmlnorm.h" |
43 | 44 |
#include "entconv.h" |
... | ... |
@@ -46,8 +47,11 @@ static const struct ftmap_s { |
46 | 46 |
const char *name; |
47 | 47 |
cli_file_t code; |
48 | 48 |
} ftmap[] = { |
49 |
- { "CL_TYPE_UNKNOWN_TEXT", CL_TYPE_UNKNOWN_TEXT }, |
|
50 |
- { "CL_TYPE_UNKNOWN_DATA", CL_TYPE_UNKNOWN_DATA }, |
|
49 |
+ { "CL_TYPE_TEXT_ASCII", CL_TYPE_TEXT_ASCII }, |
|
50 |
+ { "CL_TYPE_TEXT_UTF8", CL_TYPE_TEXT_UTF8 }, |
|
51 |
+ { "CL_TYPE_TEXT_UTF16LE", CL_TYPE_TEXT_UTF16LE }, |
|
52 |
+ { "CL_TYPE_TEXT_UTF16BE", CL_TYPE_TEXT_UTF16BE }, |
|
53 |
+ { "CL_TYPE_BINARY_DATA", CL_TYPE_BINARY_DATA }, |
|
51 | 54 |
{ "CL_TYPE_IGNORED", CL_TYPE_IGNORED }, |
52 | 55 |
{ "CL_TYPE_MSEXE", CL_TYPE_MSEXE }, |
53 | 56 |
{ "CL_TYPE_ELF", CL_TYPE_ELF }, |
... | ... |
@@ -83,7 +87,7 @@ static const struct ftmap_s { |
83 | 83 |
{ "CL_TYPE_ARJSFX", CL_TYPE_ARJSFX }, |
84 | 84 |
{ "CL_TYPE_NULSFT", CL_TYPE_NULSFT }, |
85 | 85 |
{ "CL_TYPE_AUTOIT", CL_TYPE_AUTOIT }, |
86 |
- { NULL, CL_TYPE_UNKNOWN_DATA } |
|
86 |
+ { NULL, CL_TYPE_IGNORED } |
|
87 | 87 |
}; |
88 | 88 |
|
89 | 89 |
cli_file_t cli_ftcode(const char *name) |
... | ... |
@@ -125,19 +129,7 @@ cli_file_t cli_filetype(const unsigned char *buf, size_t buflen, const struct cl |
125 | 125 |
ftype = ftype->next; |
126 | 126 |
} |
127 | 127 |
|
128 |
-/* FIXME: improve or drop this code |
|
129 |
- * https://wwws.clamav.net/bugzilla/show_bug.cgi?id=373 |
|
130 |
- * |
|
131 |
- int i, text = 1, len; |
|
132 |
- buflen < 25 ? (len = buflen) : (len = 25); |
|
133 |
- for(i = 0; i < len; i++) |
|
134 |
- if(!iscntrl(buf[i]) && !isprint(buf[i]) && !internat[buf[i] & 0xff]) { |
|
135 |
- text = 0; |
|
136 |
- break; |
|
137 |
- } |
|
138 |
- return text ? CL_TYPE_UNKNOWN_TEXT : CL_TYPE_UNKNOWN_DATA; |
|
139 |
-*/ |
|
140 |
- return CL_TYPE_UNKNOWN_TEXT; |
|
128 |
+ return cli_texttype(buf, buflen); |
|
141 | 129 |
} |
142 | 130 |
|
143 | 131 |
int is_tar(unsigned char *buf, unsigned int nbytes); |
... | ... |
@@ -146,16 +138,24 @@ cli_file_t cli_filetype2(int desc, const struct cl_engine *engine) |
146 | 146 |
{ |
147 | 147 |
unsigned char smallbuff[MAGIC_BUFFER_SIZE + 1], *decoded, *bigbuff; |
148 | 148 |
int bread, sret; |
149 |
- cli_file_t ret = CL_TYPE_UNKNOWN_DATA; |
|
149 |
+ cli_file_t ret = CL_TYPE_BINARY_DATA; |
|
150 | 150 |
struct cli_matcher *root; |
151 | 151 |
struct cli_ac_data mdata; |
152 | 152 |
|
153 | 153 |
|
154 |
+ if(!engine) { |
|
155 |
+ cli_errmsg("cli_filetype2: engine == NULL\n"); |
|
156 |
+ return CL_TYPE_ERROR; |
|
157 |
+ } |
|
158 |
+ |
|
154 | 159 |
memset(smallbuff, 0, sizeof(smallbuff)); |
155 | 160 |
if((bread = read(desc, smallbuff, MAGIC_BUFFER_SIZE)) > 0) |
156 | 161 |
ret = cli_filetype(smallbuff, bread, engine); |
157 | 162 |
|
158 |
- if(engine && ret == CL_TYPE_UNKNOWN_TEXT) { |
|
163 |
+ if(ret >= CL_TYPE_TEXT_ASCII && ret <= CL_TYPE_BINARY_DATA) { |
|
164 |
+ /* HTML files may contain special characters and could be |
|
165 |
+ * misidentified as BINARY_DATA by cli_filetype() |
|
166 |
+ */ |
|
159 | 167 |
root = engine->root[0]; |
160 | 168 |
if(!root) |
161 | 169 |
return ret; |
... | ... |
@@ -221,7 +221,7 @@ cli_file_t cli_filetype2(int desc, const struct cl_engine *engine) |
221 | 221 |
} |
222 | 222 |
} |
223 | 223 |
|
224 |
- if(ret == CL_TYPE_UNKNOWN_DATA || ret == CL_TYPE_UNKNOWN_TEXT) { |
|
224 |
+ if(ret == CL_TYPE_BINARY_DATA) { |
|
225 | 225 |
|
226 | 226 |
if(!(bigbuff = (unsigned char *) cli_calloc(37638 + 1, sizeof(unsigned char)))) |
227 | 227 |
return ret; |
... | ... |
@@ -243,8 +243,7 @@ cli_file_t cli_filetype2(int desc, const struct cl_engine *engine) |
243 | 243 |
} |
244 | 244 |
} |
245 | 245 |
|
246 |
- if(ret == CL_TYPE_UNKNOWN_DATA || ret == CL_TYPE_UNKNOWN_TEXT) { |
|
247 |
- |
|
246 |
+ if(ret == CL_TYPE_BINARY_DATA) { |
|
248 | 247 |
if(!memcmp(bigbuff + 32769, "CD001" , 5) || !memcmp(bigbuff + 37633, "CD001" , 5)) { |
249 | 248 |
cli_dbgmsg("Recognized ISO 9660 CD-ROM data\n"); |
250 | 249 |
ret = CL_TYPE_IGNORED; |
... | ... |
@@ -33,8 +33,12 @@ |
33 | 33 |
#define MAX_EMBEDDED_OBJ 10 |
34 | 34 |
|
35 | 35 |
typedef enum { |
36 |
- CL_TYPE_UNKNOWN_TEXT = CL_TYPENO, |
|
37 |
- CL_TYPE_UNKNOWN_DATA, |
|
36 |
+ CL_TYPE_TEXT_ASCII = CL_TYPENO, /* X3.4, ISO-8859, non-ISO ext. ASCII */ |
|
37 |
+ CL_TYPE_TEXT_UTF8, |
|
38 |
+ CL_TYPE_TEXT_UTF16LE, |
|
39 |
+ CL_TYPE_TEXT_UTF16BE, |
|
40 |
+ CL_TYPE_BINARY_DATA, |
|
41 |
+ /* Please do not add any new types above this line */ |
|
38 | 42 |
CL_TYPE_IGNORED, |
39 | 43 |
CL_TYPE_ERROR, |
40 | 44 |
CL_TYPE_MSEXE, |
... | ... |
@@ -609,7 +609,7 @@ cli_mbox(const char *dir, int desc, cli_ctx *ctx) |
609 | 609 |
|
610 | 610 |
type = cli_filetype(start, size, ctx->engine); |
611 | 611 |
|
612 |
- if((type == CL_TYPE_UNKNOWN_TEXT) && |
|
612 |
+ if((type == CL_TYPE_TEXT_ASCII) && |
|
613 | 613 |
(strncmp(start, "Microsoft Mail Internet Headers", 31) == 0)) |
614 | 614 |
type = CL_TYPE_MAIL; |
615 | 615 |
|
... | ... |
@@ -1038,7 +1038,7 @@ save_text(cli_ctx *ctx, const char *dir, const char *start, size_t len) |
1038 | 1038 |
* in this way. It gets the "filetype" wrong and then |
1039 | 1039 |
* doesn't scan correctly |
1040 | 1040 |
*/ |
1041 |
- if(cli_scanbuff((char *)p, len, ctx->virname, ctx->engine, CL_TYPE_UNKNOWN_DATA) == CL_VIRUS) { |
|
1041 |
+ if(cli_scanbuff((char *)p, len, ctx->virname, ctx->engine, CL_TYPE_BINARY_DATA) == CL_VIRUS) { |
|
1042 | 1042 |
cli_dbgmsg("save_text: found %s\n", *ctx->virname); |
1043 | 1043 |
return CL_VIRUS; |
1044 | 1044 |
} |
... | ... |
@@ -1602,7 +1602,7 @@ static int cli_scanraw(int desc, cli_ctx *ctx, cli_file_t type, uint8_t typercg) |
1602 | 1602 |
|
1603 | 1603 |
|
1604 | 1604 |
if(typercg) switch(type) { |
1605 |
- case CL_TYPE_UNKNOWN_TEXT: |
|
1605 |
+ case CL_TYPE_TEXT_ASCII: |
|
1606 | 1606 |
case CL_TYPE_MSEXE: |
1607 | 1607 |
case CL_TYPE_ZIP: |
1608 | 1608 |
ftrec = 1; |
... | ... |
@@ -1620,7 +1620,7 @@ static int cli_scanraw(int desc, cli_ctx *ctx, cli_file_t type, uint8_t typercg) |
1620 | 1620 |
|
1621 | 1621 |
if(ret >= CL_TYPENO) { |
1622 | 1622 |
|
1623 |
- if(type == CL_TYPE_UNKNOWN_TEXT) { |
|
1623 |
+ if(type == CL_TYPE_TEXT_ASCII) { |
|
1624 | 1624 |
lseek(desc, 0, SEEK_SET); |
1625 | 1625 |
|
1626 | 1626 |
nret = cli_scandesc(desc, ctx, 0, ret, 1, NULL); |
... | ... |
@@ -1713,12 +1713,12 @@ static int cli_scanraw(int desc, cli_ctx *ctx, cli_file_t type, uint8_t typercg) |
1713 | 1713 |
|
1714 | 1714 |
if(nret != CL_VIRUS) switch(ret) { |
1715 | 1715 |
case CL_TYPE_HTML: |
1716 |
- if(SCAN_HTML && type == CL_TYPE_UNKNOWN_TEXT && (DCONF_DOC & DOC_CONF_HTML)) |
|
1716 |
+ if(SCAN_HTML && type == CL_TYPE_TEXT_ASCII && (DCONF_DOC & DOC_CONF_HTML)) |
|
1717 | 1717 |
nret = cli_scanhtml(desc, ctx); |
1718 | 1718 |
break; |
1719 | 1719 |
|
1720 | 1720 |
case CL_TYPE_MAIL: |
1721 |
- if(SCAN_MAIL && type == CL_TYPE_UNKNOWN_TEXT && (DCONF_MAIL & MAIL_CONF_MBOX)) |
|
1721 |
+ if(SCAN_MAIL && type == CL_TYPE_TEXT_ASCII && (DCONF_MAIL & MAIL_CONF_MBOX)) |
|
1722 | 1722 |
nret = cli_scanmail(desc, ctx); |
1723 | 1723 |
break; |
1724 | 1724 |
|
... | ... |
@@ -1949,7 +1949,7 @@ int cli_magic_scandesc(int desc, cli_ctx *ctx) |
1949 | 1949 |
ret = cli_scansis(desc, ctx); |
1950 | 1950 |
break; |
1951 | 1951 |
|
1952 |
- case CL_TYPE_UNKNOWN_DATA: |
|
1952 |
+ case CL_TYPE_BINARY_DATA: |
|
1953 | 1953 |
ret = cli_check_mydoom_log(desc, ctx->virname); |
1954 | 1954 |
break; |
1955 | 1955 |
|
1956 | 1956 |
new file mode 100644 |
... | ... |
@@ -0,0 +1,189 @@ |
0 |
+/* |
|
1 |
+ * Text detection based on ascmagic.c from the file(1) utility. |
|
2 |
+ * Portions Copyright (C) 2008 Sourcefire, Inc. |
|
3 |
+ * Maintained by Tomasz Kojm <tkojm@clamav.net> |
|
4 |
+ * |
|
5 |
+ * Copyright (c) Ian F. Darwin 1986-1995. |
|
6 |
+ * Software written by Ian F. Darwin and others; |
|
7 |
+ * maintained 1995-present by Christos Zoulas and others. |
|
8 |
+ * |
|
9 |
+ * Redistribution and use in source and binary forms, with or without |
|
10 |
+ * modification, are permitted provided that the following conditions |
|
11 |
+ * are met: |
|
12 |
+ * 1. Redistributions of source code must retain the above copyright |
|
13 |
+ * notice immediately at the beginning of the file, without modification, |
|
14 |
+ * this list of conditions, and the following disclaimer. |
|
15 |
+ * 2. Redistributions in binary form must reproduce the above copyright |
|
16 |
+ * notice, this list of conditions and the following disclaimer in the |
|
17 |
+ * documentation and/or other materials provided with the distribution. |
|
18 |
+ * |
|
19 |
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND |
|
20 |
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
|
21 |
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE |
|
22 |
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE FOR |
|
23 |
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL |
|
24 |
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS |
|
25 |
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) |
|
26 |
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT |
|
27 |
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY |
|
28 |
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF |
|
29 |
+ * SUCH DAMAGE. |
|
30 |
+ */ |
|
31 |
+ |
|
32 |
+#if HAVE_CONFIG_H |
|
33 |
+#include "clamav-config.h" |
|
34 |
+#endif |
|
35 |
+ |
|
36 |
+#include <stdio.h> |
|
37 |
+#include <string.h> |
|
38 |
+#include <memory.h> |
|
39 |
+#include <ctype.h> |
|
40 |
+#include <stdlib.h> |
|
41 |
+#ifdef HAVE_UNISTD_H |
|
42 |
+#include <unistd.h> |
|
43 |
+#endif |
|
44 |
+ |
|
45 |
+#include "filetypes.h" |
|
46 |
+#include "textdet.h" |
|
47 |
+ |
|
48 |
+#define F 0 /* character never appears in text */ |
|
49 |
+#define T 1 /* character appears in plain ASCII text */ |
|
50 |
+#define I 2 /* character appears in ISO-8859 text */ |
|
51 |
+#define X 3 /* character appears in non-ISO extended ASCII (Mac, IBM PC) */ |
|
52 |
+ |
|
53 |
+static char text_chars[256] = { |
|
54 |
+ /* BEL BS HT LF FF CR */ |
|
55 |
+ F, F, F, F, F, F, F, T, T, T, T, F, T, T, F, F, /* 0x0X */ |
|
56 |
+ /* ESC */ |
|
57 |
+ F, F, F, F, F, F, F, F, F, F, F, T, F, F, F, F, /* 0x1X */ |
|
58 |
+ T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, /* 0x2X */ |
|
59 |
+ T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, /* 0x3X */ |
|
60 |
+ T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, /* 0x4X */ |
|
61 |
+ T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, /* 0x5X */ |
|
62 |
+ T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, /* 0x6X */ |
|
63 |
+ T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, F, /* 0x7X */ |
|
64 |
+ /* NEL */ |
|
65 |
+ X, X, X, X, X, T, X, X, X, X, X, X, X, X, X, X, /* 0x8X */ |
|
66 |
+ X, X, X, X, X, X, X, X, X, X, X, X, X, X, X, X, /* 0x9X */ |
|
67 |
+ I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, /* 0xaX */ |
|
68 |
+ I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, /* 0xbX */ |
|
69 |
+ I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, /* 0xcX */ |
|
70 |
+ I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, /* 0xdX */ |
|
71 |
+ I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, /* 0xeX */ |
|
72 |
+ I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I /* 0xfX */ |
|
73 |
+}; |
|
74 |
+ |
|
75 |
+static int td_isascii(const unsigned char *buf, unsigned int len) |
|
76 |
+{ |
|
77 |
+ unsigned int i; |
|
78 |
+ |
|
79 |
+ for(i = 0; i < len; i++) |
|
80 |
+ if(text_chars[buf[i]] == F) |
|
81 |
+ return 0; |
|
82 |
+ |
|
83 |
+ return 1; |
|
84 |
+} |
|
85 |
+ |
|
86 |
+static int td_isutf8(const unsigned char *buf, unsigned int len) |
|
87 |
+{ |
|
88 |
+ unsigned int i, j, gotone = 0; |
|
89 |
+ |
|
90 |
+ |
|
91 |
+ for(i = 0; i < len; i++) { |
|
92 |
+ if((buf[i] & 0x80) == 0) { /* 0xxxxxxx is plain ASCII */ |
|
93 |
+ /* |
|
94 |
+ * Even if the whole file is valid UTF-8 sequences, |
|
95 |
+ * still reject it if it uses weird control characters. |
|
96 |
+ */ |
|
97 |
+ if(text_chars[buf[i]] != T) |
|
98 |
+ return 0; |
|
99 |
+ |
|
100 |
+ } else if((buf[i] & 0x40) == 0) { /* 10xxxxxx never 1st byte */ |
|
101 |
+ return 0; |
|
102 |
+ } else { /* 11xxxxxx begins UTF-8 */ |
|
103 |
+ unsigned int following; |
|
104 |
+ |
|
105 |
+ if((buf[i] & 0x20) == 0) { /* 110xxxxx */ |
|
106 |
+ /* c = buf[i] & 0x1f; */ |
|
107 |
+ following = 1; |
|
108 |
+ } else if((buf[i] & 0x10) == 0) { /* 1110xxxx */ |
|
109 |
+ /* c = buf[i] & 0x0f; */ |
|
110 |
+ following = 2; |
|
111 |
+ } else if((buf[i] & 0x08) == 0) { /* 11110xxx */ |
|
112 |
+ /* c = buf[i] & 0x07; */ |
|
113 |
+ following = 3; |
|
114 |
+ } else if((buf[i] & 0x04) == 0) { /* 111110xx */ |
|
115 |
+ /* c = buf[i] & 0x03; */ |
|
116 |
+ following = 4; |
|
117 |
+ } else if((buf[i] & 0x02) == 0) { /* 1111110x */ |
|
118 |
+ /* c = buf[i] & 0x01; */ |
|
119 |
+ following = 5; |
|
120 |
+ } else { |
|
121 |
+ return 0; |
|
122 |
+ } |
|
123 |
+ |
|
124 |
+ for(j = 0; j < following; j++) { |
|
125 |
+ if(++i >= len) |
|
126 |
+ return gotone; |
|
127 |
+ |
|
128 |
+ if((buf[i] & 0x80) == 0 || (buf[i] & 0x40)) |
|
129 |
+ return 0; |
|
130 |
+ |
|
131 |
+ /* c = (c << 6) + (buf[i] & 0x3f); */ |
|
132 |
+ } |
|
133 |
+ |
|
134 |
+ gotone = 1; |
|
135 |
+ } |
|
136 |
+ } |
|
137 |
+ |
|
138 |
+ return gotone; |
|
139 |
+} |
|
140 |
+ |
|
141 |
+static int td_isutf16(const unsigned char *buf, unsigned int len) |
|
142 |
+{ |
|
143 |
+ unsigned int be, i, c; |
|
144 |
+ |
|
145 |
+ |
|
146 |
+ if(len < 2) |
|
147 |
+ return 0; |
|
148 |
+ |
|
149 |
+ if(buf[0] == 0xff && buf[1] == 0xfe) |
|
150 |
+ be = 0; |
|
151 |
+ else if(buf[0] == 0xfe && buf[1] == 0xff) |
|
152 |
+ be = 1; |
|
153 |
+ else |
|
154 |
+ return 0; |
|
155 |
+ |
|
156 |
+ for(i = 2; i + 1 < len; i += 2) { |
|
157 |
+ if(be) |
|
158 |
+ c = buf[i + 1] + 256 * buf[i]; |
|
159 |
+ else |
|
160 |
+ c = buf[i] + 256 * buf[i + 1]; |
|
161 |
+ |
|
162 |
+ if(c == 0xfffe) |
|
163 |
+ return 0; |
|
164 |
+ |
|
165 |
+ if(c < 128 && text_chars[c] != T) |
|
166 |
+ return 0; |
|
167 |
+ } |
|
168 |
+ |
|
169 |
+ return 1 + be; |
|
170 |
+} |
|
171 |
+ |
|
172 |
+cli_file_t cli_texttype(const unsigned char *buf, unsigned int len) |
|
173 |
+{ |
|
174 |
+ int ret; |
|
175 |
+ |
|
176 |
+ if(td_isutf8(buf, len)) { |
|
177 |
+ return CL_TYPE_TEXT_UTF8; |
|
178 |
+ } else if((ret = td_isutf16(buf, len))) { |
|
179 |
+ if(ret == 1) |
|
180 |
+ return CL_TYPE_TEXT_UTF16LE; |
|
181 |
+ else |
|
182 |
+ return CL_TYPE_TEXT_UTF16BE; |
|
183 |
+ } else if(td_isascii(buf, len)) { |
|
184 |
+ return CL_TYPE_TEXT_ASCII; |
|
185 |
+ } else { |
|
186 |
+ return CL_TYPE_BINARY_DATA; |
|
187 |
+ } |
|
188 |
+} |
0 | 189 |
new file mode 100644 |
... | ... |
@@ -0,0 +1,26 @@ |
0 |
+/* |
|
1 |
+ * Copyright (C) 2008 Sourcefire, Inc. |
|
2 |
+ * |
|
3 |
+ * This program is free software; you can redistribute it and/or modify |
|
4 |
+ * it under the terms of the GNU General Public License version 2 as |
|
5 |
+ * published by the Free Software Foundation. |
|
6 |
+ * |
|
7 |
+ * This program is distributed in the hope that it will be useful, |
|
8 |
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of |
|
9 |
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
|
10 |
+ * GNU General Public License for more details. |
|
11 |
+ * |
|
12 |
+ * You should have received a copy of the GNU General Public License |
|
13 |
+ * along with this program; if not, write to the Free Software |
|
14 |
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, |
|
15 |
+ * MA 02110-1301, USA. |
|
16 |
+ */ |
|
17 |
+ |
|
18 |
+#ifndef __TEXTDET_H |
|
19 |
+#define __TEXTDET_H |
|
20 |
+ |
|
21 |
+#include "filetypes.h" |
|
22 |
+ |
|
23 |
+cli_file_t cli_texttype(const unsigned char *buf, unsigned int len); |
|
24 |
+ |
|
25 |
+#endif |