Browse code

re-enable text detection (ASCII, UTF8, UTF16)

git-svn: trunk@3486

Tomasz Kojm authored on 2008/01/07 23:20:38
Showing 13 changed files
1 1
new file mode 100644
... ...
@@ -0,0 +1,33 @@
0
+libclamav/textdet.c includes modified code from file-4.23/src/ascmagic.c.
1
+The original LEGAL.NOTICE file for file-4.23 is reproduced below.
2
+
3
+--------------------------------------------------------------------------
4
+$File: LEGAL.NOTICE,v 1.15 2006/05/03 18:48:33 christos Exp $
5
+Copyright (c) Ian F. Darwin 1986, 1987, 1989, 1990, 1991, 1992, 1994, 1995.
6
+Software written by Ian F. Darwin and others;
7
+maintained 1994- Christos Zoulas.
8
+
9
+This software is not subject to any export provision of the United States
10
+Department of Commerce, and may be exported to any country or planet.
11
+
12
+Redistribution and use in source and binary forms, with or without
13
+modification, are permitted provided that the following conditions
14
+are met:
15
+1. Redistributions of source code must retain the above copyright
16
+   notice immediately at the beginning of the file, without modification,
17
+   this list of conditions, and the following disclaimer.
18
+2. Redistributions in binary form must reproduce the above copyright
19
+   notice, this list of conditions and the following disclaimer in the
20
+   documentation and/or other materials provided with the distribution.
21
+ 
22
+THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
23
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
24
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
25
+ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE FOR
26
+ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
27
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
28
+OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
29
+HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30
+LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
31
+OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32
+SUCH DAMAGE.
... ...
@@ -1,3 +1,8 @@
1
+Mon Jan  7 14:50:24 CET 2008 (tk)
2
+---------------------------------
3
+  * libclamav/textdet.c: text detection code based on file-4.23
4
+  * libclamav/filetypes.c: re-enable text detection (ASCII, UTF8, UTF16)
5
+
1 6
 Sun Jan  6 19:35:28 EET 2008 (edwin)
2 7
 ------------------------------------
3 8
  * build system: improve iconv() detection, by actually trying to link a
... ...
@@ -17,7 +17,7 @@
17 17
 #  MA 02110-1301, USA.
18 18
 
19 19
 SUBDIRS = libclamunrar libclamunrar_iface libclamav clamscan clamd clamdscan freshclam sigtool clamconf database docs etc clamav-milter
20
-EXTRA_DIST = FAQ contrib test examples BUGS shared libclamav.pc.in UPGRADE COPYING.bzip2 COPYING.lzma COPYING.unrar COPYING.LGPL
20
+EXTRA_DIST = FAQ contrib test examples BUGS shared libclamav.pc.in UPGRADE COPYING.bzip2 COPYING.lzma COPYING.unrar COPYING.LGPL COPYING.file
21 21
 
22 22
 bin_SCRIPTS=clamav-config
23 23
 
... ...
@@ -236,7 +236,7 @@ target_vendor = @target_vendor@
236 236
 top_builddir = @top_builddir@
237 237
 top_srcdir = @top_srcdir@
238 238
 SUBDIRS = libclamunrar libclamunrar_iface libclamav clamscan clamd clamdscan freshclam sigtool clamconf database docs etc clamav-milter
239
-EXTRA_DIST = FAQ contrib test examples BUGS shared libclamav.pc.in UPGRADE COPYING.bzip2 COPYING.lzma COPYING.unrar COPYING.LGPL
239
+EXTRA_DIST = FAQ contrib test examples BUGS shared libclamav.pc.in UPGRADE COPYING.bzip2 COPYING.lzma COPYING.unrar COPYING.LGPL COPYING.file
240 240
 bin_SCRIPTS = clamav-config
241 241
 pkgconfigdir = $(libdir)/pkgconfig
242 242
 pkgconfig_DATA = libclamav.pc
... ...
@@ -56,6 +56,8 @@ libclamav_la_SOURCES = \
56 56
 	str.h \
57 57
 	scanners.c \
58 58
 	scanners.h \
59
+	textdet.c \
60
+	textdet.h \
59 61
 	filetypes.c \
60 62
 	filetypes.h \
61 63
 	filetypes_int.h \
... ...
@@ -78,17 +78,17 @@ LTLIBRARIES = $(lib_LTLIBRARIES) $(noinst_LTLIBRARIES)
78 78
 @ENABLE_UNRAR_TRUE@	$(top_builddir)/libclamunrar_iface/libclamunrar_iface.la
79 79
 am_libclamav_la_OBJECTS = matcher-ac.lo matcher-bm.lo matcher.lo \
80 80
 	md5.lo others.lo readdb.lo cvd.lo dsig.lo str.lo scanners.lo \
81
-	filetypes.lo rtf.lo blob.lo mbox.lo message.lo snprintf.lo \
82
-	table.lo text.lo ole2_extract.lo vba_extract.lo msexpand.lo \
83
-	pe.lo upx.lo htmlnorm.lo chmunpack.lo rebuildpe.lo petite.lo \
84
-	wwunpack.lo unsp.lo aspack.lo packlibs.lo fsg.lo mew.lo \
85
-	upack.lo line.lo untar.lo unzip.lo inflate64.lo special.lo \
86
-	binhex.lo is_tar.lo tnef.lo autoit.lo strlcpy.lo regcomp.lo \
87
-	regerror.lo regexec.lo regfree.lo unarj.lo bzlib.lo nulsft.lo \
88
-	pdf.lo spin.lo yc.lo elf.lo sis.lo uuencode.lo pst.lo \
89
-	phishcheck.lo phish_domaincheck_db.lo phish_whitelist.lo \
90
-	regex_list.lo mspack.lo cab.lo entconv.lo hashtab.lo dconf.lo \
91
-	lzma_iface.lo explode.lo
81
+	textdet.lo filetypes.lo rtf.lo blob.lo mbox.lo message.lo \
82
+	snprintf.lo table.lo text.lo ole2_extract.lo vba_extract.lo \
83
+	msexpand.lo pe.lo upx.lo htmlnorm.lo chmunpack.lo rebuildpe.lo \
84
+	petite.lo wwunpack.lo unsp.lo aspack.lo packlibs.lo fsg.lo \
85
+	mew.lo upack.lo line.lo untar.lo unzip.lo inflate64.lo \
86
+	special.lo binhex.lo is_tar.lo tnef.lo autoit.lo strlcpy.lo \
87
+	regcomp.lo regerror.lo regexec.lo regfree.lo unarj.lo bzlib.lo \
88
+	nulsft.lo pdf.lo spin.lo yc.lo elf.lo sis.lo uuencode.lo \
89
+	pst.lo phishcheck.lo phish_domaincheck_db.lo \
90
+	phish_whitelist.lo regex_list.lo mspack.lo cab.lo entconv.lo \
91
+	hashtab.lo dconf.lo lzma_iface.lo explode.lo
92 92
 libclamav_la_OBJECTS = $(am_libclamav_la_OBJECTS)
93 93
 libclamav_la_LINK = $(LIBTOOL) --tag=CC $(AM_LIBTOOLFLAGS) \
94 94
 	$(LIBTOOLFLAGS) --mode=link $(CCLD) $(AM_CFLAGS) $(CFLAGS) \
... ...
@@ -278,6 +278,8 @@ libclamav_la_SOURCES = \
278 278
 	str.h \
279 279
 	scanners.c \
280 280
 	scanners.h \
281
+	textdet.c \
282
+	textdet.h \
281 283
 	filetypes.c \
282 284
 	filetypes.h \
283 285
 	filetypes_int.h \
... ...
@@ -555,6 +557,7 @@ distclean-compile:
555 555
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/strlcpy.Plo@am__quote@
556 556
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/table.Plo@am__quote@
557 557
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/text.Plo@am__quote@
558
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/textdet.Plo@am__quote@
558 559
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/tnef.Plo@am__quote@
559 560
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/unarj.Plo@am__quote@
560 561
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/unsp.Plo@am__quote@
... ...
@@ -628,7 +628,7 @@ fileblobAddData(fileblob *fb, const unsigned char *data, size_t len)
628 628
 					*ctx->scanned += (unsigned long)len / CL_COUNT_PRECISION;
629 629
 				fb->bytes_scanned += (unsigned long)len;
630 630
 
631
-				if((len > 5) && (cli_scanbuff(data, (unsigned int)len, ctx->virname, ctx->engine, CL_TYPE_UNKNOWN_DATA) == CL_VIRUS)) {
631
+				if((len > 5) && (cli_scanbuff(data, (unsigned int)len, ctx->virname, ctx->engine, CL_TYPE_BINARY_DATA) == CL_VIRUS)) {
632 632
 					cli_dbgmsg("fileblobAddData: found %s\n", *ctx->virname);
633 633
 					fb->isInfected = 1;
634 634
 				}
... ...
@@ -38,6 +38,7 @@
38 38
 #include "readdb.h"
39 39
 #include "matcher-ac.h"
40 40
 #include "str.h"
41
+#include "textdet.h"
41 42
 
42 43
 #include "htmlnorm.h"
43 44
 #include "entconv.h"
... ...
@@ -46,8 +47,11 @@ static const struct ftmap_s {
46 46
     const char *name;
47 47
     cli_file_t code;
48 48
 } ftmap[] = {
49
-    { "CL_TYPE_UNKNOWN_TEXT",	CL_TYPE_UNKNOWN_TEXT	},
50
-    { "CL_TYPE_UNKNOWN_DATA",	CL_TYPE_UNKNOWN_DATA	},
49
+    { "CL_TYPE_TEXT_ASCII",	CL_TYPE_TEXT_ASCII	},
50
+    { "CL_TYPE_TEXT_UTF8",	CL_TYPE_TEXT_UTF8	},
51
+    { "CL_TYPE_TEXT_UTF16LE",	CL_TYPE_TEXT_UTF16LE	},
52
+    { "CL_TYPE_TEXT_UTF16BE",	CL_TYPE_TEXT_UTF16BE	},
53
+    { "CL_TYPE_BINARY_DATA",	CL_TYPE_BINARY_DATA	},
51 54
     { "CL_TYPE_IGNORED",	CL_TYPE_IGNORED		},
52 55
     { "CL_TYPE_MSEXE",		CL_TYPE_MSEXE		},
53 56
     { "CL_TYPE_ELF",		CL_TYPE_ELF		},
... ...
@@ -83,7 +87,7 @@ static const struct ftmap_s {
83 83
     { "CL_TYPE_ARJSFX",		CL_TYPE_ARJSFX		},
84 84
     { "CL_TYPE_NULSFT",		CL_TYPE_NULSFT		},
85 85
     { "CL_TYPE_AUTOIT",		CL_TYPE_AUTOIT		},
86
-    { NULL,			CL_TYPE_UNKNOWN_DATA	}
86
+    { NULL,			CL_TYPE_IGNORED		}
87 87
 };
88 88
 
89 89
 cli_file_t cli_ftcode(const char *name)
... ...
@@ -125,19 +129,7 @@ cli_file_t cli_filetype(const unsigned char *buf, size_t buflen, const struct cl
125 125
 	ftype = ftype->next;
126 126
     }
127 127
 
128
-/* FIXME: improve or drop this code
129
- * https://wwws.clamav.net/bugzilla/show_bug.cgi?id=373
130
- *
131
-	int i, text = 1, len;
132
-    buflen < 25 ? (len = buflen) : (len = 25);
133
-    for(i = 0; i < len; i++)
134
-	if(!iscntrl(buf[i]) && !isprint(buf[i]) && !internat[buf[i] & 0xff]) {
135
-	    text = 0;
136
-	    break;
137
-	}
138
-    return text ? CL_TYPE_UNKNOWN_TEXT : CL_TYPE_UNKNOWN_DATA;
139
-*/
140
-    return CL_TYPE_UNKNOWN_TEXT;
128
+    return cli_texttype(buf, buflen);
141 129
 }
142 130
 
143 131
 int is_tar(unsigned char *buf, unsigned int nbytes);
... ...
@@ -146,16 +138,24 @@ cli_file_t cli_filetype2(int desc, const struct cl_engine *engine)
146 146
 {
147 147
 	unsigned char smallbuff[MAGIC_BUFFER_SIZE + 1], *decoded, *bigbuff;
148 148
 	int bread, sret;
149
-	cli_file_t ret = CL_TYPE_UNKNOWN_DATA;
149
+	cli_file_t ret = CL_TYPE_BINARY_DATA;
150 150
 	struct cli_matcher *root;
151 151
 	struct cli_ac_data mdata;
152 152
 
153 153
 
154
+    if(!engine) {
155
+	cli_errmsg("cli_filetype2: engine == NULL\n");
156
+	return CL_TYPE_ERROR;
157
+    }
158
+
154 159
     memset(smallbuff, 0, sizeof(smallbuff));
155 160
     if((bread = read(desc, smallbuff, MAGIC_BUFFER_SIZE)) > 0)
156 161
 	ret = cli_filetype(smallbuff, bread, engine);
157 162
 
158
-    if(engine && ret == CL_TYPE_UNKNOWN_TEXT) {
163
+    if(ret >= CL_TYPE_TEXT_ASCII && ret <= CL_TYPE_BINARY_DATA) {
164
+	/* HTML files may contain special characters and could be
165
+	 * misidentified as BINARY_DATA by cli_filetype()
166
+	 */
159 167
 	root = engine->root[0];
160 168
 	if(!root)
161 169
 	    return ret;
... ...
@@ -221,7 +221,7 @@ cli_file_t cli_filetype2(int desc, const struct cl_engine *engine)
221 221
 	}
222 222
     }
223 223
 
224
-    if(ret == CL_TYPE_UNKNOWN_DATA || ret == CL_TYPE_UNKNOWN_TEXT) {
224
+    if(ret == CL_TYPE_BINARY_DATA) {
225 225
 
226 226
 	if(!(bigbuff = (unsigned char *) cli_calloc(37638 + 1, sizeof(unsigned char))))
227 227
 	    return ret;
... ...
@@ -243,8 +243,7 @@ cli_file_t cli_filetype2(int desc, const struct cl_engine *engine)
243 243
 	    }
244 244
 	}
245 245
 
246
-	if(ret == CL_TYPE_UNKNOWN_DATA || ret == CL_TYPE_UNKNOWN_TEXT) {
247
-
246
+	if(ret == CL_TYPE_BINARY_DATA) {
248 247
 	    if(!memcmp(bigbuff + 32769, "CD001" , 5) || !memcmp(bigbuff + 37633, "CD001" , 5)) {
249 248
 		cli_dbgmsg("Recognized ISO 9660 CD-ROM data\n");
250 249
 		ret = CL_TYPE_IGNORED;
... ...
@@ -33,8 +33,12 @@
33 33
 #define MAX_EMBEDDED_OBJ 10
34 34
 
35 35
 typedef enum {
36
-    CL_TYPE_UNKNOWN_TEXT = CL_TYPENO,
37
-    CL_TYPE_UNKNOWN_DATA,
36
+    CL_TYPE_TEXT_ASCII = CL_TYPENO, /* X3.4, ISO-8859, non-ISO ext. ASCII */
37
+    CL_TYPE_TEXT_UTF8,
38
+    CL_TYPE_TEXT_UTF16LE,
39
+    CL_TYPE_TEXT_UTF16BE,
40
+    CL_TYPE_BINARY_DATA,
41
+    /* Please do not add any new types above this line */
38 42
     CL_TYPE_IGNORED,
39 43
     CL_TYPE_ERROR,
40 44
     CL_TYPE_MSEXE,
... ...
@@ -609,7 +609,7 @@ cli_mbox(const char *dir, int desc, cli_ctx *ctx)
609 609
 
610 610
 		type = cli_filetype(start, size, ctx->engine);
611 611
 
612
-		if((type == CL_TYPE_UNKNOWN_TEXT) &&
612
+		if((type == CL_TYPE_TEXT_ASCII) &&
613 613
 		   (strncmp(start, "Microsoft Mail Internet Headers", 31) == 0))
614 614
 			type = CL_TYPE_MAIL;
615 615
 
... ...
@@ -1038,7 +1038,7 @@ save_text(cli_ctx *ctx, const char *dir, const char *start, size_t len)
1038 1038
 		 *	in this way. It gets the "filetype" wrong and then
1039 1039
 		 *	doesn't scan correctly
1040 1040
 		 */
1041
-		if(cli_scanbuff((char *)p, len, ctx->virname, ctx->engine, CL_TYPE_UNKNOWN_DATA) == CL_VIRUS) {
1041
+		if(cli_scanbuff((char *)p, len, ctx->virname, ctx->engine, CL_TYPE_BINARY_DATA) == CL_VIRUS) {
1042 1042
 			cli_dbgmsg("save_text: found %s\n", *ctx->virname);
1043 1043
 			return CL_VIRUS;
1044 1044
 		}
... ...
@@ -1602,7 +1602,7 @@ static int cli_scanraw(int desc, cli_ctx *ctx, cli_file_t type, uint8_t typercg)
1602 1602
 
1603 1603
 
1604 1604
     if(typercg) switch(type) {
1605
-	case CL_TYPE_UNKNOWN_TEXT:
1605
+	case CL_TYPE_TEXT_ASCII:
1606 1606
 	case CL_TYPE_MSEXE:
1607 1607
 	case CL_TYPE_ZIP:
1608 1608
 	    ftrec = 1;
... ...
@@ -1620,7 +1620,7 @@ static int cli_scanraw(int desc, cli_ctx *ctx, cli_file_t type, uint8_t typercg)
1620 1620
 
1621 1621
     if(ret >= CL_TYPENO) {
1622 1622
 
1623
-	if(type == CL_TYPE_UNKNOWN_TEXT) {
1623
+	if(type == CL_TYPE_TEXT_ASCII) {
1624 1624
 	    lseek(desc, 0, SEEK_SET);
1625 1625
 
1626 1626
 	    nret = cli_scandesc(desc, ctx, 0, ret, 1, NULL);
... ...
@@ -1713,12 +1713,12 @@ static int cli_scanraw(int desc, cli_ctx *ctx, cli_file_t type, uint8_t typercg)
1713 1713
 
1714 1714
 	if(nret != CL_VIRUS) switch(ret) {
1715 1715
 	    case CL_TYPE_HTML:
1716
-		if(SCAN_HTML && type == CL_TYPE_UNKNOWN_TEXT && (DCONF_DOC & DOC_CONF_HTML))
1716
+		if(SCAN_HTML && type == CL_TYPE_TEXT_ASCII && (DCONF_DOC & DOC_CONF_HTML))
1717 1717
 		    nret = cli_scanhtml(desc, ctx);
1718 1718
 		break;
1719 1719
 
1720 1720
 	    case CL_TYPE_MAIL:
1721
-		if(SCAN_MAIL && type == CL_TYPE_UNKNOWN_TEXT && (DCONF_MAIL & MAIL_CONF_MBOX))
1721
+		if(SCAN_MAIL && type == CL_TYPE_TEXT_ASCII && (DCONF_MAIL & MAIL_CONF_MBOX))
1722 1722
 		    nret = cli_scanmail(desc, ctx);
1723 1723
 		break;
1724 1724
 
... ...
@@ -1949,7 +1949,7 @@ int cli_magic_scandesc(int desc, cli_ctx *ctx)
1949 1949
 		ret = cli_scansis(desc, ctx);
1950 1950
 	    break;
1951 1951
 
1952
-	case CL_TYPE_UNKNOWN_DATA:
1952
+	case CL_TYPE_BINARY_DATA:
1953 1953
 	    ret = cli_check_mydoom_log(desc, ctx->virname);
1954 1954
 	    break;
1955 1955
 
1956 1956
new file mode 100644
... ...
@@ -0,0 +1,189 @@
0
+/*
1
+ * Text detection based on ascmagic.c from the file(1) utility.
2
+ * Portions Copyright (C) 2008 Sourcefire, Inc.
3
+ * Maintained by Tomasz Kojm <tkojm@clamav.net>
4
+ *
5
+ * Copyright (c) Ian F. Darwin 1986-1995.
6
+ * Software written by Ian F. Darwin and others;
7
+ * maintained 1995-present by Christos Zoulas and others.
8
+ * 
9
+ * Redistribution and use in source and binary forms, with or without
10
+ * modification, are permitted provided that the following conditions
11
+ * are met:
12
+ * 1. Redistributions of source code must retain the above copyright
13
+ *    notice immediately at the beginning of the file, without modification,
14
+ *    this list of conditions, and the following disclaimer.
15
+ * 2. Redistributions in binary form must reproduce the above copyright
16
+ *    notice, this list of conditions and the following disclaimer in the
17
+ *    documentation and/or other materials provided with the distribution.
18
+ *  
19
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
20
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE FOR
23
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
25
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
26
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
27
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
28
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
29
+ * SUCH DAMAGE.
30
+ */
31
+
32
+#if HAVE_CONFIG_H
33
+#include "clamav-config.h"
34
+#endif
35
+
36
+#include <stdio.h>
37
+#include <string.h>
38
+#include <memory.h>
39
+#include <ctype.h>
40
+#include <stdlib.h>
41
+#ifdef HAVE_UNISTD_H
42
+#include <unistd.h>
43
+#endif
44
+
45
+#include "filetypes.h"
46
+#include "textdet.h"
47
+
48
+#define F 0   /* character never appears in text */
49
+#define T 1   /* character appears in plain ASCII text */
50
+#define I 2   /* character appears in ISO-8859 text */
51
+#define X 3   /* character appears in non-ISO extended ASCII (Mac, IBM PC) */
52
+
53
+static char text_chars[256] = {
54
+	/*                  BEL BS HT LF    FF CR    */
55
+	F, F, F, F, F, F, F, T, T, T, T, F, T, T, F, F,  /* 0x0X */
56
+        /*                              ESC          */
57
+	F, F, F, F, F, F, F, F, F, F, F, T, F, F, F, F,  /* 0x1X */
58
+	T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T,  /* 0x2X */
59
+	T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T,  /* 0x3X */
60
+	T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T,  /* 0x4X */
61
+	T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T,  /* 0x5X */
62
+	T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T,  /* 0x6X */
63
+	T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, F,  /* 0x7X */
64
+	/*            NEL                            */
65
+	X, X, X, X, X, T, X, X, X, X, X, X, X, X, X, X,  /* 0x8X */
66
+	X, X, X, X, X, X, X, X, X, X, X, X, X, X, X, X,  /* 0x9X */
67
+	I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I,  /* 0xaX */
68
+	I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I,  /* 0xbX */
69
+	I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I,  /* 0xcX */
70
+	I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I,  /* 0xdX */
71
+	I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I,  /* 0xeX */
72
+	I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I   /* 0xfX */
73
+};
74
+
75
+static int td_isascii(const unsigned char *buf, unsigned int len)
76
+{
77
+	unsigned int i;
78
+
79
+    for(i = 0; i < len; i++)
80
+	if(text_chars[buf[i]] == F)
81
+	    return 0;
82
+
83
+    return 1;
84
+}
85
+
86
+static int td_isutf8(const unsigned char *buf, unsigned int len)
87
+{
88
+	unsigned int i, j, gotone = 0;
89
+
90
+
91
+    for(i = 0; i < len; i++) {
92
+	if((buf[i] & 0x80) == 0) {  /* 0xxxxxxx is plain ASCII */
93
+	    /*
94
+	     * Even if the whole file is valid UTF-8 sequences,
95
+	     * still reject it if it uses weird control characters.
96
+	     */
97
+	    if(text_chars[buf[i]] != T)
98
+		return 0;
99
+
100
+	} else if((buf[i] & 0x40) == 0) { /* 10xxxxxx never 1st byte */
101
+	    return 0;
102
+	} else {			   /* 11xxxxxx begins UTF-8 */
103
+		unsigned int following;
104
+
105
+	    if((buf[i] & 0x20) == 0) {		/* 110xxxxx */
106
+		/* c = buf[i] & 0x1f; */
107
+		following = 1;
108
+	    } else if((buf[i] & 0x10) == 0) {	/* 1110xxxx */
109
+		/* c = buf[i] & 0x0f; */
110
+		following = 2;
111
+	    } else if((buf[i] & 0x08) == 0) {	/* 11110xxx */
112
+		/* c = buf[i] & 0x07; */
113
+		following = 3;
114
+	    } else if((buf[i] & 0x04) == 0) {	/* 111110xx */
115
+		/* c = buf[i] & 0x03; */
116
+		following = 4;
117
+	    } else if((buf[i] & 0x02) == 0) {	/* 1111110x */
118
+		/* c = buf[i] & 0x01; */
119
+		following = 5;
120
+	    } else {
121
+		return 0;
122
+	    }
123
+
124
+	    for(j = 0; j < following; j++) {
125
+		if(++i >= len)
126
+		    return gotone;
127
+
128
+		if((buf[i] & 0x80) == 0 || (buf[i] & 0x40))
129
+		    return 0;
130
+
131
+		/* c = (c << 6) + (buf[i] & 0x3f); */
132
+	    }
133
+
134
+	    gotone = 1;
135
+	}
136
+    }
137
+
138
+    return gotone;
139
+}
140
+
141
+static int td_isutf16(const unsigned char *buf, unsigned int len)
142
+{
143
+	unsigned int be, i, c;
144
+
145
+
146
+    if(len < 2)
147
+	return 0;
148
+
149
+    if(buf[0] == 0xff && buf[1] == 0xfe)
150
+	be = 0;
151
+    else if(buf[0] == 0xfe && buf[1] == 0xff)
152
+	be = 1;
153
+    else
154
+	return 0;
155
+
156
+    for(i = 2; i + 1 < len; i += 2) {
157
+	if(be)
158
+	    c = buf[i + 1] + 256 * buf[i];
159
+	else
160
+	    c = buf[i] + 256 * buf[i + 1];
161
+
162
+	if(c == 0xfffe)
163
+	    return 0;
164
+
165
+	if(c < 128 && text_chars[c] != T)
166
+	    return 0;
167
+    }
168
+
169
+    return 1 + be;
170
+}
171
+
172
+cli_file_t cli_texttype(const unsigned char *buf, unsigned int len)
173
+{
174
+	int ret;
175
+
176
+    if(td_isutf8(buf, len)) {
177
+	return CL_TYPE_TEXT_UTF8;
178
+    } else if((ret = td_isutf16(buf, len))) {
179
+	if(ret == 1)
180
+	    return CL_TYPE_TEXT_UTF16LE;
181
+	else
182
+	    return CL_TYPE_TEXT_UTF16BE;
183
+    } else if(td_isascii(buf, len)) {
184
+	return CL_TYPE_TEXT_ASCII;
185
+    } else {
186
+	return CL_TYPE_BINARY_DATA;
187
+    }
188
+}
0 189
new file mode 100644
... ...
@@ -0,0 +1,26 @@
0
+/*
1
+ *  Copyright (C) 2008 Sourcefire, Inc.
2
+ *
3
+ *  This program is free software; you can redistribute it and/or modify
4
+ *  it under the terms of the GNU General Public License version 2 as
5
+ *  published by the Free Software Foundation.
6
+ *
7
+ *  This program is distributed in the hope that it will be useful,
8
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
9
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
10
+ *  GNU General Public License for more details.
11
+ *
12
+ *  You should have received a copy of the GNU General Public License
13
+ *  along with this program; if not, write to the Free Software
14
+ *  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
15
+ *  MA 02110-1301, USA.
16
+ */
17
+
18
+#ifndef __TEXTDET_H
19
+#define __TEXTDET_H
20
+
21
+#include "filetypes.h"
22
+
23
+cli_file_t cli_texttype(const unsigned char *buf, unsigned int len);
24
+
25
+#endif