Browse code

libclamav/textdet.c: improve (**or screw up**) UTF-16 detection (bb#1209)

git-svn: trunk@4835

Tomasz Kojm authored on 2009/02/19 06:55:25
Showing 2 changed files
... ...
@@ -1,3 +1,7 @@
1
+Wed Feb 18 23:12:36 CET 2009 (tk)
2
+---------------------------------
3
+ * libclamav/textdet.c: improve UTF-16 detection (bb#1209)
4
+
1 5
 Thu Feb 19 00:05:28 EET 2009 (edwin)
2 6
 ------------------------------------
3 7
  * clamd/server-th.c: move the command parsing, and stream handling
... ...
@@ -45,6 +45,7 @@
45 45
 
46 46
 #include "filetypes.h"
47 47
 #include "textdet.h"
48
+#include "others.h"
48 49
 
49 50
 #define F 0   /* character never appears in text */
50 51
 #define T 1   /* character appears in plain ASCII text */
... ...
@@ -141,7 +142,7 @@ static int td_isutf8(const unsigned char *buf, unsigned int len)
141 141
 
142 142
 static int td_isutf16(const unsigned char *buf, unsigned int len)
143 143
 {
144
-	unsigned int be, i, c;
144
+	unsigned int be = 1, nobom = 0, i, c, bad = 0;
145 145
 
146 146
 
147 147
     if(len < 2)
... ...
@@ -152,7 +153,7 @@ static int td_isutf16(const unsigned char *buf, unsigned int len)
152 152
     else if(buf[0] == 0xfe && buf[1] == 0xff)
153 153
 	be = 1;
154 154
     else
155
-	return 0;
155
+	nobom = 1;
156 156
 
157 157
     for(i = 2; i + 1 < len; i += 2) {
158 158
 	if(be)
... ...
@@ -163,10 +164,17 @@ static int td_isutf16(const unsigned char *buf, unsigned int len)
163 163
 	if(c == 0xfffe)
164 164
 	    return 0;
165 165
 
166
-	if(c < 128 && text_chars[c] != T)
167
-	    return 0;
166
+	if(c < 128 && text_chars[c] != T) {
167
+	    if(nobom)
168
+		return 0;
169
+	    else
170
+		bad++;
171
+	}
168 172
     }
169 173
 
174
+    if(!nobom && bad >= len / 2)
175
+	return 0;
176
+
170 177
     return 1 + be;
171 178
 }
172 179
 
... ...
@@ -174,16 +182,17 @@ cli_file_t cli_texttype(const unsigned char *buf, unsigned int len)
174 174
 {
175 175
 	int ret;
176 176
 
177
-    if(td_isutf8(buf, len)) {
177
+    if(td_isascii(buf, len)) {
178
+	cli_dbgmsg("Recognized ASCII text\n");
179
+	return CL_TYPE_TEXT_ASCII;
180
+    } else if(td_isutf8(buf, len)) {
181
+	cli_dbgmsg("Recognized UTF-8 character data\n");
178 182
 	return CL_TYPE_TEXT_UTF8;
179 183
     } else if((ret = td_isutf16(buf, len))) {
180
-	if(ret == 1)
181
-	    return CL_TYPE_TEXT_UTF16LE;
182
-	else
183
-	    return CL_TYPE_TEXT_UTF16BE;
184
-    } else if(td_isascii(buf, len)) {
185
-	return CL_TYPE_TEXT_ASCII;
184
+	cli_dbgmsg("Recognized %s character data\n", (ret == 1) ? "UTF-16LE" : "UTF-16BE");
185
+	return (ret == 1) ? CL_TYPE_TEXT_UTF16LE : CL_TYPE_TEXT_UTF16BE;
186 186
     } else {
187
+	cli_dbgmsg("Recognized binary data\n");
187 188
 	return CL_TYPE_BINARY_DATA;
188 189
     }
189 190
 }