git-svn: trunk@4835
Tomasz Kojm authored on 2009/02/19 06:55:25... | ... |
@@ -1,3 +1,7 @@ |
1 |
+Wed Feb 18 23:12:36 CET 2009 (tk) |
|
2 |
+--------------------------------- |
|
3 |
+ * libclamav/textdet.c: improve UTF-16 detection (bb#1209) |
|
4 |
+ |
|
1 | 5 |
Thu Feb 19 00:05:28 EET 2009 (edwin) |
2 | 6 |
------------------------------------ |
3 | 7 |
* clamd/server-th.c: move the command parsing, and stream handling |
... | ... |
@@ -45,6 +45,7 @@ |
45 | 45 |
|
46 | 46 |
#include "filetypes.h" |
47 | 47 |
#include "textdet.h" |
48 |
+#include "others.h" |
|
48 | 49 |
|
49 | 50 |
#define F 0 /* character never appears in text */ |
50 | 51 |
#define T 1 /* character appears in plain ASCII text */ |
... | ... |
@@ -141,7 +142,7 @@ static int td_isutf8(const unsigned char *buf, unsigned int len) |
141 | 141 |
|
142 | 142 |
static int td_isutf16(const unsigned char *buf, unsigned int len) |
143 | 143 |
{ |
144 |
- unsigned int be, i, c; |
|
144 |
+ unsigned int be = 1, nobom = 0, i, c, bad = 0; |
|
145 | 145 |
|
146 | 146 |
|
147 | 147 |
if(len < 2) |
... | ... |
@@ -152,7 +153,7 @@ static int td_isutf16(const unsigned char *buf, unsigned int len) |
152 | 152 |
else if(buf[0] == 0xfe && buf[1] == 0xff) |
153 | 153 |
be = 1; |
154 | 154 |
else |
155 |
- return 0; |
|
155 |
+ nobom = 1; |
|
156 | 156 |
|
157 | 157 |
for(i = 2; i + 1 < len; i += 2) { |
158 | 158 |
if(be) |
... | ... |
@@ -163,10 +164,17 @@ static int td_isutf16(const unsigned char *buf, unsigned int len) |
163 | 163 |
if(c == 0xfffe) |
164 | 164 |
return 0; |
165 | 165 |
|
166 |
- if(c < 128 && text_chars[c] != T) |
|
167 |
- return 0; |
|
166 |
+ if(c < 128 && text_chars[c] != T) { |
|
167 |
+ if(nobom) |
|
168 |
+ return 0; |
|
169 |
+ else |
|
170 |
+ bad++; |
|
171 |
+ } |
|
168 | 172 |
} |
169 | 173 |
|
174 |
+ if(!nobom && bad >= len / 2) |
|
175 |
+ return 0; |
|
176 |
+ |
|
170 | 177 |
return 1 + be; |
171 | 178 |
} |
172 | 179 |
|
... | ... |
@@ -174,16 +182,17 @@ cli_file_t cli_texttype(const unsigned char *buf, unsigned int len) |
174 | 174 |
{ |
175 | 175 |
int ret; |
176 | 176 |
|
177 |
- if(td_isutf8(buf, len)) { |
|
177 |
+ if(td_isascii(buf, len)) { |
|
178 |
+ cli_dbgmsg("Recognized ASCII text\n"); |
|
179 |
+ return CL_TYPE_TEXT_ASCII; |
|
180 |
+ } else if(td_isutf8(buf, len)) { |
|
181 |
+ cli_dbgmsg("Recognized UTF-8 character data\n"); |
|
178 | 182 |
return CL_TYPE_TEXT_UTF8; |
179 | 183 |
} else if((ret = td_isutf16(buf, len))) { |
180 |
- if(ret == 1) |
|
181 |
- return CL_TYPE_TEXT_UTF16LE; |
|
182 |
- else |
|
183 |
- return CL_TYPE_TEXT_UTF16BE; |
|
184 |
- } else if(td_isascii(buf, len)) { |
|
185 |
- return CL_TYPE_TEXT_ASCII; |
|
184 |
+ cli_dbgmsg("Recognized %s character data\n", (ret == 1) ? "UTF-16LE" : "UTF-16BE"); |
|
185 |
+ return (ret == 1) ? CL_TYPE_TEXT_UTF16LE : CL_TYPE_TEXT_UTF16BE; |
|
186 | 186 |
} else { |
187 |
+ cli_dbgmsg("Recognized binary data\n"); |
|
187 | 188 |
return CL_TYPE_BINARY_DATA; |
188 | 189 |
} |
189 | 190 |
} |