... | ... |
@@ -29,6 +29,7 @@ |
29 | 29 |
#include <ctype.h> |
30 | 30 |
#include "clamav.h" |
31 | 31 |
#include "textnorm.h" |
32 |
+#include "bignum_fast.h" |
|
32 | 33 |
|
33 | 34 |
int text_normalize_init(struct text_norm_state *state, unsigned char *out, size_t out_len) |
34 | 35 |
{ |
... | ... |
@@ -55,6 +56,7 @@ enum normalize_action { |
55 | 55 |
NORMALIZE_ADD_32 |
56 | 56 |
}; |
57 | 57 |
|
58 |
+ |
|
58 | 59 |
/* use shorter names in the table */ |
59 | 60 |
#define IGN NORMALIZE_SKIP |
60 | 61 |
#define WSP NORMALIZE_AS_WHITESPACE |
... | ... |
@@ -121,3 +123,38 @@ size_t text_normalize_buffer(struct text_norm_state *state, const unsigned char |
121 | 121 |
return i; |
122 | 122 |
} |
123 | 123 |
|
124 |
+/* Normalizes the text in @fmap and stores the result in @state's buffer. |
|
125 |
+ * Returns number of characters written to buffer. */ |
|
126 |
+size_t text_normalize_map(struct text_norm_state *state, fmap_t *map, size_t offset) |
|
127 |
+{ |
|
128 |
+ const unsigned char *map_loc; |
|
129 |
+ unsigned int map_pgsz; |
|
130 |
+ uint64_t map_len; |
|
131 |
+ size_t buff_len; |
|
132 |
+ size_t acc; |
|
133 |
+ size_t acc_total; |
|
134 |
+ size_t acc_len; |
|
135 |
+ |
|
136 |
+ map_len = map->len; |
|
137 |
+ map_pgsz = map->pgsz; |
|
138 |
+ buff_len = state->out_len; |
|
139 |
+ |
|
140 |
+ acc_total = 0; |
|
141 |
+ acc = 0; |
|
142 |
+ |
|
143 |
+ while (1) { |
|
144 |
+ /* Break out if we've reached the end of the map or our buffer. */ |
|
145 |
+ if(!(acc_len = MIN_3(map_pgsz, map_len - offset, buff_len - acc_total))) break; |
|
146 |
+ |
|
147 |
+ /* If map_loc is NULL, then there's nothing left to do but recover. */ |
|
148 |
+ if(!(map_loc = fmap_need_off_once(map, offset, acc_len))) break; |
|
149 |
+ offset += acc_len; |
|
150 |
+ |
|
151 |
+ /* If we didn't normalize anything, no need to update values, just break out. */ |
|
152 |
+ if(!(acc = text_normalize_buffer(state, map_loc, acc_len))) break; |
|
153 |
+ acc_total += acc; |
|
154 |
+ } |
|
155 |
+ |
|
156 |
+ return acc_total; |
|
157 |
+} |
|
158 |
+ |
... | ... |
@@ -20,6 +20,11 @@ |
20 | 20 |
* MA 02110-1301, USA. |
21 | 21 |
*/ |
22 | 22 |
|
23 |
+#ifndef __TEXTNORM_H |
|
24 |
+#define __TEXTNORM_H |
|
25 |
+ |
|
26 |
+#include "fmap.h" |
|
27 |
+ |
|
23 | 28 |
struct text_norm_state { |
24 | 29 |
unsigned char *out; |
25 | 30 |
size_t out_len; |
... | ... |
@@ -27,6 +32,14 @@ struct text_norm_state { |
27 | 27 |
int space_written; |
28 | 28 |
}; |
29 | 29 |
|
30 |
+#define ASCII_FILE_BUFF_LENGTH 131072 |
|
31 |
+#define MAX_ASCII_FILE_SIZE 20000000 |
|
32 |
+ |
|
33 |
+#define MIN_3(x,y,z) ((x)<(y) ? ((x)<(z)?(x):(z)) : ((y)<(z)?(y):(z))) |
|
34 |
+ |
|
30 | 35 |
int text_normalize_init(struct text_norm_state *state, unsigned char *out, size_t out_len); |
31 | 36 |
void text_normalize_reset(struct text_norm_state* state); |
32 | 37 |
size_t text_normalize_buffer(struct text_norm_state *state, const unsigned char *buf, const size_t buf_len); |
38 |
+size_t text_normalize_map(struct text_norm_state *state, fmap_t *map, size_t offset); |
|
39 |
+ |
|
40 |
+#endif |
... | ... |
@@ -110,6 +110,7 @@ const struct clam_option __clam_options[] = { |
110 | 110 |
{ NULL, "mdb", 0, CLOPT_TYPE_BOOL, MATCH_BOOL, 0, NULL, 0, OPT_SIGTOOL, "", "" }, |
111 | 111 |
{ NULL, "print-certs", 0, CLOPT_TYPE_STRING, NULL, -1, NULL, 0, OPT_SIGTOOL, "", "" }, |
112 | 112 |
{ NULL, "html-normalise", 0, CLOPT_TYPE_STRING, NULL, -1, NULL, 0, OPT_SIGTOOL, "", "" }, |
113 |
+ { NULL, "ascii-normalise", 0, CLOPT_TYPE_STRING, NULL, -1, NULL, 0, OPT_SIGTOOL, "", "" }, |
|
113 | 114 |
{ NULL, "utf16-decode", 0, CLOPT_TYPE_STRING, NULL, -1, NULL, 0, OPT_SIGTOOL, "", "" }, |
114 | 115 |
{ NULL, "build", 'b', CLOPT_TYPE_STRING, NULL, -1, NULL, 0, OPT_SIGTOOL, "", "" }, |
115 | 116 |
{ NULL, "max-bad-sigs", 0, CLOPT_TYPE_NUMBER, MATCH_NUMBER, 3000, NULL, 0, OPT_SIGTOOL, "Maximum number of mismatched signatures when building a CVD. Zero disables this limit.", "3000" }, |
... | ... |
@@ -43,6 +43,8 @@ |
43 | 43 |
#include <netinet/in.h> |
44 | 44 |
#include <arpa/inet.h> |
45 | 45 |
#include <sys/wait.h> |
46 |
+#else |
|
47 |
+#include "w32_stat.h" |
|
46 | 48 |
#endif |
47 | 49 |
#include <dirent.h> |
48 | 50 |
#include <ctype.h> |
... | ... |
@@ -66,6 +68,7 @@ |
66 | 66 |
#include "libclamav/str.h" |
67 | 67 |
#include "libclamav/ole2_extract.h" |
68 | 68 |
#include "libclamav/htmlnorm.h" |
69 |
+#include "libclamav/textnorm.h" |
|
69 | 70 |
#include "libclamav/default.h" |
70 | 71 |
#include "libclamav/fmap.h" |
71 | 72 |
#include "libclamav/readdb.h" |
... | ... |
@@ -223,6 +226,79 @@ static int htmlnorm(const struct optstruct *opts) |
223 | 223 |
return 0; |
224 | 224 |
} |
225 | 225 |
|
226 |
+static int asciinorm(const struct optstruct *opts) |
|
227 |
+{ |
|
228 |
+ const char *fname; |
|
229 |
+ unsigned char *norm_buff; |
|
230 |
+ struct text_norm_state state; |
|
231 |
+ size_t map_off; |
|
232 |
+ fmap_t *map; |
|
233 |
+ int fd, ofd; |
|
234 |
+ |
|
235 |
+ fname = optget(opts, "ascii-normalise")->strarg; |
|
236 |
+ fd = open(fname, O_RDONLY); |
|
237 |
+ |
|
238 |
+ if (fd == -1) { |
|
239 |
+ mprintf("!asciinorm: Can't open file %s\n", fname); |
|
240 |
+ return -1; |
|
241 |
+ } |
|
242 |
+ |
|
243 |
+ if(!(norm_buff = malloc(ASCII_FILE_BUFF_LENGTH))) { |
|
244 |
+ mprintf("!asciinorm: Can't allocate memory\n"); |
|
245 |
+ close(fd); |
|
246 |
+ return -1; |
|
247 |
+ } |
|
248 |
+ |
|
249 |
+ if (!(map = fmap(fd, 0, 0))) { |
|
250 |
+ mprintf("!fmap: Could not map fd %d\n", fd); |
|
251 |
+ close(fd); |
|
252 |
+ free(norm_buff); |
|
253 |
+ return -1; |
|
254 |
+ } |
|
255 |
+ |
|
256 |
+ if (map->len > MAX_ASCII_FILE_SIZE) { |
|
257 |
+ mprintf("!asciinorm: File size of %zu too large\n", map->len); |
|
258 |
+ close(fd); |
|
259 |
+ free(norm_buff); |
|
260 |
+ funmap(map); |
|
261 |
+ return -1; |
|
262 |
+ } |
|
263 |
+ |
|
264 |
+ ofd = open("./normalised_text", O_WRONLY | O_CREAT | O_TRUNC, S_IRUSR | S_IWUSR); |
|
265 |
+ if (ofd == -1) { |
|
266 |
+ mprintf("!asciinorm: Can't open file ./normalised_text\n"); |
|
267 |
+ close(fd); |
|
268 |
+ free(norm_buff); |
|
269 |
+ funmap(map); |
|
270 |
+ return -1; |
|
271 |
+ } |
|
272 |
+ |
|
273 |
+ text_normalize_init(&state, norm_buff, ASCII_FILE_BUFF_LENGTH); |
|
274 |
+ |
|
275 |
+ map_off = 0; |
|
276 |
+ while(map_off != map->len) { |
|
277 |
+ size_t written; |
|
278 |
+ if (!(written = text_normalize_map(&state, map, map_off))) break; |
|
279 |
+ map_off += written; |
|
280 |
+ |
|
281 |
+ if (write(ofd, norm_buff, state.out_pos) == -1) { |
|
282 |
+ mprintf("!asciinorm: Can't write to file ./normalised_text\n"); |
|
283 |
+ close(fd); |
|
284 |
+ close(ofd); |
|
285 |
+ free(norm_buff); |
|
286 |
+ funmap(map); |
|
287 |
+ return -1; |
|
288 |
+ } |
|
289 |
+ text_normalize_reset(&state); |
|
290 |
+ } |
|
291 |
+ |
|
292 |
+ close(fd); |
|
293 |
+ close(ofd); |
|
294 |
+ free(norm_buff); |
|
295 |
+ funmap(map); |
|
296 |
+ return 0; |
|
297 |
+} |
|
298 |
+ |
|
226 | 299 |
static int utf16decode(const struct optstruct *opts) |
227 | 300 |
{ |
228 | 301 |
const char *fname; |
... | ... |
@@ -3012,6 +3088,7 @@ static void help(void) |
3012 | 3012 |
mprintf(" or SHA256 sigs for FILES\n"); |
3013 | 3013 |
mprintf(" --mdb [FILES] generate .mdb sigs\n"); |
3014 | 3014 |
mprintf(" --html-normalise=FILE create normalised parts of HTML file\n"); |
3015 |
+ mprintf(" --ascii-normalise=FILE create normalised text file from ascii source\n"); |
|
3015 | 3016 |
mprintf(" --utf16-decode=FILE decode UTF16 encoded files\n"); |
3016 | 3017 |
mprintf(" --info=FILE -i FILE print database information\n"); |
3017 | 3018 |
mprintf(" --build=NAME [cvd] -b NAME build a CVD file\n"); |
... | ... |
@@ -3106,6 +3183,8 @@ int main(int argc, char **argv) |
3106 | 3106 |
ret = hashsig(opts, 1, 1); |
3107 | 3107 |
else if(optget(opts, "html-normalise")->enabled) |
3108 | 3108 |
ret = htmlnorm(opts); |
3109 |
+ else if(optget(opts, "ascii-normalise")->enabled) |
|
3110 |
+ ret = asciinorm(opts); |
|
3109 | 3111 |
else if(optget(opts, "utf16-decode")->enabled) |
3110 | 3112 |
ret = utf16decode(opts); |
3111 | 3113 |
else if(optget(opts, "build")->enabled) |
... | ... |
@@ -170,6 +170,9 @@ EXPORTS cli_pcre_freeoff @44373 NONAME |
170 | 170 |
EXPORTS cli_pcre_recaloff @44374 NONAME |
171 | 171 |
EXPORTS cli_pcre_perf_events_destroy @44375 NONAME |
172 | 172 |
EXPORTS cli_pcre_perf_print @44376 NONAME |
173 |
+EXPORTS text_normalize_init @44377 NONAME |
|
174 |
+EXPORTS text_normalize_reset @44378 NONAME |
|
175 |
+EXPORTS text_normalize_map @44379 NONAME |
|
173 | 176 |
|
174 | 177 |
; compatibility layer, tommath, zlib |
175 | 178 |
EXPORTS w32_srand @44269 NONAME |