Browse code

Adding ascii file normalization option to sigtool.

Mickey Sola authored on 2015/07/08 05:46:19
Showing 6 changed files
... ...
@@ -103,6 +103,9 @@ CLAMAV_PRIVATE {
103 103
     cli_str2hex;
104 104
     cli_hashfile;
105 105
     cli_hashstream;
106
+    text_normalize_init;
107
+    text_normalize_reset;
108
+    text_normalize_map;
106 109
     html_normalise_map;
107 110
     cli_utf16toascii;
108 111
 
... ...
@@ -29,6 +29,7 @@
29 29
 #include <ctype.h>
30 30
 #include "clamav.h"
31 31
 #include "textnorm.h"
32
+#include "bignum_fast.h"
32 33
 
33 34
 int text_normalize_init(struct text_norm_state *state, unsigned char *out, size_t out_len)
34 35
 {
... ...
@@ -55,6 +56,7 @@ enum normalize_action {
55 55
 	NORMALIZE_ADD_32
56 56
 };
57 57
 
58
+
58 59
 /* use shorter names in the table */
59 60
 #define IGN NORMALIZE_SKIP
60 61
 #define WSP NORMALIZE_AS_WHITESPACE
... ...
@@ -121,3 +123,38 @@ size_t text_normalize_buffer(struct text_norm_state *state, const unsigned char
121 121
 	return i;
122 122
 }
123 123
 
124
+/* Normalizes the text in @fmap and stores the result in @state's buffer.
125
+ * Returns number of characters written to buffer. */
126
+size_t text_normalize_map(struct text_norm_state *state, fmap_t *map, size_t offset)
127
+{
128
+	const unsigned char *map_loc;
129
+	unsigned int map_pgsz;
130
+	uint64_t map_len;
131
+	size_t buff_len;
132
+	size_t acc;
133
+	size_t acc_total;
134
+	size_t acc_len;
135
+
136
+	map_len = map->len;
137
+	map_pgsz = map->pgsz;
138
+	buff_len = state->out_len;
139
+
140
+	acc_total = 0;
141
+	acc = 0;
142
+
143
+	while (1) {
144
+		/* Break out if we've reached the end of the map or our buffer. */
145
+		if(!(acc_len = MIN_3(map_pgsz, map_len - offset, buff_len - acc_total))) break;
146
+
147
+		/* If map_loc is NULL, then there's nothing left to do but recover. */
148
+		if(!(map_loc = fmap_need_off_once(map, offset, acc_len))) break;
149
+		offset += acc_len;
150
+
151
+		/* If we didn't normalize anything, no need to update values, just break out. */
152
+		if(!(acc = text_normalize_buffer(state, map_loc, acc_len))) break;
153
+		acc_total += acc;
154
+	}
155
+
156
+	return acc_total;
157
+}
158
+
... ...
@@ -20,6 +20,11 @@
20 20
  *  MA 02110-1301, USA.
21 21
  */
22 22
 
23
+#ifndef __TEXTNORM_H
24
+#define __TEXTNORM_H
25
+
26
+#include "fmap.h"
27
+
23 28
 struct text_norm_state {
24 29
 	unsigned char *out;
25 30
 	size_t out_len;
... ...
@@ -27,6 +32,14 @@ struct text_norm_state {
27 27
 	int space_written;
28 28
 };
29 29
 
30
+#define ASCII_FILE_BUFF_LENGTH 131072
31
+#define MAX_ASCII_FILE_SIZE 20000000
32
+
33
+#define MIN_3(x,y,z) ((x)<(y) ? ((x)<(z)?(x):(z)) : ((y)<(z)?(y):(z)))
34
+
30 35
 int text_normalize_init(struct text_norm_state *state, unsigned char *out, size_t out_len);
31 36
 void text_normalize_reset(struct text_norm_state* state);
32 37
 size_t text_normalize_buffer(struct text_norm_state *state, const unsigned char *buf, const size_t buf_len);
38
+size_t text_normalize_map(struct text_norm_state *state, fmap_t *map, size_t offset);
39
+
40
+#endif
... ...
@@ -110,6 +110,7 @@ const struct clam_option __clam_options[] = {
110 110
     { NULL, "mdb", 0, CLOPT_TYPE_BOOL, MATCH_BOOL, 0, NULL, 0, OPT_SIGTOOL, "", "" },
111 111
     { NULL, "print-certs", 0, CLOPT_TYPE_STRING, NULL, -1, NULL, 0, OPT_SIGTOOL, "", "" },
112 112
     { NULL, "html-normalise", 0, CLOPT_TYPE_STRING, NULL, -1, NULL, 0, OPT_SIGTOOL, "", "" },
113
+    { NULL, "ascii-normalise", 0, CLOPT_TYPE_STRING, NULL, -1, NULL, 0, OPT_SIGTOOL, "", "" },
113 114
     { NULL, "utf16-decode", 0, CLOPT_TYPE_STRING, NULL, -1, NULL, 0, OPT_SIGTOOL, "", "" },
114 115
     { NULL, "build", 'b', CLOPT_TYPE_STRING, NULL, -1, NULL, 0, OPT_SIGTOOL, "", "" },
115 116
     { NULL, "max-bad-sigs", 0, CLOPT_TYPE_NUMBER, MATCH_NUMBER, 3000, NULL, 0, OPT_SIGTOOL, "Maximum number of mismatched signatures when building a CVD. Zero disables this limit.", "3000" },
... ...
@@ -43,6 +43,8 @@
43 43
 #include <netinet/in.h>
44 44
 #include <arpa/inet.h>
45 45
 #include <sys/wait.h>
46
+#else
47
+#include "w32_stat.h"
46 48
 #endif
47 49
 #include <dirent.h>
48 50
 #include <ctype.h>
... ...
@@ -66,6 +68,7 @@
66 66
 #include "libclamav/str.h"
67 67
 #include "libclamav/ole2_extract.h"
68 68
 #include "libclamav/htmlnorm.h"
69
+#include "libclamav/textnorm.h"
69 70
 #include "libclamav/default.h"
70 71
 #include "libclamav/fmap.h"
71 72
 #include "libclamav/readdb.h"
... ...
@@ -223,6 +226,79 @@ static int htmlnorm(const struct optstruct *opts)
223 223
     return 0;
224 224
 }
225 225
 
226
+static int asciinorm(const struct optstruct *opts)
227
+{
228
+    const char *fname;
229
+    unsigned char *norm_buff;
230
+    struct text_norm_state state;
231
+    size_t map_off;
232
+    fmap_t *map; 
233
+    int fd, ofd;
234
+
235
+    fname = optget(opts, "ascii-normalise")->strarg;
236
+    fd = open(fname, O_RDONLY);
237
+
238
+    if (fd == -1) {
239
+	mprintf("!asciinorm: Can't open file %s\n", fname);
240
+	return -1;
241
+    }
242
+
243
+    if(!(norm_buff = malloc(ASCII_FILE_BUFF_LENGTH))) {
244
+	mprintf("!asciinorm: Can't allocate memory\n");
245
+	close(fd);
246
+	return -1;
247
+    }
248
+
249
+    if (!(map = fmap(fd, 0, 0))) {
250
+	mprintf("!fmap: Could not map fd %d\n", fd);
251
+	close(fd);
252
+	free(norm_buff);
253
+	return -1;
254
+    }
255
+
256
+    if (map->len > MAX_ASCII_FILE_SIZE) {
257
+	mprintf("!asciinorm: File size of %zu too large\n", map->len);
258
+	close(fd);
259
+	free(norm_buff);
260
+	funmap(map);
261
+	return -1;
262
+    }
263
+
264
+    ofd = open("./normalised_text", O_WRONLY | O_CREAT | O_TRUNC, S_IRUSR | S_IWUSR);
265
+    if (ofd == -1) {
266
+	mprintf("!asciinorm: Can't open file ./normalised_text\n");
267
+	close(fd);
268
+	free(norm_buff);
269
+	funmap(map);
270
+	return -1;
271
+    }
272
+
273
+    text_normalize_init(&state, norm_buff, ASCII_FILE_BUFF_LENGTH);
274
+
275
+    map_off = 0;
276
+    while(map_off != map->len) {
277
+	    size_t written;
278
+	    if (!(written = text_normalize_map(&state, map, map_off))) break;
279
+	    map_off += written;
280
+ 
281
+	    if (write(ofd, norm_buff, state.out_pos) == -1) {
282
+		    mprintf("!asciinorm: Can't write to file ./normalised_text\n");
283
+		    close(fd);
284
+		    close(ofd);
285
+		    free(norm_buff);
286
+		    funmap(map);
287
+		    return -1;
288
+	    }
289
+	    text_normalize_reset(&state);
290
+    }
291
+
292
+    close(fd);
293
+    close(ofd);
294
+    free(norm_buff);
295
+    funmap(map);
296
+    return 0;
297
+}
298
+
226 299
 static int utf16decode(const struct optstruct *opts)
227 300
 {
228 301
 	const char *fname;
... ...
@@ -3012,6 +3088,7 @@ static void help(void)
3012 3012
     mprintf("                                           or SHA256 sigs for FILES\n");
3013 3013
     mprintf("    --mdb [FILES]                          generate .mdb sigs\n");
3014 3014
     mprintf("    --html-normalise=FILE                  create normalised parts of HTML file\n");
3015
+    mprintf("    --ascii-normalise=FILE                 create normalised text file from ascii source\n");
3015 3016
     mprintf("    --utf16-decode=FILE                    decode UTF16 encoded files\n");
3016 3017
     mprintf("    --info=FILE            -i FILE         print database information\n");
3017 3018
     mprintf("    --build=NAME [cvd] -b NAME             build a CVD file\n");
... ...
@@ -3106,6 +3183,8 @@ int main(int argc, char **argv)
3106 3106
 	ret = hashsig(opts, 1, 1);
3107 3107
     else if(optget(opts, "html-normalise")->enabled)
3108 3108
 	ret = htmlnorm(opts);
3109
+    else if(optget(opts, "ascii-normalise")->enabled)
3110
+	ret = asciinorm(opts);
3109 3111
     else if(optget(opts, "utf16-decode")->enabled)
3110 3112
 	ret = utf16decode(opts);
3111 3113
     else if(optget(opts, "build")->enabled)
... ...
@@ -170,6 +170,9 @@ EXPORTS cli_pcre_freeoff @44373 NONAME
170 170
 EXPORTS cli_pcre_recaloff @44374 NONAME
171 171
 EXPORTS cli_pcre_perf_events_destroy @44375 NONAME
172 172
 EXPORTS cli_pcre_perf_print @44376 NONAME
173
+EXPORTS text_normalize_init @44377 NONAME
174
+EXPORTS text_normalize_reset @44378 NONAME
175
+EXPORTS text_normalize_map @44379 NONAME
173 176
 
174 177
 ; compatibility layer, tommath, zlib
175 178
 EXPORTS w32_srand @44269 NONAME