Browse code

Merge branch 'prefiltering4'

* prefiltering4:
fix build
move matching code to matcher_run.
matcher-ac: move leaf checks inside IS_FINAL.
Prepare for prefiltering: add new files.

Török Edvin authored on 2010/02/10 03:35:35
Showing 13 changed files
... ...
@@ -321,8 +321,12 @@ libclamav_la_SOURCES = \
321 321
 	version.h\
322 322
 	mpool.c\
323 323
 	mpool.h \
324
+	filtering.h\
325
+	filtering.c\
324 326
 	fmap.c \
325 327
 	fmap.h \
328
+	perflogging.c\
329
+	perflogging.h\
326 330
 	default.h\
327 331
 	sha256.c\
328 332
 	sha256.h\
... ...
@@ -148,12 +148,13 @@ am__libclamav_la_SOURCES_DIST = clamav.h matcher-ac.c matcher-ac.h \
148 148
 	7z/Archive/7z/7zExtract.h explode.c explode.h textnorm.c \
149 149
 	textnorm.h dlp.c dlp.h jsparse/js-norm.c jsparse/js-norm.h \
150 150
 	jsparse/lexglobal.h jsparse/textbuf.h uniq.c uniq.h version.c \
151
-	version.h mpool.c mpool.h fmap.c fmap.h default.h sha256.c \
152
-	sha256.h bignum.h bytecode.c bytecode.h bytecode_vm.c \
153
-	bytecode_priv.h clambc.h cpio.c cpio.h macho.c macho.h \
154
-	ishield.c ishield.h type_desc.h bcfeatures.h bytecode_api.c \
155
-	bytecode_api_decl.c bytecode_api.h bytecode_api_impl.h \
156
-	bytecode_hooks.h cache.c cache.h bignum.c bignum_class.h
151
+	version.h mpool.c mpool.h filtering.h filtering.c fmap.c \
152
+	fmap.h perflogging.c perflogging.h default.h sha256.c sha256.h \
153
+	bignum.h bytecode.c bytecode.h bytecode_vm.c bytecode_priv.h \
154
+	clambc.h cpio.c cpio.h macho.c macho.h ishield.c ishield.h \
155
+	type_desc.h bcfeatures.h bytecode_api.c bytecode_api_decl.c \
156
+	bytecode_api.h bytecode_api_impl.h bytecode_hooks.h cache.c \
157
+	cache.h bignum.c bignum_class.h
157 158
 @LINK_TOMMATH_FALSE@am__objects_1 = libclamav_la-bignum.lo
158 159
 am_libclamav_la_OBJECTS = libclamav_la-matcher-ac.lo \
159 160
 	libclamav_la-matcher-bm.lo libclamav_la-matcher.lo \
... ...
@@ -197,7 +198,8 @@ am_libclamav_la_OBJECTS = libclamav_la-matcher-ac.lo \
197 197
 	libclamav_la-explode.lo libclamav_la-textnorm.lo \
198 198
 	libclamav_la-dlp.lo libclamav_la-js-norm.lo \
199 199
 	libclamav_la-uniq.lo libclamav_la-version.lo \
200
-	libclamav_la-mpool.lo libclamav_la-fmap.lo \
200
+	libclamav_la-mpool.lo libclamav_la-filtering.lo \
201
+	libclamav_la-fmap.lo libclamav_la-perflogging.lo \
201 202
 	libclamav_la-sha256.lo libclamav_la-bytecode.lo \
202 203
 	libclamav_la-bytecode_vm.lo libclamav_la-cpio.lo \
203 204
 	libclamav_la-macho.lo libclamav_la-ishield.lo \
... ...
@@ -644,12 +646,13 @@ libclamav_la_SOURCES = clamav.h matcher-ac.c matcher-ac.h matcher-bm.c \
644 644
 	7z/Archive/7z/7zExtract.h explode.c explode.h textnorm.c \
645 645
 	textnorm.h dlp.c dlp.h jsparse/js-norm.c jsparse/js-norm.h \
646 646
 	jsparse/lexglobal.h jsparse/textbuf.h uniq.c uniq.h version.c \
647
-	version.h mpool.c mpool.h fmap.c fmap.h default.h sha256.c \
648
-	sha256.h bignum.h bytecode.c bytecode.h bytecode_vm.c \
649
-	bytecode_priv.h clambc.h cpio.c cpio.h macho.c macho.h \
650
-	ishield.c ishield.h type_desc.h bcfeatures.h bytecode_api.c \
651
-	bytecode_api_decl.c bytecode_api.h bytecode_api_impl.h \
652
-	bytecode_hooks.h cache.c cache.h $(am__append_7)
647
+	version.h mpool.c mpool.h filtering.h filtering.c fmap.c \
648
+	fmap.h perflogging.c perflogging.h default.h sha256.c sha256.h \
649
+	bignum.h bytecode.c bytecode.h bytecode_vm.c bytecode_priv.h \
650
+	clambc.h cpio.c cpio.h macho.c macho.h ishield.c ishield.h \
651
+	type_desc.h bcfeatures.h bytecode_api.c bytecode_api_decl.c \
652
+	bytecode_api.h bytecode_api_impl.h bytecode_hooks.h cache.c \
653
+	cache.h $(am__append_7)
653 654
 noinst_LTLIBRARIES = libclamav_internal_utils.la libclamav_internal_utils_nothreads.la libclamav_nocxx.la
654 655
 COMMON_CLEANFILES = version.h version.h.tmp *.gcda *.gcno
655 656
 @MAINTAINER_MODE_TRUE@BUILT_SOURCES = jsparse/generated/operators.h jsparse/generated/keywords.h jsparse-keywords.gperf
... ...
@@ -806,6 +809,7 @@ distclean-compile:
806 806
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libclamav_la-entconv.Plo@am__quote@
807 807
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libclamav_la-explode.Plo@am__quote@
808 808
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libclamav_la-filetypes.Plo@am__quote@
809
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libclamav_la-filtering.Plo@am__quote@
809 810
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libclamav_la-fmap.Plo@am__quote@
810 811
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libclamav_la-fsg.Plo@am__quote@
811 812
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libclamav_la-hashtab.Plo@am__quote@
... ...
@@ -834,6 +838,7 @@ distclean-compile:
834 834
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libclamav_la-pdf.Plo@am__quote@
835 835
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libclamav_la-pe.Plo@am__quote@
836 836
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libclamav_la-pe_icons.Plo@am__quote@
837
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libclamav_la-perflogging.Plo@am__quote@
837 838
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libclamav_la-petite.Plo@am__quote@
838 839
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libclamav_la-phish_domaincheck_db.Plo@am__quote@
839 840
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libclamav_la-phish_whitelist.Plo@am__quote@
... ...
@@ -1579,6 +1584,14 @@ libclamav_la-mpool.lo: mpool.c
1579 1579
 @AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
1580 1580
 @am__fastdepCC_FALSE@	$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libclamav_la_CFLAGS) $(CFLAGS) -c -o libclamav_la-mpool.lo `test -f 'mpool.c' || echo '$(srcdir)/'`mpool.c
1581 1581
 
1582
+libclamav_la-filtering.lo: filtering.c
1583
+@am__fastdepCC_TRUE@	$(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libclamav_la_CFLAGS) $(CFLAGS) -MT libclamav_la-filtering.lo -MD -MP -MF $(DEPDIR)/libclamav_la-filtering.Tpo -c -o libclamav_la-filtering.lo `test -f 'filtering.c' || echo '$(srcdir)/'`filtering.c
1584
+@am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) $(DEPDIR)/libclamav_la-filtering.Tpo $(DEPDIR)/libclamav_la-filtering.Plo
1585
+@am__fastdepCC_FALSE@	$(AM_V_CC) @AM_BACKSLASH@
1586
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	source='filtering.c' object='libclamav_la-filtering.lo' libtool=yes @AMDEPBACKSLASH@
1587
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
1588
+@am__fastdepCC_FALSE@	$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libclamav_la_CFLAGS) $(CFLAGS) -c -o libclamav_la-filtering.lo `test -f 'filtering.c' || echo '$(srcdir)/'`filtering.c
1589
+
1582 1590
 libclamav_la-fmap.lo: fmap.c
1583 1591
 @am__fastdepCC_TRUE@	$(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libclamav_la_CFLAGS) $(CFLAGS) -MT libclamav_la-fmap.lo -MD -MP -MF $(DEPDIR)/libclamav_la-fmap.Tpo -c -o libclamav_la-fmap.lo `test -f 'fmap.c' || echo '$(srcdir)/'`fmap.c
1584 1592
 @am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) $(DEPDIR)/libclamav_la-fmap.Tpo $(DEPDIR)/libclamav_la-fmap.Plo
... ...
@@ -1587,6 +1600,14 @@ libclamav_la-fmap.lo: fmap.c
1587 1587
 @AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
1588 1588
 @am__fastdepCC_FALSE@	$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libclamav_la_CFLAGS) $(CFLAGS) -c -o libclamav_la-fmap.lo `test -f 'fmap.c' || echo '$(srcdir)/'`fmap.c
1589 1589
 
1590
+libclamav_la-perflogging.lo: perflogging.c
1591
+@am__fastdepCC_TRUE@	$(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libclamav_la_CFLAGS) $(CFLAGS) -MT libclamav_la-perflogging.lo -MD -MP -MF $(DEPDIR)/libclamav_la-perflogging.Tpo -c -o libclamav_la-perflogging.lo `test -f 'perflogging.c' || echo '$(srcdir)/'`perflogging.c
1592
+@am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) $(DEPDIR)/libclamav_la-perflogging.Tpo $(DEPDIR)/libclamav_la-perflogging.Plo
1593
+@am__fastdepCC_FALSE@	$(AM_V_CC) @AM_BACKSLASH@
1594
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	source='perflogging.c' object='libclamav_la-perflogging.lo' libtool=yes @AMDEPBACKSLASH@
1595
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
1596
+@am__fastdepCC_FALSE@	$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libclamav_la_CFLAGS) $(CFLAGS) -c -o libclamav_la-perflogging.lo `test -f 'perflogging.c' || echo '$(srcdir)/'`perflogging.c
1597
+
1590 1598
 libclamav_la-sha256.lo: sha256.c
1591 1599
 @am__fastdepCC_TRUE@	$(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libclamav_la_CFLAGS) $(CFLAGS) -MT libclamav_la-sha256.lo -MD -MP -MF $(DEPDIR)/libclamav_la-sha256.Tpo -c -o libclamav_la-sha256.lo `test -f 'sha256.c' || echo '$(srcdir)/'`sha256.c
1592 1600
 @am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) $(DEPDIR)/libclamav_la-sha256.Tpo $(DEPDIR)/libclamav_la-sha256.Plo
1593 1601
new file mode 100644
... ...
@@ -0,0 +1,749 @@
0
+/*
1
+ *  A fast filter for static patterns.
2
+ *
3
+ *  Copyright (C) 2008 Sourcefire, Inc.
4
+ *
5
+ *  Authors: Török Edvin
6
+ *
7
+ *  This program is free software; you can redistribute it and/or modify
8
+ *  it under the terms of the GNU General Public License version 2 as
9
+ *  published by the Free Software Foundation.
10
+ *
11
+ *  This program is distributed in the hope that it will be useful,
12
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
13
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14
+ *  GNU General Public License for more details.
15
+ *
16
+ *  You should have received a copy of the GNU General Public License
17
+ *  along with this program; if not, write to the Free Software
18
+ *  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
19
+ *  MA 02110-1301, USA.
20
+ */
21
+#if HAVE_CONFIG_H
22
+#include "clamav-config.h"
23
+#endif
24
+#include "filtering.h"
25
+#include "matcher-ac.h"
26
+#include <string.h>
27
+#include <assert.h>
28
+#include "perflogging.h"
29
+/* ----- shift-or filtering -------------- */
30
+
31
+/*
32
+ * Description of algorithm:
33
+ *
34
+ * Multiple patterns are added to the filter.
35
+ * The filter retains an approximation of these patterns, which can lead to
36
+ * false positive matches, but not false negative matches.
37
+ *
38
+ * For each position in the filter we retain what qgrams can match at that
39
+ * position, for example (if we'd use characters as qgrams):
40
+ * pattern1: atu
41
+ * pattern2: bzf
42
+ * pattern3: xat
43
+ * 
44
+ * filter accepts:
45
+ * [abx][tza][uft]
46
+ *
47
+ * But it also accepts (false positives):
48
+ * azu, azf, azt, ...
49
+ *
50
+ * It doesn't however accept:
51
+ * aaa, atz, ...
52
+ *
53
+ * This is implemented by having a bit-level state-machine with MAXSOPATLEN (=32) states, 
54
+ * each active bit meaning that a state is active.
55
+ * 
56
+ * The states are activated sequentially, eachtransition decision is made 
57
+ * considering if we can accept the character at position X. 
58
+ * Since we can start a match at any position, position 0 is
59
+ * reactivated each time.
60
+ * When the last position is activated, the filter reports a match.
61
+ * If we can't accept the character at position X, the state remains inactive,
62
+ * and further states aren't activated (unless we activate this state in the
63
+ * future).
64
+ *
65
+ * Essentially this is an automaton like this:
66
+ *
67
+ *  /\    (a|b|x)        (t|z|a)        (u|f|t)
68
+ * [S1] ---------> [S2] -------> [S3] ---------> [S4] -> match
69
+ *  \_______________/             |               
70
+ *  \_____________________________/               
71
+ *
72
+ *
73
+ * But we are tracking multiple active states at each time (or run N automatons
74
+ * in parallel if you like, N = number of states).
75
+ *
76
+ * We can have S3 and S2 active, meaning that if the next character is
77
+ * acceptable, it transitions to S1,S3 and S4 being active, otherwise it
78
+ * transitions to S1 being active.
79
+ *
80
+ * Active states can either be represented as a binary 1 or 0, and using
81
+ * bit-shifting and masking.
82
+ * If we choose 1, we must use &, and after shifting always reactivate bit 0.
83
+ * If we choose 0, we must use |, and after shifting we don't need to do
84
+ * anything (since by shifting a 0 is implicitly introduced).
85
+ *
86
+ * This file implements the latter (shift-or) method.
87
+ *
88
+ * The discussion above considered pattern to be of same length (or truncated to
89
+ * be so). In reality patterns are of variable length, and we often have short
90
+ * pattern.
91
+ *
92
+ * Thus another bitmap was introduced, meaning that if (end[Q] == set), then
93
+ * a pattern can end at this position.
94
+ * Also we would fill the pattern's position filters quite quickly with only 256
95
+ * choices for a position, so the algorithm uses overlapping qgrams of length 2:
96
+ * 'abcd' is 3 qgrams: 'ab','bc','cd'
97
+ *
98
+ * The algorithm is very sensitive to the end[Q] filter, since it can have false
99
+ * positives due to short patterns!
100
+ * For optimal performance we need:
101
+ *   - patterns as long as possible
102
+ *   - probability for end[Q] to match low (avoid 0000, and other common case
103
+ *   - choose the most "diverse" subset from a long pattern
104
+ *
105
+ * diverse = refering to what we are scanning, so that the filter rarely
106
+ * matches, so this actually means that we *want* to avoid adding more
107
+ * characters to the filter, if we have 2 patterns:
108
+ * abxfg, and dalabxpo, it may be preferable to shift the 2nd one so that we
109
+ * don't add new character at the beginning.
110
+ *
111
+ * With NDB signatures there are more challenges to overcome:
112
+ *    e8??0000000aa
113
+ *
114
+ *    will make the filter accept:
115
+ *    e8<all-256-values-here>, <all-256-values>00, ... 000000aa
116
+ *
117
+ *    We should delay the pattern end as long as possible, especially if it is  0000
118
+ *    The problem is that now the filter accepts 0000 on position 3, regardless
119
+ *    of what we have on position 1 (even if we have something else than e8), so
120
+ *    we have to be very careful not to allow 0000 on first position too,
121
+ *    otherwise the filter will happily accept 000000000000.
122
+ *
123
+ * To optimize cache usage there are 2 end filters, one character (fits L1), and one qgram
124
+ * based (fits L2), both must match for the filter to consider it a match.   
125
+ *
126
+ *
127
+ */
128
+
129
+#define BITMAP_CONTAINS(bmap, val) ((bmap)[(val) >> 5] & (1 << ((val) & 0x1f)))
130
+#define BITMAP_INSERT(bmap, val) ((bmap)[(val) >> 5] |= (1 << ((val) & 0x1f)))
131
+
132
+void filter_init(struct filter *m)
133
+{
134
+	memset(m->B, ~0, sizeof(m->B));
135
+	memset(m->end, ~0, sizeof(m->end));
136
+}
137
+
138
+/* because we use uint32_t */
139
+#define MAXSOPATLEN 8
140
+
141
+static inline int filter_isset(const struct filter *m, unsigned pos, uint16_t val)
142
+{
143
+	return !(m->B[val] & (1<<pos));
144
+}
145
+
146
+static inline void filter_set_atpos(struct filter *m, unsigned pos, uint16_t val)
147
+{
148
+	if (!filter_isset(m, pos, val)) {
149
+		cli_perf_log_count(FILTER_LOAD, pos);
150
+		m->B[val] &= ~(1<<pos);
151
+	}
152
+}
153
+
154
+
155
+static inline int filter_end_isset(const struct filter *m, unsigned pos, uint16_t a)
156
+{
157
+	return !(m->end[a] & (1<<pos));
158
+}
159
+
160
+static inline void filter_set_end(struct filter *m, unsigned pos, uint16_t a)
161
+{
162
+	if (!filter_end_isset(m, pos, a)) {
163
+		cli_perf_log_count(FILTER_END_LOAD, pos);
164
+		m->end[a] &= ~(1 << pos);
165
+	}
166
+}
167
+#define MAX_CHOICES 8
168
+/* just an arbitrary limit, if patterns are longer, we cut
169
+ * the filter can only use MAXSOPATLEN (32) characters,
170
+ * this longer buffer is needed so that we can choose the "best" subpattern from
171
+ * it */
172
+#define MAXPATLEN 255
173
+
174
+/* merge another pattern into the filter
175
+ * add('abc'); add('bcd'); will match [ab][bc][cd] */
176
+int filter_add_static(struct filter *m, const unsigned char *pattern, unsigned long len, const char *name)
177
+{
178
+	uint16_t q;
179
+	uint8_t j, maxlen;
180
+	uint32_t best = 0xffffffff;
181
+	uint8_t best_pos = 0;
182
+
183
+	cli_perf_log_count(TRIE_ORIG_LEN, len > 8 ? 8 : len);
184
+	/* TODO: choose best among MAXCHOICES */
185
+	/* cut length */
186
+	if(len > MAXPATLEN) {
187
+		len = MAXPATLEN;
188
+	}
189
+	if(len < 2)
190
+		return -1;
191
+
192
+	/* we want subsigs to be as long as possible */
193
+	if (len > 4) {
194
+		maxlen = len - 4;
195
+		if (maxlen == 1) maxlen = 2;
196
+	} else
197
+		maxlen = 2;
198
+	for(j=0;(best < 100 && j<MAX_CHOICES) || (j < maxlen) ;j++) {
199
+		uint32_t num = MAXSOPATLEN;
200
+		uint8_t k;
201
+		if (j+2 > len)
202
+			break;
203
+		for(k=j;k<len-1 && (k-j < MAXSOPATLEN);k++) {
204
+			q = cli_readint16( &pattern[k] );
205
+			/* we want to favor subsigs that add as little as
206
+			 * possible to the filter */
207
+			num += filter_isset(m, k-j, q) ? 0 : MAXSOPATLEN - (k-j);
208
+			if ((k == j || k == j+1) && (q == 0x0000 || q == 0xffff))
209
+				num += k==j ?  10000 : 1000;/* bad */
210
+		}
211
+		/* it is very important to keep the end set small */
212
+		num += 10*(filter_end_isset(m, k-j-1, q) ? 0 : 1);
213
+		/* it is very important to have signatures as long as possible
214
+		 * */
215
+		num += 5*(MAXSOPATLEN - (k-j));
216
+		/* if we are lower length than threshold penalize */
217
+		if (k-j+1 < 4)
218
+			num += 200;
219
+		/* favour longer patterns */
220
+		num -= (2*MAXSOPATLEN - (k + 1+j))*(k-j)/2;
221
+
222
+		if (num < best) {
223
+			best = num;
224
+			best_pos = j;
225
+		}
226
+	}
227
+
228
+	assert(best_pos < len-1);
229
+	if (pattern[best_pos] == 0 && pattern[best_pos+1] == 0) {
230
+		cli_warnmsg("filter: subsignature begins with zero (static): %s\n", name);
231
+	}
232
+	pattern += best_pos;
233
+	len -= best_pos;
234
+	/* cut length */
235
+	if(len > MAXSOPATLEN) {
236
+		len = MAXSOPATLEN;
237
+	}
238
+	/* Shift-Or like preprocessing */
239
+	for(j=0;j < len-1;j++) {
240
+		/* use overlapping little-endian 2-grams. We need them overlapping because matching can start at any position */
241
+		q = cli_readint16( &pattern[j] );
242
+		filter_set_atpos(m, j, q);
243
+	}
244
+	/* we use variable length patterns, use last character to mark pattern end,
245
+	 * can lead to false positives.*/
246
+	/* mark that at state j, the q-gram q can end the pattern */
247
+	if(j) {
248
+		j--;
249
+		filter_set_end(m, j, q);
250
+	}
251
+	return j+2;
252
+}
253
+
254
+struct char_spec {
255
+	/* if non-null i-th character = alt[start + step*i]; start+step*i < end;
256
+	 */
257
+	struct cli_ac_special *alt;
258
+	uint8_t start;
259
+	uint8_t end;
260
+	uint8_t step;
261
+};
262
+
263
+static inline unsigned char spec_ith_char(const struct char_spec *spec, unsigned i)
264
+{
265
+	const struct cli_ac_special *alt = spec->alt;
266
+	if (alt) {
267
+		assert (alt->type == 1);
268
+		assert (i < alt->num);
269
+		return alt->str[i];
270
+	}
271
+	return i;
272
+}
273
+
274
+static const struct char_spec full_range = {NULL, 0,0xff,1};
275
+
276
+static inline int spec_is_fullrange(const struct char_spec *spec0, const struct char_spec *spec1)
277
+{
278
+	return !memcmp(spec0, &full_range, sizeof(full_range)) &&
279
+	       !memcmp(spec1, &full_range, sizeof(full_range));
280
+}
281
+
282
+
283
+#ifndef MIN
284
+#define MIN(a,b) ((a) < (b) ? (a) : (b))
285
+#endif
286
+
287
+enum badness {
288
+	reject,
289
+	/* try to avoid if possible */
290
+	avoid_first,
291
+	avoid_anywhere, /* includes avoid_first! */
292
+	/* not that bad, but still not best */
293
+	dontlike,
294
+	accept,
295
+	like
296
+};
297
+static inline void get_score(enum badness badness, unsigned i, const struct filter *m, const struct char_spec *spec0, const struct char_spec *spec1, int32_t *score, int32_t *score_end)
298
+{
299
+	int32_t base;
300
+	unsigned k0, k1, num_introduced = 0, num_end_introduced = 0;
301
+	switch (badness) {
302
+		case reject:
303
+			/* not reached */
304
+			assert(0);
305
+			base = -0x7fffff;
306
+			break;
307
+		case avoid_first:
308
+			if (!i)
309
+				base = -0x700000;
310
+			else
311
+				base = 0;
312
+			break;
313
+		case avoid_anywhere:
314
+			if (!i)
315
+				base = -0x720000;
316
+			else
317
+				base = -0x1000;
318
+			break;
319
+		case dontlike:
320
+			base = 0;
321
+			break;
322
+		case accept:
323
+			base = 0x200;
324
+			break;
325
+		case like:
326
+			/* a bit better only */
327
+			base = 0x201;
328
+			break;
329
+	}
330
+	if (base < 0) {
331
+		*score = base;
332
+		*score_end = base;
333
+		return;
334
+	}
335
+	/* at most 256 iterations here, otherwise base would be negative */
336
+	for(k0=spec0->start;k0 <= spec0->end;k0 += spec0->step) {
337
+		for(k1=spec1->start;k1 <= spec1->end;k1 += spec1->step) {
338
+			unsigned char c0 = spec_ith_char(spec0, k0);
339
+			unsigned char c1 = spec_ith_char(spec1, k1);
340
+			uint16_t a = c0 | (c1<<8);
341
+			num_introduced += filter_isset(m, i, a);
342
+			num_end_introduced += filter_end_isset(m, i, a);
343
+		}
344
+	}
345
+	*score = base - num_introduced;
346
+	*score_end = base - num_end_introduced;
347
+	if (badness == avoid_first && i) {
348
+		/* what is bad to begin with, is bad at end too */
349
+		*score_end -= 0x1000;
350
+	}
351
+}
352
+
353
+struct choice {
354
+	enum badness base;
355
+	unsigned begin;
356
+	unsigned len;
357
+};
358
+
359
+static inline void add_choice(struct choice *choices, unsigned *cnt, unsigned i, unsigned ie, enum badness badness)
360
+{
361
+	struct choice *choice;
362
+	int i_neg = -1;
363
+	assert(ie < MAXPATLEN);
364
+	if (ie < i+1)
365
+		return;
366
+	if (*cnt >= MAX_CHOICES)
367
+		return;
368
+	if (badness > avoid_first && *cnt >= (MAX_CHOICES >> 1)) {
369
+		unsigned j;
370
+		/* replace very bad picks if we're full */
371
+		for (j=0;j<*cnt;j++) {
372
+			if (choices[j].base < badness) {
373
+				if (i_neg == -1 || choices[j].base < choices[i_neg].base) {
374
+					i_neg = j;
375
+				}
376
+			}
377
+		}
378
+	}
379
+	if (i_neg != -1) {
380
+		choice = &choices[i_neg];
381
+	} else {
382
+		choice = &choices[(*cnt)++];
383
+	}
384
+	choice->begin = i;
385
+	choice->len = ie - i + 1;
386
+	choice->base = badness;
387
+}
388
+
389
+static inline int32_t spec_iter(const struct char_spec *spec)
390
+{
391
+	assert(spec->step);
392
+	return (1 + spec->end - spec->start)/spec->step;
393
+}
394
+
395
+int  filter_add_acpatt(struct filter *m, const struct cli_ac_patt *pat)
396
+{
397
+	unsigned i, j = 0, stop = 0, l=0;
398
+	uint16_t k0, k1;
399
+
400
+	struct char_spec chars[MAXPATLEN];
401
+	enum badness char_badness[MAXPATLEN];
402
+	unsigned char patc[MAXPATLEN];
403
+	unsigned altcnt = 0;
404
+	int32_t best_score = -0x7fffffff;
405
+	unsigned best_score_i = 0;
406
+	unsigned best_score_len = 0;
407
+	struct char_spec *spec0, *spec1;
408
+
409
+	struct choice choices[MAX_CHOICES];
410
+	unsigned choices_cnt = 0;
411
+	unsigned prefix_len = pat->prefix_length;
412
+
413
+	j = MIN(prefix_len + pat->length, MAXPATLEN);
414
+	for(i=0;i<j;i++) {
415
+		const uint16_t p = i < prefix_len ? pat->prefix[i] : pat->pattern[i - prefix_len];
416
+		if ((p&CLI_MATCH_WILDCARD) != CLI_MATCH_CHAR)
417
+			break;
418
+		patc[i] = (uint8_t)p;
419
+	}
420
+	if (i == j) {
421
+		/* all static, use add_static it has better heuristics for this
422
+		 * case */
423
+		return filter_add_static(m, patc, j, pat->virname);
424
+	}
425
+	cli_perf_log_count(TRIE_ORIG_LEN, j > 8 ? 8 : j);
426
+	/* transform AC characters into our representation */
427
+	for (i=0;i<j && !stop; i++) {
428
+		struct char_spec *spec = &chars[i];
429
+		const uint16_t p = i < prefix_len ? pat->prefix[i] : pat->pattern[i - prefix_len];
430
+		spec->alt = NULL;
431
+		switch (p & CLI_MATCH_WILDCARD) {
432
+			case CLI_MATCH_CHAR:
433
+				spec->start = spec->end = (uint8_t)p;
434
+				spec->step  = 1;
435
+				break;
436
+			case CLI_MATCH_IGNORE:
437
+				spec->start = 0x00;
438
+				spec->end   = 0xff;
439
+				spec->step  = 1;
440
+				break;
441
+			case CLI_MATCH_SPECIAL:
442
+				assert(pat->special_table);
443
+//				assert(altcnt < pat->alt);
444
+				assert(pat->special_table[altcnt]);
445
+				switch (pat->special_table[altcnt++]->type) {
446
+				    case 1: /* ALT_CHAR */
447
+					spec->start = 0;
448
+					spec->end = pat->special_table[altcnt-1]->num - 1;
449
+					spec->step = 1;
450
+					spec->alt = pat->special_table[altcnt-1];
451
+					break;
452
+				    default:
453
+					break;
454
+					/* TODO: should something be done here?
455
+					 * */
456
+				}
457
+				stop = 1;
458
+				break;
459
+			case CLI_MATCH_NIBBLE_HIGH:
460
+				spec->start = (p & 0xf0);
461
+				spec->end   = spec->start | 0x0f;
462
+				spec->step  = 1;
463
+				break;
464
+			case CLI_MATCH_NIBBLE_LOW:
465
+				spec->start = (p & 0xf);
466
+				spec->end   = 0xf0 | spec->start;
467
+				spec->step  = 0x10;
468
+				break;
469
+			default:
470
+				cli_errmsg("filtering: unknown wildcard character: %d\n", p);
471
+				return -1;
472
+		}
473
+	}
474
+	if (stop) --i;
475
+	j = i;
476
+	if (j < 2) {
477
+		if (stop)
478
+			cli_warnmsg("Don't know how to create filter for: %s\n",pat->virname);
479
+		else
480
+			cli_warnmsg("Subpattern too short: %s\n", pat->virname);
481
+		return -1;
482
+	}
483
+
484
+	for(i=0;i<j-1;i++) {
485
+		int32_t num_iter;
486
+		/* new qgrams added to the filter */
487
+		spec0 = &chars[i];
488
+		spec1 = &chars[i+1];
489
+		num_iter = spec_iter(spec0) * spec_iter(spec1);
490
+
491
+		if (num_iter >= 0x100) {
492
+			if (num_iter == 0x10000)
493
+				char_badness[i] = reject;
494
+			else
495
+				char_badness[i] = avoid_anywhere;
496
+		} else {
497
+			int8_t binary = 0;
498
+			enum badness scor = accept;
499
+			for(k0=spec0->start;k0 <= spec0->end;k0 += spec0->step) {
500
+				for(k1=spec1->start;k1 <= spec1->end;k1 += spec1->step) {
501
+					unsigned char c0 = spec_ith_char(spec0, k0);
502
+					unsigned char c1 = spec_ith_char(spec1, k1);
503
+					if ((!c0 && !c1) || (c0 == 0xff && c1 == 0xff)) {
504
+						scor = avoid_first;
505
+						break;
506
+					}
507
+					if (c0 == c1) {
508
+						scor = dontlike;
509
+						break;
510
+					}
511
+					if ((c0 < 32 || c0 > 127) && (c1 < 32 || c1 >127))
512
+						binary = 1;
513
+				}
514
+			}
515
+			if (scor == accept && binary) {
516
+				/* slightly favor binary */
517
+				scor = like;
518
+			}
519
+			char_badness[i] = scor;
520
+		}
521
+	}
522
+
523
+	/* try to choose best subpattern */
524
+
525
+	/* calculating the score for all possible i start pos
526
+	 * and all possible length is too slow, so choose best among N choices
527
+	 * only */
528
+	for (i=0;i<j-1 && choices_cnt < MAX_CHOICES;i++) {
529
+		enum badness base0 = like, base1 = like;
530
+		unsigned kend = MIN(j-1, (i + MAXSOPATLEN)&~1), k;
531
+		int ki = -0xff;
532
+		/* add 2 scores: pattern with max length, one where we stop at
533
+		 * first negative, and one we stop at last positive, but never
534
+		 * include reject */
535
+		assert(kend-1 < j-1);
536
+		if (char_badness[i]  == reject)
537
+			continue;
538
+		if ((char_badness[i] == avoid_anywhere || char_badness[i] == avoid_first)
539
+				&& choices_cnt > 0)
540
+			/* if we have another choice don't choose this */
541
+			continue;
542
+		while ((kend > i+3) && char_badness[kend-1] == reject) kend--;
543
+		for (k=i;k<kend;k++) {
544
+			enum badness badness = char_badness[k];
545
+			if (badness < accept) {
546
+				if (badness == reject) {
547
+					/* this is a never pick */
548
+					kend = k;
549
+					break;
550
+				}
551
+				if (badness == avoid_first && k != i)
552
+					badness = dontlike;
553
+				if (k == i && badness == avoid_anywhere)
554
+					badness = avoid_first;
555
+				if (ki == -0xff)
556
+					ki = k;
557
+			}
558
+			base0 = MIN(base0, badness);
559
+			if (ki == -0xff)
560
+				base1 = MIN(base1, badness);
561
+		}
562
+		add_choice(choices, &choices_cnt, i, kend, base0);
563
+		if (ki > (int)i) {
564
+			/* ki|ki+1|??| */
565
+			/* try subpattern from after the wildcard */
566
+			i = ki;
567
+		}
568
+		/* if score is positive, it replaces a negative choice */
569
+	}
570
+	for(l=0;l<choices_cnt;l++) {
571
+		int32_t score;
572
+		unsigned kend;
573
+		unsigned k;
574
+
575
+		i = choices[l].begin;
576
+		kend = i + choices[l].len;
577
+		score = 0;
578
+
579
+		for(k = i; k < kend-1; k++) {
580
+			unsigned p = k - i;
581
+			int32_t iscore, score_end;
582
+			assert(k < j);
583
+			get_score(char_badness[k], p, m, &chars[k], &chars[k+1],
584
+				  &iscore, &score_end);
585
+			/* give more importance to the score of the characters
586
+			 * at the beginning */
587
+			/* TODO: tune magic number here */
588
+			if (p < 6) {
589
+				iscore *= (6-p);
590
+				score_end *= (6-p);
591
+			}
592
+			score += iscore;
593
+			if (score + score_end > best_score) {
594
+				/* we may have negative scores, so truncating
595
+				 * the pattern could actually get us a higher
596
+				 * score */
597
+				best_score = score + score_end;
598
+				best_score_len = p + 2;
599
+				best_score_i = i;
600
+				assert(i + best_score_len <= j);
601
+			}
602
+		}
603
+	}
604
+
605
+	if (best_score <= -0x7fffffff) {
606
+		cli_warnmsg("filter rejecting %s due to very bad score: %ld\n", pat->virname, (long)best_score);
607
+		return -1;
608
+	}
609
+	if (choices_cnt == 0) {
610
+		cli_warnmsg("filter rejecting %s because there are no viable choices", pat->virname);
611
+		return -1;
612
+	}
613
+	assert(best_score_len >= 2);
614
+
615
+	cli_dbgmsg("filter %s score: %ld, %u (+ %u)\n", pat->virname, (long)best_score, best_score_i, best_score_len);
616
+	/* Shift-Or like preprocessing */
617
+	assert(1 < best_score_len);
618
+	for (i=0;i < best_score_len-1;i++) {
619
+		spec0 = &chars[best_score_i + i];
620
+		spec1 = &chars[best_score_i + i + 1];
621
+		/* use overlapping little-endian 2-grams, overlapping because match can start
622
+		 * at any position (including odd) */
623
+
624
+		for(k0=spec0->start;k0 <= spec0->end;k0 += spec0->step) {
625
+			for(k1=spec1->start;k1 <= spec1->end;k1 += spec1->step) {
626
+				unsigned char c0 = spec_ith_char(spec0, k0);
627
+				unsigned char c1 = spec_ith_char(spec1, k1);
628
+				if (!c0 && !c1 && !i) {
629
+					cli_warnmsg("filter: subsignature begins with zero: %s\n",pat->virname);
630
+				}
631
+				filter_set_atpos(m, i, c0 | (c1<<8));
632
+			}
633
+		}
634
+	}
635
+
636
+	j  = best_score_len - 2;
637
+	for (k0=spec0->start;k0 <= spec0->end;k0 += spec0->step) {
638
+		for (k1=spec1->start;k1 <= spec1->end;k1 += spec1->step) {
639
+			unsigned char c0 = spec_ith_char(spec0, k0);
640
+			unsigned char c1 = spec_ith_char(spec1, k1);
641
+			if (!c0 && !c1) {
642
+				cli_dbgmsg("filter: subsignature ends with zero: %s\n",pat->virname);
643
+			}
644
+			filter_set_end(m, j, c0 | (c1<<8));
645
+		}
646
+	}
647
+	return j+2;
648
+}
649
+
650
+static const struct match_len_info {
651
+	uint8_t shortest;
652
+	uint8_t longest;
653
+} match_len[256] = {
654
+	{2,9},{3,9},{2,9},{4,9},{2,9},{3,9},{2,9},{5,9},
655
+	{2,9},{3,9},{2,9},{4,9},{2,9},{3,9},{2,9},{6,9},
656
+	{2,9},{3,9},{2,9},{4,9},{2,9},{3,9},{2,9},{5,9},
657
+	{2,9},{3,9},{2,9},{4,9},{2,9},{3,9},{2,9},{7,9},
658
+	{2,9},{3,9},{2,9},{4,9},{2,9},{3,9},{2,9},{5,9},
659
+	{2,9},{3,9},{2,9},{4,9},{2,9},{3,9},{2,9},{6,9},
660
+	{2,9},{3,9},{2,9},{4,9},{2,9},{3,9},{2,9},{5,9},
661
+	{2,9},{3,9},{2,9},{4,9},{2,9},{3,9},{2,9},{8,9},
662
+	{2,9},{3,9},{2,9},{4,9},{2,9},{3,9},{2,9},{5,9},
663
+	{2,9},{3,9},{2,9},{4,9},{2,9},{3,9},{2,9},{6,9},
664
+	{2,9},{3,9},{2,9},{4,9},{2,9},{3,9},{2,9},{5,9},
665
+	{2,9},{3,9},{2,9},{4,9},{2,9},{3,9},{2,9},{7,9},
666
+	{2,9},{3,9},{2,9},{4,9},{2,9},{3,9},{2,9},{5,9},
667
+	{2,9},{3,9},{2,9},{4,9},{2,9},{3,9},{2,9},{6,9},
668
+	{2,9},{3,9},{2,9},{4,9},{2,9},{3,9},{2,9},{5,9},
669
+	{2,9},{3,9},{2,9},{4,9},{2,9},{3,9},{2,9},{9,9},
670
+	{2,8},{3,8},{2,8},{4,8},{2,8},{3,8},{2,8},{5,8},
671
+	{2,8},{3,8},{2,8},{4,8},{2,8},{3,8},{2,8},{6,8},
672
+	{2,8},{3,8},{2,8},{4,8},{2,8},{3,8},{2,8},{5,8},
673
+	{2,8},{3,8},{2,8},{4,8},{2,8},{3,8},{2,8},{7,8},
674
+	{2,8},{3,8},{2,8},{4,8},{2,8},{3,8},{2,8},{5,8},
675
+	{2,8},{3,8},{2,8},{4,8},{2,8},{3,8},{2,8},{6,8},
676
+	{2,8},{3,8},{2,8},{4,8},{2,8},{3,8},{2,8},{5,8},
677
+	{2,8},{3,8},{2,8},{4,8},{2,8},{3,8},{2,8},{8,8},
678
+	{2,7},{3,7},{2,7},{4,7},{2,7},{3,7},{2,7},{5,7},
679
+	{2,7},{3,7},{2,7},{4,7},{2,7},{3,7},{2,7},{6,7},
680
+	{2,7},{3,7},{2,7},{4,7},{2,7},{3,7},{2,7},{5,7},
681
+	{2,7},{3,7},{2,7},{4,7},{2,7},{3,7},{2,7},{7,7},
682
+	{2,6},{3,6},{2,6},{4,6},{2,6},{3,6},{2,6},{5,6},
683
+	{2,6},{3,6},{2,6},{4,6},{2,6},{3,6},{2,6},{6,6},
684
+	{2,5},{3,5},{2,5},{4,5},{2,5},{3,5},{2,5},{5,5},
685
+	{2,4},{3,4},{2,4},{4,4},{2,3},{3,3},{2,2},{0,0}
686
+};
687
+/* state 11110011 means that we may have a match of length min 4, max 5 */
688
+
689
+__hot__ int filter_search_ext(const struct filter *m, const unsigned char *data, unsigned long len, struct filter_match_info *inf)
690
+{
691
+	size_t j;
692
+	uint8_t state = ~0;
693
+	const uint8_t *B = m->B;
694
+	const uint8_t *End = m->end;
695
+	uint8_t shortest, longest=0;
696
+
697
+	if (len < 2) return -1;
698
+	/* look for first match */
699
+	for (j=0; j < len-1;j++) {
700
+		uint8_t match_state_end;
701
+		const uint16_t q0 = cli_readint16( &data[j] );
702
+
703
+		state = (state << 1) | B[q0];
704
+		match_state_end = state | End[q0];
705
+		if (match_state_end != 0xff) {
706
+			inf->first_match = j;
707
+      return 0;
708
+		}
709
+	}
710
+  /* no match, inf is invalid */
711
+  return -1;
712
+}
713
+
714
+/* this is like a FSM, with multiple active states at the same time.
715
+ * each bit in "state" means an active state, when a char is encountered
716
+ * we determine what states can remain active.
717
+ * The FSM transition rules are expressed as bit-masks */
718
+long filter_search(const struct filter *m, const unsigned char *data, unsigned long len)
719
+{
720
+	size_t j;
721
+	uint8_t state = ~0;
722
+	const uint8_t *B = m->B;
723
+	const uint8_t *End = m->end;
724
+
725
+	/* we use 2-grams, must be higher than 1 */
726
+	if(len < 2) return -1;
727
+	/* Shift-Or like search algorithm */
728
+	for(j=0;j < len-1; j++) {
729
+		const uint16_t q0 = cli_readint16( &data[j] );
730
+		uint8_t match_end;
731
+		state = (state << 1) | B[q0];
732
+		/* state marks with a 0 bit all active states
733
+		 * End[q0] marks with a 0 bit all states where the q-gram 'q' can end a pattern
734
+		 * if we got two 0's at matching positions, it means we encountered a pattern's end */
735
+		match_end = state | End[q0];
736
+		if(match_end != 0xff) {
737
+
738
+			/* if state is reachable, and this character can finish a pattern, assume match */
739
+			/* to reduce false positives check if qgram can finish the pattern */
740
+			/* return position of probable match */
741
+			/* find first 0 starting from MSB, the position of that bit as counted from LSB, is the length of the
742
+			 * longest pattern that could match */
743
+			return j >= MAXSOPATLEN  ? j - MAXSOPATLEN : 0;
744
+		}
745
+	}
746
+	/* no match */
747
+	return -1;
748
+}
0 749
new file mode 100644
... ...
@@ -0,0 +1,42 @@
0
+/*
1
+ *  A fast filter for static patterns.
2
+ *
3
+ *  Copyright (C) 2008 Sourcefire, Inc.
4
+ *
5
+ *  Authors: Török Edvin
6
+ *
7
+ *  This program is free software; you can redistribute it and/or modify
8
+ *  it under the terms of the GNU General Public License version 2 as
9
+ *  published by the Free Software Foundation.
10
+ *
11
+ *  This program is distributed in the hope that it will be useful,
12
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
13
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14
+ *  GNU General Public License for more details.
15
+ *
16
+ *  You should have received a copy of the GNU General Public License
17
+ *  along with this program; if not, write to the Free Software
18
+ *  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
19
+ *  MA 02110-1301, USA.
20
+ */
21
+#ifndef FILTER_H
22
+#define FILTER_H
23
+#include "cltypes.h"
24
+struct filter {
25
+	uint8_t B[65536];
26
+	uint8_t end[65536];
27
+	unsigned long m;
28
+};
29
+
30
+struct filter_match_info {
31
+	unsigned long first_match;
32
+};
33
+
34
+struct cli_ac_patt;
35
+void filter_init(struct filter *m);
36
+long filter_search(const struct filter *m, const unsigned char *data, unsigned long len);
37
+int filter_search_ext(const struct filter *m, const unsigned char *data, unsigned long len, struct filter_match_info *inf);
38
+int  filter_add_static(struct filter *m, const unsigned char *pattern, unsigned long len, const char *name);
39
+int  filter_add_acpatt(struct filter *m, const struct cli_ac_patt *pat);
40
+
41
+#endif
... ...
@@ -123,8 +123,10 @@ CLAMAV_PRIVATE {
123 123
     cli_bm_scanbuff;
124 124
     cli_bm_free;
125 125
     cli_initroots;
126
+    cli_scanbuff;
126 127
     html_screnc_decode;
127 128
     mpool_create;
129
+    mpool_calloc;
128 130
     mpool_destroy;
129 131
     mpool_free;
130 132
     mpool_getstats;
... ...
@@ -333,7 +333,7 @@ static int ac_maketrans(struct cli_matcher *root)
333 333
 	    continue;
334 334
 	for(i = 0; i < 256; i++) {
335 335
 	    child = node->trans[i];
336
-	    if(!child) {
336
+	    if (!child || (!IS_FINAL(child) && IS_LEAF(child))) {
337 337
 		struct cli_ac_node *failtarget = node->fail;
338 338
 		while(IS_LEAF(failtarget) || !failtarget->trans[i])
339 339
 		    failtarget = failtarget->fail;
... ...
@@ -1132,14 +1132,12 @@ int cli_ac_scanbuff(const unsigned char *buffer, uint32_t length, const char **v
1132 1132
     current = root->ac_root;
1133 1133
 
1134 1134
     for(i = 0; i < length; i++)  {
1135
-
1136
-	if(IS_LEAF(current))
1137
-	    current = current->fail;
1138
-
1139 1135
 	current = current->trans[buffer[i]];
1140 1136
 
1141 1137
 	if(IS_FINAL(current)) {
1142 1138
 	    patt = current->list;
1139
+	    if (IS_LEAF(current))
1140
+		current = current->fail;
1143 1141
 	    while(patt) {
1144 1142
 		bp = i + 1 - patt->depth;
1145 1143
 		if(patt->offdata[0] != CLI_OFF_VERSION && patt->offdata[0] != CLI_OFF_MACRO && !patt->next_same && (patt->offset_min != CLI_OFF_ANY) && (!patt->sigid || patt->partno == 1)) {
... ...
@@ -49,6 +49,22 @@
49 49
 #include "pe_icons.h"
50 50
 #include "regex/regex.h"
51 51
 
52
+static inline int matcher_run(const struct cli_matcher *root,
53
+			      const unsigned char *buffer, uint32_t length,
54
+			      const char **virname, struct cli_ac_data *mdata,
55
+			      uint32_t offset,
56
+			      cli_file_t ftype,
57
+			      struct cli_matched_type **ftoffset,
58
+			      unsigned int acmode,
59
+			      fmap_t *map,
60
+			      struct cli_bm_off *offdata)
61
+{
62
+    int ret;
63
+    if (root->ac_only || (ret = cli_bm_scanbuff(buffer, length, virname, NULL, root, offset, map, offdata)) != CL_VIRUS)
64
+	ret = cli_ac_scanbuff(buffer, length, virname, NULL, NULL, root, mdata, offset, ftype, ftoffset, acmode, NULL);
65
+    return ret;
66
+}
67
+
52 68
 int cli_scanbuff(const unsigned char *buffer, uint32_t length, uint32_t offset, cli_ctx *ctx, cli_file_t ftype, struct cli_ac_data **acdata)
53 69
 {
54 70
 	int ret = CL_CLEAN;
... ...
@@ -79,8 +95,7 @@ int cli_scanbuff(const unsigned char *buffer, uint32_t length, uint32_t offset,
79 79
 	if(!acdata && (ret = cli_ac_initdata(&mdata, troot->ac_partsigs, troot->ac_lsigs, troot->ac_reloff_num, CLI_DEFAULT_AC_TRACKLEN)))
80 80
 	    return ret;
81 81
 
82
-	if(troot->ac_only || (ret = cli_bm_scanbuff(buffer, length, virname, NULL, troot, offset, NULL, NULL)) != CL_VIRUS)
83
-	    ret = cli_ac_scanbuff(buffer, length, virname, NULL, NULL, troot, acdata ? (acdata[0]) : (&mdata), offset, ftype, NULL, AC_SCAN_VIR, NULL);
82
+	ret = matcher_run(troot, buffer, length, virname, acdata ? (acdata[0]): (&mdata), offset, ftype, NULL, AC_SCAN_VIR, NULL, NULL);
84 83
 
85 84
 	if(!acdata)
86 85
 	    cli_ac_freedata(&mdata);
... ...
@@ -92,8 +107,7 @@ int cli_scanbuff(const unsigned char *buffer, uint32_t length, uint32_t offset,
92 92
     if(!acdata && (ret = cli_ac_initdata(&mdata, groot->ac_partsigs, groot->ac_lsigs, groot->ac_reloff_num, CLI_DEFAULT_AC_TRACKLEN)))
93 93
 	return ret;
94 94
 
95
-    if(groot->ac_only || (ret = cli_bm_scanbuff(buffer, length, virname, NULL, groot, offset, NULL, NULL)) != CL_VIRUS)
96
-	ret = cli_ac_scanbuff(buffer, length, virname, NULL, NULL, groot, acdata ? (acdata[1]) : (&mdata), offset, ftype, NULL, AC_SCAN_VIR, NULL);
95
+    ret = matcher_run(groot, buffer, length, virname, acdata ? (acdata[1]): (&mdata), offset, ftype, NULL, AC_SCAN_VIR, NULL, NULL);
97 96
 
98 97
     if(!acdata)
99 98
 	cli_ac_freedata(&mdata);
... ...
@@ -444,8 +458,8 @@ int cli_fmap_scandesc(cli_ctx *ctx, cli_file_t ftype, uint8_t ftonly, struct cli
444 444
 	    *ctx->scanned += bytes / CL_COUNT_PRECISION;
445 445
 
446 446
 	if(troot) {
447
-	    if(troot->ac_only || (ret = cli_bm_scanbuff(buff, bytes, ctx->virname, NULL, troot, offset, map, bm_offmode ? &toff : NULL)) != CL_VIRUS)
448
-		ret = cli_ac_scanbuff(buff, bytes, ctx->virname, NULL, NULL, troot, &tdata, offset, ftype, ftoffset, acmode, NULL);
447
+	    ret = matcher_run(troot, buff, bytes, ctx->virname, &tdata, offset, ftype, ftoffset, acmode, map, bm_offmode ? &toff : NULL);
448
+
449 449
 	    if(ret == CL_VIRUS) {
450 450
 		if(!ftonly)
451 451
 		    cli_ac_freedata(&gdata);
... ...
@@ -422,6 +422,12 @@ void cli_errmsg(const char *str, ...);
422 422
 #define always_inline inline
423 423
 #endif
424 424
 
425
+#if defined (__GNUC__) && ((__GNUC__ > 4) || (__GNUC__ == 4 && __GNUC_MINOR__ >= 3))
426
+#define __hot__ __attribute__((hot))
427
+#else
428
+#define __hot__
429
+#endif
430
+
425 431
 #define cli_dbgmsg (!UNLIKELY(cli_debug_flag)) ? (void)0 : cli_dbgmsg_internal
426 432
 
427 433
 #ifdef __GNUC__
428 434
new file mode 100644
... ...
@@ -0,0 +1,148 @@
0
+/*
1
+ *  Gather statistics from performance sensitive code.
2
+ *
3
+ *  Copyright (C) 2008 Sourcefire, Inc.
4
+ *
5
+ *  Authors: Török Edvin
6
+ *
7
+ *  This program is free software; you can redistribute it and/or modify
8
+ *  it under the terms of the GNU General Public License version 2 as
9
+ *  published by the Free Software Foundation.
10
+ *
11
+ *  This program is distributed in the hope that it will be useful,
12
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
13
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14
+ *  GNU General Public License for more details.
15
+ *
16
+ *  You should have received a copy of the GNU General Public License
17
+ *  along with this program; if not, write to the Free Software
18
+ *  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
19
+ *  MA 02110-1301, USA.
20
+ */
21
+#ifdef HAVE_CONFIG_H
22
+#include "clamav-config.h"
23
+#endif
24
+
25
+#include "perflogging.h"
26
+#include <stdio.h>
27
+#ifdef CLI_PERF_LOGGING
28
+
29
+__thread last_flushed = 0;
30
+__thread cli_perf_registered = 0;
31
+__thread uint64_t cli_perf_sum_tls[__LAST_SUMABLE];
32
+__thread uint64_t cli_perf_count_tls[__LAST_COUNTABLE][256];
33
+
34
+uint64_t cli_perf_sum[__LAST_SUMABLE];
35
+uint64_t cli_perf_count[__LAST_COUNTABLE][256];
36
+
37
+static pthread_key_t thread_exit_key;
38
+int pthread_key_create(pthread_key_t *key, void (*destr_function) (void *)); 
39
+
40
+static void cli_perf_thread_exit(void* arg)
41
+{
42
+	/* save counters into global */
43
+	cli_perf_flush();
44
+}
45
+
46
+void __attribute__((constructor)) __cli_perf_init(void)
47
+{
48
+	pthread_key_create(&thread_exit_key, cli_perf_thread_exit);
49
+}
50
+
51
+void __attribute__((destructor)) __cli_perf_exit(void)
52
+{
53
+	cli_perf_thread_exit(NULL);
54
+}
55
+
56
+static int dummy;
57
+void cli_perf_register(void)
58
+{
59
+	/* set a fake key, so that destructor gets called */
60
+	pthread_setspecific(thread_exit_key, &dummy);
61
+	cli_perf_registered = 1;
62
+}
63
+
64
+static const char *perf_log_names_sum[__LAST_SUMABLE] = {
65
+	"raw scanned",
66
+	"filter scanned",
67
+	"AC scanned",
68
+	"BM scanned"
69
+};
70
+
71
+static const char *perf_log_names_cnt[__LAST_COUNTABLE] = {
72
+	"trie bytes scanned",
73
+	"filter position load",
74
+	"filter end load",
75
+	"trie pattern original length"
76
+};
77
+
78
+#define NONE __LAST_SUMABLE
79
+static enum perf_log_sumable perf_log_percent[__LAST_SUMABLE] = {
80
+	NONE,
81
+	RAW_BYTES_SCANNED,
82
+	RAW_BYTES_SCANNED,
83
+	RAW_BYTES_SCANNED,
84
+};
85
+
86
+static enum perf_log_countable perf_log_percent_cnt[__LAST_COUNTABLE] = {
87
+	RAW_BYTES_SCANNED,
88
+	NONE,
89
+	NONE,
90
+	NONE,
91
+};
92
+
93
+static void cli_perf_print(void)
94
+{
95
+	enum perf_log_sumable i;
96
+	enum perf_log_countable j;
97
+	unsigned k;
98
+
99
+	uint64_t raw_scanned = cli_perf_sum[RAW_BYTES_SCANNED];
100
+	const double MEGA = 1024*1024.0;
101
+
102
+	/* in multiscan mode multiple threads can output, so output a unique id
103
+	 * here*/
104
+	printf("PERF: %p\n", &cli_perf_registered);
105
+	for(i=0;i<__LAST_SUMABLE;i++) {
106
+		printf("PERF: %s: %g MB", perf_log_names_sum[i], cli_perf_sum[i] / MEGA);
107
+		if (perf_log_percent[i] != NONE)
108
+			printf("(%6.3f%%)", 100.0*cli_perf_sum[i] / cli_perf_sum[perf_log_percent[i]]);
109
+		printf("\n");
110
+	}
111
+	printf("\n");
112
+	for(j=0;j<__LAST_COUNTABLE;j++) {
113
+		printf("PERF: %s: ", perf_log_names_cnt[j]);
114
+		for (k=0;k<256;k++)
115
+			if (cli_perf_count[j][k]) {
116
+				printf(" %u -> %ju", k, cli_perf_count[j][k]);
117
+				if (perf_log_percent_cnt[j] != NONE)
118
+					printf("(%6.3f%%)", 100.0*cli_perf_count[j][k] / cli_perf_sum[perf_log_percent_cnt[j]]);
119
+			}
120
+		printf("\n");
121
+	}
122
+	printf("\n");
123
+}
124
+
125
+static pthread_mutex_t cli_perf_log_mutex = PTHREAD_MUTEX_INITIALIZER;
126
+void cli_perf_flush(void)
127
+{
128
+	unsigned i, j;
129
+
130
+	pthread_mutex_lock(&cli_perf_log_mutex);
131
+
132
+	for (i = 0; i < __LAST_SUMABLE; i++) {
133
+		cli_perf_sum[i] += cli_perf_sum_tls[i];
134
+		cli_perf_sum_tls[i] = 0;
135
+	}
136
+
137
+	for (i = 0; i < __LAST_COUNTABLE; i++) {
138
+		for (j = 0; j < 256; j++) {
139
+			cli_perf_count[i][j] += cli_perf_count_tls[i][j];
140
+			cli_perf_count_tls[i][j] = 0;
141
+		}
142
+	}
143
+
144
+	cli_perf_print();
145
+	pthread_mutex_unlock(&cli_perf_log_mutex);
146
+}
147
+#endif
0 148
new file mode 100644
... ...
@@ -0,0 +1,101 @@
0
+/*
1
+ *  Gather statistics from performance sensitive code.
2
+ *
3
+ *  Copyright (C) 2008 Sourcefire, Inc.
4
+ *
5
+ *  Authors: Török Edvin
6
+ *
7
+ *  This program is free software; you can redistribute it and/or modify
8
+ *  it under the terms of the GNU General Public License version 2 as
9
+ *  published by the Free Software Foundation.
10
+ *
11
+ *  This program is distributed in the hope that it will be useful,
12
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
13
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14
+ *  GNU General Public License for more details.
15
+ *
16
+ *  You should have received a copy of the GNU General Public License
17
+ *  along with this program; if not, write to the Free Software
18
+ *  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
19
+ *  MA 02110-1301, USA.
20
+ */
21
+#ifndef PERFLOGGING_H
22
+#define PERFLOGGING_H
23
+
24
+/* this is a compile-time selectable, default off module to log certain
25
+ * statistics, such as which tries are used, efficiency of filtering and so on.
26
+ * it must have as little overhead as possible */
27
+
28
+//#define CLI_PERF_LOGGING
29
+#ifdef CLI_PERF_LOGGING
30
+
31
+#ifndef __GNUC__
32
+#error "Performance logging requires GNU C compatible compiler"
33
+#else
34
+/*TODO: maybe we need a GCC version check too here */
35
+#include <pthread.h>
36
+#include "cltypes.h"
37
+
38
+enum perf_log_sumable {
39
+	RAW_BYTES_SCANNED,
40
+	FILTER_BYTES_SCANNED,
41
+  AC_SCANNED,
42
+	BM_SCANNED,
43
+	__LAST_SUMABLE
44
+};
45
+
46
+enum perf_log_countable {
47
+	TRIE_SCANNED,
48
+	FILTER_LOAD,
49
+	FILTER_END_LOAD,
50
+	TRIE_ORIG_LEN,
51
+	__LAST_COUNTABLE
52
+};
53
+
54
+extern __thread int last_flushed;
55
+extern __thread int cli_perf_registered;
56
+extern __thread uint64_t cli_perf_sum_tls[__LAST_SUMABLE];
57
+extern __thread uint64_t cli_perf_count_tls[__LAST_COUNTABLE][256];
58
+extern __thread int last_flushed;
59
+
60
+extern uint64_t cli_perf_sum[__LAST_SUMABLE];
61
+extern uint64_t cli_perf_count[__LAST_COUNTABLE][256];
62
+
63
+void cli_perf_register(void);
64
+void cli_perf_flush(void);
65
+
66
+static inline void cli_perf_enter(void)
67
+{
68
+	if (!cli_perf_registered) cli_perf_register();
69
+	if (cli_perf_sum_tls[RAW_BYTES_SCANNED] - last_flushed > 100*1024*1024) {
70
+		cli_perf_flush();
71
+		last_flushed = cli_perf_sum_tls[RAW_BYTES_SCANNED];
72
+	}
73
+}
74
+
75
+static inline void cli_perf_log_add(enum perf_log_sumable kind, uint64_t add)
76
+{
77
+	cli_perf_enter();
78
+	assert( kind < __LAST_SUMABLE);
79
+	cli_perf_sum_tls[kind] += add;
80
+}
81
+
82
+static inline void cli_perf_log_count2(enum perf_log_countable kind, uint8_t event, uint64_t cnt)
83
+{
84
+	cli_perf_enter();
85
+	assert( kind < __LAST_COUNTABLE);
86
+	cli_perf_count_tls[kind][event] += cnt;
87
+}
88
+
89
+static inline void cli_perf_log_count(enum perf_log_countable kind, uint8_t event)
90
+{
91
+	cli_perf_log_count2(kind, event, 1);
92
+}
93
+
94
+#endif
95
+
96
+#else
97
+#define cli_perf_log_count(a,b) do {} while(0)
98
+#endif
99
+
100
+#endif
... ...
@@ -65,104 +65,6 @@ static int add_pattern_suffix(void *cbdata, const char *suffix, size_t suffix_le
65 65
 static int add_static_pattern(struct regex_matcher *matcher, char* pattern);
66 66
 /* ---------- */
67 67
 
68
-/* ----- shift-or filtering -------------- */
69
-
70
-#define BITMAP_CONTAINS(bmap, val) ((bmap)[(val) >> 5] & (1 << ((val) & 0x1f)))
71
-#define BITMAP_INSERT(bmap, val) ((bmap)[(val) >> 5] |= (1 << ((val) & 0x1f)))
72
-
73
-static void SO_init(struct filter *m)
74
-{
75
-	memset(m->B, ~0, sizeof(m->B));
76
-	memset(m->end, ~0, sizeof(m->end));
77
-	memset(m->end_fast, ~0, sizeof(m->end_fast));
78
-}
79
-
80
-/* because we use uint32_t */
81
-#define MAXSOPATLEN 32
82
-
83
-/* merge another pattern into the filter
84
- * add('abc'); add('bcd'); will match [ab][bc][cd] */
85
-static int SO_preprocess_add(struct filter *m, const unsigned char *pattern, size_t len)
86
-{
87
-	uint16_t q;
88
-	uint8_t j;
89
-
90
-	/* cut length, and make it modulo 2 */
91
-	if(len > MAXSOPATLEN) {
92
-		len = MAXSOPATLEN;
93
-	} else {
94
-		/* we use 2-grams, must be multiple of 2 */
95
-		len = len & ~1;
96
-	}
97
-	if(!len)
98
-		return 0;
99
-
100
-	/* Shift-Or like preprocessing */
101
-	for(j=0;j < len-1;j++) {
102
-		/* use overlapping 2-grams. We need them overlapping because matching can start at any position */
103
-		q = cli_readint16( &pattern[j] );
104
-		m->B[q] &= ~(1 << j);
105
-	}
106
-	/* we use variable length patterns, use last character to mark pattern end,
107
-	 * can lead to false positives.*/
108
-	/* mark that at state j, the q-gram q can end the pattern */
109
-	if(j) {
110
-		j--;
111
-		m->end[q] &= ~(1 << j);
112
-		m->end_fast[pattern[j+1]] &= ~(1<<j);
113
-	}
114
-	return 0;
115
-}
116
-
117
-/* this is like a FSM, with multiple active states at the same time.
118
- * each bit in "state" means an active state, when a char is encountered
119
- * we determine what states can remain active.
120
- * The FSM transition rules are expressed as bit-masks */
121
-long SO_search(const struct filter *m, const unsigned char *data, unsigned long len)
122
-{
123
-	size_t j;
124
-	uint32_t state = ~0;
125
-	const uint32_t *B = m->B;
126
-	const uint32_t *End = m->end;
127
-	const uint32_t *EndFast = m->end_fast;
128
-
129
-	/* cut length, and make it modulo 2 */
130
-	if(len > MAXSOPATLEN) {
131
-		len = MAXSOPATLEN;
132
-	} else {
133
-		/* we use 2-grams, must be multiple of 2 */
134
-		len = len & ~1;
135
-	}
136
-	if(!len) return -1;
137
-	/* Shift-Or like search algorithm */
138
-	for(j=0;j < len-1; j++) {
139
-		const uint16_t q0 = cli_readint16( &data[j] );
140
-		uint32_t match_end;
141
-		state = (state << 1) | B[q0];
142
-		/* state marks with a 0 bit all active states
143
-		 * End[q0] marks with a 0 bit all states where the q-gram 'q' can end a pattern
144
-		 * if we got two 0's at matching positions, it means we encountered a pattern's end */
145
-		match_end = state | EndFast[data[j+1]];
146
-		if((match_end != 0xffffffff) && (state | End[q0]) !=  0xffffffff) {
147
-			/* note: we rely on short-circuit eval here, we only evaluate and fetch End[q0], if
148
-			 * end_fast has matched. This reduces cache pressure on End[], and allows us to keep the working
149
-			 * set inside L2 */
150
-
151
-			/* if state is reachable, and this character can finish a pattern, assume match */
152
-			/* to reduce false positives check if qgram can finish the pattern */
153
-			/* return position of probable match */
154
-			/* find first 0 starting from MSB, the position of that bit as counted from LSB, is the length of the
155
-			 * longest pattern that could match */
156
-			return j >= MAXSOPATLEN  ? j - MAXSOPATLEN : 0;
157
-		}
158
-	}
159
-	/* no match */
160
-	return -1;
161
-}
162
-
163
-/* ----------------------------------------------------------- */
164
-
165
-
166 68
 #define MATCH_SUCCESS 0
167 69
 #define MATCH_FAILED  -1
168 70
 
... ...
@@ -296,7 +198,7 @@ int regex_list_match(struct regex_matcher* matcher,char* real_url,const char* di
296 296
 		if(!bufrev)
297 297
 			return CL_EMEM;
298 298
 		reverse_string(bufrev);
299
-		rc = SO_search(&matcher->filter, (const unsigned char*)bufrev, buffer_len) != -1;
299
+		rc = filter_search(&matcher->filter, (const unsigned char*)bufrev, buffer_len) != -1;
300 300
 		if(rc == -1) {
301 301
 			free(buffer);
302 302
 			free(bufrev);
... ...
@@ -381,7 +283,7 @@ int init_regex_list(struct regex_matcher* matcher)
381 381
 	if((rc = cli_bm_init(&matcher->hostkey_prefix))) {
382 382
 		return rc;
383 383
 	}
384
-	SO_init(&matcher->filter);
384
+	filter_init(&matcher->filter);
385 385
 	return CL_SUCCESS;
386 386
 }
387 387
 
... ...
@@ -697,7 +599,7 @@ static int add_newsuffix(struct regex_matcher *matcher, struct regex_list *info,
697 697
 		mpool_free(matcher->mempool, new);
698 698
 		return ret;
699 699
 	}
700
-	SO_preprocess_add(&matcher->filter, (const unsigned char*)suffix, len);
700
+	filter_add_static(&matcher->filter, (const unsigned char*)suffix, len, "regex");
701 701
 	return CL_SUCCESS;
702 702
 }
703 703
 
... ...
@@ -27,17 +27,11 @@
27 27
 #include "phishcheck.h"
28 28
 #include "readdb.h"
29 29
 #include "matcher.h"
30
+#include "filtering.h"
30 31
 #include <zlib.h> /* for gzFile */
31 32
 
32 33
 #include "mpool.h"
33 34
 
34
-struct filter {
35
-	uint32_t B[65536];
36
-	uint32_t end_fast[256];
37
-	uint32_t end[65536];
38
-	unsigned long m;
39
-};
40
-
41 35
 struct regex_list_ht {
42 36
 	struct regex_list *head;
43 37
 	struct regex_list *tail;
... ...
@@ -71,7 +65,6 @@ int load_regex_matcher(struct regex_matcher* matcher,FILE* fd,unsigned int *sign
71 71
 void regex_list_cleanup(struct regex_matcher* matcher);
72 72
 void regex_list_done(struct regex_matcher* matcher);
73 73
 int is_regex_ok(struct regex_matcher* matcher);
74
-long SO_search(const struct filter *m, const unsigned char *data, unsigned long len);
75 74
 
76 75
 #endif
77 76
 
... ...
@@ -30,6 +30,7 @@
30 30
 #include "../libclamav/matcher.h"
31 31
 #include "../libclamav/matcher-ac.h"
32 32
 #include "../libclamav/matcher-bm.h"
33
+#include "../libclamav/others.h"
33 34
 #include "../libclamav/default.h"
34 35
 #include "checks.h"
35 36
 
... ...
@@ -46,19 +47,44 @@ static const struct ac_testdata_s {
46 46
     { "abdcabcddabccadbbdbacb", "6463{2-3}64646162(63|64|65)6361*6462????6261{-1}6362", "Test_5" },
47 47
     { "abcdefghijkabcdefghijk", "62????65666768*696a6b6162{2-3}656667[1-3]6b", "Test_6" },
48 48
     { "abcadbabcadbabcacb", "6?6164?26?62{3}?26162?361", "Test_7" },
49
+    /* testcase for filter bug: it was checking only first 32 chars, and last
50
+     * maxpatlen */
51
+    { "\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1dddddddddddddddddddd5\1\1\1\1\1\1\1\1\1\1\1\1\1","6464646464646464646464646464646464646464(35|36)","Test_8"},
49 52
 
50 53
     { NULL, NULL, NULL}
51 54
 };
52 55
 
53
-START_TEST (test_ac_scanbuff) {
56
+
57
+static cli_ctx ctx;
58
+static const char *virname = NULL;
59
+static void setup(void)
60
+{
54 61
 	struct cli_matcher *root;
62
+	virname = NULL;
63
+	ctx.virname = &virname;
64
+	ctx.engine = cl_engine_new();
65
+	fail_unless(!!ctx.engine, "cl_engine_new() failed");
66
+	root = (struct cli_matcher *) mpool_calloc(ctx.engine->mempool, 1, sizeof(struct cli_matcher));
67
+	fail_unless(root != NULL, "root == NULL");
68
+#ifdef USE_MPOOL
69
+	root->mempool = ctx.engine->mempool;
70
+#endif
71
+
72
+	ctx.engine->root[0] = root;
73
+}
74
+
75
+static void teardown(void)
76
+{
77
+	cl_engine_free((struct cl_engine*)ctx.engine);
78
+}
79
+
80
+START_TEST (test_ac_scanbuff) {
55 81
 	struct cli_ac_data mdata;
56
-	const char *virname = NULL;
82
+	struct cli_matcher *root;
57 83
 	unsigned int i;
58 84
 	int ret;
59 85
 
60
-
61
-    root = (struct cli_matcher *) cli_calloc(1, sizeof(struct cli_matcher));
86
+    root = ctx.engine->root[0];
62 87
     fail_unless(root != NULL, "root == NULL");
63 88
     root->ac_only = 1;
64 89
 
... ...
@@ -68,6 +94,7 @@ START_TEST (test_ac_scanbuff) {
68 68
     ret = cli_ac_init(root, CLI_DEFAULT_AC_MINDEPTH, CLI_DEFAULT_AC_MAXDEPTH);
69 69
     fail_unless(ret == CL_SUCCESS, "cli_ac_init() failed");
70 70
 
71
+
71 72
     for(i = 0; ac_testdata[i].data; i++) {
72 73
 	ret = cli_parse_add(root, ac_testdata[i].virname, ac_testdata[i].hexsig, 0, 0, "*", 0, NULL, 0);
73 74
 	fail_unless(ret == CL_SUCCESS, "cli_parse_add() failed");
... ...
@@ -83,14 +110,13 @@ START_TEST (test_ac_scanbuff) {
83 83
 	ret = cli_ac_scanbuff(ac_testdata[i].data, strlen(ac_testdata[i].data), &virname, NULL, NULL, root, &mdata, 0, 0, NULL, AC_SCAN_VIR, NULL);
84 84
 	fail_unless_fmt(ret == CL_VIRUS, "cli_ac_scanbuff() failed for %s", ac_testdata[i].virname);
85 85
 	fail_unless_fmt(!strncmp(virname, ac_testdata[i].virname, strlen(ac_testdata[i].virname)), "Dataset %u matched with %s", i, virname);
86
+
87
+	ret = cli_scanbuff(ac_testdata[i].data, strlen(ac_testdata[i].data), 0, &ctx, 0, NULL);
88
+	fail_unless_fmt(ret == CL_VIRUS, "cli_scanbuff() failed for %s", ac_testdata[i].virname);
89
+	fail_unless_fmt(!strncmp(virname, ac_testdata[i].virname, strlen(ac_testdata[i].virname)), "Dataset %u matched with %s", i, virname);
86 90
     }
87 91
 
88 92
     cli_ac_freedata(&mdata);
89
-    cli_ac_free(root);
90
-#ifdef USE_MPOOL
91
-    mpool_destroy(root->mempool);
92
-#endif
93
-    free(root);
94 93
 }
95 94
 END_TEST
96 95
 
... ...
@@ -100,7 +126,7 @@ START_TEST (test_bm_scanbuff) {
100 100
 	int ret;
101 101
 
102 102
 
103
-    root = (struct cli_matcher *) cli_calloc(1, sizeof(struct cli_matcher));
103
+    root = ctx.engine->root[0];
104 104
     fail_unless(root != NULL, "root == NULL");
105 105
 
106 106
 #ifdef USE_MPOOL
... ...
@@ -119,11 +145,6 @@ START_TEST (test_bm_scanbuff) {
119 119
     ret = cli_bm_scanbuff("blah\xde\xad\xbe\xef", 12, &virname, NULL, root, 0, NULL, NULL);
120 120
     fail_unless(ret == CL_VIRUS, "cli_bm_scanbuff() failed");
121 121
     fail_unless(!strncmp(virname, "Sig2", 4), "Incorrect signature matched in cli_bm_scanbuff()\n");
122
-    cli_bm_free(root);
123
-#ifdef USE_MPOOL
124
-    mpool_destroy(root->mempool);
125
-#endif
126
-    free(root);
127 122
 }
128 123
 END_TEST
129 124
 
... ...
@@ -133,6 +154,7 @@ Suite *test_matchers_suite(void)
133 133
     TCase *tc_matchers;
134 134
     tc_matchers = tcase_create("matchers");
135 135
     suite_add_tcase(s, tc_matchers);
136
+    tcase_add_checked_fixture (tc_matchers, setup, teardown);
136 137
     tcase_add_test(tc_matchers, test_ac_scanbuff);
137 138
     tcase_add_test(tc_matchers, test_bm_scanbuff);
138 139
     return s;