* prefiltering4:
fix build
move matching code to matcher_run.
matcher-ac: move leaf checks inside IS_FINAL.
Prepare for prefiltering: add new files.
... | ... |
@@ -148,12 +148,13 @@ am__libclamav_la_SOURCES_DIST = clamav.h matcher-ac.c matcher-ac.h \ |
148 | 148 |
7z/Archive/7z/7zExtract.h explode.c explode.h textnorm.c \ |
149 | 149 |
textnorm.h dlp.c dlp.h jsparse/js-norm.c jsparse/js-norm.h \ |
150 | 150 |
jsparse/lexglobal.h jsparse/textbuf.h uniq.c uniq.h version.c \ |
151 |
- version.h mpool.c mpool.h fmap.c fmap.h default.h sha256.c \ |
|
152 |
- sha256.h bignum.h bytecode.c bytecode.h bytecode_vm.c \ |
|
153 |
- bytecode_priv.h clambc.h cpio.c cpio.h macho.c macho.h \ |
|
154 |
- ishield.c ishield.h type_desc.h bcfeatures.h bytecode_api.c \ |
|
155 |
- bytecode_api_decl.c bytecode_api.h bytecode_api_impl.h \ |
|
156 |
- bytecode_hooks.h cache.c cache.h bignum.c bignum_class.h |
|
151 |
+ version.h mpool.c mpool.h filtering.h filtering.c fmap.c \ |
|
152 |
+ fmap.h perflogging.c perflogging.h default.h sha256.c sha256.h \ |
|
153 |
+ bignum.h bytecode.c bytecode.h bytecode_vm.c bytecode_priv.h \ |
|
154 |
+ clambc.h cpio.c cpio.h macho.c macho.h ishield.c ishield.h \ |
|
155 |
+ type_desc.h bcfeatures.h bytecode_api.c bytecode_api_decl.c \ |
|
156 |
+ bytecode_api.h bytecode_api_impl.h bytecode_hooks.h cache.c \ |
|
157 |
+ cache.h bignum.c bignum_class.h |
|
157 | 158 |
@LINK_TOMMATH_FALSE@am__objects_1 = libclamav_la-bignum.lo |
158 | 159 |
am_libclamav_la_OBJECTS = libclamav_la-matcher-ac.lo \ |
159 | 160 |
libclamav_la-matcher-bm.lo libclamav_la-matcher.lo \ |
... | ... |
@@ -197,7 +198,8 @@ am_libclamav_la_OBJECTS = libclamav_la-matcher-ac.lo \ |
197 | 197 |
libclamav_la-explode.lo libclamav_la-textnorm.lo \ |
198 | 198 |
libclamav_la-dlp.lo libclamav_la-js-norm.lo \ |
199 | 199 |
libclamav_la-uniq.lo libclamav_la-version.lo \ |
200 |
- libclamav_la-mpool.lo libclamav_la-fmap.lo \ |
|
200 |
+ libclamav_la-mpool.lo libclamav_la-filtering.lo \ |
|
201 |
+ libclamav_la-fmap.lo libclamav_la-perflogging.lo \ |
|
201 | 202 |
libclamav_la-sha256.lo libclamav_la-bytecode.lo \ |
202 | 203 |
libclamav_la-bytecode_vm.lo libclamav_la-cpio.lo \ |
203 | 204 |
libclamav_la-macho.lo libclamav_la-ishield.lo \ |
... | ... |
@@ -644,12 +646,13 @@ libclamav_la_SOURCES = clamav.h matcher-ac.c matcher-ac.h matcher-bm.c \ |
644 | 644 |
7z/Archive/7z/7zExtract.h explode.c explode.h textnorm.c \ |
645 | 645 |
textnorm.h dlp.c dlp.h jsparse/js-norm.c jsparse/js-norm.h \ |
646 | 646 |
jsparse/lexglobal.h jsparse/textbuf.h uniq.c uniq.h version.c \ |
647 |
- version.h mpool.c mpool.h fmap.c fmap.h default.h sha256.c \ |
|
648 |
- sha256.h bignum.h bytecode.c bytecode.h bytecode_vm.c \ |
|
649 |
- bytecode_priv.h clambc.h cpio.c cpio.h macho.c macho.h \ |
|
650 |
- ishield.c ishield.h type_desc.h bcfeatures.h bytecode_api.c \ |
|
651 |
- bytecode_api_decl.c bytecode_api.h bytecode_api_impl.h \ |
|
652 |
- bytecode_hooks.h cache.c cache.h $(am__append_7) |
|
647 |
+ version.h mpool.c mpool.h filtering.h filtering.c fmap.c \ |
|
648 |
+ fmap.h perflogging.c perflogging.h default.h sha256.c sha256.h \ |
|
649 |
+ bignum.h bytecode.c bytecode.h bytecode_vm.c bytecode_priv.h \ |
|
650 |
+ clambc.h cpio.c cpio.h macho.c macho.h ishield.c ishield.h \ |
|
651 |
+ type_desc.h bcfeatures.h bytecode_api.c bytecode_api_decl.c \ |
|
652 |
+ bytecode_api.h bytecode_api_impl.h bytecode_hooks.h cache.c \ |
|
653 |
+ cache.h $(am__append_7) |
|
653 | 654 |
noinst_LTLIBRARIES = libclamav_internal_utils.la libclamav_internal_utils_nothreads.la libclamav_nocxx.la |
654 | 655 |
COMMON_CLEANFILES = version.h version.h.tmp *.gcda *.gcno |
655 | 656 |
@MAINTAINER_MODE_TRUE@BUILT_SOURCES = jsparse/generated/operators.h jsparse/generated/keywords.h jsparse-keywords.gperf |
... | ... |
@@ -806,6 +809,7 @@ distclean-compile: |
806 | 806 |
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libclamav_la-entconv.Plo@am__quote@ |
807 | 807 |
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libclamav_la-explode.Plo@am__quote@ |
808 | 808 |
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libclamav_la-filetypes.Plo@am__quote@ |
809 |
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libclamav_la-filtering.Plo@am__quote@ |
|
809 | 810 |
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libclamav_la-fmap.Plo@am__quote@ |
810 | 811 |
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libclamav_la-fsg.Plo@am__quote@ |
811 | 812 |
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libclamav_la-hashtab.Plo@am__quote@ |
... | ... |
@@ -834,6 +838,7 @@ distclean-compile: |
834 | 834 |
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libclamav_la-pdf.Plo@am__quote@ |
835 | 835 |
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libclamav_la-pe.Plo@am__quote@ |
836 | 836 |
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libclamav_la-pe_icons.Plo@am__quote@ |
837 |
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libclamav_la-perflogging.Plo@am__quote@ |
|
837 | 838 |
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libclamav_la-petite.Plo@am__quote@ |
838 | 839 |
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libclamav_la-phish_domaincheck_db.Plo@am__quote@ |
839 | 840 |
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libclamav_la-phish_whitelist.Plo@am__quote@ |
... | ... |
@@ -1579,6 +1584,14 @@ libclamav_la-mpool.lo: mpool.c |
1579 | 1579 |
@AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@ |
1580 | 1580 |
@am__fastdepCC_FALSE@ $(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libclamav_la_CFLAGS) $(CFLAGS) -c -o libclamav_la-mpool.lo `test -f 'mpool.c' || echo '$(srcdir)/'`mpool.c |
1581 | 1581 |
|
1582 |
+libclamav_la-filtering.lo: filtering.c |
|
1583 |
+@am__fastdepCC_TRUE@ $(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libclamav_la_CFLAGS) $(CFLAGS) -MT libclamav_la-filtering.lo -MD -MP -MF $(DEPDIR)/libclamav_la-filtering.Tpo -c -o libclamav_la-filtering.lo `test -f 'filtering.c' || echo '$(srcdir)/'`filtering.c |
|
1584 |
+@am__fastdepCC_TRUE@ $(AM_V_at)$(am__mv) $(DEPDIR)/libclamav_la-filtering.Tpo $(DEPDIR)/libclamav_la-filtering.Plo |
|
1585 |
+@am__fastdepCC_FALSE@ $(AM_V_CC) @AM_BACKSLASH@ |
|
1586 |
+@AMDEP_TRUE@@am__fastdepCC_FALSE@ source='filtering.c' object='libclamav_la-filtering.lo' libtool=yes @AMDEPBACKSLASH@ |
|
1587 |
+@AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@ |
|
1588 |
+@am__fastdepCC_FALSE@ $(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libclamav_la_CFLAGS) $(CFLAGS) -c -o libclamav_la-filtering.lo `test -f 'filtering.c' || echo '$(srcdir)/'`filtering.c |
|
1589 |
+ |
|
1582 | 1590 |
libclamav_la-fmap.lo: fmap.c |
1583 | 1591 |
@am__fastdepCC_TRUE@ $(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libclamav_la_CFLAGS) $(CFLAGS) -MT libclamav_la-fmap.lo -MD -MP -MF $(DEPDIR)/libclamav_la-fmap.Tpo -c -o libclamav_la-fmap.lo `test -f 'fmap.c' || echo '$(srcdir)/'`fmap.c |
1584 | 1592 |
@am__fastdepCC_TRUE@ $(AM_V_at)$(am__mv) $(DEPDIR)/libclamav_la-fmap.Tpo $(DEPDIR)/libclamav_la-fmap.Plo |
... | ... |
@@ -1587,6 +1600,14 @@ libclamav_la-fmap.lo: fmap.c |
1587 | 1587 |
@AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@ |
1588 | 1588 |
@am__fastdepCC_FALSE@ $(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libclamav_la_CFLAGS) $(CFLAGS) -c -o libclamav_la-fmap.lo `test -f 'fmap.c' || echo '$(srcdir)/'`fmap.c |
1589 | 1589 |
|
1590 |
+libclamav_la-perflogging.lo: perflogging.c |
|
1591 |
+@am__fastdepCC_TRUE@ $(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libclamav_la_CFLAGS) $(CFLAGS) -MT libclamav_la-perflogging.lo -MD -MP -MF $(DEPDIR)/libclamav_la-perflogging.Tpo -c -o libclamav_la-perflogging.lo `test -f 'perflogging.c' || echo '$(srcdir)/'`perflogging.c |
|
1592 |
+@am__fastdepCC_TRUE@ $(AM_V_at)$(am__mv) $(DEPDIR)/libclamav_la-perflogging.Tpo $(DEPDIR)/libclamav_la-perflogging.Plo |
|
1593 |
+@am__fastdepCC_FALSE@ $(AM_V_CC) @AM_BACKSLASH@ |
|
1594 |
+@AMDEP_TRUE@@am__fastdepCC_FALSE@ source='perflogging.c' object='libclamav_la-perflogging.lo' libtool=yes @AMDEPBACKSLASH@ |
|
1595 |
+@AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@ |
|
1596 |
+@am__fastdepCC_FALSE@ $(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libclamav_la_CFLAGS) $(CFLAGS) -c -o libclamav_la-perflogging.lo `test -f 'perflogging.c' || echo '$(srcdir)/'`perflogging.c |
|
1597 |
+ |
|
1590 | 1598 |
libclamav_la-sha256.lo: sha256.c |
1591 | 1599 |
@am__fastdepCC_TRUE@ $(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libclamav_la_CFLAGS) $(CFLAGS) -MT libclamav_la-sha256.lo -MD -MP -MF $(DEPDIR)/libclamav_la-sha256.Tpo -c -o libclamav_la-sha256.lo `test -f 'sha256.c' || echo '$(srcdir)/'`sha256.c |
1592 | 1600 |
@am__fastdepCC_TRUE@ $(AM_V_at)$(am__mv) $(DEPDIR)/libclamav_la-sha256.Tpo $(DEPDIR)/libclamav_la-sha256.Plo |
1593 | 1601 |
new file mode 100644 |
... | ... |
@@ -0,0 +1,749 @@ |
0 |
+/* |
|
1 |
+ * A fast filter for static patterns. |
|
2 |
+ * |
|
3 |
+ * Copyright (C) 2008 Sourcefire, Inc. |
|
4 |
+ * |
|
5 |
+ * Authors: Török Edvin |
|
6 |
+ * |
|
7 |
+ * This program is free software; you can redistribute it and/or modify |
|
8 |
+ * it under the terms of the GNU General Public License version 2 as |
|
9 |
+ * published by the Free Software Foundation. |
|
10 |
+ * |
|
11 |
+ * This program is distributed in the hope that it will be useful, |
|
12 |
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of |
|
13 |
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
|
14 |
+ * GNU General Public License for more details. |
|
15 |
+ * |
|
16 |
+ * You should have received a copy of the GNU General Public License |
|
17 |
+ * along with this program; if not, write to the Free Software |
|
18 |
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, |
|
19 |
+ * MA 02110-1301, USA. |
|
20 |
+ */ |
|
21 |
+#if HAVE_CONFIG_H |
|
22 |
+#include "clamav-config.h" |
|
23 |
+#endif |
|
24 |
+#include "filtering.h" |
|
25 |
+#include "matcher-ac.h" |
|
26 |
+#include <string.h> |
|
27 |
+#include <assert.h> |
|
28 |
+#include "perflogging.h" |
|
29 |
+/* ----- shift-or filtering -------------- */ |
|
30 |
+ |
|
31 |
+/* |
|
32 |
+ * Description of algorithm: |
|
33 |
+ * |
|
34 |
+ * Multiple patterns are added to the filter. |
|
35 |
+ * The filter retains an approximation of these patterns, which can lead to |
|
36 |
+ * false positive matches, but not false negative matches. |
|
37 |
+ * |
|
38 |
+ * For each position in the filter we retain what qgrams can match at that |
|
39 |
+ * position, for example (if we'd use characters as qgrams): |
|
40 |
+ * pattern1: atu |
|
41 |
+ * pattern2: bzf |
|
42 |
+ * pattern3: xat |
|
43 |
+ * |
|
44 |
+ * filter accepts: |
|
45 |
+ * [abx][tza][uft] |
|
46 |
+ * |
|
47 |
+ * But it also accepts (false positives): |
|
48 |
+ * azu, azf, azt, ... |
|
49 |
+ * |
|
50 |
+ * It doesn't however accept: |
|
51 |
+ * aaa, atz, ... |
|
52 |
+ * |
|
53 |
+ * This is implemented by having a bit-level state-machine with MAXSOPATLEN (=32) states, |
|
54 |
+ * each active bit meaning that a state is active. |
|
55 |
+ * |
|
56 |
+ * The states are activated sequentially, eachtransition decision is made |
|
57 |
+ * considering if we can accept the character at position X. |
|
58 |
+ * Since we can start a match at any position, position 0 is |
|
59 |
+ * reactivated each time. |
|
60 |
+ * When the last position is activated, the filter reports a match. |
|
61 |
+ * If we can't accept the character at position X, the state remains inactive, |
|
62 |
+ * and further states aren't activated (unless we activate this state in the |
|
63 |
+ * future). |
|
64 |
+ * |
|
65 |
+ * Essentially this is an automaton like this: |
|
66 |
+ * |
|
67 |
+ * /\ (a|b|x) (t|z|a) (u|f|t) |
|
68 |
+ * [S1] ---------> [S2] -------> [S3] ---------> [S4] -> match |
|
69 |
+ * \_______________/ | |
|
70 |
+ * \_____________________________/ |
|
71 |
+ * |
|
72 |
+ * |
|
73 |
+ * But we are tracking multiple active states at each time (or run N automatons |
|
74 |
+ * in parallel if you like, N = number of states). |
|
75 |
+ * |
|
76 |
+ * We can have S3 and S2 active, meaning that if the next character is |
|
77 |
+ * acceptable, it transitions to S1,S3 and S4 being active, otherwise it |
|
78 |
+ * transitions to S1 being active. |
|
79 |
+ * |
|
80 |
+ * Active states can either be represented as a binary 1 or 0, and using |
|
81 |
+ * bit-shifting and masking. |
|
82 |
+ * If we choose 1, we must use &, and after shifting always reactivate bit 0. |
|
83 |
+ * If we choose 0, we must use |, and after shifting we don't need to do |
|
84 |
+ * anything (since by shifting a 0 is implicitly introduced). |
|
85 |
+ * |
|
86 |
+ * This file implements the latter (shift-or) method. |
|
87 |
+ * |
|
88 |
+ * The discussion above considered pattern to be of same length (or truncated to |
|
89 |
+ * be so). In reality patterns are of variable length, and we often have short |
|
90 |
+ * pattern. |
|
91 |
+ * |
|
92 |
+ * Thus another bitmap was introduced, meaning that if (end[Q] == set), then |
|
93 |
+ * a pattern can end at this position. |
|
94 |
+ * Also we would fill the pattern's position filters quite quickly with only 256 |
|
95 |
+ * choices for a position, so the algorithm uses overlapping qgrams of length 2: |
|
96 |
+ * 'abcd' is 3 qgrams: 'ab','bc','cd' |
|
97 |
+ * |
|
98 |
+ * The algorithm is very sensitive to the end[Q] filter, since it can have false |
|
99 |
+ * positives due to short patterns! |
|
100 |
+ * For optimal performance we need: |
|
101 |
+ * - patterns as long as possible |
|
102 |
+ * - probability for end[Q] to match low (avoid 0000, and other common case |
|
103 |
+ * - choose the most "diverse" subset from a long pattern |
|
104 |
+ * |
|
105 |
+ * diverse = refering to what we are scanning, so that the filter rarely |
|
106 |
+ * matches, so this actually means that we *want* to avoid adding more |
|
107 |
+ * characters to the filter, if we have 2 patterns: |
|
108 |
+ * abxfg, and dalabxpo, it may be preferable to shift the 2nd one so that we |
|
109 |
+ * don't add new character at the beginning. |
|
110 |
+ * |
|
111 |
+ * With NDB signatures there are more challenges to overcome: |
|
112 |
+ * e8??0000000aa |
|
113 |
+ * |
|
114 |
+ * will make the filter accept: |
|
115 |
+ * e8<all-256-values-here>, <all-256-values>00, ... 000000aa |
|
116 |
+ * |
|
117 |
+ * We should delay the pattern end as long as possible, especially if it is 0000 |
|
118 |
+ * The problem is that now the filter accepts 0000 on position 3, regardless |
|
119 |
+ * of what we have on position 1 (even if we have something else than e8), so |
|
120 |
+ * we have to be very careful not to allow 0000 on first position too, |
|
121 |
+ * otherwise the filter will happily accept 000000000000. |
|
122 |
+ * |
|
123 |
+ * To optimize cache usage there are 2 end filters, one character (fits L1), and one qgram |
|
124 |
+ * based (fits L2), both must match for the filter to consider it a match. |
|
125 |
+ * |
|
126 |
+ * |
|
127 |
+ */ |
|
128 |
+ |
|
129 |
+#define BITMAP_CONTAINS(bmap, val) ((bmap)[(val) >> 5] & (1 << ((val) & 0x1f))) |
|
130 |
+#define BITMAP_INSERT(bmap, val) ((bmap)[(val) >> 5] |= (1 << ((val) & 0x1f))) |
|
131 |
+ |
|
132 |
+void filter_init(struct filter *m) |
|
133 |
+{ |
|
134 |
+ memset(m->B, ~0, sizeof(m->B)); |
|
135 |
+ memset(m->end, ~0, sizeof(m->end)); |
|
136 |
+} |
|
137 |
+ |
|
138 |
+/* because we use uint32_t */ |
|
139 |
+#define MAXSOPATLEN 8 |
|
140 |
+ |
|
141 |
+static inline int filter_isset(const struct filter *m, unsigned pos, uint16_t val) |
|
142 |
+{ |
|
143 |
+ return !(m->B[val] & (1<<pos)); |
|
144 |
+} |
|
145 |
+ |
|
146 |
+static inline void filter_set_atpos(struct filter *m, unsigned pos, uint16_t val) |
|
147 |
+{ |
|
148 |
+ if (!filter_isset(m, pos, val)) { |
|
149 |
+ cli_perf_log_count(FILTER_LOAD, pos); |
|
150 |
+ m->B[val] &= ~(1<<pos); |
|
151 |
+ } |
|
152 |
+} |
|
153 |
+ |
|
154 |
+ |
|
155 |
+static inline int filter_end_isset(const struct filter *m, unsigned pos, uint16_t a) |
|
156 |
+{ |
|
157 |
+ return !(m->end[a] & (1<<pos)); |
|
158 |
+} |
|
159 |
+ |
|
160 |
+static inline void filter_set_end(struct filter *m, unsigned pos, uint16_t a) |
|
161 |
+{ |
|
162 |
+ if (!filter_end_isset(m, pos, a)) { |
|
163 |
+ cli_perf_log_count(FILTER_END_LOAD, pos); |
|
164 |
+ m->end[a] &= ~(1 << pos); |
|
165 |
+ } |
|
166 |
+} |
|
167 |
+#define MAX_CHOICES 8 |
|
168 |
+/* just an arbitrary limit, if patterns are longer, we cut |
|
169 |
+ * the filter can only use MAXSOPATLEN (32) characters, |
|
170 |
+ * this longer buffer is needed so that we can choose the "best" subpattern from |
|
171 |
+ * it */ |
|
172 |
+#define MAXPATLEN 255 |
|
173 |
+ |
|
174 |
+/* merge another pattern into the filter |
|
175 |
+ * add('abc'); add('bcd'); will match [ab][bc][cd] */ |
|
176 |
+int filter_add_static(struct filter *m, const unsigned char *pattern, unsigned long len, const char *name) |
|
177 |
+{ |
|
178 |
+ uint16_t q; |
|
179 |
+ uint8_t j, maxlen; |
|
180 |
+ uint32_t best = 0xffffffff; |
|
181 |
+ uint8_t best_pos = 0; |
|
182 |
+ |
|
183 |
+ cli_perf_log_count(TRIE_ORIG_LEN, len > 8 ? 8 : len); |
|
184 |
+ /* TODO: choose best among MAXCHOICES */ |
|
185 |
+ /* cut length */ |
|
186 |
+ if(len > MAXPATLEN) { |
|
187 |
+ len = MAXPATLEN; |
|
188 |
+ } |
|
189 |
+ if(len < 2) |
|
190 |
+ return -1; |
|
191 |
+ |
|
192 |
+ /* we want subsigs to be as long as possible */ |
|
193 |
+ if (len > 4) { |
|
194 |
+ maxlen = len - 4; |
|
195 |
+ if (maxlen == 1) maxlen = 2; |
|
196 |
+ } else |
|
197 |
+ maxlen = 2; |
|
198 |
+ for(j=0;(best < 100 && j<MAX_CHOICES) || (j < maxlen) ;j++) { |
|
199 |
+ uint32_t num = MAXSOPATLEN; |
|
200 |
+ uint8_t k; |
|
201 |
+ if (j+2 > len) |
|
202 |
+ break; |
|
203 |
+ for(k=j;k<len-1 && (k-j < MAXSOPATLEN);k++) { |
|
204 |
+ q = cli_readint16( &pattern[k] ); |
|
205 |
+ /* we want to favor subsigs that add as little as |
|
206 |
+ * possible to the filter */ |
|
207 |
+ num += filter_isset(m, k-j, q) ? 0 : MAXSOPATLEN - (k-j); |
|
208 |
+ if ((k == j || k == j+1) && (q == 0x0000 || q == 0xffff)) |
|
209 |
+ num += k==j ? 10000 : 1000;/* bad */ |
|
210 |
+ } |
|
211 |
+ /* it is very important to keep the end set small */ |
|
212 |
+ num += 10*(filter_end_isset(m, k-j-1, q) ? 0 : 1); |
|
213 |
+ /* it is very important to have signatures as long as possible |
|
214 |
+ * */ |
|
215 |
+ num += 5*(MAXSOPATLEN - (k-j)); |
|
216 |
+ /* if we are lower length than threshold penalize */ |
|
217 |
+ if (k-j+1 < 4) |
|
218 |
+ num += 200; |
|
219 |
+ /* favour longer patterns */ |
|
220 |
+ num -= (2*MAXSOPATLEN - (k + 1+j))*(k-j)/2; |
|
221 |
+ |
|
222 |
+ if (num < best) { |
|
223 |
+ best = num; |
|
224 |
+ best_pos = j; |
|
225 |
+ } |
|
226 |
+ } |
|
227 |
+ |
|
228 |
+ assert(best_pos < len-1); |
|
229 |
+ if (pattern[best_pos] == 0 && pattern[best_pos+1] == 0) { |
|
230 |
+ cli_warnmsg("filter: subsignature begins with zero (static): %s\n", name); |
|
231 |
+ } |
|
232 |
+ pattern += best_pos; |
|
233 |
+ len -= best_pos; |
|
234 |
+ /* cut length */ |
|
235 |
+ if(len > MAXSOPATLEN) { |
|
236 |
+ len = MAXSOPATLEN; |
|
237 |
+ } |
|
238 |
+ /* Shift-Or like preprocessing */ |
|
239 |
+ for(j=0;j < len-1;j++) { |
|
240 |
+ /* use overlapping little-endian 2-grams. We need them overlapping because matching can start at any position */ |
|
241 |
+ q = cli_readint16( &pattern[j] ); |
|
242 |
+ filter_set_atpos(m, j, q); |
|
243 |
+ } |
|
244 |
+ /* we use variable length patterns, use last character to mark pattern end, |
|
245 |
+ * can lead to false positives.*/ |
|
246 |
+ /* mark that at state j, the q-gram q can end the pattern */ |
|
247 |
+ if(j) { |
|
248 |
+ j--; |
|
249 |
+ filter_set_end(m, j, q); |
|
250 |
+ } |
|
251 |
+ return j+2; |
|
252 |
+} |
|
253 |
+ |
|
254 |
+struct char_spec { |
|
255 |
+ /* if non-null i-th character = alt[start + step*i]; start+step*i < end; |
|
256 |
+ */ |
|
257 |
+ struct cli_ac_special *alt; |
|
258 |
+ uint8_t start; |
|
259 |
+ uint8_t end; |
|
260 |
+ uint8_t step; |
|
261 |
+}; |
|
262 |
+ |
|
263 |
+static inline unsigned char spec_ith_char(const struct char_spec *spec, unsigned i) |
|
264 |
+{ |
|
265 |
+ const struct cli_ac_special *alt = spec->alt; |
|
266 |
+ if (alt) { |
|
267 |
+ assert (alt->type == 1); |
|
268 |
+ assert (i < alt->num); |
|
269 |
+ return alt->str[i]; |
|
270 |
+ } |
|
271 |
+ return i; |
|
272 |
+} |
|
273 |
+ |
|
274 |
+static const struct char_spec full_range = {NULL, 0,0xff,1}; |
|
275 |
+ |
|
276 |
+static inline int spec_is_fullrange(const struct char_spec *spec0, const struct char_spec *spec1) |
|
277 |
+{ |
|
278 |
+ return !memcmp(spec0, &full_range, sizeof(full_range)) && |
|
279 |
+ !memcmp(spec1, &full_range, sizeof(full_range)); |
|
280 |
+} |
|
281 |
+ |
|
282 |
+ |
|
283 |
+#ifndef MIN |
|
284 |
+#define MIN(a,b) ((a) < (b) ? (a) : (b)) |
|
285 |
+#endif |
|
286 |
+ |
|
287 |
+enum badness { |
|
288 |
+ reject, |
|
289 |
+ /* try to avoid if possible */ |
|
290 |
+ avoid_first, |
|
291 |
+ avoid_anywhere, /* includes avoid_first! */ |
|
292 |
+ /* not that bad, but still not best */ |
|
293 |
+ dontlike, |
|
294 |
+ accept, |
|
295 |
+ like |
|
296 |
+}; |
|
297 |
+static inline void get_score(enum badness badness, unsigned i, const struct filter *m, const struct char_spec *spec0, const struct char_spec *spec1, int32_t *score, int32_t *score_end) |
|
298 |
+{ |
|
299 |
+ int32_t base; |
|
300 |
+ unsigned k0, k1, num_introduced = 0, num_end_introduced = 0; |
|
301 |
+ switch (badness) { |
|
302 |
+ case reject: |
|
303 |
+ /* not reached */ |
|
304 |
+ assert(0); |
|
305 |
+ base = -0x7fffff; |
|
306 |
+ break; |
|
307 |
+ case avoid_first: |
|
308 |
+ if (!i) |
|
309 |
+ base = -0x700000; |
|
310 |
+ else |
|
311 |
+ base = 0; |
|
312 |
+ break; |
|
313 |
+ case avoid_anywhere: |
|
314 |
+ if (!i) |
|
315 |
+ base = -0x720000; |
|
316 |
+ else |
|
317 |
+ base = -0x1000; |
|
318 |
+ break; |
|
319 |
+ case dontlike: |
|
320 |
+ base = 0; |
|
321 |
+ break; |
|
322 |
+ case accept: |
|
323 |
+ base = 0x200; |
|
324 |
+ break; |
|
325 |
+ case like: |
|
326 |
+ /* a bit better only */ |
|
327 |
+ base = 0x201; |
|
328 |
+ break; |
|
329 |
+ } |
|
330 |
+ if (base < 0) { |
|
331 |
+ *score = base; |
|
332 |
+ *score_end = base; |
|
333 |
+ return; |
|
334 |
+ } |
|
335 |
+ /* at most 256 iterations here, otherwise base would be negative */ |
|
336 |
+ for(k0=spec0->start;k0 <= spec0->end;k0 += spec0->step) { |
|
337 |
+ for(k1=spec1->start;k1 <= spec1->end;k1 += spec1->step) { |
|
338 |
+ unsigned char c0 = spec_ith_char(spec0, k0); |
|
339 |
+ unsigned char c1 = spec_ith_char(spec1, k1); |
|
340 |
+ uint16_t a = c0 | (c1<<8); |
|
341 |
+ num_introduced += filter_isset(m, i, a); |
|
342 |
+ num_end_introduced += filter_end_isset(m, i, a); |
|
343 |
+ } |
|
344 |
+ } |
|
345 |
+ *score = base - num_introduced; |
|
346 |
+ *score_end = base - num_end_introduced; |
|
347 |
+ if (badness == avoid_first && i) { |
|
348 |
+ /* what is bad to begin with, is bad at end too */ |
|
349 |
+ *score_end -= 0x1000; |
|
350 |
+ } |
|
351 |
+} |
|
352 |
+ |
|
353 |
+struct choice { |
|
354 |
+ enum badness base; |
|
355 |
+ unsigned begin; |
|
356 |
+ unsigned len; |
|
357 |
+}; |
|
358 |
+ |
|
359 |
+static inline void add_choice(struct choice *choices, unsigned *cnt, unsigned i, unsigned ie, enum badness badness) |
|
360 |
+{ |
|
361 |
+ struct choice *choice; |
|
362 |
+ int i_neg = -1; |
|
363 |
+ assert(ie < MAXPATLEN); |
|
364 |
+ if (ie < i+1) |
|
365 |
+ return; |
|
366 |
+ if (*cnt >= MAX_CHOICES) |
|
367 |
+ return; |
|
368 |
+ if (badness > avoid_first && *cnt >= (MAX_CHOICES >> 1)) { |
|
369 |
+ unsigned j; |
|
370 |
+ /* replace very bad picks if we're full */ |
|
371 |
+ for (j=0;j<*cnt;j++) { |
|
372 |
+ if (choices[j].base < badness) { |
|
373 |
+ if (i_neg == -1 || choices[j].base < choices[i_neg].base) { |
|
374 |
+ i_neg = j; |
|
375 |
+ } |
|
376 |
+ } |
|
377 |
+ } |
|
378 |
+ } |
|
379 |
+ if (i_neg != -1) { |
|
380 |
+ choice = &choices[i_neg]; |
|
381 |
+ } else { |
|
382 |
+ choice = &choices[(*cnt)++]; |
|
383 |
+ } |
|
384 |
+ choice->begin = i; |
|
385 |
+ choice->len = ie - i + 1; |
|
386 |
+ choice->base = badness; |
|
387 |
+} |
|
388 |
+ |
|
389 |
+static inline int32_t spec_iter(const struct char_spec *spec) |
|
390 |
+{ |
|
391 |
+ assert(spec->step); |
|
392 |
+ return (1 + spec->end - spec->start)/spec->step; |
|
393 |
+} |
|
394 |
+ |
|
395 |
+int filter_add_acpatt(struct filter *m, const struct cli_ac_patt *pat) |
|
396 |
+{ |
|
397 |
+ unsigned i, j = 0, stop = 0, l=0; |
|
398 |
+ uint16_t k0, k1; |
|
399 |
+ |
|
400 |
+ struct char_spec chars[MAXPATLEN]; |
|
401 |
+ enum badness char_badness[MAXPATLEN]; |
|
402 |
+ unsigned char patc[MAXPATLEN]; |
|
403 |
+ unsigned altcnt = 0; |
|
404 |
+ int32_t best_score = -0x7fffffff; |
|
405 |
+ unsigned best_score_i = 0; |
|
406 |
+ unsigned best_score_len = 0; |
|
407 |
+ struct char_spec *spec0, *spec1; |
|
408 |
+ |
|
409 |
+ struct choice choices[MAX_CHOICES]; |
|
410 |
+ unsigned choices_cnt = 0; |
|
411 |
+ unsigned prefix_len = pat->prefix_length; |
|
412 |
+ |
|
413 |
+ j = MIN(prefix_len + pat->length, MAXPATLEN); |
|
414 |
+ for(i=0;i<j;i++) { |
|
415 |
+ const uint16_t p = i < prefix_len ? pat->prefix[i] : pat->pattern[i - prefix_len]; |
|
416 |
+ if ((p&CLI_MATCH_WILDCARD) != CLI_MATCH_CHAR) |
|
417 |
+ break; |
|
418 |
+ patc[i] = (uint8_t)p; |
|
419 |
+ } |
|
420 |
+ if (i == j) { |
|
421 |
+ /* all static, use add_static it has better heuristics for this |
|
422 |
+ * case */ |
|
423 |
+ return filter_add_static(m, patc, j, pat->virname); |
|
424 |
+ } |
|
425 |
+ cli_perf_log_count(TRIE_ORIG_LEN, j > 8 ? 8 : j); |
|
426 |
+ /* transform AC characters into our representation */ |
|
427 |
+ for (i=0;i<j && !stop; i++) { |
|
428 |
+ struct char_spec *spec = &chars[i]; |
|
429 |
+ const uint16_t p = i < prefix_len ? pat->prefix[i] : pat->pattern[i - prefix_len]; |
|
430 |
+ spec->alt = NULL; |
|
431 |
+ switch (p & CLI_MATCH_WILDCARD) { |
|
432 |
+ case CLI_MATCH_CHAR: |
|
433 |
+ spec->start = spec->end = (uint8_t)p; |
|
434 |
+ spec->step = 1; |
|
435 |
+ break; |
|
436 |
+ case CLI_MATCH_IGNORE: |
|
437 |
+ spec->start = 0x00; |
|
438 |
+ spec->end = 0xff; |
|
439 |
+ spec->step = 1; |
|
440 |
+ break; |
|
441 |
+ case CLI_MATCH_SPECIAL: |
|
442 |
+ assert(pat->special_table); |
|
443 |
+// assert(altcnt < pat->alt); |
|
444 |
+ assert(pat->special_table[altcnt]); |
|
445 |
+ switch (pat->special_table[altcnt++]->type) { |
|
446 |
+ case 1: /* ALT_CHAR */ |
|
447 |
+ spec->start = 0; |
|
448 |
+ spec->end = pat->special_table[altcnt-1]->num - 1; |
|
449 |
+ spec->step = 1; |
|
450 |
+ spec->alt = pat->special_table[altcnt-1]; |
|
451 |
+ break; |
|
452 |
+ default: |
|
453 |
+ break; |
|
454 |
+ /* TODO: should something be done here? |
|
455 |
+ * */ |
|
456 |
+ } |
|
457 |
+ stop = 1; |
|
458 |
+ break; |
|
459 |
+ case CLI_MATCH_NIBBLE_HIGH: |
|
460 |
+ spec->start = (p & 0xf0); |
|
461 |
+ spec->end = spec->start | 0x0f; |
|
462 |
+ spec->step = 1; |
|
463 |
+ break; |
|
464 |
+ case CLI_MATCH_NIBBLE_LOW: |
|
465 |
+ spec->start = (p & 0xf); |
|
466 |
+ spec->end = 0xf0 | spec->start; |
|
467 |
+ spec->step = 0x10; |
|
468 |
+ break; |
|
469 |
+ default: |
|
470 |
+ cli_errmsg("filtering: unknown wildcard character: %d\n", p); |
|
471 |
+ return -1; |
|
472 |
+ } |
|
473 |
+ } |
|
474 |
+ if (stop) --i; |
|
475 |
+ j = i; |
|
476 |
+ if (j < 2) { |
|
477 |
+ if (stop) |
|
478 |
+ cli_warnmsg("Don't know how to create filter for: %s\n",pat->virname); |
|
479 |
+ else |
|
480 |
+ cli_warnmsg("Subpattern too short: %s\n", pat->virname); |
|
481 |
+ return -1; |
|
482 |
+ } |
|
483 |
+ |
|
484 |
+ for(i=0;i<j-1;i++) { |
|
485 |
+ int32_t num_iter; |
|
486 |
+ /* new qgrams added to the filter */ |
|
487 |
+ spec0 = &chars[i]; |
|
488 |
+ spec1 = &chars[i+1]; |
|
489 |
+ num_iter = spec_iter(spec0) * spec_iter(spec1); |
|
490 |
+ |
|
491 |
+ if (num_iter >= 0x100) { |
|
492 |
+ if (num_iter == 0x10000) |
|
493 |
+ char_badness[i] = reject; |
|
494 |
+ else |
|
495 |
+ char_badness[i] = avoid_anywhere; |
|
496 |
+ } else { |
|
497 |
+ int8_t binary = 0; |
|
498 |
+ enum badness scor = accept; |
|
499 |
+ for(k0=spec0->start;k0 <= spec0->end;k0 += spec0->step) { |
|
500 |
+ for(k1=spec1->start;k1 <= spec1->end;k1 += spec1->step) { |
|
501 |
+ unsigned char c0 = spec_ith_char(spec0, k0); |
|
502 |
+ unsigned char c1 = spec_ith_char(spec1, k1); |
|
503 |
+ if ((!c0 && !c1) || (c0 == 0xff && c1 == 0xff)) { |
|
504 |
+ scor = avoid_first; |
|
505 |
+ break; |
|
506 |
+ } |
|
507 |
+ if (c0 == c1) { |
|
508 |
+ scor = dontlike; |
|
509 |
+ break; |
|
510 |
+ } |
|
511 |
+ if ((c0 < 32 || c0 > 127) && (c1 < 32 || c1 >127)) |
|
512 |
+ binary = 1; |
|
513 |
+ } |
|
514 |
+ } |
|
515 |
+ if (scor == accept && binary) { |
|
516 |
+ /* slightly favor binary */ |
|
517 |
+ scor = like; |
|
518 |
+ } |
|
519 |
+ char_badness[i] = scor; |
|
520 |
+ } |
|
521 |
+ } |
|
522 |
+ |
|
523 |
+ /* try to choose best subpattern */ |
|
524 |
+ |
|
525 |
+ /* calculating the score for all possible i start pos |
|
526 |
+ * and all possible length is too slow, so choose best among N choices |
|
527 |
+ * only */ |
|
528 |
+ for (i=0;i<j-1 && choices_cnt < MAX_CHOICES;i++) { |
|
529 |
+ enum badness base0 = like, base1 = like; |
|
530 |
+ unsigned kend = MIN(j-1, (i + MAXSOPATLEN)&~1), k; |
|
531 |
+ int ki = -0xff; |
|
532 |
+ /* add 2 scores: pattern with max length, one where we stop at |
|
533 |
+ * first negative, and one we stop at last positive, but never |
|
534 |
+ * include reject */ |
|
535 |
+ assert(kend-1 < j-1); |
|
536 |
+ if (char_badness[i] == reject) |
|
537 |
+ continue; |
|
538 |
+ if ((char_badness[i] == avoid_anywhere || char_badness[i] == avoid_first) |
|
539 |
+ && choices_cnt > 0) |
|
540 |
+ /* if we have another choice don't choose this */ |
|
541 |
+ continue; |
|
542 |
+ while ((kend > i+3) && char_badness[kend-1] == reject) kend--; |
|
543 |
+ for (k=i;k<kend;k++) { |
|
544 |
+ enum badness badness = char_badness[k]; |
|
545 |
+ if (badness < accept) { |
|
546 |
+ if (badness == reject) { |
|
547 |
+ /* this is a never pick */ |
|
548 |
+ kend = k; |
|
549 |
+ break; |
|
550 |
+ } |
|
551 |
+ if (badness == avoid_first && k != i) |
|
552 |
+ badness = dontlike; |
|
553 |
+ if (k == i && badness == avoid_anywhere) |
|
554 |
+ badness = avoid_first; |
|
555 |
+ if (ki == -0xff) |
|
556 |
+ ki = k; |
|
557 |
+ } |
|
558 |
+ base0 = MIN(base0, badness); |
|
559 |
+ if (ki == -0xff) |
|
560 |
+ base1 = MIN(base1, badness); |
|
561 |
+ } |
|
562 |
+ add_choice(choices, &choices_cnt, i, kend, base0); |
|
563 |
+ if (ki > (int)i) { |
|
564 |
+ /* ki|ki+1|??| */ |
|
565 |
+ /* try subpattern from after the wildcard */ |
|
566 |
+ i = ki; |
|
567 |
+ } |
|
568 |
+ /* if score is positive, it replaces a negative choice */ |
|
569 |
+ } |
|
570 |
+ for(l=0;l<choices_cnt;l++) { |
|
571 |
+ int32_t score; |
|
572 |
+ unsigned kend; |
|
573 |
+ unsigned k; |
|
574 |
+ |
|
575 |
+ i = choices[l].begin; |
|
576 |
+ kend = i + choices[l].len; |
|
577 |
+ score = 0; |
|
578 |
+ |
|
579 |
+ for(k = i; k < kend-1; k++) { |
|
580 |
+ unsigned p = k - i; |
|
581 |
+ int32_t iscore, score_end; |
|
582 |
+ assert(k < j); |
|
583 |
+ get_score(char_badness[k], p, m, &chars[k], &chars[k+1], |
|
584 |
+ &iscore, &score_end); |
|
585 |
+ /* give more importance to the score of the characters |
|
586 |
+ * at the beginning */ |
|
587 |
+ /* TODO: tune magic number here */ |
|
588 |
+ if (p < 6) { |
|
589 |
+ iscore *= (6-p); |
|
590 |
+ score_end *= (6-p); |
|
591 |
+ } |
|
592 |
+ score += iscore; |
|
593 |
+ if (score + score_end > best_score) { |
|
594 |
+ /* we may have negative scores, so truncating |
|
595 |
+ * the pattern could actually get us a higher |
|
596 |
+ * score */ |
|
597 |
+ best_score = score + score_end; |
|
598 |
+ best_score_len = p + 2; |
|
599 |
+ best_score_i = i; |
|
600 |
+ assert(i + best_score_len <= j); |
|
601 |
+ } |
|
602 |
+ } |
|
603 |
+ } |
|
604 |
+ |
|
605 |
+ if (best_score <= -0x7fffffff) { |
|
606 |
+ cli_warnmsg("filter rejecting %s due to very bad score: %ld\n", pat->virname, (long)best_score); |
|
607 |
+ return -1; |
|
608 |
+ } |
|
609 |
+ if (choices_cnt == 0) { |
|
610 |
+ cli_warnmsg("filter rejecting %s because there are no viable choices", pat->virname); |
|
611 |
+ return -1; |
|
612 |
+ } |
|
613 |
+ assert(best_score_len >= 2); |
|
614 |
+ |
|
615 |
+ cli_dbgmsg("filter %s score: %ld, %u (+ %u)\n", pat->virname, (long)best_score, best_score_i, best_score_len); |
|
616 |
+ /* Shift-Or like preprocessing */ |
|
617 |
+ assert(1 < best_score_len); |
|
618 |
+ for (i=0;i < best_score_len-1;i++) { |
|
619 |
+ spec0 = &chars[best_score_i + i]; |
|
620 |
+ spec1 = &chars[best_score_i + i + 1]; |
|
621 |
+ /* use overlapping little-endian 2-grams, overlapping because match can start |
|
622 |
+ * at any position (including odd) */ |
|
623 |
+ |
|
624 |
+ for(k0=spec0->start;k0 <= spec0->end;k0 += spec0->step) { |
|
625 |
+ for(k1=spec1->start;k1 <= spec1->end;k1 += spec1->step) { |
|
626 |
+ unsigned char c0 = spec_ith_char(spec0, k0); |
|
627 |
+ unsigned char c1 = spec_ith_char(spec1, k1); |
|
628 |
+ if (!c0 && !c1 && !i) { |
|
629 |
+ cli_warnmsg("filter: subsignature begins with zero: %s\n",pat->virname); |
|
630 |
+ } |
|
631 |
+ filter_set_atpos(m, i, c0 | (c1<<8)); |
|
632 |
+ } |
|
633 |
+ } |
|
634 |
+ } |
|
635 |
+ |
|
636 |
+ j = best_score_len - 2; |
|
637 |
+ for (k0=spec0->start;k0 <= spec0->end;k0 += spec0->step) { |
|
638 |
+ for (k1=spec1->start;k1 <= spec1->end;k1 += spec1->step) { |
|
639 |
+ unsigned char c0 = spec_ith_char(spec0, k0); |
|
640 |
+ unsigned char c1 = spec_ith_char(spec1, k1); |
|
641 |
+ if (!c0 && !c1) { |
|
642 |
+ cli_dbgmsg("filter: subsignature ends with zero: %s\n",pat->virname); |
|
643 |
+ } |
|
644 |
+ filter_set_end(m, j, c0 | (c1<<8)); |
|
645 |
+ } |
|
646 |
+ } |
|
647 |
+ return j+2; |
|
648 |
+} |
|
649 |
+ |
|
650 |
+static const struct match_len_info { |
|
651 |
+ uint8_t shortest; |
|
652 |
+ uint8_t longest; |
|
653 |
+} match_len[256] = { |
|
654 |
+ {2,9},{3,9},{2,9},{4,9},{2,9},{3,9},{2,9},{5,9}, |
|
655 |
+ {2,9},{3,9},{2,9},{4,9},{2,9},{3,9},{2,9},{6,9}, |
|
656 |
+ {2,9},{3,9},{2,9},{4,9},{2,9},{3,9},{2,9},{5,9}, |
|
657 |
+ {2,9},{3,9},{2,9},{4,9},{2,9},{3,9},{2,9},{7,9}, |
|
658 |
+ {2,9},{3,9},{2,9},{4,9},{2,9},{3,9},{2,9},{5,9}, |
|
659 |
+ {2,9},{3,9},{2,9},{4,9},{2,9},{3,9},{2,9},{6,9}, |
|
660 |
+ {2,9},{3,9},{2,9},{4,9},{2,9},{3,9},{2,9},{5,9}, |
|
661 |
+ {2,9},{3,9},{2,9},{4,9},{2,9},{3,9},{2,9},{8,9}, |
|
662 |
+ {2,9},{3,9},{2,9},{4,9},{2,9},{3,9},{2,9},{5,9}, |
|
663 |
+ {2,9},{3,9},{2,9},{4,9},{2,9},{3,9},{2,9},{6,9}, |
|
664 |
+ {2,9},{3,9},{2,9},{4,9},{2,9},{3,9},{2,9},{5,9}, |
|
665 |
+ {2,9},{3,9},{2,9},{4,9},{2,9},{3,9},{2,9},{7,9}, |
|
666 |
+ {2,9},{3,9},{2,9},{4,9},{2,9},{3,9},{2,9},{5,9}, |
|
667 |
+ {2,9},{3,9},{2,9},{4,9},{2,9},{3,9},{2,9},{6,9}, |
|
668 |
+ {2,9},{3,9},{2,9},{4,9},{2,9},{3,9},{2,9},{5,9}, |
|
669 |
+ {2,9},{3,9},{2,9},{4,9},{2,9},{3,9},{2,9},{9,9}, |
|
670 |
+ {2,8},{3,8},{2,8},{4,8},{2,8},{3,8},{2,8},{5,8}, |
|
671 |
+ {2,8},{3,8},{2,8},{4,8},{2,8},{3,8},{2,8},{6,8}, |
|
672 |
+ {2,8},{3,8},{2,8},{4,8},{2,8},{3,8},{2,8},{5,8}, |
|
673 |
+ {2,8},{3,8},{2,8},{4,8},{2,8},{3,8},{2,8},{7,8}, |
|
674 |
+ {2,8},{3,8},{2,8},{4,8},{2,8},{3,8},{2,8},{5,8}, |
|
675 |
+ {2,8},{3,8},{2,8},{4,8},{2,8},{3,8},{2,8},{6,8}, |
|
676 |
+ {2,8},{3,8},{2,8},{4,8},{2,8},{3,8},{2,8},{5,8}, |
|
677 |
+ {2,8},{3,8},{2,8},{4,8},{2,8},{3,8},{2,8},{8,8}, |
|
678 |
+ {2,7},{3,7},{2,7},{4,7},{2,7},{3,7},{2,7},{5,7}, |
|
679 |
+ {2,7},{3,7},{2,7},{4,7},{2,7},{3,7},{2,7},{6,7}, |
|
680 |
+ {2,7},{3,7},{2,7},{4,7},{2,7},{3,7},{2,7},{5,7}, |
|
681 |
+ {2,7},{3,7},{2,7},{4,7},{2,7},{3,7},{2,7},{7,7}, |
|
682 |
+ {2,6},{3,6},{2,6},{4,6},{2,6},{3,6},{2,6},{5,6}, |
|
683 |
+ {2,6},{3,6},{2,6},{4,6},{2,6},{3,6},{2,6},{6,6}, |
|
684 |
+ {2,5},{3,5},{2,5},{4,5},{2,5},{3,5},{2,5},{5,5}, |
|
685 |
+ {2,4},{3,4},{2,4},{4,4},{2,3},{3,3},{2,2},{0,0} |
|
686 |
+}; |
|
687 |
+/* state 11110011 means that we may have a match of length min 4, max 5 */ |
|
688 |
+ |
|
689 |
+__hot__ int filter_search_ext(const struct filter *m, const unsigned char *data, unsigned long len, struct filter_match_info *inf) |
|
690 |
+{ |
|
691 |
+ size_t j; |
|
692 |
+ uint8_t state = ~0; |
|
693 |
+ const uint8_t *B = m->B; |
|
694 |
+ const uint8_t *End = m->end; |
|
695 |
+ uint8_t shortest, longest=0; |
|
696 |
+ |
|
697 |
+ if (len < 2) return -1; |
|
698 |
+ /* look for first match */ |
|
699 |
+ for (j=0; j < len-1;j++) { |
|
700 |
+ uint8_t match_state_end; |
|
701 |
+ const uint16_t q0 = cli_readint16( &data[j] ); |
|
702 |
+ |
|
703 |
+ state = (state << 1) | B[q0]; |
|
704 |
+ match_state_end = state | End[q0]; |
|
705 |
+ if (match_state_end != 0xff) { |
|
706 |
+ inf->first_match = j; |
|
707 |
+ return 0; |
|
708 |
+ } |
|
709 |
+ } |
|
710 |
+ /* no match, inf is invalid */ |
|
711 |
+ return -1; |
|
712 |
+} |
|
713 |
+ |
|
714 |
+/* this is like a FSM, with multiple active states at the same time. |
|
715 |
+ * each bit in "state" means an active state, when a char is encountered |
|
716 |
+ * we determine what states can remain active. |
|
717 |
+ * The FSM transition rules are expressed as bit-masks */ |
|
718 |
+long filter_search(const struct filter *m, const unsigned char *data, unsigned long len) |
|
719 |
+{ |
|
720 |
+ size_t j; |
|
721 |
+ uint8_t state = ~0; |
|
722 |
+ const uint8_t *B = m->B; |
|
723 |
+ const uint8_t *End = m->end; |
|
724 |
+ |
|
725 |
+ /* we use 2-grams, must be higher than 1 */ |
|
726 |
+ if(len < 2) return -1; |
|
727 |
+ /* Shift-Or like search algorithm */ |
|
728 |
+ for(j=0;j < len-1; j++) { |
|
729 |
+ const uint16_t q0 = cli_readint16( &data[j] ); |
|
730 |
+ uint8_t match_end; |
|
731 |
+ state = (state << 1) | B[q0]; |
|
732 |
+ /* state marks with a 0 bit all active states |
|
733 |
+ * End[q0] marks with a 0 bit all states where the q-gram 'q' can end a pattern |
|
734 |
+ * if we got two 0's at matching positions, it means we encountered a pattern's end */ |
|
735 |
+ match_end = state | End[q0]; |
|
736 |
+ if(match_end != 0xff) { |
|
737 |
+ |
|
738 |
+ /* if state is reachable, and this character can finish a pattern, assume match */ |
|
739 |
+ /* to reduce false positives check if qgram can finish the pattern */ |
|
740 |
+ /* return position of probable match */ |
|
741 |
+ /* find first 0 starting from MSB, the position of that bit as counted from LSB, is the length of the |
|
742 |
+ * longest pattern that could match */ |
|
743 |
+ return j >= MAXSOPATLEN ? j - MAXSOPATLEN : 0; |
|
744 |
+ } |
|
745 |
+ } |
|
746 |
+ /* no match */ |
|
747 |
+ return -1; |
|
748 |
+} |
0 | 749 |
new file mode 100644 |
... | ... |
@@ -0,0 +1,42 @@ |
0 |
+/* |
|
1 |
+ * A fast filter for static patterns. |
|
2 |
+ * |
|
3 |
+ * Copyright (C) 2008 Sourcefire, Inc. |
|
4 |
+ * |
|
5 |
+ * Authors: Török Edvin |
|
6 |
+ * |
|
7 |
+ * This program is free software; you can redistribute it and/or modify |
|
8 |
+ * it under the terms of the GNU General Public License version 2 as |
|
9 |
+ * published by the Free Software Foundation. |
|
10 |
+ * |
|
11 |
+ * This program is distributed in the hope that it will be useful, |
|
12 |
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of |
|
13 |
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
|
14 |
+ * GNU General Public License for more details. |
|
15 |
+ * |
|
16 |
+ * You should have received a copy of the GNU General Public License |
|
17 |
+ * along with this program; if not, write to the Free Software |
|
18 |
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, |
|
19 |
+ * MA 02110-1301, USA. |
|
20 |
+ */ |
|
21 |
+#ifndef FILTER_H |
|
22 |
+#define FILTER_H |
|
23 |
+#include "cltypes.h" |
|
24 |
+struct filter { |
|
25 |
+ uint8_t B[65536]; |
|
26 |
+ uint8_t end[65536]; |
|
27 |
+ unsigned long m; |
|
28 |
+}; |
|
29 |
+ |
|
30 |
+struct filter_match_info { |
|
31 |
+ unsigned long first_match; |
|
32 |
+}; |
|
33 |
+ |
|
34 |
+struct cli_ac_patt; |
|
35 |
+void filter_init(struct filter *m); |
|
36 |
+long filter_search(const struct filter *m, const unsigned char *data, unsigned long len); |
|
37 |
+int filter_search_ext(const struct filter *m, const unsigned char *data, unsigned long len, struct filter_match_info *inf); |
|
38 |
+int filter_add_static(struct filter *m, const unsigned char *pattern, unsigned long len, const char *name); |
|
39 |
+int filter_add_acpatt(struct filter *m, const struct cli_ac_patt *pat); |
|
40 |
+ |
|
41 |
+#endif |
... | ... |
@@ -333,7 +333,7 @@ static int ac_maketrans(struct cli_matcher *root) |
333 | 333 |
continue; |
334 | 334 |
for(i = 0; i < 256; i++) { |
335 | 335 |
child = node->trans[i]; |
336 |
- if(!child) { |
|
336 |
+ if (!child || (!IS_FINAL(child) && IS_LEAF(child))) { |
|
337 | 337 |
struct cli_ac_node *failtarget = node->fail; |
338 | 338 |
while(IS_LEAF(failtarget) || !failtarget->trans[i]) |
339 | 339 |
failtarget = failtarget->fail; |
... | ... |
@@ -1132,14 +1132,12 @@ int cli_ac_scanbuff(const unsigned char *buffer, uint32_t length, const char **v |
1132 | 1132 |
current = root->ac_root; |
1133 | 1133 |
|
1134 | 1134 |
for(i = 0; i < length; i++) { |
1135 |
- |
|
1136 |
- if(IS_LEAF(current)) |
|
1137 |
- current = current->fail; |
|
1138 |
- |
|
1139 | 1135 |
current = current->trans[buffer[i]]; |
1140 | 1136 |
|
1141 | 1137 |
if(IS_FINAL(current)) { |
1142 | 1138 |
patt = current->list; |
1139 |
+ if (IS_LEAF(current)) |
|
1140 |
+ current = current->fail; |
|
1143 | 1141 |
while(patt) { |
1144 | 1142 |
bp = i + 1 - patt->depth; |
1145 | 1143 |
if(patt->offdata[0] != CLI_OFF_VERSION && patt->offdata[0] != CLI_OFF_MACRO && !patt->next_same && (patt->offset_min != CLI_OFF_ANY) && (!patt->sigid || patt->partno == 1)) { |
... | ... |
@@ -49,6 +49,22 @@ |
49 | 49 |
#include "pe_icons.h" |
50 | 50 |
#include "regex/regex.h" |
51 | 51 |
|
52 |
+static inline int matcher_run(const struct cli_matcher *root, |
|
53 |
+ const unsigned char *buffer, uint32_t length, |
|
54 |
+ const char **virname, struct cli_ac_data *mdata, |
|
55 |
+ uint32_t offset, |
|
56 |
+ cli_file_t ftype, |
|
57 |
+ struct cli_matched_type **ftoffset, |
|
58 |
+ unsigned int acmode, |
|
59 |
+ fmap_t *map, |
|
60 |
+ struct cli_bm_off *offdata) |
|
61 |
+{ |
|
62 |
+ int ret; |
|
63 |
+ if (root->ac_only || (ret = cli_bm_scanbuff(buffer, length, virname, NULL, root, offset, map, offdata)) != CL_VIRUS) |
|
64 |
+ ret = cli_ac_scanbuff(buffer, length, virname, NULL, NULL, root, mdata, offset, ftype, ftoffset, acmode, NULL); |
|
65 |
+ return ret; |
|
66 |
+} |
|
67 |
+ |
|
52 | 68 |
int cli_scanbuff(const unsigned char *buffer, uint32_t length, uint32_t offset, cli_ctx *ctx, cli_file_t ftype, struct cli_ac_data **acdata) |
53 | 69 |
{ |
54 | 70 |
int ret = CL_CLEAN; |
... | ... |
@@ -79,8 +95,7 @@ int cli_scanbuff(const unsigned char *buffer, uint32_t length, uint32_t offset, |
79 | 79 |
if(!acdata && (ret = cli_ac_initdata(&mdata, troot->ac_partsigs, troot->ac_lsigs, troot->ac_reloff_num, CLI_DEFAULT_AC_TRACKLEN))) |
80 | 80 |
return ret; |
81 | 81 |
|
82 |
- if(troot->ac_only || (ret = cli_bm_scanbuff(buffer, length, virname, NULL, troot, offset, NULL, NULL)) != CL_VIRUS) |
|
83 |
- ret = cli_ac_scanbuff(buffer, length, virname, NULL, NULL, troot, acdata ? (acdata[0]) : (&mdata), offset, ftype, NULL, AC_SCAN_VIR, NULL); |
|
82 |
+ ret = matcher_run(troot, buffer, length, virname, acdata ? (acdata[0]): (&mdata), offset, ftype, NULL, AC_SCAN_VIR, NULL, NULL); |
|
84 | 83 |
|
85 | 84 |
if(!acdata) |
86 | 85 |
cli_ac_freedata(&mdata); |
... | ... |
@@ -92,8 +107,7 @@ int cli_scanbuff(const unsigned char *buffer, uint32_t length, uint32_t offset, |
92 | 92 |
if(!acdata && (ret = cli_ac_initdata(&mdata, groot->ac_partsigs, groot->ac_lsigs, groot->ac_reloff_num, CLI_DEFAULT_AC_TRACKLEN))) |
93 | 93 |
return ret; |
94 | 94 |
|
95 |
- if(groot->ac_only || (ret = cli_bm_scanbuff(buffer, length, virname, NULL, groot, offset, NULL, NULL)) != CL_VIRUS) |
|
96 |
- ret = cli_ac_scanbuff(buffer, length, virname, NULL, NULL, groot, acdata ? (acdata[1]) : (&mdata), offset, ftype, NULL, AC_SCAN_VIR, NULL); |
|
95 |
+ ret = matcher_run(groot, buffer, length, virname, acdata ? (acdata[1]): (&mdata), offset, ftype, NULL, AC_SCAN_VIR, NULL, NULL); |
|
97 | 96 |
|
98 | 97 |
if(!acdata) |
99 | 98 |
cli_ac_freedata(&mdata); |
... | ... |
@@ -444,8 +458,8 @@ int cli_fmap_scandesc(cli_ctx *ctx, cli_file_t ftype, uint8_t ftonly, struct cli |
444 | 444 |
*ctx->scanned += bytes / CL_COUNT_PRECISION; |
445 | 445 |
|
446 | 446 |
if(troot) { |
447 |
- if(troot->ac_only || (ret = cli_bm_scanbuff(buff, bytes, ctx->virname, NULL, troot, offset, map, bm_offmode ? &toff : NULL)) != CL_VIRUS) |
|
448 |
- ret = cli_ac_scanbuff(buff, bytes, ctx->virname, NULL, NULL, troot, &tdata, offset, ftype, ftoffset, acmode, NULL); |
|
447 |
+ ret = matcher_run(troot, buff, bytes, ctx->virname, &tdata, offset, ftype, ftoffset, acmode, map, bm_offmode ? &toff : NULL); |
|
448 |
+ |
|
449 | 449 |
if(ret == CL_VIRUS) { |
450 | 450 |
if(!ftonly) |
451 | 451 |
cli_ac_freedata(&gdata); |
... | ... |
@@ -422,6 +422,12 @@ void cli_errmsg(const char *str, ...); |
422 | 422 |
#define always_inline inline |
423 | 423 |
#endif |
424 | 424 |
|
425 |
+#if defined (__GNUC__) && ((__GNUC__ > 4) || (__GNUC__ == 4 && __GNUC_MINOR__ >= 3)) |
|
426 |
+#define __hot__ __attribute__((hot)) |
|
427 |
+#else |
|
428 |
+#define __hot__ |
|
429 |
+#endif |
|
430 |
+ |
|
425 | 431 |
#define cli_dbgmsg (!UNLIKELY(cli_debug_flag)) ? (void)0 : cli_dbgmsg_internal |
426 | 432 |
|
427 | 433 |
#ifdef __GNUC__ |
428 | 434 |
new file mode 100644 |
... | ... |
@@ -0,0 +1,148 @@ |
0 |
+/* |
|
1 |
+ * Gather statistics from performance sensitive code. |
|
2 |
+ * |
|
3 |
+ * Copyright (C) 2008 Sourcefire, Inc. |
|
4 |
+ * |
|
5 |
+ * Authors: Török Edvin |
|
6 |
+ * |
|
7 |
+ * This program is free software; you can redistribute it and/or modify |
|
8 |
+ * it under the terms of the GNU General Public License version 2 as |
|
9 |
+ * published by the Free Software Foundation. |
|
10 |
+ * |
|
11 |
+ * This program is distributed in the hope that it will be useful, |
|
12 |
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of |
|
13 |
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
|
14 |
+ * GNU General Public License for more details. |
|
15 |
+ * |
|
16 |
+ * You should have received a copy of the GNU General Public License |
|
17 |
+ * along with this program; if not, write to the Free Software |
|
18 |
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, |
|
19 |
+ * MA 02110-1301, USA. |
|
20 |
+ */ |
|
21 |
+#ifdef HAVE_CONFIG_H |
|
22 |
+#include "clamav-config.h" |
|
23 |
+#endif |
|
24 |
+ |
|
25 |
+#include "perflogging.h" |
|
26 |
+#include <stdio.h> |
|
27 |
+#ifdef CLI_PERF_LOGGING |
|
28 |
+ |
|
29 |
+__thread last_flushed = 0; |
|
30 |
+__thread cli_perf_registered = 0; |
|
31 |
+__thread uint64_t cli_perf_sum_tls[__LAST_SUMABLE]; |
|
32 |
+__thread uint64_t cli_perf_count_tls[__LAST_COUNTABLE][256]; |
|
33 |
+ |
|
34 |
+uint64_t cli_perf_sum[__LAST_SUMABLE]; |
|
35 |
+uint64_t cli_perf_count[__LAST_COUNTABLE][256]; |
|
36 |
+ |
|
37 |
+static pthread_key_t thread_exit_key; |
|
38 |
+int pthread_key_create(pthread_key_t *key, void (*destr_function) (void *)); |
|
39 |
+ |
|
40 |
+static void cli_perf_thread_exit(void* arg) |
|
41 |
+{ |
|
42 |
+ /* save counters into global */ |
|
43 |
+ cli_perf_flush(); |
|
44 |
+} |
|
45 |
+ |
|
46 |
+void __attribute__((constructor)) __cli_perf_init(void) |
|
47 |
+{ |
|
48 |
+ pthread_key_create(&thread_exit_key, cli_perf_thread_exit); |
|
49 |
+} |
|
50 |
+ |
|
51 |
+void __attribute__((destructor)) __cli_perf_exit(void) |
|
52 |
+{ |
|
53 |
+ cli_perf_thread_exit(NULL); |
|
54 |
+} |
|
55 |
+ |
|
56 |
+static int dummy; |
|
57 |
+void cli_perf_register(void) |
|
58 |
+{ |
|
59 |
+ /* set a fake key, so that destructor gets called */ |
|
60 |
+ pthread_setspecific(thread_exit_key, &dummy); |
|
61 |
+ cli_perf_registered = 1; |
|
62 |
+} |
|
63 |
+ |
|
64 |
+static const char *perf_log_names_sum[__LAST_SUMABLE] = { |
|
65 |
+ "raw scanned", |
|
66 |
+ "filter scanned", |
|
67 |
+ "AC scanned", |
|
68 |
+ "BM scanned" |
|
69 |
+}; |
|
70 |
+ |
|
71 |
+static const char *perf_log_names_cnt[__LAST_COUNTABLE] = { |
|
72 |
+ "trie bytes scanned", |
|
73 |
+ "filter position load", |
|
74 |
+ "filter end load", |
|
75 |
+ "trie pattern original length" |
|
76 |
+}; |
|
77 |
+ |
|
78 |
+#define NONE __LAST_SUMABLE |
|
79 |
+static enum perf_log_sumable perf_log_percent[__LAST_SUMABLE] = { |
|
80 |
+ NONE, |
|
81 |
+ RAW_BYTES_SCANNED, |
|
82 |
+ RAW_BYTES_SCANNED, |
|
83 |
+ RAW_BYTES_SCANNED, |
|
84 |
+}; |
|
85 |
+ |
|
86 |
+static enum perf_log_countable perf_log_percent_cnt[__LAST_COUNTABLE] = { |
|
87 |
+ RAW_BYTES_SCANNED, |
|
88 |
+ NONE, |
|
89 |
+ NONE, |
|
90 |
+ NONE, |
|
91 |
+}; |
|
92 |
+ |
|
93 |
+static void cli_perf_print(void) |
|
94 |
+{ |
|
95 |
+ enum perf_log_sumable i; |
|
96 |
+ enum perf_log_countable j; |
|
97 |
+ unsigned k; |
|
98 |
+ |
|
99 |
+ uint64_t raw_scanned = cli_perf_sum[RAW_BYTES_SCANNED]; |
|
100 |
+ const double MEGA = 1024*1024.0; |
|
101 |
+ |
|
102 |
+ /* in multiscan mode multiple threads can output, so output a unique id |
|
103 |
+ * here*/ |
|
104 |
+ printf("PERF: %p\n", &cli_perf_registered); |
|
105 |
+ for(i=0;i<__LAST_SUMABLE;i++) { |
|
106 |
+ printf("PERF: %s: %g MB", perf_log_names_sum[i], cli_perf_sum[i] / MEGA); |
|
107 |
+ if (perf_log_percent[i] != NONE) |
|
108 |
+ printf("(%6.3f%%)", 100.0*cli_perf_sum[i] / cli_perf_sum[perf_log_percent[i]]); |
|
109 |
+ printf("\n"); |
|
110 |
+ } |
|
111 |
+ printf("\n"); |
|
112 |
+ for(j=0;j<__LAST_COUNTABLE;j++) { |
|
113 |
+ printf("PERF: %s: ", perf_log_names_cnt[j]); |
|
114 |
+ for (k=0;k<256;k++) |
|
115 |
+ if (cli_perf_count[j][k]) { |
|
116 |
+ printf(" %u -> %ju", k, cli_perf_count[j][k]); |
|
117 |
+ if (perf_log_percent_cnt[j] != NONE) |
|
118 |
+ printf("(%6.3f%%)", 100.0*cli_perf_count[j][k] / cli_perf_sum[perf_log_percent_cnt[j]]); |
|
119 |
+ } |
|
120 |
+ printf("\n"); |
|
121 |
+ } |
|
122 |
+ printf("\n"); |
|
123 |
+} |
|
124 |
+ |
|
125 |
+static pthread_mutex_t cli_perf_log_mutex = PTHREAD_MUTEX_INITIALIZER; |
|
126 |
+void cli_perf_flush(void) |
|
127 |
+{ |
|
128 |
+ unsigned i, j; |
|
129 |
+ |
|
130 |
+ pthread_mutex_lock(&cli_perf_log_mutex); |
|
131 |
+ |
|
132 |
+ for (i = 0; i < __LAST_SUMABLE; i++) { |
|
133 |
+ cli_perf_sum[i] += cli_perf_sum_tls[i]; |
|
134 |
+ cli_perf_sum_tls[i] = 0; |
|
135 |
+ } |
|
136 |
+ |
|
137 |
+ for (i = 0; i < __LAST_COUNTABLE; i++) { |
|
138 |
+ for (j = 0; j < 256; j++) { |
|
139 |
+ cli_perf_count[i][j] += cli_perf_count_tls[i][j]; |
|
140 |
+ cli_perf_count_tls[i][j] = 0; |
|
141 |
+ } |
|
142 |
+ } |
|
143 |
+ |
|
144 |
+ cli_perf_print(); |
|
145 |
+ pthread_mutex_unlock(&cli_perf_log_mutex); |
|
146 |
+} |
|
147 |
+#endif |
0 | 148 |
new file mode 100644 |
... | ... |
@@ -0,0 +1,101 @@ |
0 |
+/* |
|
1 |
+ * Gather statistics from performance sensitive code. |
|
2 |
+ * |
|
3 |
+ * Copyright (C) 2008 Sourcefire, Inc. |
|
4 |
+ * |
|
5 |
+ * Authors: Török Edvin |
|
6 |
+ * |
|
7 |
+ * This program is free software; you can redistribute it and/or modify |
|
8 |
+ * it under the terms of the GNU General Public License version 2 as |
|
9 |
+ * published by the Free Software Foundation. |
|
10 |
+ * |
|
11 |
+ * This program is distributed in the hope that it will be useful, |
|
12 |
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of |
|
13 |
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
|
14 |
+ * GNU General Public License for more details. |
|
15 |
+ * |
|
16 |
+ * You should have received a copy of the GNU General Public License |
|
17 |
+ * along with this program; if not, write to the Free Software |
|
18 |
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, |
|
19 |
+ * MA 02110-1301, USA. |
|
20 |
+ */ |
|
21 |
+#ifndef PERFLOGGING_H |
|
22 |
+#define PERFLOGGING_H |
|
23 |
+ |
|
24 |
+/* this is a compile-time selectable, default off module to log certain |
|
25 |
+ * statistics, such as which tries are used, efficiency of filtering and so on. |
|
26 |
+ * it must have as little overhead as possible */ |
|
27 |
+ |
|
28 |
+//#define CLI_PERF_LOGGING |
|
29 |
+#ifdef CLI_PERF_LOGGING |
|
30 |
+ |
|
31 |
+#ifndef __GNUC__ |
|
32 |
+#error "Performance logging requires GNU C compatible compiler" |
|
33 |
+#else |
|
34 |
+/*TODO: maybe we need a GCC version check too here */ |
|
35 |
+#include <pthread.h> |
|
36 |
+#include "cltypes.h" |
|
37 |
+ |
|
38 |
+enum perf_log_sumable { |
|
39 |
+ RAW_BYTES_SCANNED, |
|
40 |
+ FILTER_BYTES_SCANNED, |
|
41 |
+ AC_SCANNED, |
|
42 |
+ BM_SCANNED, |
|
43 |
+ __LAST_SUMABLE |
|
44 |
+}; |
|
45 |
+ |
|
46 |
+enum perf_log_countable { |
|
47 |
+ TRIE_SCANNED, |
|
48 |
+ FILTER_LOAD, |
|
49 |
+ FILTER_END_LOAD, |
|
50 |
+ TRIE_ORIG_LEN, |
|
51 |
+ __LAST_COUNTABLE |
|
52 |
+}; |
|
53 |
+ |
|
54 |
+extern __thread int last_flushed; |
|
55 |
+extern __thread int cli_perf_registered; |
|
56 |
+extern __thread uint64_t cli_perf_sum_tls[__LAST_SUMABLE]; |
|
57 |
+extern __thread uint64_t cli_perf_count_tls[__LAST_COUNTABLE][256]; |
|
58 |
+extern __thread int last_flushed; |
|
59 |
+ |
|
60 |
+extern uint64_t cli_perf_sum[__LAST_SUMABLE]; |
|
61 |
+extern uint64_t cli_perf_count[__LAST_COUNTABLE][256]; |
|
62 |
+ |
|
63 |
+void cli_perf_register(void); |
|
64 |
+void cli_perf_flush(void); |
|
65 |
+ |
|
66 |
+static inline void cli_perf_enter(void) |
|
67 |
+{ |
|
68 |
+ if (!cli_perf_registered) cli_perf_register(); |
|
69 |
+ if (cli_perf_sum_tls[RAW_BYTES_SCANNED] - last_flushed > 100*1024*1024) { |
|
70 |
+ cli_perf_flush(); |
|
71 |
+ last_flushed = cli_perf_sum_tls[RAW_BYTES_SCANNED]; |
|
72 |
+ } |
|
73 |
+} |
|
74 |
+ |
|
75 |
+static inline void cli_perf_log_add(enum perf_log_sumable kind, uint64_t add) |
|
76 |
+{ |
|
77 |
+ cli_perf_enter(); |
|
78 |
+ assert( kind < __LAST_SUMABLE); |
|
79 |
+ cli_perf_sum_tls[kind] += add; |
|
80 |
+} |
|
81 |
+ |
|
82 |
+static inline void cli_perf_log_count2(enum perf_log_countable kind, uint8_t event, uint64_t cnt) |
|
83 |
+{ |
|
84 |
+ cli_perf_enter(); |
|
85 |
+ assert( kind < __LAST_COUNTABLE); |
|
86 |
+ cli_perf_count_tls[kind][event] += cnt; |
|
87 |
+} |
|
88 |
+ |
|
89 |
+static inline void cli_perf_log_count(enum perf_log_countable kind, uint8_t event) |
|
90 |
+{ |
|
91 |
+ cli_perf_log_count2(kind, event, 1); |
|
92 |
+} |
|
93 |
+ |
|
94 |
+#endif |
|
95 |
+ |
|
96 |
+#else |
|
97 |
+#define cli_perf_log_count(a,b) do {} while(0) |
|
98 |
+#endif |
|
99 |
+ |
|
100 |
+#endif |
... | ... |
@@ -65,104 +65,6 @@ static int add_pattern_suffix(void *cbdata, const char *suffix, size_t suffix_le |
65 | 65 |
static int add_static_pattern(struct regex_matcher *matcher, char* pattern); |
66 | 66 |
/* ---------- */ |
67 | 67 |
|
68 |
-/* ----- shift-or filtering -------------- */ |
|
69 |
- |
|
70 |
-#define BITMAP_CONTAINS(bmap, val) ((bmap)[(val) >> 5] & (1 << ((val) & 0x1f))) |
|
71 |
-#define BITMAP_INSERT(bmap, val) ((bmap)[(val) >> 5] |= (1 << ((val) & 0x1f))) |
|
72 |
- |
|
73 |
-static void SO_init(struct filter *m) |
|
74 |
-{ |
|
75 |
- memset(m->B, ~0, sizeof(m->B)); |
|
76 |
- memset(m->end, ~0, sizeof(m->end)); |
|
77 |
- memset(m->end_fast, ~0, sizeof(m->end_fast)); |
|
78 |
-} |
|
79 |
- |
|
80 |
-/* because we use uint32_t */ |
|
81 |
-#define MAXSOPATLEN 32 |
|
82 |
- |
|
83 |
-/* merge another pattern into the filter |
|
84 |
- * add('abc'); add('bcd'); will match [ab][bc][cd] */ |
|
85 |
-static int SO_preprocess_add(struct filter *m, const unsigned char *pattern, size_t len) |
|
86 |
-{ |
|
87 |
- uint16_t q; |
|
88 |
- uint8_t j; |
|
89 |
- |
|
90 |
- /* cut length, and make it modulo 2 */ |
|
91 |
- if(len > MAXSOPATLEN) { |
|
92 |
- len = MAXSOPATLEN; |
|
93 |
- } else { |
|
94 |
- /* we use 2-grams, must be multiple of 2 */ |
|
95 |
- len = len & ~1; |
|
96 |
- } |
|
97 |
- if(!len) |
|
98 |
- return 0; |
|
99 |
- |
|
100 |
- /* Shift-Or like preprocessing */ |
|
101 |
- for(j=0;j < len-1;j++) { |
|
102 |
- /* use overlapping 2-grams. We need them overlapping because matching can start at any position */ |
|
103 |
- q = cli_readint16( &pattern[j] ); |
|
104 |
- m->B[q] &= ~(1 << j); |
|
105 |
- } |
|
106 |
- /* we use variable length patterns, use last character to mark pattern end, |
|
107 |
- * can lead to false positives.*/ |
|
108 |
- /* mark that at state j, the q-gram q can end the pattern */ |
|
109 |
- if(j) { |
|
110 |
- j--; |
|
111 |
- m->end[q] &= ~(1 << j); |
|
112 |
- m->end_fast[pattern[j+1]] &= ~(1<<j); |
|
113 |
- } |
|
114 |
- return 0; |
|
115 |
-} |
|
116 |
- |
|
117 |
-/* this is like a FSM, with multiple active states at the same time. |
|
118 |
- * each bit in "state" means an active state, when a char is encountered |
|
119 |
- * we determine what states can remain active. |
|
120 |
- * The FSM transition rules are expressed as bit-masks */ |
|
121 |
-long SO_search(const struct filter *m, const unsigned char *data, unsigned long len) |
|
122 |
-{ |
|
123 |
- size_t j; |
|
124 |
- uint32_t state = ~0; |
|
125 |
- const uint32_t *B = m->B; |
|
126 |
- const uint32_t *End = m->end; |
|
127 |
- const uint32_t *EndFast = m->end_fast; |
|
128 |
- |
|
129 |
- /* cut length, and make it modulo 2 */ |
|
130 |
- if(len > MAXSOPATLEN) { |
|
131 |
- len = MAXSOPATLEN; |
|
132 |
- } else { |
|
133 |
- /* we use 2-grams, must be multiple of 2 */ |
|
134 |
- len = len & ~1; |
|
135 |
- } |
|
136 |
- if(!len) return -1; |
|
137 |
- /* Shift-Or like search algorithm */ |
|
138 |
- for(j=0;j < len-1; j++) { |
|
139 |
- const uint16_t q0 = cli_readint16( &data[j] ); |
|
140 |
- uint32_t match_end; |
|
141 |
- state = (state << 1) | B[q0]; |
|
142 |
- /* state marks with a 0 bit all active states |
|
143 |
- * End[q0] marks with a 0 bit all states where the q-gram 'q' can end a pattern |
|
144 |
- * if we got two 0's at matching positions, it means we encountered a pattern's end */ |
|
145 |
- match_end = state | EndFast[data[j+1]]; |
|
146 |
- if((match_end != 0xffffffff) && (state | End[q0]) != 0xffffffff) { |
|
147 |
- /* note: we rely on short-circuit eval here, we only evaluate and fetch End[q0], if |
|
148 |
- * end_fast has matched. This reduces cache pressure on End[], and allows us to keep the working |
|
149 |
- * set inside L2 */ |
|
150 |
- |
|
151 |
- /* if state is reachable, and this character can finish a pattern, assume match */ |
|
152 |
- /* to reduce false positives check if qgram can finish the pattern */ |
|
153 |
- /* return position of probable match */ |
|
154 |
- /* find first 0 starting from MSB, the position of that bit as counted from LSB, is the length of the |
|
155 |
- * longest pattern that could match */ |
|
156 |
- return j >= MAXSOPATLEN ? j - MAXSOPATLEN : 0; |
|
157 |
- } |
|
158 |
- } |
|
159 |
- /* no match */ |
|
160 |
- return -1; |
|
161 |
-} |
|
162 |
- |
|
163 |
-/* ----------------------------------------------------------- */ |
|
164 |
- |
|
165 |
- |
|
166 | 68 |
#define MATCH_SUCCESS 0 |
167 | 69 |
#define MATCH_FAILED -1 |
168 | 70 |
|
... | ... |
@@ -296,7 +198,7 @@ int regex_list_match(struct regex_matcher* matcher,char* real_url,const char* di |
296 | 296 |
if(!bufrev) |
297 | 297 |
return CL_EMEM; |
298 | 298 |
reverse_string(bufrev); |
299 |
- rc = SO_search(&matcher->filter, (const unsigned char*)bufrev, buffer_len) != -1; |
|
299 |
+ rc = filter_search(&matcher->filter, (const unsigned char*)bufrev, buffer_len) != -1; |
|
300 | 300 |
if(rc == -1) { |
301 | 301 |
free(buffer); |
302 | 302 |
free(bufrev); |
... | ... |
@@ -381,7 +283,7 @@ int init_regex_list(struct regex_matcher* matcher) |
381 | 381 |
if((rc = cli_bm_init(&matcher->hostkey_prefix))) { |
382 | 382 |
return rc; |
383 | 383 |
} |
384 |
- SO_init(&matcher->filter); |
|
384 |
+ filter_init(&matcher->filter); |
|
385 | 385 |
return CL_SUCCESS; |
386 | 386 |
} |
387 | 387 |
|
... | ... |
@@ -697,7 +599,7 @@ static int add_newsuffix(struct regex_matcher *matcher, struct regex_list *info, |
697 | 697 |
mpool_free(matcher->mempool, new); |
698 | 698 |
return ret; |
699 | 699 |
} |
700 |
- SO_preprocess_add(&matcher->filter, (const unsigned char*)suffix, len); |
|
700 |
+ filter_add_static(&matcher->filter, (const unsigned char*)suffix, len, "regex"); |
|
701 | 701 |
return CL_SUCCESS; |
702 | 702 |
} |
703 | 703 |
|
... | ... |
@@ -27,17 +27,11 @@ |
27 | 27 |
#include "phishcheck.h" |
28 | 28 |
#include "readdb.h" |
29 | 29 |
#include "matcher.h" |
30 |
+#include "filtering.h" |
|
30 | 31 |
#include <zlib.h> /* for gzFile */ |
31 | 32 |
|
32 | 33 |
#include "mpool.h" |
33 | 34 |
|
34 |
-struct filter { |
|
35 |
- uint32_t B[65536]; |
|
36 |
- uint32_t end_fast[256]; |
|
37 |
- uint32_t end[65536]; |
|
38 |
- unsigned long m; |
|
39 |
-}; |
|
40 |
- |
|
41 | 35 |
struct regex_list_ht { |
42 | 36 |
struct regex_list *head; |
43 | 37 |
struct regex_list *tail; |
... | ... |
@@ -71,7 +65,6 @@ int load_regex_matcher(struct regex_matcher* matcher,FILE* fd,unsigned int *sign |
71 | 71 |
void regex_list_cleanup(struct regex_matcher* matcher); |
72 | 72 |
void regex_list_done(struct regex_matcher* matcher); |
73 | 73 |
int is_regex_ok(struct regex_matcher* matcher); |
74 |
-long SO_search(const struct filter *m, const unsigned char *data, unsigned long len); |
|
75 | 74 |
|
76 | 75 |
#endif |
77 | 76 |
|
... | ... |
@@ -30,6 +30,7 @@ |
30 | 30 |
#include "../libclamav/matcher.h" |
31 | 31 |
#include "../libclamav/matcher-ac.h" |
32 | 32 |
#include "../libclamav/matcher-bm.h" |
33 |
+#include "../libclamav/others.h" |
|
33 | 34 |
#include "../libclamav/default.h" |
34 | 35 |
#include "checks.h" |
35 | 36 |
|
... | ... |
@@ -46,19 +47,44 @@ static const struct ac_testdata_s { |
46 | 46 |
{ "abdcabcddabccadbbdbacb", "6463{2-3}64646162(63|64|65)6361*6462????6261{-1}6362", "Test_5" }, |
47 | 47 |
{ "abcdefghijkabcdefghijk", "62????65666768*696a6b6162{2-3}656667[1-3]6b", "Test_6" }, |
48 | 48 |
{ "abcadbabcadbabcacb", "6?6164?26?62{3}?26162?361", "Test_7" }, |
49 |
+ /* testcase for filter bug: it was checking only first 32 chars, and last |
|
50 |
+ * maxpatlen */ |
|
51 |
+ { "\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1dddddddddddddddddddd5\1\1\1\1\1\1\1\1\1\1\1\1\1","6464646464646464646464646464646464646464(35|36)","Test_8"}, |
|
49 | 52 |
|
50 | 53 |
{ NULL, NULL, NULL} |
51 | 54 |
}; |
52 | 55 |
|
53 |
-START_TEST (test_ac_scanbuff) { |
|
56 |
+ |
|
57 |
+static cli_ctx ctx; |
|
58 |
+static const char *virname = NULL; |
|
59 |
+static void setup(void) |
|
60 |
+{ |
|
54 | 61 |
struct cli_matcher *root; |
62 |
+ virname = NULL; |
|
63 |
+ ctx.virname = &virname; |
|
64 |
+ ctx.engine = cl_engine_new(); |
|
65 |
+ fail_unless(!!ctx.engine, "cl_engine_new() failed"); |
|
66 |
+ root = (struct cli_matcher *) mpool_calloc(ctx.engine->mempool, 1, sizeof(struct cli_matcher)); |
|
67 |
+ fail_unless(root != NULL, "root == NULL"); |
|
68 |
+#ifdef USE_MPOOL |
|
69 |
+ root->mempool = ctx.engine->mempool; |
|
70 |
+#endif |
|
71 |
+ |
|
72 |
+ ctx.engine->root[0] = root; |
|
73 |
+} |
|
74 |
+ |
|
75 |
+static void teardown(void) |
|
76 |
+{ |
|
77 |
+ cl_engine_free((struct cl_engine*)ctx.engine); |
|
78 |
+} |
|
79 |
+ |
|
80 |
+START_TEST (test_ac_scanbuff) { |
|
55 | 81 |
struct cli_ac_data mdata; |
56 |
- const char *virname = NULL; |
|
82 |
+ struct cli_matcher *root; |
|
57 | 83 |
unsigned int i; |
58 | 84 |
int ret; |
59 | 85 |
|
60 |
- |
|
61 |
- root = (struct cli_matcher *) cli_calloc(1, sizeof(struct cli_matcher)); |
|
86 |
+ root = ctx.engine->root[0]; |
|
62 | 87 |
fail_unless(root != NULL, "root == NULL"); |
63 | 88 |
root->ac_only = 1; |
64 | 89 |
|
... | ... |
@@ -68,6 +94,7 @@ START_TEST (test_ac_scanbuff) { |
68 | 68 |
ret = cli_ac_init(root, CLI_DEFAULT_AC_MINDEPTH, CLI_DEFAULT_AC_MAXDEPTH); |
69 | 69 |
fail_unless(ret == CL_SUCCESS, "cli_ac_init() failed"); |
70 | 70 |
|
71 |
+ |
|
71 | 72 |
for(i = 0; ac_testdata[i].data; i++) { |
72 | 73 |
ret = cli_parse_add(root, ac_testdata[i].virname, ac_testdata[i].hexsig, 0, 0, "*", 0, NULL, 0); |
73 | 74 |
fail_unless(ret == CL_SUCCESS, "cli_parse_add() failed"); |
... | ... |
@@ -83,14 +110,13 @@ START_TEST (test_ac_scanbuff) { |
83 | 83 |
ret = cli_ac_scanbuff(ac_testdata[i].data, strlen(ac_testdata[i].data), &virname, NULL, NULL, root, &mdata, 0, 0, NULL, AC_SCAN_VIR, NULL); |
84 | 84 |
fail_unless_fmt(ret == CL_VIRUS, "cli_ac_scanbuff() failed for %s", ac_testdata[i].virname); |
85 | 85 |
fail_unless_fmt(!strncmp(virname, ac_testdata[i].virname, strlen(ac_testdata[i].virname)), "Dataset %u matched with %s", i, virname); |
86 |
+ |
|
87 |
+ ret = cli_scanbuff(ac_testdata[i].data, strlen(ac_testdata[i].data), 0, &ctx, 0, NULL); |
|
88 |
+ fail_unless_fmt(ret == CL_VIRUS, "cli_scanbuff() failed for %s", ac_testdata[i].virname); |
|
89 |
+ fail_unless_fmt(!strncmp(virname, ac_testdata[i].virname, strlen(ac_testdata[i].virname)), "Dataset %u matched with %s", i, virname); |
|
86 | 90 |
} |
87 | 91 |
|
88 | 92 |
cli_ac_freedata(&mdata); |
89 |
- cli_ac_free(root); |
|
90 |
-#ifdef USE_MPOOL |
|
91 |
- mpool_destroy(root->mempool); |
|
92 |
-#endif |
|
93 |
- free(root); |
|
94 | 93 |
} |
95 | 94 |
END_TEST |
96 | 95 |
|
... | ... |
@@ -100,7 +126,7 @@ START_TEST (test_bm_scanbuff) { |
100 | 100 |
int ret; |
101 | 101 |
|
102 | 102 |
|
103 |
- root = (struct cli_matcher *) cli_calloc(1, sizeof(struct cli_matcher)); |
|
103 |
+ root = ctx.engine->root[0]; |
|
104 | 104 |
fail_unless(root != NULL, "root == NULL"); |
105 | 105 |
|
106 | 106 |
#ifdef USE_MPOOL |
... | ... |
@@ -119,11 +145,6 @@ START_TEST (test_bm_scanbuff) { |
119 | 119 |
ret = cli_bm_scanbuff("blah\xde\xad\xbe\xef", 12, &virname, NULL, root, 0, NULL, NULL); |
120 | 120 |
fail_unless(ret == CL_VIRUS, "cli_bm_scanbuff() failed"); |
121 | 121 |
fail_unless(!strncmp(virname, "Sig2", 4), "Incorrect signature matched in cli_bm_scanbuff()\n"); |
122 |
- cli_bm_free(root); |
|
123 |
-#ifdef USE_MPOOL |
|
124 |
- mpool_destroy(root->mempool); |
|
125 |
-#endif |
|
126 |
- free(root); |
|
127 | 122 |
} |
128 | 123 |
END_TEST |
129 | 124 |
|
... | ... |
@@ -133,6 +154,7 @@ Suite *test_matchers_suite(void) |
133 | 133 |
TCase *tc_matchers; |
134 | 134 |
tc_matchers = tcase_create("matchers"); |
135 | 135 |
suite_add_tcase(s, tc_matchers); |
136 |
+ tcase_add_checked_fixture (tc_matchers, setup, teardown); |
|
136 | 137 |
tcase_add_test(tc_matchers, test_ac_scanbuff); |
137 | 138 |
tcase_add_test(tc_matchers, test_bm_scanbuff); |
138 | 139 |
return s; |