/* * A fast filter for static patterns. * * Copyright (C) 2008 Sourcefire, Inc. * * Authors: Török Edvin * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as * published by the Free Software Foundation. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, * MA 02110-1301, USA. */ #if HAVE_CONFIG_H #include "clamav-config.h" #endif #include "filtering.h" #include "matcher-ac.h" #include #include #include "perflogging.h" /* ----- shift-or filtering -------------- */ /* * Description of algorithm: * * Multiple patterns are added to the filter. * The filter retains an approximation of these patterns, which can lead to * false positive matches, but not false negative matches. * * For each position in the filter we retain what qgrams can match at that * position, for example (if we'd use characters as qgrams): * pattern1: atu * pattern2: bzf * pattern3: xat * * filter accepts: * [abx][tza][uft] * * But it also accepts (false positives): * azu, azf, azt, ... * * It doesn't however accept: * aaa, atz, ... * * This is implemented by having a bit-level state-machine with MAXSOPATLEN (=32) states, * each active bit meaning that a state is active. * * The states are activated sequentially, eachtransition decision is made * considering if we can accept the character at position X. * Since we can start a match at any position, position 0 is * reactivated each time. * When the last position is activated, the filter reports a match. * If we can't accept the character at position X, the state remains inactive, * and further states aren't activated (unless we activate this state in the * future). * * Essentially this is an automaton like this: * * /\ (a|b|x) (t|z|a) (u|f|t) * [S1] ---------> [S2] -------> [S3] ---------> [S4] -> match * \_______________/ | * \_____________________________/ * * * But we are tracking multiple active states at each time (or run N automatons * in parallel if you like, N = number of states). * * We can have S3 and S2 active, meaning that if the next character is * acceptable, it transitions to S1,S3 and S4 being active, otherwise it * transitions to S1 being active. * * Active states can either be represented as a binary 1 or 0, and using * bit-shifting and masking. * If we choose 1, we must use &, and after shifting always reactivate bit 0. * If we choose 0, we must use |, and after shifting we don't need to do * anything (since by shifting a 0 is implicitly introduced). * * This file implements the latter (shift-or) method. * * The discussion above considered pattern to be of same length (or truncated to * be so). In reality patterns are of variable length, and we often have short * pattern. * * Thus another bitmap was introduced, meaning that if (end[Q] == set), then * a pattern can end at this position. * Also we would fill the pattern's position filters quite quickly with only 256 * choices for a position, so the algorithm uses overlapping qgrams of length 2: * 'abcd' is 3 qgrams: 'ab','bc','cd' * * The algorithm is very sensitive to the end[Q] filter, since it can have false * positives due to short patterns! * For optimal performance we need: * - patterns as long as possible * - probability for end[Q] to match low (avoid 0000, and other common case * - choose the most "diverse" subset from a long pattern * * diverse = refering to what we are scanning, so that the filter rarely * matches, so this actually means that we *want* to avoid adding more * characters to the filter, if we have 2 patterns: * abxfg, and dalabxpo, it may be preferable to shift the 2nd one so that we * don't add new character at the beginning. * * With NDB signatures there are more challenges to overcome: * e8??0000000aa * * will make the filter accept: * e8, 00, ... 000000aa * * We should delay the pattern end as long as possible, especially if it is 0000 * The problem is that now the filter accepts 0000 on position 3, regardless * of what we have on position 1 (even if we have something else than e8), so * we have to be very careful not to allow 0000 on first position too, * otherwise the filter will happily accept 000000000000. * * To optimize cache usage there are 2 end filters, one character (fits L1), and one qgram * based (fits L2), both must match for the filter to consider it a match. * * */ /*#define DETAILED_DEBUG*/ #ifdef DETAILED_DEBUG #define detailed_dbg cli_dbgmsg #else #define detailed_dbg(...) #endif #define BITMAP_CONTAINS(bmap, val) ((bmap)[(val) >> 5] & (1 << ((val) & 0x1f))) #define BITMAP_INSERT(bmap, val) ((bmap)[(val) >> 5] |= (1 << ((val) & 0x1f))) void filter_init(struct filter *m) { memset(m->B, ~0, sizeof(m->B)); memset(m->end, ~0, sizeof(m->end)); } /* because we use uint32_t */ #define MAXSOPATLEN 8 static inline int filter_isset(const struct filter *m, unsigned pos, uint16_t val) { return !(m->B[val] & (1<B[val] &= ~(1<end[a] & (1<end[a] &= ~(1 << pos); } } #define MAX_CHOICES 8 /* just an arbitrary limit, if patterns are longer, we cut * the filter can only use MAXSOPATLEN (32) characters, * this longer buffer is needed so that we can choose the "best" subpattern from * it */ #define MAXPATLEN 255 /* merge another pattern into the filter * add('abc'); add('bcd'); will match [ab][bc][cd] */ int filter_add_static(struct filter *m, const unsigned char *pattern, unsigned long len, const char *name) { uint16_t q = 0; uint8_t j, maxlen; uint32_t best = 0xffffffff; uint8_t best_pos = 0; cli_perf_log_count(TRIE_ORIG_LEN, len > 8 ? 8 : len); /* TODO: choose best among MAXCHOICES */ /* cut length */ if(len > MAXPATLEN) { len = MAXPATLEN; } if(len < 2) return -1; /* we want subsigs to be as long as possible */ if (len > 4) { maxlen = len - 4; if (maxlen == 1) maxlen = 2; } else maxlen = 2; for(j=0;(best < 100 && j len) break; for(k=j;k MAXSOPATLEN) { len = MAXSOPATLEN; } /* Shift-Or like preprocessing */ for(j=0;j < len-1;j++) { /* use overlapping little-endian 2-grams. We need them overlapping because matching can start at any position */ q = cli_readint16( &pattern[j] ); filter_set_atpos(m, j, q); } /* we use variable length patterns, use last character to mark pattern end, * can lead to false positives.*/ /* mark that at state j, the q-gram q can end the pattern */ if(j) { j--; filter_set_end(m, j, q); } return j+2; } struct char_spec { /* if non-null i-th character = alt[start + step*i]; start+step*i < end; */ struct cli_ac_special *alt; uint8_t start; uint8_t end; uint8_t step; uint8_t negative; }; static inline unsigned char spec_ith_char(const struct char_spec *spec, unsigned i) { const struct cli_ac_special *alt = spec->alt; if (alt) { assert (alt->type == 1); assert (i < alt->num); return alt->str[i]; } return i; } static const struct char_spec full_range = {NULL, 0,0xff,1,0}; static inline int spec_is_fullrange(const struct char_spec *spec0, const struct char_spec *spec1) { return !memcmp(spec0, &full_range, sizeof(full_range)) && !memcmp(spec1, &full_range, sizeof(full_range)); } #ifndef MIN #define MIN(a,b) ((a) < (b) ? (a) : (b)) #endif #define SPEC_FOREACH(spec0, k0, spec1, k1) do {\ unsigned char c0 = spec_ith_char(spec0, k0);\ unsigned char c1 = spec_ith_char(spec1, k1);\ unsigned c0end, c1end, cc0,cc1;\ c0end = spec0->negative ? 255 : c0;\ c1end = spec1->negative ? 255 : c1;\ cc0 = spec0->negative ? 0 : c0;\ cc1 = spec1->negative ? 0 : c1;\ for (;cc0 <= c0end;cc0++) {\ for (;cc1 <= c1end; cc1++) {\ uint16_t a = cc0 | (cc1<<8);\ if (spec0->negative && cc0 == c0)\ continue;\ if (spec1->negative && cc1 == c1)\ continue; #define SPEC_END_FOR }}} while(0) enum badness { reject, /* try to avoid if possible */ avoid_first, avoid_anywhere, /* includes avoid_first! */ /* not that bad, but still not best */ dontlike, acceptable, like }; static inline void get_score(enum badness badness, unsigned i, const struct filter *m, const struct char_spec *spec0, const struct char_spec *spec1, int32_t *score, int32_t *score_end) { int32_t base; unsigned k0, k1, num_introduced = 0, num_end_introduced = 0; switch (badness) { case reject: /* not reached */ assert(0); base = -0x7fffff; break; case avoid_first: if (!i) base = -0x700000; else base = 0; break; case avoid_anywhere: if (!i) base = -0x720000; else base = -0x1000; break; case dontlike: base = 0; break; case acceptable: base = 0x200; break; case like: /* a bit better only */ base = 0x201; break; } if (base < 0) { *score = base; *score_end = base; return; } /* at most 256 iterations here, otherwise base would be negative */ for(k0=spec0->start;k0 <= spec0->end;k0 += spec0->step) { for(k1=spec1->start;k1 <= spec1->end;k1 += spec1->step) { SPEC_FOREACH(spec0, k0, spec1, k1) { num_introduced += filter_isset(m, i, a); num_end_introduced += filter_end_isset(m, i, a); } SPEC_END_FOR; } } *score = base - num_introduced; *score_end = base - num_end_introduced; if (badness == avoid_first && i) { /* what is bad to begin with, is bad at end too */ *score_end -= 0x1000; } } struct choice { enum badness base; unsigned begin; unsigned len; }; static inline void add_choice(struct choice *choices, unsigned *cnt, unsigned i, unsigned ie, enum badness badness) { struct choice *choice; int i_neg = -1; assert(ie < MAXPATLEN); if (ie < i+1) return; if (*cnt >= MAX_CHOICES) return; if (badness > avoid_first && *cnt >= (MAX_CHOICES >> 1)) { unsigned j; /* replace very bad picks if we're full */ for (j=0;j<*cnt;j++) { if (choices[j].base < badness) { if (i_neg == -1 || choices[j].base < choices[i_neg].base) { i_neg = j; } } } } if (i_neg != -1) { choice = &choices[i_neg]; } else { choice = &choices[(*cnt)++]; } choice->begin = i; choice->len = ie - i + 1; choice->base = badness; } static inline int32_t spec_iter(const struct char_spec *spec) { unsigned count; assert(spec->step); count = (1 + spec->end - spec->start)/spec->step; if (spec->negative) /* all chars except itself are added */ count *= 254; return count; } int filter_add_acpatt(struct filter *m, const struct cli_ac_patt *pat) { unsigned i, j = 0, stop = 0, l=0; uint16_t k0, k1; struct char_spec chars[MAXPATLEN]; enum badness char_badness[MAXPATLEN]; unsigned char patc[MAXPATLEN]; unsigned altcnt = 0; int32_t best_score = -0x7fffffff; unsigned best_score_i = 0; unsigned best_score_len = 0; struct char_spec *spec0 = NULL, *spec1 = NULL; struct choice choices[MAX_CHOICES]; unsigned choices_cnt = 0; unsigned prefix_len = pat->prefix_length; unsigned speci; j = MIN(prefix_len + pat->length, MAXPATLEN); for(i=0;iprefix[i] : pat->pattern[i - prefix_len]; if ((p&CLI_MATCH_WILDCARD) != CLI_MATCH_CHAR) break; patc[i] = (uint8_t)p; } if (i == j) { /* all static, use add_static it has better heuristics for this * case */ return filter_add_static(m, patc, j, pat->virname); } cli_perf_log_count(TRIE_ORIG_LEN, j > 8 ? 8 : j); i = 0; if (!prefix_len) { while ((pat->pattern[i] & CLI_MATCH_WILDCARD) == CLI_MATCH_SPECIAL) { /* we support only ALT_CHAR, skip the rest */ if (pat->special_table[altcnt]->type == 1) break; altcnt++; i++; } } /* transform AC characters into our representation */ for (speci=0;iprefix[i] : pat->pattern[i - prefix_len]; spec->alt = NULL; spec->negative = 0; switch (p & CLI_MATCH_WILDCARD) { case CLI_MATCH_CHAR: spec->start = spec->end = (uint8_t)p; spec->step = 1; break; case CLI_MATCH_IGNORE: spec->start = 0x00; spec->end = 0xff; spec->step = 1; break; case CLI_MATCH_SPECIAL: assert(pat->special_table); /* assert(altcnt < pat->alt); */ assert(pat->special_table[altcnt]); spec->negative = pat->special_table[altcnt]->negative; switch (pat->special_table[altcnt++]->type) { case 1: /* ALT_CHAR */ spec->start = 0; spec->end = pat->special_table[altcnt-1]->num - 1; spec->step = 1; spec->alt = pat->special_table[altcnt-1]; break; default: stop = 1; break; /* TODO: should something be done here? * */ } break; case CLI_MATCH_NIBBLE_HIGH: spec->start = (p & 0xf0); spec->end = spec->start | 0x0f; spec->step = 1; break; case CLI_MATCH_NIBBLE_LOW: spec->start = (p & 0xf); spec->end = 0xf0 | spec->start; spec->step = 0x10; break; default: cli_errmsg("filtering: unknown wildcard character: %d\n", p); return -1; } } if (stop) --speci; j = speci; if (j < 2) { if (stop) cli_warnmsg("Don't know how to create filter for: %s\n",pat->virname); else cli_warnmsg("Subpattern too short: %s\n", pat->virname); return -1; } for(i=0;i= 0x100) { if (num_iter == 0x10000) char_badness[i] = reject; else char_badness[i] = avoid_anywhere; } else { int8_t binary = 0; enum badness scor = acceptable; for(k0=spec0->start;k0 <= spec0->end;k0 += spec0->step) { for(k1=spec1->start;k1 <= spec1->end;k1 += spec1->step) { unsigned char c0 = spec_ith_char(spec0, k0); unsigned char c1 = spec_ith_char(spec1, k1); if (spec0->negative || spec1->negative) { scor = avoid_anywhere; break; } if ((!c0 && !c1) || (c0 == 0xff && c1 == 0xff)) { scor = avoid_first; break; } if (c0 == c1) { scor = dontlike; break; } if ((c0 < 32 || c0 > 127) && (c1 < 32 || c1 >127)) binary = 1; } } if (scor == acceptable && binary) { /* slightly favor binary */ scor = like; } char_badness[i] = scor; } } /* try to choose best subpattern */ /* calculating the score for all possible i start pos * and all possible length is too slow, so choose best among N choices * only */ for (i=0;i 0) /* if we have another choice don't choose this */ continue; while ((kend > i+3) && char_badness[kend-1] == reject) kend--; for (k=i;k (int)i) { /* ki|ki+1|??| */ /* try subpattern from after the wildcard */ i = ki; } /* if score is positive, it replaces a negative choice */ } for(l=0;l best_score) { /* we may have negative scores, so truncating * the pattern could actually get us a higher * score */ best_score = score + score_end; best_score_len = p + 2; best_score_i = i; assert(i + best_score_len <= j); } } } if (best_score <= -0x7fffffff) { cli_warnmsg("filter rejecting %s due to very bad score: %ld\n", pat->virname, (long)best_score); return -1; } if (choices_cnt == 0) { cli_warnmsg("filter rejecting %s because there are no viable choices", pat->virname); return -1; } assert(best_score_len >= 2); detailed_dbg("filter %s score: %ld, %u (+ %u)\n", pat->virname, (long)best_score, best_score_i, best_score_len); /* Shift-Or like preprocessing */ assert(1 < best_score_len); for (i=0;i < best_score_len-1;i++) { spec0 = &chars[best_score_i + i]; spec1 = &chars[best_score_i + i + 1]; /* use overlapping little-endian 2-grams, overlapping because match can start * at any position (including odd) */ for(k0=spec0->start;k0 <= spec0->end;k0 += spec0->step) { for(k1=spec1->start;k1 <= spec1->end;k1 += spec1->step) { SPEC_FOREACH(spec0, k0, spec1, k1) { if (!cc0 && !cc1 && !i) { detailed_dbg("filter (warning): subsignature begins with zero: %s\n",pat->virname); } filter_set_atpos(m, i, a); } SPEC_END_FOR; } } } j = best_score_len - 2; if (spec0 && spec1) { for (k0=spec0->start;k0 <= spec0->end;k0 += spec0->step) { for (k1=spec1->start;k1 <= spec1->end;k1 += spec1->step) { SPEC_FOREACH(spec0, k0, spec1, k1) { if (!cc0 && !cc1) { detailed_dbg("filter (warning): subsignature ends with zero: %s\n",pat->virname); } filter_set_end(m, j, a); } SPEC_END_FOR; } } } return j+2; } static const struct match_len_info { uint8_t shortest; uint8_t longest; } match_len[256] = { {2,9},{3,9},{2,9},{4,9},{2,9},{3,9},{2,9},{5,9}, {2,9},{3,9},{2,9},{4,9},{2,9},{3,9},{2,9},{6,9}, {2,9},{3,9},{2,9},{4,9},{2,9},{3,9},{2,9},{5,9}, {2,9},{3,9},{2,9},{4,9},{2,9},{3,9},{2,9},{7,9}, {2,9},{3,9},{2,9},{4,9},{2,9},{3,9},{2,9},{5,9}, {2,9},{3,9},{2,9},{4,9},{2,9},{3,9},{2,9},{6,9}, {2,9},{3,9},{2,9},{4,9},{2,9},{3,9},{2,9},{5,9}, {2,9},{3,9},{2,9},{4,9},{2,9},{3,9},{2,9},{8,9}, {2,9},{3,9},{2,9},{4,9},{2,9},{3,9},{2,9},{5,9}, {2,9},{3,9},{2,9},{4,9},{2,9},{3,9},{2,9},{6,9}, {2,9},{3,9},{2,9},{4,9},{2,9},{3,9},{2,9},{5,9}, {2,9},{3,9},{2,9},{4,9},{2,9},{3,9},{2,9},{7,9}, {2,9},{3,9},{2,9},{4,9},{2,9},{3,9},{2,9},{5,9}, {2,9},{3,9},{2,9},{4,9},{2,9},{3,9},{2,9},{6,9}, {2,9},{3,9},{2,9},{4,9},{2,9},{3,9},{2,9},{5,9}, {2,9},{3,9},{2,9},{4,9},{2,9},{3,9},{2,9},{9,9}, {2,8},{3,8},{2,8},{4,8},{2,8},{3,8},{2,8},{5,8}, {2,8},{3,8},{2,8},{4,8},{2,8},{3,8},{2,8},{6,8}, {2,8},{3,8},{2,8},{4,8},{2,8},{3,8},{2,8},{5,8}, {2,8},{3,8},{2,8},{4,8},{2,8},{3,8},{2,8},{7,8}, {2,8},{3,8},{2,8},{4,8},{2,8},{3,8},{2,8},{5,8}, {2,8},{3,8},{2,8},{4,8},{2,8},{3,8},{2,8},{6,8}, {2,8},{3,8},{2,8},{4,8},{2,8},{3,8},{2,8},{5,8}, {2,8},{3,8},{2,8},{4,8},{2,8},{3,8},{2,8},{8,8}, {2,7},{3,7},{2,7},{4,7},{2,7},{3,7},{2,7},{5,7}, {2,7},{3,7},{2,7},{4,7},{2,7},{3,7},{2,7},{6,7}, {2,7},{3,7},{2,7},{4,7},{2,7},{3,7},{2,7},{5,7}, {2,7},{3,7},{2,7},{4,7},{2,7},{3,7},{2,7},{7,7}, {2,6},{3,6},{2,6},{4,6},{2,6},{3,6},{2,6},{5,6}, {2,6},{3,6},{2,6},{4,6},{2,6},{3,6},{2,6},{6,6}, {2,5},{3,5},{2,5},{4,5},{2,5},{3,5},{2,5},{5,5}, {2,4},{3,4},{2,4},{4,4},{2,3},{3,3},{2,2},{0,0} }; /* state 11110011 means that we may have a match of length min 4, max 5 */ __hot__ int filter_search_ext(const struct filter *m, const unsigned char *data, unsigned long len, struct filter_match_info *inf) { size_t j; uint8_t state = ~0; const uint8_t *B = m->B; const uint8_t *End = m->end; if (len < 2) return -1; /* look for first match */ for (j=0; j < len-1;j++) { uint8_t match_state_end; const uint16_t q0 = cli_readint16( &data[j] ); state = (state << 1) | B[q0]; match_state_end = state | End[q0]; if (match_state_end != 0xff) { inf->first_match = j; return 0; } } /* no match, inf is invalid */ return -1; } /* this is like a FSM, with multiple active states at the same time. * each bit in "state" means an active state, when a char is encountered * we determine what states can remain active. * The FSM transition rules are expressed as bit-masks */ long filter_search(const struct filter *m, const unsigned char *data, unsigned long len) { size_t j; uint8_t state = ~0; const uint8_t *B = m->B; const uint8_t *End = m->end; /* we use 2-grams, must be higher than 1 */ if(len < 2) return -1; /* Shift-Or like search algorithm */ for(j=0;j < len-1; j++) { const uint16_t q0 = cli_readint16( &data[j] ); uint8_t match_end; state = (state << 1) | B[q0]; /* state marks with a 0 bit all active states * End[q0] marks with a 0 bit all states where the q-gram 'q' can end a pattern * if we got two 0's at matching positions, it means we encountered a pattern's end */ match_end = state | End[q0]; if(match_end != 0xff) { /* if state is reachable, and this character can finish a pattern, assume match */ /* to reduce false positives check if qgram can finish the pattern */ /* return position of probable match */ /* find first 0 starting from MSB, the position of that bit as counted from LSB, is the length of the * longest pattern that could match */ return j >= MAXSOPATLEN ? j - MAXSOPATLEN : 0; } } /* no match */ return -1; }