git-svn: trunk@3978
Török Edvin authored on 2008/07/23 22:51:57... | ... |
@@ -1,3 +1,12 @@ |
1 |
+Wed Jul 23 16:32:32 EEST 2008 (edwin) |
|
2 |
+------------------------------------ |
|
3 |
+ * libclamav: performance improvements for URL matching (bb #725, bb #650): |
|
4 |
+ * use a suffix AC-trie and a shift-or FSM to filter |
|
5 |
+ * rewrite the URL regex in C |
|
6 |
+ * use a perfect hash to lookup TLD and ccTLD, instead of a regex |
|
7 |
+ * TODO: suffixes having a common prefix: loop over all of them |
|
8 |
+ cli_ac_free: multiple virname pointing to same location |
|
9 |
+ |
|
1 | 10 |
Mon Jul 21 12:16:44 CEST 2008 (tk) |
2 | 11 |
---------------------------------- |
3 | 12 |
* sigtool/vba.c: fix crash on error in vba code (bb#1106) |
... | ... |
@@ -1,7 +1,7 @@ |
1 | 1 |
PERL=perl |
2 | 2 |
CC=cc |
3 | 3 |
|
4 |
-all: entitylist.h encoding_aliases.h gentbl encname_chars.h |
|
4 |
+all: entitylist.h encoding_aliases.h gentbl encname_chars.h generate_hash |
|
5 | 5 |
|
6 | 6 |
entities_parsed: entities entities/* entity_decl_parse.pl |
7 | 7 |
$(PERL) entity_decl_parse.pl $</* | sort -u >$@ |
... | ... |
@@ -9,6 +9,9 @@ entities_parsed: entities entities/* entity_decl_parse.pl |
9 | 9 |
generate_entitylist: generate_entitylist.c ../../libclamav/hashtab.h ../../libclamav/hashtab.c ../../libclamav/others.c |
10 | 10 |
$(CC) -I. -DHAVE_CONFIG_H -DCLI_MEMFUNSONLY -DPROFILE_HASHTABLE $< ../../libclamav/hashtab.c ../../libclamav/others.c -o $@ |
11 | 11 |
|
12 |
+generate_hash: generate_hash.c ../../libclamav/hashtab.h ../../libclamav/hashtab.c ../../libclamav/others.c |
|
13 |
+ $(CC) -I. -DHAVE_CONFIG_H -DCLI_MEMFUNSONLY -DPROFILE_HASHTABLE $< ../../libclamav/hashtab.c ../../libclamav/others.c -o $@ |
|
14 |
+ |
|
12 | 15 |
generate_encoding_aliases: generate_encoding_aliases.c ../../libclamav/hashtab.c ../../libclamav/others.c ../../libclamav/htmlnorm.h ../../libclamav/entconv.h ../../libclamav/cltypes.h ../../libclamav/hashtab.h ../../libclamav/hashtab.h |
13 | 16 |
$(CC) -I. -DHAVE_CONFIG_H -DCLI_MEMFUNSONLY -DPROFILE_HASHTABLE $< ../../libclamav/hashtab.c ../../libclamav/others.c -o $@ |
14 | 17 |
|
... | ... |
@@ -26,30 +26,11 @@ OUTFILE=iana_tld.h |
26 | 26 |
echo "Downloading updated tld list from iana.org" |
27 | 27 |
wget $IANA_TLD -O $TMP || exit 2 |
28 | 28 |
echo "Download complete, parsing data" |
29 |
-# 174 is the code for | |
|
30 |
-TLDLIST=$(egrep -v ^# $TMP | tr \\n \\174 | sed 's/[^a-zA-Z]$//') |
|
31 |
-echo "Parse complete, removing tmpfile" |
|
32 |
-rm $TMP |
|
33 |
-echo "Generating tld list in $OUTFILE" |
|
34 |
-cat >$OUTFILE <<EOF |
|
35 |
-#ifndef IANA_TLD_H |
|
36 |
-#define IANA_TLD_H |
|
37 |
-EOF |
|
38 |
-echo -n "#define iana_tld \"(" >>$OUTFILE |
|
39 |
-echo -n $TLDLIST >>$OUTFILE |
|
40 |
-echo ")\"" >>$OUTFILE |
|
29 |
+grep -Ev ^# $TMP | tr [A-Z] [a-z] | gperf -C -l -L ANSI-C -E -C -H tld_hash -N in_tld_set|grep -v '^#line' | sed -e 's/^const struct/static const struct/' -e 's/register //g' >iana_tld.h |
|
41 | 30 |
|
42 | 31 |
echo "Downloading updated country-code list from iana.org" |
43 | 32 |
wget $IANA_CCTLD -O $TMP || exit 2 |
44 | 33 |
echo "Download complete, parsing data" |
45 |
-CCTLDLIST=$(cat $TMP | egrep -oi "<a href=[^>]+>\\.([a-zA-Z]+).+</a>" | egrep -o ">.[a-zA-Z]+" | colrm 1 2 | tr \\n \\174 | sed 's/[^a-zA-Z]$//') |
|
46 |
-echo "Parse complete, removing tmpfile" |
|
47 |
-rm $TMP |
|
48 |
-echo "Generating cctld list in $OUTFILE" |
|
49 |
-echo -n "#define iana_cctld \"(" >>$OUTFILE |
|
50 |
-echo -n $CCTLDLIST >>$OUTFILE |
|
51 |
-echo ")\"" >>$OUTFILE |
|
52 |
- |
|
53 |
- |
|
54 |
-echo "#endif" >>$OUTFILE |
|
55 |
-echo "Finished succesfully" |
|
34 |
+cat $TMP | grep country-code|egrep -oi "<a |
|
35 |
+href=[^>]+>\\.([a-zA-Z]+).+</a>"|egrep -o ">.[a-zA-Z]+" | colrm 1 2 | tr [A-Z] [a-z]| gperf -C -l -L ANSI-C -E -C -H cctld_hash -N in_cctld_set |grep -v '^#line'|sed -e 's/^const struct/static const struct/' -e 's/register //g' >iana_cctld.h |
|
36 |
+echo "Done" |
... | ... |
@@ -26,17 +26,4 @@ echo "Downloading updated tld list from iana.org" |
26 | 26 |
wget $IANA_TLD -O $TMP || exit 2 |
27 | 27 |
echo "Download complete, parsing data" |
28 | 28 |
# 174 is the code for | |
29 |
-TLDLIST=$(egrep -v ^# $TMP|tr \\n \\174 ) |
|
30 |
-echo "Parse complete, removing tmpfile" |
|
31 |
-rm $TMP |
|
32 |
-echo "Generating $OUTFILE" |
|
33 |
-cat >$OUTFILE <<EOF |
|
34 |
-#ifndef IANA_TLD_H |
|
35 |
-#define IANA_TLD_H |
|
36 |
-EOF |
|
37 |
-echo -n "#define iana_tld \"(" >>$OUTFILE |
|
38 |
-echo -n $TLDLIST >>$OUTFILE |
|
39 |
-echo ")\"" >>$OUTFILE |
|
40 |
-echo "#endif" >>$OUTFILE |
|
41 |
-echo "Finished succesfully" |
|
42 |
- |
|
29 |
+grep -Ev ^# $TMP | tr [A-Z] [a-z] | gperf -C -H tld_hash -N in_tld_set -l|grep -v '^#line' | sed -e 's/^const struct/static const struct/' -e 's/register //g' |
... | ... |
@@ -361,7 +361,7 @@ All 4 tests passed |
361 | 361 |
\item The exact output from \verb+make check+ |
362 | 362 |
\item Output of \verb+uname -mrsp+ |
363 | 363 |
\item your \verb+config.log+ |
364 |
- \item The following files from the \verb+unit-tests/+ directory: |
|
364 |
+ \item The following files from the \verb+unit_tests/+ directory: |
|
365 | 365 |
\begin{itemize} |
366 | 366 |
\item \verb+test.log+ |
367 | 367 |
\item \verb+clamscan.log+ |
... | ... |
@@ -367,10 +367,18 @@ void hashtab_clear(struct hashtable *s) |
367 | 367 |
if(s->htable[i].key && s->htable[i].key != DELETED_KEY) |
368 | 368 |
free((void *)s->htable[i].key); |
369 | 369 |
} |
370 |
- memset(s->htable, 0, s->capacity); |
|
370 |
+ if(s->htable) |
|
371 |
+ memset(s->htable, 0, s->capacity); |
|
371 | 372 |
s->used = 0; |
372 | 373 |
} |
373 | 374 |
|
375 |
+void hashtab_free(struct hashtable *s) |
|
376 |
+{ |
|
377 |
+ hashtab_clear(s); |
|
378 |
+ free(s->htable); |
|
379 |
+ s->htable = NULL; |
|
380 |
+ s->capacity = 0; |
|
381 |
+} |
|
374 | 382 |
|
375 | 383 |
int hashtab_store(const struct hashtable *s,FILE* out) |
376 | 384 |
{ |
... | ... |
@@ -82,7 +82,7 @@ int hashtab_init(struct hashtable *s,size_t capacity); |
82 | 82 |
const struct element* hashtab_insert(struct hashtable *s, const char* key, const size_t len, const element_data data); |
83 | 83 |
void hashtab_delete(struct hashtable *s,const char* key,const size_t len); |
84 | 84 |
void hashtab_clear(struct hashtable *s); |
85 |
- |
|
85 |
+void hashtab_free(struct hashtable *s); |
|
86 | 86 |
int hashtab_load(FILE* in, struct hashtable *s); |
87 | 87 |
int hashtab_store(const struct hashtable *s,FILE* out); |
88 | 88 |
|
89 | 89 |
new file mode 100644 |
... | ... |
@@ -0,0 +1,505 @@ |
0 |
+/* ANSI-C code produced by gperf version 3.0.3 */ |
|
1 |
+/* Command-line: gperf -C -l -L ANSI-C -E -C -H cctld_hash -N in_cctld_set */ |
|
2 |
+/* Computed positions: -k'1-2' */ |
|
3 |
+ |
|
4 |
+#if !((' ' == 32) && ('!' == 33) && ('"' == 34) && ('#' == 35) \ |
|
5 |
+ && ('%' == 37) && ('&' == 38) && ('\'' == 39) && ('(' == 40) \ |
|
6 |
+ && (')' == 41) && ('*' == 42) && ('+' == 43) && (',' == 44) \ |
|
7 |
+ && ('-' == 45) && ('.' == 46) && ('/' == 47) && ('0' == 48) \ |
|
8 |
+ && ('1' == 49) && ('2' == 50) && ('3' == 51) && ('4' == 52) \ |
|
9 |
+ && ('5' == 53) && ('6' == 54) && ('7' == 55) && ('8' == 56) \ |
|
10 |
+ && ('9' == 57) && (':' == 58) && (';' == 59) && ('<' == 60) \ |
|
11 |
+ && ('=' == 61) && ('>' == 62) && ('?' == 63) && ('A' == 65) \ |
|
12 |
+ && ('B' == 66) && ('C' == 67) && ('D' == 68) && ('E' == 69) \ |
|
13 |
+ && ('F' == 70) && ('G' == 71) && ('H' == 72) && ('I' == 73) \ |
|
14 |
+ && ('J' == 74) && ('K' == 75) && ('L' == 76) && ('M' == 77) \ |
|
15 |
+ && ('N' == 78) && ('O' == 79) && ('P' == 80) && ('Q' == 81) \ |
|
16 |
+ && ('R' == 82) && ('S' == 83) && ('T' == 84) && ('U' == 85) \ |
|
17 |
+ && ('V' == 86) && ('W' == 87) && ('X' == 88) && ('Y' == 89) \ |
|
18 |
+ && ('Z' == 90) && ('[' == 91) && ('\\' == 92) && (']' == 93) \ |
|
19 |
+ && ('^' == 94) && ('_' == 95) && ('a' == 97) && ('b' == 98) \ |
|
20 |
+ && ('c' == 99) && ('d' == 100) && ('e' == 101) && ('f' == 102) \ |
|
21 |
+ && ('g' == 103) && ('h' == 104) && ('i' == 105) && ('j' == 106) \ |
|
22 |
+ && ('k' == 107) && ('l' == 108) && ('m' == 109) && ('n' == 110) \ |
|
23 |
+ && ('o' == 111) && ('p' == 112) && ('q' == 113) && ('r' == 114) \ |
|
24 |
+ && ('s' == 115) && ('t' == 116) && ('u' == 117) && ('v' == 118) \ |
|
25 |
+ && ('w' == 119) && ('x' == 120) && ('y' == 121) && ('z' == 122) \ |
|
26 |
+ && ('{' == 123) && ('|' == 124) && ('}' == 125) && ('~' == 126)) |
|
27 |
+/* The character set is not based on ISO-646. */ |
|
28 |
+#error "gperf generated tables don't work with this execution character set. Please report a bug to <bug-gnu-gperf@gnu.org>." |
|
29 |
+#endif |
|
30 |
+ |
|
31 |
+/* maximum key range = 472, duplicates = 0 */ |
|
32 |
+ |
|
33 |
+#ifdef __GNUC__ |
|
34 |
+__inline |
|
35 |
+#else |
|
36 |
+#ifdef __cplusplus |
|
37 |
+inline |
|
38 |
+#endif |
|
39 |
+#endif |
|
40 |
+static unsigned int |
|
41 |
+cctld_hash (const char *str, unsigned int len) |
|
42 |
+{ |
|
43 |
+ static const unsigned short asso_values[] = |
|
44 |
+ { |
|
45 |
+ 476, 476, 476, 476, 476, 476, 476, 476, 476, 476, |
|
46 |
+ 476, 476, 476, 476, 476, 476, 476, 476, 476, 476, |
|
47 |
+ 476, 476, 476, 476, 476, 476, 476, 476, 476, 476, |
|
48 |
+ 476, 476, 476, 476, 476, 476, 476, 476, 476, 476, |
|
49 |
+ 476, 476, 476, 476, 476, 476, 476, 476, 476, 476, |
|
50 |
+ 476, 476, 476, 476, 476, 476, 476, 476, 476, 476, |
|
51 |
+ 476, 476, 476, 476, 476, 476, 476, 476, 476, 476, |
|
52 |
+ 476, 476, 476, 476, 476, 476, 476, 476, 476, 476, |
|
53 |
+ 476, 476, 476, 476, 476, 476, 476, 476, 476, 476, |
|
54 |
+ 476, 476, 476, 476, 476, 476, 476, 119, 97, 33, |
|
55 |
+ 103, 4, 59, 115, 210, 149, 169, 143, 175, 55, |
|
56 |
+ 145, 89, 178, 37, 85, 18, 34, 239, 2, 73, |
|
57 |
+ 112, 3, 25, 10, 15, 117, 209, 229, 150, 223, |
|
58 |
+ 200, 78, 225, 54, 5, 215, 215, 190, 25, 23, |
|
59 |
+ 0, 20, 233, 234, 14, 476, 33, 204, 476, 476, |
|
60 |
+ 476, 476, 476, 476, 476, 476, 476, 476, 476, 476, |
|
61 |
+ 476, 476, 476, 476, 476, 476, 476, 476, 476, 476, |
|
62 |
+ 476, 476, 476, 476, 476, 476, 476, 476, 476, 476, |
|
63 |
+ 476, 476, 476, 476, 476, 476, 476, 476, 476, 476, |
|
64 |
+ 476, 476, 476, 476, 476, 476, 476, 476, 476, 476, |
|
65 |
+ 476, 476, 476, 476, 476, 476, 476, 476, 476, 476, |
|
66 |
+ 476, 476, 476, 476, 476, 476, 476, 476, 476, 476, |
|
67 |
+ 476, 476, 476, 476, 476, 476, 476, 476, 476, 476, |
|
68 |
+ 476, 476, 476, 476, 476, 476, 476, 476, 476, 476, |
|
69 |
+ 476, 476, 476, 476, 476, 476, 476, 476, 476, 476, |
|
70 |
+ 476, 476, 476, 476, 476, 476, 476, 476, 476, 476, |
|
71 |
+ 476, 476, 476, 476, 476, 476, 476, 476, 476, 476, |
|
72 |
+ 476, 476, 476, 476, 476, 476, 476, 476, 476, 476, |
|
73 |
+ 476 |
|
74 |
+ }; |
|
75 |
+ return len + asso_values[(unsigned char)str[1]] + asso_values[(unsigned char)str[0]+25]; |
|
76 |
+} |
|
77 |
+ |
|
78 |
+#ifdef __GNUC__ |
|
79 |
+__inline |
|
80 |
+#ifdef __GNUC_STDC_INLINE__ |
|
81 |
+__attribute__ ((__gnu_inline__)) |
|
82 |
+#endif |
|
83 |
+#endif |
|
84 |
+const char * |
|
85 |
+in_cctld_set (const char *str, unsigned int len) |
|
86 |
+{ |
|
87 |
+ enum |
|
88 |
+ { |
|
89 |
+ TOTAL_KEYWORDS = 252, |
|
90 |
+ MIN_WORD_LENGTH = 2, |
|
91 |
+ MAX_WORD_LENGTH = 2, |
|
92 |
+ MIN_HASH_VALUE = 4, |
|
93 |
+ MAX_HASH_VALUE = 475 |
|
94 |
+ }; |
|
95 |
+ |
|
96 |
+ static const unsigned char lengthtable[] = |
|
97 |
+ { |
|
98 |
+ 0, 0, 0, 0, 2, 2, 2, 0, 0, 2, 2, 2, 0, 0, |
|
99 |
+ 2, 2, 2, 0, 0, 2, 2, 0, 0, 0, 2, 2, 0, 2, |
|
100 |
+ 0, 2, 2, 2, 2, 0, 2, 2, 2, 2, 0, 2, 2, 2, |
|
101 |
+ 2, 2, 2, 2, 2, 2, 0, 0, 2, 0, 2, 0, 0, 2, |
|
102 |
+ 2, 2, 2, 2, 2, 2, 2, 0, 2, 0, 2, 2, 0, 2, |
|
103 |
+ 0, 2, 2, 0, 2, 2, 2, 2, 0, 0, 2, 2, 2, 0, |
|
104 |
+ 2, 2, 2, 2, 0, 2, 2, 2, 2, 0, 0, 2, 2, 2, |
|
105 |
+ 2, 2, 2, 2, 2, 0, 0, 2, 2, 2, 0, 2, 2, 2, |
|
106 |
+ 2, 0, 2, 2, 2, 2, 0, 2, 2, 2, 2, 2, 0, 2, |
|
107 |
+ 2, 2, 0, 2, 2, 2, 2, 0, 0, 2, 2, 2, 0, 2, |
|
108 |
+ 0, 2, 2, 0, 2, 2, 2, 2, 0, 0, 2, 2, 2, 2, |
|
109 |
+ 0, 2, 2, 2, 0, 0, 2, 2, 2, 0, 0, 2, 2, 2, |
|
110 |
+ 0, 2, 2, 2, 2, 0, 2, 2, 2, 2, 0, 0, 0, 2, |
|
111 |
+ 2, 0, 0, 2, 2, 2, 0, 2, 0, 2, 2, 0, 0, 2, |
|
112 |
+ 2, 2, 0, 2, 2, 0, 2, 0, 0, 2, 2, 2, 2, 0, |
|
113 |
+ 2, 2, 2, 0, 0, 2, 0, 2, 0, 0, 2, 2, 2, 0, |
|
114 |
+ 0, 2, 2, 2, 0, 2, 2, 2, 2, 0, 0, 0, 2, 2, |
|
115 |
+ 2, 2, 2, 2, 2, 0, 2, 2, 2, 2, 0, 2, 2, 2, |
|
116 |
+ 2, 2, 0, 2, 2, 2, 2, 2, 2, 2, 2, 0, 2, 2, |
|
117 |
+ 2, 2, 0, 2, 0, 2, 2, 0, 2, 0, 2, 2, 0, 2, |
|
118 |
+ 2, 0, 2, 0, 0, 0, 2, 2, 2, 0, 2, 2, 0, 0, |
|
119 |
+ 0, 2, 2, 2, 0, 0, 2, 2, 2, 0, 0, 2, 2, 2, |
|
120 |
+ 0, 0, 2, 2, 2, 0, 0, 0, 2, 0, 0, 0, 2, 0, |
|
121 |
+ 0, 0, 0, 2, 2, 2, 0, 0, 2, 0, 2, 0, 0, 2, |
|
122 |
+ 2, 2, 0, 0, 0, 0, 2, 0, 0, 0, 0, 2, 0, 0, |
|
123 |
+ 2, 2, 0, 0, 2, 2, 0, 0, 0, 0, 0, 0, 2, 0, |
|
124 |
+ 0, 0, 2, 2, 2, 0, 2, 0, 2, 0, 2, 0, 2, 2, |
|
125 |
+ 2, 0, 2, 2, 0, 0, 0, 2, 0, 0, 0, 0, 0, 2, |
|
126 |
+ 2, 0, 0, 2, 0, 0, 0, 0, 2, 0, 2, 0, 0, 2, |
|
127 |
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
|
128 |
+ 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
|
129 |
+ 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
|
130 |
+ 0, 0, 2, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, |
|
131 |
+ 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2 |
|
132 |
+ }; |
|
133 |
+ static const char * const wordlist[] = |
|
134 |
+ { |
|
135 |
+ "", "", "", "", |
|
136 |
+ "sv", |
|
137 |
+ "sy", |
|
138 |
+ "se", |
|
139 |
+ "", "", |
|
140 |
+ "mv", |
|
141 |
+ "my", |
|
142 |
+ "me", |
|
143 |
+ "", "", |
|
144 |
+ "bv", |
|
145 |
+ "by", |
|
146 |
+ "be", |
|
147 |
+ "", "", |
|
148 |
+ "cv", |
|
149 |
+ "cy", |
|
150 |
+ "", "", "", |
|
151 |
+ "tv", |
|
152 |
+ "ms", |
|
153 |
+ "", |
|
154 |
+ "sz", |
|
155 |
+ "", |
|
156 |
+ "re", |
|
157 |
+ "bs", |
|
158 |
+ "ae", |
|
159 |
+ "mz", |
|
160 |
+ "", |
|
161 |
+ "ws", |
|
162 |
+ "sc", |
|
163 |
+ "st", |
|
164 |
+ "bz", |
|
165 |
+ "", |
|
166 |
+ "ye", |
|
167 |
+ "mc", |
|
168 |
+ "mt", |
|
169 |
+ "cz", |
|
170 |
+ "rs", |
|
171 |
+ "mq", |
|
172 |
+ "as", |
|
173 |
+ "bt", |
|
174 |
+ "tz", |
|
175 |
+ "", "", |
|
176 |
+ "cc", |
|
177 |
+ "", |
|
178 |
+ "az", |
|
179 |
+ "", "", |
|
180 |
+ "tc", |
|
181 |
+ "tt", |
|
182 |
+ "sm", |
|
183 |
+ "lv", |
|
184 |
+ "ly", |
|
185 |
+ "ac", |
|
186 |
+ "at", |
|
187 |
+ "mm", |
|
188 |
+ "", |
|
189 |
+ "aq", |
|
190 |
+ "", |
|
191 |
+ "mf", |
|
192 |
+ "bm", |
|
193 |
+ "", |
|
194 |
+ "yt", |
|
195 |
+ "", |
|
196 |
+ "bf", |
|
197 |
+ "cm", |
|
198 |
+ "", |
|
199 |
+ "ls", |
|
200 |
+ "wf", |
|
201 |
+ "cf", |
|
202 |
+ "tm", |
|
203 |
+ "", "", |
|
204 |
+ "mw", |
|
205 |
+ "tf", |
|
206 |
+ "am", |
|
207 |
+ "", |
|
208 |
+ "je", |
|
209 |
+ "bw", |
|
210 |
+ "af", |
|
211 |
+ "sr", |
|
212 |
+ "", |
|
213 |
+ "lc", |
|
214 |
+ "lt", |
|
215 |
+ "so", |
|
216 |
+ "mr", |
|
217 |
+ "", "", |
|
218 |
+ "tw", |
|
219 |
+ "mo", |
|
220 |
+ "br", |
|
221 |
+ "rw", |
|
222 |
+ "sb", |
|
223 |
+ "aw", |
|
224 |
+ "bo", |
|
225 |
+ "cr", |
|
226 |
+ "", "", |
|
227 |
+ "sd", |
|
228 |
+ "co", |
|
229 |
+ "tr", |
|
230 |
+ "", |
|
231 |
+ "bb", |
|
232 |
+ "md", |
|
233 |
+ "to", |
|
234 |
+ "ar", |
|
235 |
+ "", |
|
236 |
+ "ro", |
|
237 |
+ "bd", |
|
238 |
+ "ao", |
|
239 |
+ "sg", |
|
240 |
+ "", |
|
241 |
+ "mx", |
|
242 |
+ "cd", |
|
243 |
+ "sa", |
|
244 |
+ "mg", |
|
245 |
+ "de", |
|
246 |
+ "", |
|
247 |
+ "td", |
|
248 |
+ "ma", |
|
249 |
+ "bg", |
|
250 |
+ "", |
|
251 |
+ "cx", |
|
252 |
+ "ad", |
|
253 |
+ "ba", |
|
254 |
+ "cg", |
|
255 |
+ "", "", |
|
256 |
+ "jm", |
|
257 |
+ "ca", |
|
258 |
+ "tg", |
|
259 |
+ "", |
|
260 |
+ "ax", |
|
261 |
+ "", |
|
262 |
+ "lr", |
|
263 |
+ "ag", |
|
264 |
+ "", |
|
265 |
+ "dz", |
|
266 |
+ "sk", |
|
267 |
+ "qa", |
|
268 |
+ "sn", |
|
269 |
+ "", "", |
|
270 |
+ "mk", |
|
271 |
+ "si", |
|
272 |
+ "mn", |
|
273 |
+ "lb", |
|
274 |
+ "", |
|
275 |
+ "gy", |
|
276 |
+ "ge", |
|
277 |
+ "bn", |
|
278 |
+ "", "", |
|
279 |
+ "ck", |
|
280 |
+ "bi", |
|
281 |
+ "cn", |
|
282 |
+ "", "", |
|
283 |
+ "tk", |
|
284 |
+ "ci", |
|
285 |
+ "tn", |
|
286 |
+ "", |
|
287 |
+ "jo", |
|
288 |
+ "gs", |
|
289 |
+ "sj", |
|
290 |
+ "an", |
|
291 |
+ "", |
|
292 |
+ "dm", |
|
293 |
+ "la", |
|
294 |
+ "ai", |
|
295 |
+ "sl", |
|
296 |
+ "", "", "", |
|
297 |
+ "bj", |
|
298 |
+ "ml", |
|
299 |
+ "", "", |
|
300 |
+ "mp", |
|
301 |
+ "gt", |
|
302 |
+ "bl", |
|
303 |
+ "", |
|
304 |
+ "gq", |
|
305 |
+ "", |
|
306 |
+ "tj", |
|
307 |
+ "cl", |
|
308 |
+ "", "", |
|
309 |
+ "py", |
|
310 |
+ "pe", |
|
311 |
+ "tl", |
|
312 |
+ "", |
|
313 |
+ "lk", |
|
314 |
+ "tp", |
|
315 |
+ "", |
|
316 |
+ "al", |
|
317 |
+ "", "", |
|
318 |
+ "li", |
|
319 |
+ "ie", |
|
320 |
+ "gm", |
|
321 |
+ "do", |
|
322 |
+ "", |
|
323 |
+ "ps", |
|
324 |
+ "gf", |
|
325 |
+ "sh", |
|
326 |
+ "", "", |
|
327 |
+ "ee", |
|
328 |
+ "", |
|
329 |
+ "mh", |
|
330 |
+ "", "", |
|
331 |
+ "is", |
|
332 |
+ "ne", |
|
333 |
+ "bh", |
|
334 |
+ "", "", |
|
335 |
+ "gw", |
|
336 |
+ "pt", |
|
337 |
+ "ch", |
|
338 |
+ "", |
|
339 |
+ "es", |
|
340 |
+ "ky", |
|
341 |
+ "ke", |
|
342 |
+ "th", |
|
343 |
+ "", "", "", |
|
344 |
+ "it", |
|
345 |
+ "gr", |
|
346 |
+ "uy", |
|
347 |
+ "iq", |
|
348 |
+ "ve", |
|
349 |
+ "su", |
|
350 |
+ "nz", |
|
351 |
+ "", |
|
352 |
+ "ec", |
|
353 |
+ "et", |
|
354 |
+ "mu", |
|
355 |
+ "pm", |
|
356 |
+ "", |
|
357 |
+ "gb", |
|
358 |
+ "nc", |
|
359 |
+ "pf", |
|
360 |
+ "kz", |
|
361 |
+ "us", |
|
362 |
+ "", |
|
363 |
+ "gd", |
|
364 |
+ "cu", |
|
365 |
+ "im", |
|
366 |
+ "jp", |
|
367 |
+ "ht", |
|
368 |
+ "uz", |
|
369 |
+ "zm", |
|
370 |
+ "dk", |
|
371 |
+ "", |
|
372 |
+ "ru", |
|
373 |
+ "pw", |
|
374 |
+ "au", |
|
375 |
+ "gg", |
|
376 |
+ "", |
|
377 |
+ "vc", |
|
378 |
+ "", |
|
379 |
+ "ga", |
|
380 |
+ "om", |
|
381 |
+ "", |
|
382 |
+ "yu", |
|
383 |
+ "", |
|
384 |
+ "nf", |
|
385 |
+ "pr", |
|
386 |
+ "", |
|
387 |
+ "zw", |
|
388 |
+ "hm", |
|
389 |
+ "", |
|
390 |
+ "km", |
|
391 |
+ "", "", "", |
|
392 |
+ "fm", |
|
393 |
+ "ir", |
|
394 |
+ "dj", |
|
395 |
+ "", |
|
396 |
+ "um", |
|
397 |
+ "io", |
|
398 |
+ "", "", "", |
|
399 |
+ "lu", |
|
400 |
+ "er", |
|
401 |
+ "gn", |
|
402 |
+ "", "", |
|
403 |
+ "kw", |
|
404 |
+ "gi", |
|
405 |
+ "nr", |
|
406 |
+ "", "", |
|
407 |
+ "id", |
|
408 |
+ "no", |
|
409 |
+ "pg", |
|
410 |
+ "", "", |
|
411 |
+ "hr", |
|
412 |
+ "pa", |
|
413 |
+ "kr", |
|
414 |
+ "", "", "", |
|
415 |
+ "fr", |
|
416 |
+ "", "", "", |
|
417 |
+ "fo", |
|
418 |
+ "", "", "", "", |
|
419 |
+ "za", |
|
420 |
+ "eg", |
|
421 |
+ "gl", |
|
422 |
+ "", "", |
|
423 |
+ "gp", |
|
424 |
+ "", |
|
425 |
+ "ng", |
|
426 |
+ "", "", |
|
427 |
+ "pk", |
|
428 |
+ "na", |
|
429 |
+ "pn", |
|
430 |
+ "", "", "", "", |
|
431 |
+ "kg", |
|
432 |
+ "", "", "", "", |
|
433 |
+ "in", |
|
434 |
+ "", "", |
|
435 |
+ "ug", |
|
436 |
+ "vg", |
|
437 |
+ "", "", |
|
438 |
+ "ua", |
|
439 |
+ "va", |
|
440 |
+ "", "", "", "", "", "", |
|
441 |
+ "gh", |
|
442 |
+ "", "", "", |
|
443 |
+ "ni", |
|
444 |
+ "pl", |
|
445 |
+ "hk", |
|
446 |
+ "", |
|
447 |
+ "hn", |
|
448 |
+ "", |
|
449 |
+ "kn", |
|
450 |
+ "", |
|
451 |
+ "fk", |
|
452 |
+ "", |
|
453 |
+ "ki", |
|
454 |
+ "il", |
|
455 |
+ "uk", |
|
456 |
+ "", |
|
457 |
+ "fi", |
|
458 |
+ "vn", |
|
459 |
+ "", "", "", |
|
460 |
+ "vi", |
|
461 |
+ "", "", "", "", "", |
|
462 |
+ "gu", |
|
463 |
+ "nl", |
|
464 |
+ "", "", |
|
465 |
+ "np", |
|
466 |
+ "", "", "", "", |
|
467 |
+ "fj", |
|
468 |
+ "", |
|
469 |
+ "ph", |
|
470 |
+ "", "", |
|
471 |
+ "kp", |
|
472 |
+ "", "", "", "", "", "", "", "", "", |
|
473 |
+ "", "", "", "", "", "", |
|
474 |
+ "eh", |
|
475 |
+ "", "", "", "", "", "", "", "", "", |
|
476 |
+ "", "", "", "", "", "", |
|
477 |
+ "kh", |
|
478 |
+ "", "", "", "", "", "", "", "", "", |
|
479 |
+ "", "", "", |
|
480 |
+ "eu", |
|
481 |
+ "", "", "", "", "", |
|
482 |
+ "nu", |
|
483 |
+ "", "", "", "", "", "", "", |
|
484 |
+ "hu", |
|
485 |
+ "", "", "", "", "", "", "", "", "", |
|
486 |
+ "", |
|
487 |
+ "vu" |
|
488 |
+ }; |
|
489 |
+ |
|
490 |
+ if (len <= MAX_WORD_LENGTH && len >= MIN_WORD_LENGTH) |
|
491 |
+ { |
|
492 |
+ int key = cctld_hash (str, len); |
|
493 |
+ |
|
494 |
+ if (key <= MAX_HASH_VALUE && key >= 0) |
|
495 |
+ if (len == lengthtable[key]) |
|
496 |
+ { |
|
497 |
+ const char *s = wordlist[key]; |
|
498 |
+ |
|
499 |
+ if (*str == *s && !memcmp (str + 1, s + 1, len - 1)) |
|
500 |
+ return s; |
|
501 |
+ } |
|
502 |
+ } |
|
503 |
+ return 0; |
|
504 |
+} |
... | ... |
@@ -1,28 +1,746 @@ |
1 |
-/* |
|
2 |
- * Phishing module: iana tld list. |
|
3 |
- * |
|
4 |
- * Copyright (C) 2007-2008 Sourcefire, Inc. |
|
5 |
- * |
|
6 |
- * Authors: Török Edvin |
|
7 |
- * |
|
8 |
- * This program is free software; you can redistribute it and/or modify |
|
9 |
- * it under the terms of the GNU General Public License version 2 as |
|
10 |
- * published by the Free Software Foundation. |
|
11 |
- * |
|
12 |
- * This program is distributed in the hope that it will be useful, |
|
13 |
- * but WITHOUT ANY WARRANTY; without even the implied warranty of |
|
14 |
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
|
15 |
- * GNU General Public License for more details. |
|
16 |
- * |
|
17 |
- * You should have received a copy of the GNU General Public License |
|
18 |
- * along with this program; if not, write to the Free Software |
|
19 |
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, |
|
20 |
- * MA 02110-1301, USA. |
|
21 |
- */ |
|
22 |
- |
|
23 |
-#ifndef IANA_TLD_H |
|
24 |
-#define IANA_TLD_H |
|
25 |
-#define iana_tld "(A[CDEFGILMNOQRSTUWXZ]|B[ABDEFGHIJMNORSTVWYZ]|C[ACDFGHIKLMNORUVXYZ]|D[EJKMOZ]|E[CEGRSTU]|F[IJKMOR]|G[ABDEFGHILMNPQRSTUWY]|H[KMNRTU]|I[DELMNOQRST]|J[EMOP]|K[EGHIMNPRWYZ]|L[ABCIKRSTUVY]|M[ACDEGHKLMNOPQRSTUVWXYZ]|N[ACEFGILOPRUZ]|OM|P[AEFGHKLMNRSTWY]|QA|R[EOSUW]|S[ABCDEGHIJKLMNORTUVYZ]|T[CDFGHJKLMNOPRTVWZ]|U[AGKMSYZ]|V[ACEGINU]|W[FS]|Y[ETU]|Z[AMW]|BIZ|CAT|COM|EDU|GOV|INT|MIL|NET|ORG|PRO|TEL|AERO|ARPA|ASIA|COOP|INFO|JOBS|MOBI|NAME|MUSEUM|TRAVEL|XN--ZCKZAH|XN--0ZWM56D|XN--DEBA0AD|XN--G6W251D|XN--JXALPDLP|XN--KGBECHTV|XN--9T4B11YI5A|XN--80AKHBYKNJ4F|XN--11B5BS3A9AJ6G|XN--HGBK6AJ7F53BBA)" |
|
26 |
-#define iana_cctld "(A[CDEFGILMNOQRSTUWXZ]|B[ABDEFGHIJLMNORSTVWYZ]|C[ACDFGHIKLMNORUVXYZ]|D[EJKMOZ]|E[CEGHRSTU]|F[IJKMOR]|G[ABDEFGHILMNPQRSTUWY]|H[KMNRTU]|I[DELMNOQRST]|J[EMOP]|K[EGHIMNPRWYZ]|L[ABCIKRSTUVY]|M[ACDEFGHKLMNOPQRSTUVWXYZ]|N[ACEFGILOPRUZ]|OM|P[AEFGHKLMNRSTWY]|QA|R[EOSUW]|S[ABCDEGHIJKLMNORTUVYZ]|T[CDFGHJKLMNOPRTVWZ]|U[AGKMSYZ]|V[ACEGINU]|W[FS]|Y[ETU]|Z[AMW]|BIZ|CAT|COM|EDU|GOV|IN[TT]|MIL|NET|ORG|PRO|TEL|AERO|ARP[AA]|ASIA|COOP|INFO|JOBS|MOBI|NAME|MUSEUM)" |
|
1 |
+/* ANSI-C code produced by gperf version 3.0.3 */ |
|
2 |
+/* Command-line: gperf -C -l -L ANSI-C -E -C -H tld_hash -N in_tld_set */ |
|
3 |
+/* Computed positions: -k'1-2,6' */ |
|
4 |
+ |
|
5 |
+#if !((' ' == 32) && ('!' == 33) && ('"' == 34) && ('#' == 35) \ |
|
6 |
+ && ('%' == 37) && ('&' == 38) && ('\'' == 39) && ('(' == 40) \ |
|
7 |
+ && (')' == 41) && ('*' == 42) && ('+' == 43) && (',' == 44) \ |
|
8 |
+ && ('-' == 45) && ('.' == 46) && ('/' == 47) && ('0' == 48) \ |
|
9 |
+ && ('1' == 49) && ('2' == 50) && ('3' == 51) && ('4' == 52) \ |
|
10 |
+ && ('5' == 53) && ('6' == 54) && ('7' == 55) && ('8' == 56) \ |
|
11 |
+ && ('9' == 57) && (':' == 58) && (';' == 59) && ('<' == 60) \ |
|
12 |
+ && ('=' == 61) && ('>' == 62) && ('?' == 63) && ('A' == 65) \ |
|
13 |
+ && ('B' == 66) && ('C' == 67) && ('D' == 68) && ('E' == 69) \ |
|
14 |
+ && ('F' == 70) && ('G' == 71) && ('H' == 72) && ('I' == 73) \ |
|
15 |
+ && ('J' == 74) && ('K' == 75) && ('L' == 76) && ('M' == 77) \ |
|
16 |
+ && ('N' == 78) && ('O' == 79) && ('P' == 80) && ('Q' == 81) \ |
|
17 |
+ && ('R' == 82) && ('S' == 83) && ('T' == 84) && ('U' == 85) \ |
|
18 |
+ && ('V' == 86) && ('W' == 87) && ('X' == 88) && ('Y' == 89) \ |
|
19 |
+ && ('Z' == 90) && ('[' == 91) && ('\\' == 92) && (']' == 93) \ |
|
20 |
+ && ('^' == 94) && ('_' == 95) && ('a' == 97) && ('b' == 98) \ |
|
21 |
+ && ('c' == 99) && ('d' == 100) && ('e' == 101) && ('f' == 102) \ |
|
22 |
+ && ('g' == 103) && ('h' == 104) && ('i' == 105) && ('j' == 106) \ |
|
23 |
+ && ('k' == 107) && ('l' == 108) && ('m' == 109) && ('n' == 110) \ |
|
24 |
+ && ('o' == 111) && ('p' == 112) && ('q' == 113) && ('r' == 114) \ |
|
25 |
+ && ('s' == 115) && ('t' == 116) && ('u' == 117) && ('v' == 118) \ |
|
26 |
+ && ('w' == 119) && ('x' == 120) && ('y' == 121) && ('z' == 122) \ |
|
27 |
+ && ('{' == 123) && ('|' == 124) && ('}' == 125) && ('~' == 126)) |
|
28 |
+/* The character set is not based on ISO-646. */ |
|
29 |
+#error "gperf generated tables don't work with this execution character set. Please report a bug to <bug-gnu-gperf@gnu.org>." |
|
30 |
+#endif |
|
31 |
+ |
|
32 |
+/* maximum key range = 983, duplicates = 0 */ |
|
33 |
+ |
|
34 |
+#ifdef __GNUC__ |
|
35 |
+__inline |
|
36 |
+#else |
|
37 |
+#ifdef __cplusplus |
|
38 |
+inline |
|
39 |
+#endif |
|
40 |
+#endif |
|
41 |
+static unsigned int |
|
42 |
+tld_hash (const char *str, unsigned int len) |
|
43 |
+{ |
|
44 |
+ static const unsigned short asso_values[] = |
|
45 |
+ { |
|
46 |
+ 988, 988, 988, 988, 988, 988, 988, 988, 988, 988, |
|
47 |
+ 988, 988, 988, 988, 988, 988, 988, 988, 988, 988, |
|
48 |
+ 988, 988, 988, 988, 988, 988, 988, 988, 988, 988, |
|
49 |
+ 988, 988, 988, 988, 988, 988, 988, 988, 988, 988, |
|
50 |
+ 988, 988, 988, 988, 988, 988, 988, 988, 0, 15, |
|
51 |
+ 988, 988, 988, 988, 0, 988, 988, 988, 988, 988, |
|
52 |
+ 988, 988, 988, 988, 988, 988, 988, 988, 988, 988, |
|
53 |
+ 988, 988, 988, 988, 988, 988, 988, 988, 988, 988, |
|
54 |
+ 988, 988, 988, 988, 988, 988, 988, 988, 988, 988, |
|
55 |
+ 988, 988, 988, 988, 988, 988, 988, 170, 328, 88, |
|
56 |
+ 3, 50, 293, 205, 123, 430, 500, 238, 115, 320, |
|
57 |
+ 375, 30, 413, 348, 70, 43, 475, 18, 6, 283, |
|
58 |
+ 95, 58, 10, 220, 5, 485, 480, 8, 190, 390, |
|
59 |
+ 225, 113, 420, 95, 0, 15, 50, 295, 20, 128, |
|
60 |
+ 130, 80, 405, 470, 340, 0, 305, 415, 988, 988, |
|
61 |
+ 988, 988, 988, 988, 988, 988, 988, 988, 988, 988, |
|
62 |
+ 988, 988, 988, 988, 988, 988, 988, 988, 988, 988, |
|
63 |
+ 988, 988, 988, 988, 988, 988, 988, 988, 988, 988, |
|
64 |
+ 988, 988, 988, 988, 988, 988, 988, 988, 988, 988, |
|
65 |
+ 988, 988, 988, 988, 988, 988, 988, 988, 988, 988, |
|
66 |
+ 988, 988, 988, 988, 988, 988, 988, 988, 988, 988, |
|
67 |
+ 988, 988, 988, 988, 988, 988, 988, 988, 988, 988, |
|
68 |
+ 988, 988, 988, 988, 988, 988, 988, 988, 988, 988, |
|
69 |
+ 988, 988, 988, 988, 988, 988, 988, 988, 988, 988, |
|
70 |
+ 988, 988, 988, 988, 988, 988, 988, 988, 988, 988, |
|
71 |
+ 988, 988, 988, 988, 988, 988, 988, 988, 988, 988, |
|
72 |
+ 988, 988, 988, 988, 988, 988, 988, 988, 988, 988, |
|
73 |
+ 988, 988, 988, 988, 988, 988, 988, 988, 988, 988, |
|
74 |
+ 988 |
|
75 |
+ }; |
|
76 |
+ int hval = len; |
|
77 |
+ |
|
78 |
+ switch (hval) |
|
79 |
+ { |
|
80 |
+ default: |
|
81 |
+ hval += asso_values[(unsigned char)str[5]]; |
|
82 |
+ /*FALLTHROUGH*/ |
|
83 |
+ case 5: |
|
84 |
+ case 4: |
|
85 |
+ case 3: |
|
86 |
+ case 2: |
|
87 |
+ hval += asso_values[(unsigned char)str[1]]; |
|
88 |
+ /*FALLTHROUGH*/ |
|
89 |
+ case 1: |
|
90 |
+ hval += asso_values[(unsigned char)str[0]+25]; |
|
91 |
+ break; |
|
92 |
+ } |
|
93 |
+ return hval; |
|
94 |
+} |
|
95 |
+ |
|
96 |
+#ifdef __GNUC__ |
|
97 |
+__inline |
|
98 |
+#ifdef __GNUC_STDC_INLINE__ |
|
99 |
+__attribute__ ((__gnu_inline__)) |
|
100 |
+#endif |
|
27 | 101 |
#endif |
102 |
+const char * |
|
103 |
+in_tld_set (const char *str, unsigned int len) |
|
104 |
+{ |
|
105 |
+ enum |
|
106 |
+ { |
|
107 |
+ TOTAL_KEYWORDS = 280, |
|
108 |
+ MIN_WORD_LENGTH = 2, |
|
109 |
+ MAX_WORD_LENGTH = 18, |
|
110 |
+ MIN_HASH_VALUE = 5, |
|
111 |
+ MAX_HASH_VALUE = 987 |
|
112 |
+ }; |
|
113 |
+ |
|
114 |
+ static const unsigned char lengthtable[] = |
|
115 |
+ { |
|
116 |
+ 0, 0, 0, 0, 0, 2, 0, 0, 2, 0, 2, 0, 2, 2, |
|
117 |
+ 0, 2, 0, 2, 0, 0, 2, 0, 2, 0, 0, 2, 0, 2, |
|
118 |
+ 0, 0, 2, 0, 2, 0, 4, 2, 0, 2, 3, 4, 2, 0, |
|
119 |
+ 2, 0, 0, 2, 0, 2, 0, 0, 0, 0, 2, 0, 0, 2, |
|
120 |
+ 0, 4, 0, 0, 2, 0, 2, 0, 4, 2, 0, 2, 3, 0, |
|
121 |
+ 0, 0, 2, 0, 0, 0, 0, 2, 0, 0, 2, 0, 2, 0, |
|
122 |
+ 4, 2, 0, 2, 2, 0, 2, 0, 2, 0, 0, 2, 0, 2, |
|
123 |
+ 0, 0, 2, 0, 2, 2, 0, 2, 0, 2, 0, 0, 0, 0, |
|
124 |
+ 2, 0, 0, 2, 0, 2, 0, 0, 0, 0, 2, 3, 0, 2, |
|
125 |
+ 0, 2, 0, 0, 2, 0, 2, 3, 0, 2, 0, 0, 2, 0, |
|
126 |
+ 2, 0, 2, 0, 0, 2, 0, 4, 2, 0, 2, 0, 2, 0, |
|
127 |
+ 0, 2, 0, 0, 0, 0, 2, 0, 2, 0, 0, 2, 0, 2, |
|
128 |
+ 0, 0, 2, 0, 2, 2, 0, 0, 0, 2, 3, 0, 2, 0, |
|
129 |
+ 2, 0, 0, 2, 0, 2, 0, 4, 2, 0, 2, 0, 0, 2, |
|
130 |
+ 0, 2, 0, 0, 0, 0, 2, 0, 0, 2, 0, 2, 0, 0, |
|
131 |
+ 2, 0, 2, 0, 0, 0, 0, 2, 0, 0, 2, 0, 2, 3, |
|
132 |
+ 0, 2, 0, 0, 2, 0, 2, 0, 2, 0, 0, 2, 0, 0, |
|
133 |
+ 0, 0, 2, 0, 2, 0, 0, 2, 0, 2, 2, 0, 2, 0, |
|
134 |
+ 2, 0, 0, 2, 0, 2, 0, 0, 0, 0, 2, 0, 0, 2, |
|
135 |
+ 0, 2, 0, 0, 2, 6, 2, 0, 0, 0, 0, 2, 0, 0, |
|
136 |
+ 2, 0, 0, 0, 0, 2, 0, 2, 0, 0, 0, 0, 2, 0, |
|
137 |
+ 0, 2, 0, 2, 0, 0, 2, 0, 2, 0, 0, 2, 0, 2, |
|
138 |
+ 0, 0, 2, 0, 0, 0, 0, 2, 0, 0, 0, 0, 2, 0, |
|
139 |
+ 2, 0, 0, 2, 0, 2, 0, 0, 2, 0, 2, 0, 0, 2, |
|
140 |
+ 0, 2, 0, 0, 2, 0, 2, 0, 6, 2, 0, 2, 0, 0, |
|
141 |
+ 2, 0, 0, 0, 0, 2, 0, 2, 0, 0, 2, 0, 2, 0, |
|
142 |
+ 0, 2, 0, 2, 3, 0, 2, 0, 2, 0, 0, 2, 0, 2, |
|
143 |
+ 0, 0, 0, 0, 2, 0, 0, 2, 11, 2, 0, 0, 0, 16, |
|
144 |
+ 2, 0, 0, 0, 11, 2, 0, 0, 0, 0, 2, 0, 0, 0, |
|
145 |
+ 0, 17, 0, 0, 2, 0, 2, 2, 0, 2, 0, 2, 0, 0, |
|
146 |
+ 2, 0, 0, 0, 0, 2, 0, 2, 0, 0, 2, 0, 2, 3, |
|
147 |
+ 0, 2, 11, 2, 0, 0, 2, 0, 2, 0, 0, 0, 0, 2, |
|
148 |
+ 0, 0, 2, 0, 2, 0, 0, 0, 0, 2, 0, 0, 2, 0, |
|
149 |
+ 2, 0, 0, 2, 0, 2, 0, 0, 0, 0, 2, 10, 0, 2, |
|
150 |
+ 0, 2, 0, 0, 2, 0, 12, 0, 0, 2, 3, 2, 0, 0, |
|
151 |
+ 2, 0, 2, 0, 0, 2, 0, 2, 0, 0, 2, 0, 2, 0, |
|
152 |
+ 0, 2, 0, 2, 18, 0, 2, 0, 2, 0, 0, 2, 0, 2, |
|
153 |
+ 0, 0, 2, 0, 2, 0, 0, 2, 0, 2, 2, 0, 0, 0, |
|
154 |
+ 2, 0, 0, 2, 0, 2, 0, 0, 2, 0, 2, 0, 0, 2, |
|
155 |
+ 0, 2, 0, 0, 2, 0, 2, 0, 0, 0, 0, 2, 0, 0, |
|
156 |
+ 2, 0, 2, 0, 0, 0, 0, 2, 0, 0, 2, 0, 2, 0, |
|
157 |
+ 0, 2, 0, 2, 0, 0, 2, 0, 2, 0, 0, 0, 0, 2, |
|
158 |
+ 0, 0, 2, 0, 12, 0, 0, 0, 0, 2, 18, 0, 0, 0, |
|
159 |
+ 2, 3, 4, 2, 0, 2, 0, 0, 0, 0, 2, 0, 0, 0, |
|
160 |
+ 0, 2, 0, 0, 0, 0, 2, 0, 0, 0, 0, 2, 0, 0, |
|
161 |
+ 2, 0, 2, 0, 0, 2, 0, 0, 0, 0, 0, 0, 2, 0, |
|
162 |
+ 0, 2, 0, 0, 0, 0, 0, 0, 2, 3, 0, 0, 0, 0, |
|
163 |
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, |
|
164 |
+ 2, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, |
|
165 |
+ 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, |
|
166 |
+ 2, 0, 2, 0, 0, 2, 0, 0, 0, 0, 0, 0, 2, 0, |
|
167 |
+ 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 2, 0, 0, |
|
168 |
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, |
|
169 |
+ 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
|
170 |
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, |
|
171 |
+ 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, |
|
172 |
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, |
|
173 |
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, |
|
174 |
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
|
175 |
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, |
|
176 |
+ 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 2, 0, |
|
177 |
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 14, 0, 0, 2, |
|
178 |
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
|
179 |
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
|
180 |
+ 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, |
|
181 |
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
|
182 |
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
|
183 |
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
|
184 |
+ 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, |
|
185 |
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
|
186 |
+ 0, 0, 0, 0, 0, 0, 0, 2 |
|
187 |
+ }; |
|
188 |
+ static const char * const wordlist[] = |
|
189 |
+ { |
|
190 |
+ "", "", "", "", "", |
|
191 |
+ "md", |
|
192 |
+ "", "", |
|
193 |
+ "mv", |
|
194 |
+ "", |
|
195 |
+ "cd", |
|
196 |
+ "", |
|
197 |
+ "mz", |
|
198 |
+ "cv", |
|
199 |
+ "", |
|
200 |
+ "ad", |
|
201 |
+ "", |
|
202 |
+ "cz", |
|
203 |
+ "", "", |
|
204 |
+ "mu", |
|
205 |
+ "", |
|
206 |
+ "az", |
|
207 |
+ "", "", |
|
208 |
+ "cu", |
|
209 |
+ "", |
|
210 |
+ "nz", |
|
211 |
+ "", "", |
|
212 |
+ "au", |
|
213 |
+ "", |
|
214 |
+ "mo", |
|
215 |
+ "", |
|
216 |
+ "mobi", |
|
217 |
+ "nu", |
|
218 |
+ "", |
|
219 |
+ "co", |
|
220 |
+ "com", |
|
221 |
+ "coop", |
|
222 |
+ "fo", |
|
223 |
+ "", |
|
224 |
+ "ao", |
|
225 |
+ "", "", |
|
226 |
+ "ms", |
|
227 |
+ "", |
|
228 |
+ "no", |
|
229 |
+ "", "", "", "", |
|
230 |
+ "me", |
|
231 |
+ "", "", |
|
232 |
+ "as", |
|
233 |
+ "", |
|
234 |
+ "asia", |
|
235 |
+ "", "", |
|
236 |
+ "my", |
|
237 |
+ "", |
|
238 |
+ "ae", |
|
239 |
+ "", |
|
240 |
+ "aero", |
|
241 |
+ "cy", |
|
242 |
+ "", |
|
243 |
+ "ne", |
|
244 |
+ "net", |
|
245 |
+ "", "", "", |
|
246 |
+ "mr", |
|
247 |
+ "", "", "", "", |
|
248 |
+ "cr", |
|
249 |
+ "", "", |
|
250 |
+ "fr", |
|
251 |
+ "", |
|
252 |
+ "ar", |
|
253 |
+ "", |
|
254 |
+ "arpa", |
|
255 |
+ "td", |
|
256 |
+ "", |
|
257 |
+ "nr", |
|
258 |
+ "tv", |
|
259 |
+ "", |
|
260 |
+ "mc", |
|
261 |
+ "", |
|
262 |
+ "tz", |
|
263 |
+ "", "", |
|
264 |
+ "cc", |
|
265 |
+ "", |
|
266 |
+ "mx", |
|
267 |
+ "", "", |
|
268 |
+ "ac", |
|
269 |
+ "", |
|
270 |
+ "cx", |
|
271 |
+ "lv", |
|
272 |
+ "", |
|
273 |
+ "nc", |
|
274 |
+ "", |
|
275 |
+ "ax", |
|
276 |
+ "", "", "", "", |
|
277 |
+ "to", |
|
278 |
+ "", "", |
|
279 |
+ "lu", |
|
280 |
+ "", |
|
281 |
+ "ml", |
|
282 |
+ "", "", "", "", |
|
283 |
+ "cl", |
|
284 |
+ "org", |
|
285 |
+ "", |
|
286 |
+ "mh", |
|
287 |
+ "", |
|
288 |
+ "al", |
|
289 |
+ "", "", |
|
290 |
+ "ch", |
|
291 |
+ "", |
|
292 |
+ "nl", |
|
293 |
+ "tel", |
|
294 |
+ "", |
|
295 |
+ "sd", |
|
296 |
+ "", "", |
|
297 |
+ "sv", |
|
298 |
+ "", |
|
299 |
+ "ls", |
|
300 |
+ "", |
|
301 |
+ "sz", |
|
302 |
+ "", "", |
|
303 |
+ "jo", |
|
304 |
+ "", |
|
305 |
+ "jobs", |
|
306 |
+ "ru", |
|
307 |
+ "", |
|
308 |
+ "su", |
|
309 |
+ "", |
|
310 |
+ "tr", |
|
311 |
+ "", "", |
|
312 |
+ "ly", |
|
313 |
+ "", "", "", "", |
|
314 |
+ "ro", |
|
315 |
+ "", |
|
316 |
+ "so", |
|
317 |
+ "", "", |
|
318 |
+ "je", |
|
319 |
+ "", |
|
320 |
+ "lr", |
|
321 |
+ "", "", |
|
322 |
+ "tc", |
|
323 |
+ "", |
|
324 |
+ "ma", |
|
325 |
+ "rs", |
|
326 |
+ "", "", "", |
|
327 |
+ "ca", |
|
328 |
+ "cat", |
|
329 |
+ "", |
|
330 |
+ "re", |
|
331 |
+ "", |
|
332 |
+ "se", |
|
333 |
+ "", "", |
|
334 |
+ "lc", |
|
335 |
+ "", |
|
336 |
+ "na", |
|
337 |
+ "", |
|
338 |
+ "name", |
|
339 |
+ "sy", |
|
340 |
+ "", |
|
341 |
+ "qa", |
|
342 |
+ "", "", |
|
343 |
+ "gd", |
|
344 |
+ "", |
|
345 |
+ "tl", |
|
346 |
+ "", "", "", "", |
|
347 |
+ "sr", |
|
348 |
+ "", "", |
|
349 |
+ "th", |
|
350 |
+ "", |
|
351 |
+ "mg", |
|
352 |
+ "", "", |
|
353 |
+ "gu", |
|
354 |
+ "", |
|
355 |
+ "cg", |
|
356 |
+ "", "", "", "", |
|
357 |
+ "ag", |
|
358 |
+ "", "", |
|
359 |
+ "sc", |
|
360 |
+ "", |
|
361 |
+ "ng", |
|
362 |
+ "gov", |
|
363 |
+ "", |
|
364 |
+ "bd", |
|
365 |
+ "", "", |
|
366 |
+ "bv", |
|
367 |
+ "", |
|
368 |
+ "id", |
|
369 |
+ "", |
|
370 |
+ "bz", |
|
371 |
+ "", "", |
|
372 |
+ "gs", |
|
373 |
+ "", "", "", "", |
|
374 |
+ "mk", |
|
375 |
+ "", |
|
376 |
+ "ge", |
|
377 |
+ "", "", |
|
378 |
+ "ck", |
|
379 |
+ "", |
|
380 |
+ "sl", |
|
381 |
+ "fk", |
|
382 |
+ "", |
|
383 |
+ "gy", |
|
384 |
+ "", |
|
385 |
+ "bo", |
|
386 |
+ "", "", |
|
387 |
+ "sh", |
|
388 |
+ "", |
|
389 |
+ "io", |
|
390 |
+ "", "", "", "", |
|
391 |
+ "gr", |
|
392 |
+ "", "", |
|
393 |
+ "bs", |
|
394 |
+ "", |
|
395 |
+ "la", |
|
396 |
+ "", "", |
|
397 |
+ "is", |
|
398 |
+ "travel", |
|
399 |
+ "be", |
|
400 |
+ "", "", "", "", |
|
401 |
+ "ie", |
|
402 |
+ "", "", |
|
403 |
+ "by", |
|
404 |
+ "", "", "", "", |
|
405 |
+ "mw", |
|
406 |
+ "", |
|
407 |
+ "tg", |
|
408 |
+ "", "", "", "", |
|
409 |
+ "br", |
|
410 |
+ "", "", |
|
411 |
+ "aw", |
|
412 |
+ "", |
|
413 |
+ "ir", |
|
414 |
+ "", "", |
|
415 |
+ "cf", |
|
416 |
+ "", |
|
417 |
+ "sa", |
|
418 |
+ "", "", |
|
419 |
+ "af", |
|
420 |
+ "", |
|
421 |
+ "gl", |
|
422 |
+ "", "", |
|
423 |
+ "nf", |
|
424 |
+ "", "", "", "", |
|
425 |
+ "gh", |
|
426 |
+ "", "", "", "", |
|
427 |
+ "tk", |
|
428 |
+ "", |
|
429 |
+ "mm", |
|
430 |
+ "", "", |
|
431 |
+ "yu", |
|
432 |
+ "", |
|
433 |
+ "cm", |
|
434 |
+ "", "", |
|
435 |
+ "fm", |
|
436 |
+ "", |
|
437 |
+ "am", |
|
438 |
+ "", "", |
|
439 |
+ "lk", |
|
440 |
+ "", |
|
441 |
+ "sg", |
|
442 |
+ "", "", |
|
443 |
+ "ps", |
|
444 |
+ "", |
|
445 |
+ "il", |
|
446 |
+ "", |
|
447 |
+ "museum", |
|
448 |
+ "bh", |
|
449 |
+ "", |
|
450 |
+ "pe", |
|
451 |
+ "", "", |
|
452 |
+ "mq", |
|
453 |
+ "", "", "", "", |
|
454 |
+ "py", |
|
455 |
+ "", |
|
456 |
+ "ye", |
|
457 |
+ "", "", |
|
458 |
+ "aq", |
|
459 |
+ "", |
|
460 |
+ "ga", |
|
461 |
+ "", "", |
|
462 |
+ "tw", |
|
463 |
+ "", |
|
464 |
+ "pr", |
|
465 |
+ "pro", |
|
466 |
+ "", |
|
467 |
+ "sk", |
|
468 |
+ "", |
|
469 |
+ "om", |
|
470 |
+ "", "", |
|
471 |
+ "tf", |
|
472 |
+ "", |
|
473 |
+ "mn", |
|
474 |
+ "", "", "", "", |
|
475 |
+ "cn", |
|
476 |
+ "", "", |
|
477 |
+ "ws", |
|
478 |
+ "xn--g6w251d", |
|
479 |
+ "an", |
|
480 |
+ "", "", "", |
|
481 |
+ "xn--80akhbyknj4f", |
|
482 |
+ "ba", |
|
483 |
+ "", "", "", |
|
484 |
+ "xn--0zwm56d", |
|
485 |
+ "gg", |
|
486 |
+ "", "", "", "", |
|
487 |
+ "tm", |
|
488 |
+ "", "", "", "", |
|
489 |
+ "xn--11b5bs3a9aj6g", |
|
490 |
+ "", "", |
|
491 |
+ "hu", |
|
492 |
+ "", |
|
493 |
+ "pl", |
|
494 |
+ "rw", |
|
495 |
+ "", |
|
496 |
+ "mp", |
|
497 |
+ "", |
|
498 |
+ "uz", |
|
499 |
+ "", "", |
|
500 |
+ "ph", |
|
501 |
+ "", "", "", "", |
|
502 |
+ "lb", |
|
503 |
+ "", |
|
504 |
+ "bg", |
|
505 |
+ "", "", |
|
506 |
+ "np", |
|
507 |
+ "", |
|
508 |
+ "kz", |
|
509 |
+ "mil", |
|
510 |
+ "", |
|
511 |
+ "jm", |
|
512 |
+ "xn--deba0ad", |
|
513 |
+ "ci", |
|
514 |
+ "", "", |
|
515 |
+ "fi", |
|
516 |
+ "", |
|
517 |
+ "ai", |
|
518 |
+ "", "", "", "", |
|
519 |
+ "ni", |
|
520 |
+ "", "", |
|
521 |
+ "us", |
|
522 |
+ "", |
|
523 |
+ "sm", |
|
524 |
+ "", "", "", "", |
|
525 |
+ "tn", |
|
526 |
+ "", "", |
|
527 |
+ "sb", |
|
528 |
+ "", |
|
529 |
+ "hr", |
|
530 |
+ "", "", |
|
531 |
+ "uy", |
|
532 |
+ "", |
|
533 |
+ "pa", |
|
534 |
+ "", "", "", "", |
|
535 |
+ "ke", |
|
536 |
+ "xn--zckzah", |
|
537 |
+ "", |
|
538 |
+ "gw", |
|
539 |
+ "", |
|
540 |
+ "mt", |
|
541 |
+ "", "", |
|
542 |
+ "ky", |
|
543 |
+ "", |
|
544 |
+ "xn--jxalpdlp", |
|
545 |
+ "", "", |
|
546 |
+ "gf", |
|
547 |
+ "edu", |
|
548 |
+ "at", |
|
549 |
+ "", "", |
|
550 |
+ "vu", |
|
551 |
+ "", |
|
552 |
+ "kr", |
|
553 |
+ "", "", |
|
554 |
+ "tp", |
|
555 |
+ "", |
|
556 |
+ "dz", |
|
557 |
+ "", "", |
|
558 |
+ "eu", |
|
559 |
+ "", |
|
560 |
+ "pg", |
|
561 |
+ "", "", |
|
562 |
+ "bw", |
|
563 |
+ "", |
|
564 |
+ "sn", |
|
565 |
+ "xn--hlcj6aya9esc7a", |
|
566 |
+ "", |
|
567 |
+ "fj", |
|
568 |
+ "", |
|
569 |
+ "gm", |
|
570 |
+ "", "", |
|
571 |
+ "bf", |
|
572 |
+ "", |
|
573 |
+ "do", |
|
574 |
+ "", "", |
|
575 |
+ "gb", |
|
576 |
+ "", |
|
577 |
+ "ve", |
|
578 |
+ "", "", |
|
579 |
+ "es", |
|
580 |
+ "", |
|
581 |
+ "li", |
|
582 |
+ "jp", |
|
583 |
+ "", "", "", |
|
584 |
+ "ee", |
|
585 |
+ "", "", |
|
586 |
+ "pk", |
|
587 |
+ "", |
|
588 |
+ "de", |
|
589 |
+ "", "", |
|
590 |
+ "gq", |
|
591 |
+ "", |
|
592 |
+ "bm", |
|
593 |
+ "", "", |
|
594 |
+ "kh", |
|
595 |
+ "", |
|
596 |
+ "im", |
|
597 |
+ "", "", |
|
598 |
+ "bb", |
|
599 |
+ "", |
|
600 |
+ "er", |
|
601 |
+ "", "", "", "", |
|
602 |
+ "tt", |
|
603 |
+ "", "", |
|
604 |
+ "vc", |
|
605 |
+ "", |
|
606 |
+ "si", |
|
607 |
+ "", "", "", "", |
|
608 |
+ "gn", |
|
609 |
+ "", "", |
|
610 |
+ "ec", |
|
611 |
+ "", |
|
612 |
+ "lt", |
|
613 |
+ "", "", |
|
614 |
+ "iq", |
|
615 |
+ "", |
|
616 |
+ "ua", |
|
617 |
+ "", "", |
|
618 |
+ "pw", |
|
619 |
+ "", |
|
620 |
+ "tj", |
|
621 |
+ "", "", "", "", |
|
622 |
+ "za", |
|
623 |
+ "", "", |
|
624 |
+ "pf", |
|
625 |
+ "", |
|
626 |
+ "xn--kgbechtv", |
|
627 |
+ "", "", "", "", |
|
628 |
+ "bn", |
|
629 |
+ "xn--hgbk6aj7f53bba", |
|
630 |
+ "", "", "", |
|
631 |
+ "in", |
|
632 |
+ "int", |
|
633 |
+ "info", |
|
634 |
+ "gp", |
|
635 |
+ "", |
|
636 |
+ "st", |
|
637 |
+ "", "", "", "", |
|
638 |
+ "ug", |
|
639 |
+ "", "", "", "", |
|
640 |
+ "pm", |
|
641 |
+ "", "", "", "", |
|
642 |
+ "gi", |
|
643 |
+ "", "", "", "", |
|
644 |
+ "kg", |
|
645 |
+ "", "", |
|
646 |
+ "hk", |
|
647 |
+ "", |
|
648 |
+ "sj", |
|
649 |
+ "", "", |
|
650 |
+ "wf", |
|
651 |
+ "", "", "", "", "", "", |
|
652 |
+ "va", |
|
653 |
+ "", "", |
|
654 |
+ "uk", |
|
655 |
+ "", "", "", "", "", "", |
|
656 |
+ "bi", |
|
657 |
+ "biz", |
|
658 |
+ "", "", "", "", "", "", "", "", "", |
|
659 |
+ "", "", "", "", |
|
660 |
+ "gt", |
|
661 |
+ "", "", "", "", |
|
662 |
+ "pn", |
|
663 |
+ "", "", "", "", |
|
664 |
+ "vg", |
|
665 |
+ "", "", "", "", "", "", "", "", "", |
|
666 |
+ "eg", |
|
667 |
+ "", "", "", "", "", "", "", "", "", |
|
668 |
+ "bt", |
|
669 |
+ "", "", |
|
670 |
+ "zw", |
|
671 |
+ "", |
|
672 |
+ "it", |
|
673 |
+ "", "", |
|
674 |
+ "kw", |
|
675 |
+ "", "", "", "", "", "", |
|
676 |
+ "hm", |
|
677 |
+ "", "", "", "", "", "", "", "", "", |
|
678 |
+ "bj", |
|
679 |
+ "", "", |
|
680 |
+ "dk", |
|
681 |
+ "", "", "", "", "", "", "", "", "", |
|
682 |
+ "", "", |
|
683 |
+ "zm", |
|
684 |
+ "", "", "", "", |
|
685 |
+ "km", |
|
686 |
+ "", "", "", "", "", "", "", "", "", |
|
687 |
+ "", "", "", "", "", "", "", "", "", |
|
688 |
+ "", "", "", "", "", "", |
|
689 |
+ "hn", |
|
690 |
+ "", "", "", "", |
|
691 |
+ "pt", |
|
692 |
+ "", "", "", "", "", "", "", "", "", |
|
693 |
+ "yt", |
|
694 |
+ "", "", "", "", "", "", "", "", "", |
|
695 |
+ "", "", "", "", "", |
|
696 |
+ "kn", |
|
697 |
+ "", "", "", "", "", "", "", "", "", |
|
698 |
+ "dm", |
|
699 |
+ "", "", "", "", "", "", "", "", "", |
|
700 |
+ "", "", "", "", "", "", "", "", "", |
|
701 |
+ "", "", "", "", "", "", "", "", "", |
|
702 |
+ "kp", |
|
703 |
+ "", "", "", "", "", "", "", "", "", |
|
704 |
+ "", "", |
|
705 |
+ "vn", |
|
706 |
+ "", "", "", "", |
|
707 |
+ "ki", |
|
708 |
+ "", "", "", "", "", "", "", "", "", |
|
709 |
+ "", "", |
|
710 |
+ "xn--9t4b11yi5a", |
|
711 |
+ "", "", |
|
712 |
+ "ht", |
|
713 |
+ "", "", "", "", "", "", "", "", "", |
|
714 |
+ "", "", "", "", "", "", "", "", "", |
|
715 |
+ "", "", "", "", "", "", "", "", "", |
|
716 |
+ "", "", "", "", "", "", "", |
|
717 |
+ "vi", |
|
718 |
+ "", "", "", "", "", "", "", "", "", |
|
719 |
+ "", "", "", "", "", "", "", "", "", |
|
720 |
+ "", "", "", "", "", "", "", "", "", |
|
721 |
+ "", "", "", "", "", "", "", "", "", |
|
722 |
+ "", "", "", "", "", "", "", "", "", |
|
723 |
+ "", "", "", "", "", "", "", "", "", |
|
724 |
+ "et", |
|
725 |
+ "", "", "", "", "", "", "", "", "", |
|
726 |
+ "", "", "", "", "", "", "", "", "", |
|
727 |
+ "", "", "", "", "", "", "", "", "", |
|
728 |
+ "", "", |
|
729 |
+ "dj" |
|
730 |
+ }; |
|
731 |
+ |
|
732 |
+ if (len <= MAX_WORD_LENGTH && len >= MIN_WORD_LENGTH) |
|
733 |
+ { |
|
734 |
+ int key = tld_hash (str, len); |
|
735 |
+ |
|
736 |
+ if (key <= MAX_HASH_VALUE && key >= 0) |
|
737 |
+ if (len == lengthtable[key]) |
|
738 |
+ { |
|
739 |
+ const char *s = wordlist[key]; |
|
28 | 740 |
|
741 |
+ if (*str == *s && !memcmp (str + 1, s + 1, len - 1)) |
|
742 |
+ return s; |
|
743 |
+ } |
|
744 |
+ } |
|
745 |
+ return 0; |
|
746 |
+} |
... | ... |
@@ -49,16 +49,6 @@ int domainlist_match(const struct cl_engine* engine,char* real_url,const char* d |
49 | 49 |
{ |
50 | 50 |
const char* info; |
51 | 51 |
int rc = engine->domainlist_matcher ? regex_list_match(engine->domainlist_matcher,real_url,display_url,hostOnly ? pre_fixup : NULL,hostOnly,&info,0) : 0; |
52 |
- if(rc && info && info[0] && info[0] != ':') {/*match successful, and has custom flags*/ |
|
53 |
- if(strlen(info)==3 && isxdigit(info[0]) && isxdigit(info[1]) && isxdigit(info[2])) { |
|
54 |
- unsigned short notwantedflags=0; |
|
55 |
- sscanf(info,"%hx",¬wantedflags); |
|
56 |
- *flags &= ~notwantedflags;/* filter unwanted phishcheck flags */ |
|
57 |
- } |
|
58 |
- else { |
|
59 |
- cli_warnmsg("Phishcheck:Unknown flag format in domain-list, 3 hex digits expected"); |
|
60 |
- } |
|
61 |
- } |
|
62 | 52 |
return rc; |
63 | 53 |
} |
64 | 54 |
|
... | ... |
@@ -79,13 +69,6 @@ int is_domainlist_ok(const struct cl_engine* engine) |
79 | 79 |
return (engine && engine->domainlist_matcher) ? is_regex_ok(engine->domainlist_matcher) : 1; |
80 | 80 |
} |
81 | 81 |
|
82 |
-void domainlist_cleanup(const struct cl_engine* engine) |
|
83 |
-{ |
|
84 |
- if(engine && engine->domainlist_matcher) { |
|
85 |
- regex_list_cleanup(engine->domainlist_matcher); |
|
86 |
- } |
|
87 |
-} |
|
88 |
- |
|
89 | 82 |
void domainlist_done(struct cl_engine* engine) |
90 | 83 |
{ |
91 | 84 |
if(engine && engine->domainlist_matcher) { |
... | ... |
@@ -69,13 +69,6 @@ int is_whitelist_ok(const struct cl_engine* engine) |
69 | 69 |
return (engine && engine->whitelist_matcher) ? is_regex_ok(engine->whitelist_matcher) : 1; |
70 | 70 |
} |
71 | 71 |
|
72 |
-void whitelist_cleanup(const struct cl_engine* engine) |
|
73 |
-{ |
|
74 |
- if(engine && engine->whitelist_matcher) { |
|
75 |
- regex_list_cleanup(engine->whitelist_matcher); |
|
76 |
- } |
|
77 |
-} |
|
78 |
- |
|
79 | 72 |
void whitelist_done(struct cl_engine* engine) |
80 | 73 |
{ |
81 | 74 |
if(engine && engine->whitelist_matcher) { |
... | ... |
@@ -39,6 +39,7 @@ |
39 | 39 |
#include <ctype.h> |
40 | 40 |
|
41 | 41 |
#include "clamav.h" |
42 |
+#include "cltypes.h" |
|
42 | 43 |
#include "others.h" |
43 | 44 |
#include "mbox.h" |
44 | 45 |
#include "message.h" |
... | ... |
@@ -47,6 +48,7 @@ |
47 | 47 |
#include "phish_domaincheck_db.h" |
48 | 48 |
#include "phish_whitelist.h" |
49 | 49 |
#include "iana_tld.h" |
50 |
+#include "iana_cctld.h" |
|
50 | 51 |
|
51 | 52 |
|
52 | 53 |
#define DOMAIN_REAL 1 |
... | ... |
@@ -140,8 +142,6 @@ static char empty_string[]=""; |
140 | 140 |
#define CLOAKED_URL "^"ANY_CLOAK"(\\."ANY_CLOAK"){0,3}$" |
141 | 141 |
|
142 | 142 |
static const char cloaked_host_regex[] = CLOAKED_URL; |
143 |
-static const char tld_regex[] = "^"iana_tld"$"; |
|
144 |
-static const char cctld_regex[] = "^"iana_cctld"$"; |
|
145 | 143 |
static const char dotnet[] = ".net"; |
146 | 144 |
static const char adonet[] = "ado.net"; |
147 | 145 |
static const char aspnet[] = "asp.net"; |
... | ... |
@@ -151,7 +151,10 @@ static const char gt[]=">"; |
151 | 151 |
static const char src_text[] = "src"; |
152 | 152 |
static const char href_text[] = "href"; |
153 | 153 |
static const char mailto[] = "mailto:"; |
154 |
+static const char mailto_proto[] = "mailto://"; |
|
154 | 155 |
static const char https[]="https://"; |
156 |
+static const char http[]="http://"; |
|
157 |
+static const char ftp[] = "ftp://"; |
|
155 | 158 |
|
156 | 159 |
static const size_t href_text_len = sizeof(href_text); |
157 | 160 |
static const size_t src_text_len = sizeof(src_text); |
... | ... |
@@ -161,7 +164,10 @@ static const size_t aspnet_len = sizeof(aspnet)-1; |
161 | 161 |
static const size_t lt_len = sizeof(lt)-1; |
162 | 162 |
static const size_t gt_len = sizeof(gt)-1; |
163 | 163 |
static const size_t mailto_len = sizeof(mailto)-1; |
164 |
+static const size_t mailto_proto_len = sizeof(mailto_proto)-1; |
|
164 | 165 |
static const size_t https_len = sizeof(https)-1; |
166 |
+static const size_t http_len = sizeof(http)-1; |
|
167 |
+static const size_t ftp_len = sizeof(ftp)-1; |
|
165 | 168 |
|
166 | 169 |
/* for urls, including mailto: urls, and (broken) http:www... style urls*/ |
167 | 170 |
/* refer to: http://www.w3.org/Addressing/URL/5_URI_BNF.html |
... | ... |
@@ -169,41 +175,13 @@ static const size_t https_len = sizeof(https)-1; |
169 | 169 |
* So the 'safe' char class has been split up |
170 | 170 |
* */ |
171 | 171 |
/* character classes */ |
172 |
-#define URI_alpha "a-zA-Z" |
|
173 | 172 |
#define URI_digit "0-9" |
174 |
-#define URI_safe_nodot "-$_@&" |
|
175 |
-#define URI_safe "-$_@.&" |
|
176 |
-#define URI_extra "!*\"'()," |
|
177 |
- |
|
178 |
-#define URI_hex "[0-9a-fA-f]" |
|
179 |
-#define URI_escape "%"URI_hex"{2}" |
|
180 |
-#define URI_xalpha "([" URI_safe URI_alpha URI_digit URI_extra "]|"URI_escape")" /* URI_safe has to be first, because it contains - */ |
|
181 |
-#define URI_xalpha_nodot "([" URI_safe_nodot URI_alpha URI_digit URI_extra "]|"URI_escape")" |
|
182 |
- |
|
183 |
-#define URI_xalphas_nodot URI_xalpha_nodot"*" |
|
184 |
- |
|
185 |
-#define URI_ialpha "["URI_alpha"]"URI_xalphas_nodot"" |
|
186 |
-#define URI_xpalpha URI_xalpha"|\\+" |
|
187 |
-#define URI_xpalpha_nodot URI_xalpha_nodot"|\\+" |
|
188 |
-#define URI_xpalphas_nodot "("URI_xpalpha_nodot")+" |
|
189 |
- |
|
190 |
-#define URI_scheme URI_ialpha |
|
191 |
-#define URI_tld iana_tld |
|
192 |
-#define URI_path1 URI_xpalphas_nodot"\\.("URI_xpalphas_nodot"\\.)*" |
|
193 |
- |
|
194 | 173 |
#define URI_IP_digits "["URI_digit"]{1,3}" |
195 | 174 |
#define URI_path_start "[/?:]?" |
196 | 175 |
#define URI_numeric_path URI_IP_digits"(\\."URI_IP_digits"){3}"URI_path_start |
197 |
-#define URI_numeric_URI "("URI_scheme":(//)?)?"URI_numeric_path |
|
176 |
+#define URI_numeric_URI "(http|https|ftp:(//)?)?"URI_numeric_path |
|
198 | 177 |
#define URI_numeric_fragmentaddress URI_numeric_URI |
199 | 178 |
|
200 |
-#define URI_URI1 "("URI_scheme":(//)?)?"URI_path1 |
|
201 |
-#define URI_URI2 URI_tld |
|
202 |
- |
|
203 |
-#define URI_fragmentaddress1 URI_URI1 |
|
204 |
-#define URI_fragmentaddress2 URI_URI2""URI_path_start |
|
205 |
- |
|
206 |
-#define URI_CHECK_PROTOCOLS "(http|https|ftp|mailto)://.+" |
|
207 | 179 |
|
208 | 180 |
/*Warning: take care when modifying this regex, it has been tweaked, and tuned, just don't break it please. |
209 | 181 |
* there is fragmentaddress1, and 2 to work around the ISO limitation of 509 bytes max length for string constants*/ |
... | ... |
@@ -235,7 +213,6 @@ static int string_assign_concatenated(struct string* dest, const char* prefix, c |
235 | 235 |
static void string_assign_null(struct string* dest); |
236 | 236 |
static char *rfind(char *start, char c, size_t len); |
237 | 237 |
static char hex2int(const unsigned char* src); |
238 |
-static int isTLD(const struct phishcheck* pchk,const char* str,int len); |
|
239 | 238 |
static enum phish_status phishingCheck(const struct cl_engine* engine,struct url_check* urls); |
240 | 239 |
static const char* phishing_ret_toString(enum phish_status rc); |
241 | 240 |
|
... | ... |
@@ -416,7 +393,7 @@ static int get_host(const struct phishcheck* s,const char* URL,int isReal,int* p |
416 | 416 |
} |
417 | 417 |
|
418 | 418 |
tld = strrchr(realhost,'.'); |
419 |
- rc = tld ? isTLD(s,tld,tld-realhost-1) : 0; |
|
419 |
+ rc = tld ? !!in_tld_set(tld,tld-realhost-1) : 0; |
|
420 | 420 |
if(rc < 0) |
421 | 421 |
return rc; |
422 | 422 |
if(rc) |
... | ... |
@@ -438,28 +415,6 @@ static int get_host(const struct phishcheck* s,const char* URL,int isReal,int* p |
438 | 438 |
return 0; |
439 | 439 |
} |
440 | 440 |
|
441 |
-static int isCountryCode(const struct phishcheck* s,const char* str) |
|
442 |
-{ |
|
443 |
- return str ? !cli_regexec(&s->preg_cctld,str,0,NULL,0) : 0; |
|
444 |
-} |
|
445 |
- |
|
446 |
-static int isTLD(const struct phishcheck* pchk,const char* str,int len) |
|
447 |
-{ |
|
448 |
- if (!str) |
|
449 |
- return 0; |
|
450 |
- else { |
|
451 |
- char* s = cli_malloc(len+1); |
|
452 |
- int rc; |
|
453 |
- |
|
454 |
- if(!s) |
|
455 |
- return CL_EMEM; |
|
456 |
- strncpy(s,str,len); |
|
457 |
- s[len]='\0'; |
|
458 |
- rc = !cli_regexec(&pchk->preg_tld,s,0,NULL,0); |
|
459 |
- free(s); |
|
460 |
- return rc ? 1 : 0; |
|
461 |
- } |
|
462 |
-} |
|
463 | 441 |
|
464 | 442 |
/* |
465 | 443 |
* memrchr isn't standard, so I use this |
... | ... |
@@ -486,7 +441,7 @@ static void get_domain(const struct phishcheck* pchk,struct string* dest,struct |
486 | 486 |
string_assign(dest,host); |
487 | 487 |
return; |
488 | 488 |
} |
489 |
- if(isCountryCode(pchk,tld+1)) { |
|
489 |
+ if(in_cctld_set(tld+1, strlen(tld+1))) { |
|
490 | 490 |
const char* countrycode = tld+1; |
491 | 491 |
tld = rfind(host->data,'.',tld-host->data-1); |
492 | 492 |
if(!tld) { |
... | ... |
@@ -495,7 +450,7 @@ static void get_domain(const struct phishcheck* pchk,struct string* dest,struct |
495 | 495 |
string_assign(dest,host); |
496 | 496 |
return; |
497 | 497 |
} |
498 |
- if(!isTLD(pchk,tld+1,countrycode-tld-2)) { |
|
498 |
+ if(!in_tld_set(tld+1, countrycode-tld-2)) { |
|
499 | 499 |
string_assign_ref(dest,host,tld+1); |
500 | 500 |
return;/*it was a name like: subdomain.domain.uk, return domain.uk*/ |
501 | 501 |
} |
... | ... |
@@ -737,11 +692,7 @@ cleanupURL(struct string *URL,struct string *pre_URL, int isReal) |
737 | 737 |
/* @end points to last character we want to be part of the URL */ |
738 | 738 |
end = host_begin + host_len - 1; |
739 | 739 |
} |
740 |
- /* terminate URL with a slash, except when we're at end of string */ |
|
741 |
- if(host_begin[host_len]) { |
|
742 |
- host_begin[host_len] = '/'; |
|
743 |
- end++; |
|
744 |
- } |
|
740 |
+ host_begin[host_len] = '\0'; |
|
745 | 741 |
/* convert hostname to lowercase, but only hostname! */ |
746 | 742 |
str_make_lowercase(host_begin, host_len); |
747 | 743 |
/* some broken MUAs put > in the href, and then |
... | ... |
@@ -797,6 +748,40 @@ int phishingScan(message* m,const char* dir,cli_ctx* ctx,tag_arguments_t* hrefs) |
797 | 797 |
|
798 | 798 |
if(!ctx->found_possibly_unwanted) |
799 | 799 |
*ctx->virname=NULL; |
800 |
+#if 0 |
|
801 |
+ FILE *f = fopen("/home/edwin/quarantine/urls","r"); |
|
802 |
+ if(!f) |
|
803 |
+ abort(); |
|
804 |
+ while(!feof(f)) { |
|
805 |
+ struct url_check urls; |
|
806 |
+ char line1[4096]; |
|
807 |
+ char line2[4096]; |
|
808 |
+ char line3[4096]; |
|
809 |
+ |
|
810 |
+ fgets(line1, sizeof(line1), f); |
|
811 |
+ fgets(line2, sizeof(line2), f); |
|
812 |
+ fgets(line3, sizeof(line3), f); |
|
813 |
+ if(strcmp(line3, "\n") != 0) { |
|
814 |
+ strcpy(line1, line2); |
|
815 |
+ strcpy(line2, line3); |
|
816 |
+ fgets(line3, sizeof(line3), f); |
|
817 |
+ while(strcmp(line3, "\n") != 0) { |
|
818 |
+ fgets(line3, sizeof(line3),f); |
|
819 |
+ } |
|
820 |
+ } |
|
821 |
+ urls.flags = CL_PHISH_ALL_CHECKS; |
|
822 |
+ urls.link_type = 0; |
|
823 |
+ string_init_c(&urls.realLink, line1); |
|
824 |
+ string_init_c(&urls.displayLink, line2); |
|
825 |
+ string_init_c(&urls.pre_fixup.pre_displayLink, NULL); |
|
826 |
+ urls.realLink.refcount=-1; |
|
827 |
+ urls.displayLink.refcount=-1; |
|
828 |
+ int rc = phishingCheck(ctx->engine, &urls); |
|
829 |
+ //printf("%d\n",rc); |
|
830 |
+ } |
|
831 |
+ fclose(f); |
|
832 |
+ return 0; |
|
833 |
+#endif |
|
800 | 834 |
for(i=0;i<hrefs->count;i++) |
801 | 835 |
if(hrefs->contents[i]) { |
802 | 836 |
struct url_check urls; |
... | ... |
@@ -928,44 +913,7 @@ int phishing_init(struct cl_engine* engine) |
928 | 928 |
return CL_EFORMAT; |
929 | 929 |
} |
930 | 930 |
|
931 |
- if(build_regex(&pchk->preg_cctld,cctld_regex,1)) { |
|
932 |
- free(pchk); |
|
933 |
- engine->phishcheck = NULL; |
|
934 |
- return CL_EFORMAT; |
|
935 |
- } |
|
936 |
- if(build_regex(&pchk->preg_tld,tld_regex,1)) { |
|
937 |
- free_regex(&pchk->preg_cctld); |
|
938 |
- free(pchk); |
|
939 |
- engine->phishcheck = NULL; |
|
940 |
- return CL_EFORMAT; |
|
941 |
- } |
|
942 |
- url_regex = str_compose("^ *(("URI_CHECK_PROTOCOLS")|(",URI_fragmentaddress1,URI_fragmentaddress2")) *$"); |
|
943 |
- if(!url_regex || build_regex(&pchk->preg,url_regex,1)) { |
|
944 |
- free_regex(&pchk->preg_cctld); |
|
945 |
- free_regex(&pchk->preg_tld); |
|
946 |
- free(url_regex); |
|
947 |
- free(pchk); |
|
948 |
- engine->phishcheck = NULL; |
|
949 |
- return CL_EFORMAT; |
|
950 |
- } |
|
951 |
- free(url_regex); |
|
952 |
- realurl_regex = str_compose("^ *(("URI_CHECK_PROTOCOLS")|(",URI_path1,URI_fragmentaddress2")) *$"); |
|
953 |
- if(!realurl_regex || build_regex(&pchk->preg_realurl, realurl_regex,1)) { |
|
954 |
- free_regex(&pchk->preg_cctld); |
|
955 |
- free_regex(&pchk->preg_tld); |
|
956 |
- free_regex(&pchk->preg); |
|
957 |
- free(url_regex); |
|
958 |
- free(realurl_regex); |
|
959 |
- free(pchk); |
|
960 |
- engine->phishcheck = NULL; |
|
961 |
- return CL_EFORMAT; |
|
962 |
- } |
|
963 |
- free(realurl_regex); |
|
964 | 931 |
if(build_regex(&pchk->preg_numeric,numeric_url_regex,1)) { |
965 |
- free_regex(&pchk->preg_cctld); |
|
966 |
- free_regex(&pchk->preg_tld); |
|
967 |
- free_regex(&pchk->preg); |
|
968 |
- free_regex(&pchk->preg_realurl); |
|
969 | 932 |
free(pchk); |
970 | 933 |
engine->phishcheck = NULL; |
971 | 934 |
return CL_EFORMAT; |
... | ... |
@@ -980,12 +928,8 @@ void phishing_done(struct cl_engine* engine) |
980 | 980 |
struct phishcheck* pchk = engine->phishcheck; |
981 | 981 |
cli_dbgmsg("Cleaning up phishcheck\n"); |
982 | 982 |
if(pchk && !pchk->is_disabled) { |
983 |
- free_regex(&pchk->preg); |
|
984 | 983 |
free_regex(&pchk->preg_hexurl); |
985 |
- free_regex(&pchk->preg_cctld); |
|
986 |
- free_regex(&pchk->preg_tld); |
|
987 | 984 |
free_regex(&pchk->preg_numeric); |
988 |
- free_regex(&pchk->preg_realurl); |
|
989 | 985 |
pchk->is_disabled = 1; |
990 | 986 |
} |
991 | 987 |
whitelist_done(engine); |
... | ... |
@@ -998,22 +942,165 @@ void phishing_done(struct cl_engine* engine) |
998 | 998 |
cli_dbgmsg("Phishcheck cleaned up\n"); |
999 | 999 |
} |
1000 | 1000 |
|
1001 |
+ |
|
1002 |
+/*ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz*/ |
|
1003 |
+static const uint8_t URI_alpha[256] = { |
|
1004 |
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
|
1005 |
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
|
1006 |
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
|
1007 |
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
|
1008 |
+ 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
|
1009 |
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, |
|
1010 |
+ 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
|
1011 |
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, |
|
1012 |
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
|
1013 |
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
|
1014 |
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
|
1015 |
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
|
1016 |
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
|
1017 |
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
|
1018 |
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
|
1019 |
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 |
|
1020 |
+}; |
|
1021 |
+ |
|
1022 |
+/*!"$%&'()*,-0123456789@ABCDEFGHIJKLMNOPQRSTUVWXYZ_abcdefghijklmnopqrstuvwxyz*/ |
|
1023 |
+static const uint8_t URI_xalpha_nodot[256] = { |
|
1024 |
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
|
1025 |
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
|
1026 |
+ 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, |
|
1027 |
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, |
|
1028 |
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
|
1029 |
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, |
|
1030 |
+ 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
|
1031 |
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, |
|
1032 |
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
|
1033 |
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
|
1034 |
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
|
1035 |
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
|
1036 |
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
|
1037 |
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
|
1038 |
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
|
1039 |
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 |
|
1040 |
+}; |
|
1041 |
+ |
|
1042 |
+/*!"$%&'()*+,-0123456789@ABCDEFGHIJKLMNOPQRSTUVWXYZ_abcdefghijklmnopqrstuvwxyz*/ |
|
1043 |
+static const uint8_t URI_xpalpha_nodot[256] = { |
|
1044 |
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
|
1045 |
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
|
1046 |
+ 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, |
|
1047 |
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, |
|
1048 |
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
|
1049 |
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, |
|
1050 |
+ 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
|
1051 |
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, |
|
1052 |
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
|
1053 |
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
|
1054 |
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
|
1055 |
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
|
1056 |
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
|
1057 |
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
|
1058 |
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
|
1059 |
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 |
|
1060 |
+}; |
|
1061 |
+ |
|
1062 |
+static inline int validate_uri_xalphas_nodot(const char *start, const char *end) |
|
1063 |
+{ |
|
1064 |
+ const unsigned char *p = start; |
|
1065 |
+ for(p=start;p < (const unsigned char*)end; p++) { |
|
1066 |
+ if(!URI_xalpha_nodot[*p]) |
|
1067 |
+ return 0; |
|
1068 |
+ } |
|
1069 |
+ return 1; |
|
1070 |
+} |
|
1071 |
+ |
|
1072 |
+static inline int validate_uri_xpalphas_nodot(const char *start, const char *end) |
|
1073 |
+{ |
|
1074 |
+ const unsigned char *p = start; |
|
1075 |
+ for(p=start;p < (const unsigned char*)end; p++) { |
|
1076 |
+ if(!URI_xpalpha_nodot[*p]) |
|
1077 |
+ return 0; |
|
1078 |
+ } |
|
1079 |
+ /* must have at least on char */ |
|
1080 |
+ return p > (const unsigned char*)start; |
|
1081 |
+} |
|
1082 |
+ |
|
1083 |
+ |
|
1084 |
+static inline int validate_uri_ialpha(const char *start, const char *end) |
|
1085 |
+{ |
|
1086 |
+ const unsigned char *p = start; |
|
1087 |
+ if(start >= end || !URI_alpha[*p]) |
|
1088 |
+ return 0; |
|
1089 |
+ return validate_uri_xalphas_nodot(start + 1, end); |
|
1090 |
+} |
|
1091 |
+ |
|
1001 | 1092 |
/* |
1002 | 1093 |
* Only those URLs are identified as URLs for which phishing detection can be performed. |
1003 | 1094 |
*/ |
1004 |
-static int isURL(const struct phishcheck* pchk,const char* URL) |
|
1095 |
+static int isURL(const struct phishcheck* pchk,const char* URL, int accept_anyproto) |
|
1005 | 1096 |
{ |
1006 |
- return URL ? !cli_regexec(&pchk->preg,URL,0,NULL,0) : 0; |
|
1097 |
+ const char *start = NULL, *p, *q; |
|
1098 |
+ if(!URL) |
|
1099 |
+ return 0; |
|
1100 |
+ |
|
1101 |
+ switch (URL[0]) { |
|
1102 |
+ case 'h': |
|
1103 |
+ if (strncmp(URL, https, https_len) == 0) |
|
1104 |
+ start = URL + https_len; |
|
1105 |
+ else if (strncmp(URL, http, http_len) == 0) |
|
1106 |
+ start = URL + http_len; |
|
1107 |
+ break; |
|
1108 |
+ case 'f': |
|
1109 |
+ if (strncmp(URL, ftp, ftp_len) == 0) |
|
1110 |
+ start = URL + ftp_len; |
|
1111 |
+ break; |
|
1112 |
+ case 'm': |
|
1113 |
+ if (strncmp(URL, mailto_proto, mailto_proto_len) == 0) |
|
1114 |
+ start = URL + mailto_proto_len; |
|
1115 |
+ break; |
|
1116 |
+ } |
|
1117 |
+ if(start) { |
|
1118 |
+ if(start[0] == '\0') |
|
1119 |
+ return 0;/* empty URL */ |
|
1120 |
+ /* has a valid protocol, it is a URL */ |
|
1121 |
+ return 1; |
|
1122 |
+ } |
|
1123 |
+ start = accept_anyproto ? strchr(URL, ':') : NULL; |
|
1124 |
+ if(start) { |
|
1125 |
+ /* validate URI scheme */ |
|
1126 |
+ if(validate_uri_ialpha(URL, start)) { |
|
1127 |
+ if(start[1] == '/' && start[2] == '/') |
|
1128 |
+ start += 3; /* skip :// */ |
|
1129 |
+ else |
|
1130 |
+ start++; |
|
1131 |
+ } |
|
1132 |
+ else |
|
1133 |
+ start = URL; /* scheme invalid */ |
|
1134 |
+ } else |
|
1135 |
+ start = URL; |
|
1136 |
+ p = start; |
|
1137 |
+ do { |
|
1138 |
+ q = strchr(p, '.'); |
|
1139 |
+ if(q) { |
|
1140 |
+ if(!validate_uri_xpalphas_nodot(p, q)) |
|
1141 |
+ return 0; |
|
1142 |
+ p = q+1; |
|
1143 |
+ } |
|
1144 |
+ } while(q); |
|
1145 |
+ if (p == start) /* must have at least one dot in the URL */ |
|
1146 |
+ return 0; |
|
1147 |
+ return !!in_tld_set(p, strlen(p)); |
|
1007 | 1148 |
} |
1008 | 1149 |
|
1009 | 1150 |
/* |
1010 | 1151 |
* Check if this is a real URL, which basically means to check if it has a known URL scheme (http,https,ftp). |
1011 | 1152 |
* This prevents false positives with outbind:// and blocked:: links. |
1012 | 1153 |
*/ |
1154 |
+#if 0 |
|
1013 | 1155 |
static int isRealURL(const struct phishcheck* pchk,const char* URL) |
1014 | 1156 |
{ |
1015 | 1157 |
return URL ? !cli_regexec(&pchk->preg_realurl,URL,0,NULL,0) : 0; |
1016 | 1158 |
} |
1159 |
+#endif |
|
1017 | 1160 |
|
1018 | 1161 |
static int isNumericURL(const struct phishcheck* pchk,const char* URL) |
1019 | 1162 |
{ |
... | ... |
@@ -1139,7 +1226,7 @@ static enum phish_status phishingCheck(const struct cl_engine* engine,struct url |
1139 | 1139 |
cli_dbgmsg("Phishcheck:URL after cleanup: %s->%s\n", urls->realLink.data, |
1140 | 1140 |
urls->displayLink.data); |
1141 | 1141 |
|
1142 |
- if((!isURL(pchk, urls->displayLink.data) || !isRealURL(pchk, urls->realLink.data) ) && |
|
1142 |
+ if((!isURL(pchk, urls->displayLink.data, 1) || !isURL(pchk, urls->realLink.data, 0) ) && |
|
1143 | 1143 |
( (phishy&PHISHY_NUMERIC_IP && !isNumericURL(pchk, urls->displayLink.data)) || |
1144 | 1144 |
!(phishy&PHISHY_NUMERIC_IP))) { |
1145 | 1145 |
cli_dbgmsg("Displayed 'url' is not url:%s\n",urls->displayLink.data); |
... | ... |
@@ -1839,6 +1839,12 @@ int cl_build(struct cl_engine *engine) |
1839 | 1839 |
} |
1840 | 1840 |
} |
1841 | 1841 |
|
1842 |
+ if((ret = cli_build_regex_list(engine->whitelist_matcher))) { |
|
1843 |
+ return ret; |
|
1844 |
+ } |
|
1845 |
+ if((ret = cli_build_regex_list(engine->domainlist_matcher))) { |
|
1846 |
+ return ret; |
|
1847 |
+ } |
|
1842 | 1848 |
cli_md5db_build(engine->md5_mdb); |
1843 | 1849 |
cli_freeign(engine); |
1844 | 1850 |
cli_dconf_print(engine->dconf); |
... | ... |
@@ -42,6 +42,8 @@ |
42 | 42 |
|
43 | 43 |
#include <limits.h> |
44 | 44 |
#include <sys/types.h> |
45 |
+#include <assert.h> |
|
46 |
+ |
|
45 | 47 |
|
46 | 48 |
#include "regex/regex.h" |
47 | 49 |
|
... | ... |
@@ -53,152 +55,471 @@ |
53 | 53 |
#include "matcher.h" |
54 | 54 |
#include "str.h" |
55 | 55 |
#include "readdb.h" |
56 |
+#include "jsparse/textbuf.h" |
|
57 |
+ |
|
58 |
+/* ------- parse a regular expression, and extract a static suffix ------*/ |
|
59 |
+enum node_type { |
|
60 |
+ root=0, |
|
61 |
+ concat, |
|
62 |
+ alternate, /* | */ |
|
63 |
+ optional,/* ?, * */ |
|
64 |
+ leaf, /* a character */ |
|
65 |
+ leaf_class /* character class */ |
|
66 |
+ /* (x)+ is transformed into (x)*(x) */ |
|
67 |
+}; |
|
56 | 68 |
|
57 |
-/*Tree*/ |
|
58 |
-enum token_op_t {OP_CHAR,OP_STDCLASS,OP_CUSTOMCLASS,OP_DOT,OP_LEAF,OP_ROOT,OP_PARCLOSE}; |
|
59 |
-typedef unsigned char* char_bitmap_p; |
|
60 |
-/* |
|
61 |
- * |
|
62 |
- * OP_CHAR: 1 character, c = character |
|
63 |
- * complex stuff: |
|
64 |
- * OP_STDCLASS: standard character class, c = char class, class: 1<<(index into std_class of class name) |
|
65 |
- * OP_CUSTOMCLASS: custom character class, first pointer in ptr array is a pointer to the bitmap table for this class |
|
66 |
- * OP_DOT: single . matching any character except \n |
|
67 |
- * OP_LEAF: this is a leaf node, reinterpret structure |
|
68 |
- */ |
|
69 |
-struct tree_node { |
|
70 |
- struct tree_node* next;/* next regex/complex sibling, or parent, if no more siblings , can't be NULL except for root node*/ |
|
69 |
+struct node { |
|
70 |
+ enum node_type type; |
|
71 |
+ struct node *parent; |
|
71 | 72 |
union { |
72 |
- struct tree_node** children;/* alternatives nr. of children, followed by (a null pointer terminated) regex leaf node pointers) */ |
|
73 |
- char_bitmap_p* bitmap; |
|
74 |
- struct leaf_info* leaf; |
|
73 |
+ struct { |
|
74 |
+ struct node* left; |
|
75 |
+ struct node* right; |
|
76 |
+ } children; |
|
77 |
+ uint8_t* leaf_class_bitmap; |
|
78 |
+ uint8_t leaf_char; |
|
75 | 79 |
} u; |
76 |
- enum token_op_t op; |
|
77 |
- unsigned char c; |
|
78 |
- char alternatives;/* number of (non-regex) children of node, i.e. sizeof(children)*/ |
|
79 |
- char listend;/* no more siblings, next pointer is pointer to parent*/ |
|
80 | 80 |
}; |
81 | 81 |
|
82 |
-struct leaf_info { |
|
83 |
- char* info;/* what does it mean that we reached the leaf...*/ |
|
84 |
- regex_t* preg;/* this is NULL if leaf node, and non-regex*/ |
|
85 |
-}; |
|
82 |
+/* Prototypes */ |
|
83 |
+static size_t reverse_string(char *pattern); |
|
84 |
+static int add_pattern(struct regex_matcher *matcher, char *pattern); |
|
85 |
+static int add_pattern_suffix(struct regex_matcher *matcher, char *suffix, size_t suffix_len, struct regex_list *regex); |
|
86 |
+static int add_static_pattern(struct regex_matcher *matcher, char* pattern); |
|
87 |
+static int build_suffixtree_descend(struct regex_matcher *matcher, struct regex_list *regex, struct node *n, struct text_buffer *buf); |
|
88 |
+/* ---------- */ |
|
86 | 89 |
|
87 |
-/* Character classes */ |
|
88 |
-static const char* std_class[] = { |
|
89 |
- "[:alnum:]", |
|
90 |
- "[:digit:]", |
|
91 |
- "[:punct:]", |
|
92 |
- "[:alpha:]", |
|
93 |
- "[:graph:]", |
|
94 |
- "[:space:]", |
|
95 |
- "[:blank:]", |
|
96 |
- "[:lower:]", |
|
97 |
- "[:upper:]", |
|
98 |
- "[:cntrl:]", |
|
99 |
- "[:print:]", |
|
100 |
- "[:xdigit:]" |
|
101 |
- /* don't change the order of these strings, unless you change them in generate_tables.c too, and regenerate the tables*/ |
|
102 |
-}; |
|
90 |
+static uint8_t dot_bitmap[32]; |
|
103 | 91 |
|
92 |
+static struct node* make_node(enum node_type type, struct node *left, struct node *right) |
|
93 |
+{ |
|
94 |
+ struct node *n; |
|
95 |
+ if(type == concat) { |
|
96 |
+ if(left == NULL) |
|
97 |
+ return right; |
|
98 |
+ if(right == NULL) |
|
99 |
+ return left; |
|
100 |
+ } |
|
101 |
+ n = cli_malloc(sizeof(*n)); |
|
102 |
+ if(!n) |
|
103 |
+ return NULL; |
|
104 |
+ n->type = type; |
|
105 |
+ n->parent = NULL; |
|
106 |
+ n->u.children.left = left; |
|
107 |
+ n->u.children.right = right; |
|
108 |
+ if(left) |
|
109 |
+ left->parent = n; |
|
110 |
+ if(right) |
|
111 |
+ right->parent = n; |
|
112 |
+ return n; |
|
113 |
+} |
|
104 | 114 |
|
105 |
-#define STD_CLASS_CNT sizeof(std_class)/sizeof(std_class[0]) |
|
106 |
- |
|
107 |
-/* generated by contrib/phishing/generate_tables.c */ |
|
108 |
-static const unsigned char char_class_bitmap[STD_CLASS_CNT][32] = { |
|
109 |
- {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0x03, |
|
110 |
- 0xfe, 0xff, 0xff, 0x07, 0xfe, 0xff, 0xff, 0x07, |
|
111 |
- 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, |
|
112 |
- 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}, |
|
113 |
- |
|
114 |
- {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0x03, |
|
115 |
- 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, |
|
116 |
- 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, |
|
117 |
- 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}, |
|
118 |
- |
|
119 |
- {0x00, 0x00, 0x00, 0x00, 0xfe, 0xff, 0x00, 0xfc, |
|
120 |
- 0x01, 0x00, 0x00, 0xf8, 0x01, 0x00, 0x00, 0x78, |
|
121 |
- 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, |
|
122 |
- 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}, |
|
123 |
- |
|
124 |
- {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, |
|
125 |
- 0xfe, 0xff, 0xff, 0x07, 0xfe, 0xff, 0xff, 0x07, |
|
126 |
- 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, |
|
127 |
- 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}, |
|
128 |
- |
|
129 |
- {0x00, 0x00, 0x00, 0x00, 0xfe, 0xff, 0xff, 0xff, |
|
130 |
- 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x7f, |
|
131 |
- 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, |
|
132 |
- 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}, |
|
133 |
- |
|
134 |
- {0x00, 0x3e, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, |
|
135 |
- 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, |
|
136 |
- 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, |
|
137 |
- 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}, |
|
138 |
- |
|
139 |
- {0x00, 0x02, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, |
|
140 |
- 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, |
|
141 |
- 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, |
|
142 |
- 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}, |
|
143 |
- |
|
144 |
- {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, |
|
145 |
- 0x00, 0x00, 0x00, 0x00, 0xfe, 0xff, 0xff, 0x07, |
|
146 |
- 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, |
|
147 |
- 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}, |
|
148 |
- |
|
149 |
- {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, |
|
150 |
- 0xfe, 0xff, 0xff, 0x07, 0x00, 0x00, 0x00, 0x00, |
|
151 |
- 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, |
|
152 |
- 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}, |
|
153 |
- |
|
154 |
- {0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, |
|
155 |
- 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x80, |
|
156 |
- 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, |
|
157 |
- 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}, |
|
158 |
- |
|
159 |
- {0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, |
|
160 |
- 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x7f, |
|
161 |
- 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, |
|
162 |
- 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}, |
|
163 |
- |
|
164 |
- {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0x03, |
|
165 |
- 0x7e, 0x00, 0x00, 0x00, 0x7e, 0x00, 0x00, 0x00, |
|
166 |
- 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, |
|
167 |
- 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00} |
|
168 |
-}; |
|
115 |
+static struct node *dup_node(struct node *p) |
|
116 |
+{ |
|
117 |
+ struct node *node_left, *node_right; |
|
118 |
+ struct node *d; |
|
169 | 119 |
|
170 |
-static const unsigned short int char_class[256] = { |
|
171 |
- 0x200, 0x200, 0x200, 0x200, 0x200, 0x200, 0x200, 0x200, 0x200, 0x260, 0x220, 0x220, 0x220, 0x220, 0x200, 0x200, |
|
172 |
- 0x200, 0x200, 0x200, 0x200, 0x200, 0x200, 0x200, 0x200, 0x200, 0x200, 0x200, 0x200, 0x200, 0x200, 0x200, 0x200, |
|
173 |
- 0x460, 0x414, 0x414, 0x414, 0x414, 0x414, 0x414, 0x414, 0x414, 0x414, 0x414, 0x414, 0x414, 0x414, 0x414, 0x414, |
|
174 |
- 0xc13, 0xc13, 0xc13, 0xc13, 0xc13, 0xc13, 0xc13, 0xc13, 0xc13, 0xc13, 0x414, 0x414, 0x414, 0x414, 0x414, 0x414, |
|
175 |
- 0x414, 0xd19, 0xd19, 0xd19, 0xd19, 0xd19, 0xd19, 0x519, 0x519, 0x519, 0x519, 0x519, 0x519, 0x519, 0x519, 0x519, |
|
176 |
- 0x519, 0x519, 0x519, 0x519, 0x519, 0x519, 0x519, 0x519, 0x519, 0x519, 0x519, 0x414, 0x414, 0x414, 0x414, 0x414, |
|
177 |
- 0x414, 0xc99, 0xc99, 0xc99, 0xc99, 0xc99, 0xc99, 0x499, 0x499, 0x499, 0x499, 0x499, 0x499, 0x499, 0x499, 0x499, |
|
178 |
- 0x499, 0x499, 0x499, 0x499, 0x499, 0x499, 0x499, 0x499, 0x499, 0x499, 0x499, 0x414, 0x414, 0x414, 0x414, 0x200, |
|
179 |
- 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, |
|
180 |
- 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, |
|
181 |
- 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, |
|
182 |
- 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, |
|
183 |
- 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, |
|
184 |
- 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, |
|
185 |
- 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, |
|
186 |
- 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000 |
|
187 |
-}; |
|
120 |
+ if(!p) |
|
121 |
+ return NULL; |
|
122 |
+ d = cli_malloc(sizeof(*d)); |
|
123 |
+ if(!d) |
|
124 |
+ return NULL; |
|
125 |
+ d->type = p->type; |
|
126 |
+ d->parent = NULL; |
|
127 |
+ switch(p->type) { |
|
128 |
+ case leaf: |
|
129 |
+ d->u.leaf_char = p->u.leaf_char; |
|
130 |
+ break; |
|
131 |
+ case leaf_class: |
|
132 |
+ d->u.leaf_class_bitmap = cli_malloc(32); |
|
133 |
+ if(!d->u.leaf_class_bitmap) |
|
134 |
+ return NULL; |
|
135 |
+ memcpy(d->u.leaf_class_bitmap, p->u.leaf_class_bitmap, 32); |
|
136 |
+ break; |
|
137 |
+ default: |
|
138 |
+ node_left = dup_node(p->u.children.left); |
|
139 |
+ node_right = dup_node(p->u.children.right); |
|
140 |
+ d->u.children.left = node_left; |
|
141 |
+ d->u.children.right = node_right; |
|
142 |
+ if(node_left) |
|
143 |
+ node_left->parent = d; |
|
144 |
+ if(node_right) |
|
145 |
+ node_right->parent = d; |
|
146 |
+ break; |
|
147 |
+ } |
|
148 |
+ return d; |
|
149 |
+} |
|
188 | 150 |
|
189 |
-static const size_t std_class_cnt = sizeof(std_class)/sizeof(std_class[0]); |
|
151 |
+static struct node *make_charclass(uint8_t *bitmap) |
|
152 |
+{ |
|
153 |
+ struct node *v = cli_malloc(sizeof(*v)); |
|
154 |
+ if(!v) |
|
155 |
+ return NULL; |
|
156 |
+ v->type = leaf_class; |
|
157 |
+ v->parent = NULL; |
|
158 |
+ v->u.leaf_class_bitmap = bitmap; |
|
159 |
+ return v; |
|
160 |
+} |
|
161 |
+ |
|
162 |
+static struct node *make_leaf(char c) |
|
163 |
+{ |
|
164 |
+ struct node *v = cli_malloc(sizeof(*v)); |
|
165 |
+ if(!v) |
|
166 |
+ return NULL; |
|
167 |
+ v->type = leaf; |
|
168 |
+ v->parent = NULL; |
|
169 |
+ v->u.leaf_char = c; |
|
170 |
+ return v; |
|
171 |
+} |
|
172 |
+ |
|
173 |
+static void destroy_tree(struct node *n) |
|
174 |
+{ |
|
175 |
+ if(!n) |
|
176 |
+ return; |
|
177 |
+ switch(n->type) { |
|
178 |
+ case concat: |
|
179 |
+ case alternate: |
|
180 |
+ case optional: |
|
181 |
+ destroy_tree(n->u.children.left); |
|
182 |
+ destroy_tree(n->u.children.right); |
|
183 |
+ break; |
|
184 |
+ case leaf_class: |
|
185 |
+ if(n->u.leaf_class_bitmap != dot_bitmap) |
|
186 |
+ free(n->u.leaf_class_bitmap); |
|
187 |
+ break; |
|
188 |
+ case root: |
|
189 |
+ case leaf: |
|
190 |
+ break; |
|
191 |
+ } |
|
192 |
+ free(n); |
|
193 |
+} |
|
194 |
+ |
|
195 |
+static uint8_t* parse_char_class(const char *pat, size_t *pos) |
|
196 |
+{ |
|
197 |
+ unsigned char range_start=0; |
|
198 |
+ int hasprev = 0; |
|
199 |
+ uint8_t* bitmap = cli_malloc(32); |
|
200 |
+ if(!bitmap) |
|
201 |
+ return NULL; |
|
202 |
+ if (pat[*pos]=='^') { |
|
203 |
+ memset(bitmap,0xFF,32);/*match chars not in brackets*/ |
|
204 |
+ ++*pos; |
|
205 |
+ } |
|
206 |
+ else |
|
207 |
+ memset(bitmap,0x00,32); |
|
208 |
+ do { |
|
209 |
+ /* literal ] can be first character, so test for it at the end of the loop, for example: []] */ |
|
210 |
+ if (pat[*pos]=='-' && hasprev) { |
|
211 |
+ /* it is a range*/ |
|
212 |
+ unsigned char range_end; |
|
213 |
+ unsigned int c; |
|
214 |
+ assert(range_start); |
|
215 |
+ ++*pos; |
|
216 |
+ if (pat[*pos]=='[') |
|
217 |
+ if (pat[*pos+1]=='.') { |
|
218 |
+ /* collating sequence not handled */ |
|
219 |
+ free(bitmap); |
|
220 |
+ /* we are parsing the regex for a |
|
221 |
+ * filter, be conservative and |
|
222 |
+ * tell the filter that anything could |
|
223 |
+ * match here */ |
|
224 |
+ while(pat[*pos] != ']') ++*pos; |
|
225 |
+ ++*pos; |
|
226 |
+ while(pat[*pos] != ']') ++*pos; |
|
227 |
+ return dot_bitmap; |
|
228 |
+ } |
|
229 |
+ else |
|
230 |
+ range_end = pat[*pos]; |
|
231 |
+ else |
|
232 |
+ range_end = pat[*pos]; |
|
233 |
+ for(c=range_start+1;c<=range_end;c++) |
|
234 |
+ bitmap[c>>3] ^= 1<<(c&0x7); |
|
235 |
+ hasprev = 0; |
|
236 |
+ } |
|
237 |
+ else if (pat[*pos]=='[' && pat[*pos]==':') { |
|
238 |
+ /* char class */ |
|
239 |
+ free(bitmap); |
|
240 |
+ while(pat[*pos] != ']') ++*pos; |
|
241 |
+ ++*pos; |
|
242 |
+ while(pat[*pos] != ']') ++*pos; |
|
243 |
+ return dot_bitmap; |
|
244 |
+ } else { |
|
245 |
+ bitmap[pat[*pos]>>3] ^= 1<<(pat[*pos]&0x7); |
|
246 |
+ ++*pos; |
|
247 |
+ range_start = pat[*pos]; |
|
248 |
+ hasprev = 1; |
|
249 |
+ } |
|
250 |
+ } while(pat[*pos]!=']'); |
|
251 |
+ return bitmap; |
|
252 |
+} |
|
253 |
+ |
|
254 |
+static struct node* parse_regex(const char *p, size_t *last) |
|
255 |
+{ |
|
256 |
+ struct node *v = NULL; |
|
257 |
+ struct node *right; |
|
258 |
+ struct node *tmp; |
|
259 |
+ |
|
260 |
+ while(p[*last] != '$' && p[*last] != '\0') { |
|
261 |
+ switch(p[*last]) { |
|
262 |
+ case '|': |
|
263 |
+ ++*last; |
|
264 |
+ right = parse_regex(p, last); |
|
265 |
+ v = make_node(alternate, v, right); |
|
266 |
+ if(!v) |
|
267 |
+ return NULL; |
|
268 |
+ break; |
|
269 |
+ case '*': |
|
270 |
+ case '?': |
|
271 |
+ v = make_node(optional, v, NULL); |
|
272 |
+ if(!v) |
|
273 |
+ return NULL; |
|
274 |
+ ++*last; |
|
275 |
+ break; |
|
276 |
+ case '+': |
|
277 |
+ /* (x)* */ |
|
278 |
+ tmp = make_node(optional, v, NULL); |
|
279 |
+ if(!tmp) |
|
280 |
+ return NULL; |
|
281 |
+ /* (x) */ |
|
282 |
+ right = dup_node(v); |
|
283 |
+ if(!right) |
|
284 |
+ return NULL; |
|
285 |
+ /* (x)*(x) => (x)+ */ |
|
286 |
+ v = make_node(concat, tmp, right); |
|
287 |
+ if(!v) |
|
288 |
+ return NULL; |
|
289 |
+ ++*last; |
|
290 |
+ break; |
|
291 |
+ case '(': |
|
292 |
+ ++*last; |
|
293 |
+ right = parse_regex(p, last); |
|
294 |
+ if(!right) |
|
295 |
+ return NULL; |
|
296 |
+ ++*last; |
|
297 |
+ v = make_node(concat, v, right); |
|
298 |
+ break; |
|
299 |
+ case ')': |
|
300 |
+ return v; |
|
301 |
+ case '.': |
|
302 |
+ right = make_charclass(dot_bitmap); |
|
303 |
+ if(!right) |
|
304 |
+ return NULL; |
|
305 |
+ v = make_node(concat, v, right); |
|
306 |
+ if(!v) |
|
307 |
+ return NULL; |
|
308 |
+ ++*last; |
|
309 |
+ break; |
|
310 |
+ case '[': |
|
311 |
+ right = make_charclass( parse_char_class(p, last) ); |
|
312 |
+ if(!right) |
|
313 |
+ return NULL; |
|
314 |
+ v = make_node(concat, v, right); |
|
315 |
+ if(!v) |
|
316 |
+ return NULL; |
|
317 |
+ case '\\': |
|
318 |
+ /* next char is escaped, advance pointer |
|
319 |
+ * and let fall-through handle it */ |
|
320 |
+ ++*last; |
|
321 |
+ default: |
|
322 |
+ right = make_leaf(p[*last]); |
|
323 |
+ v = make_node(concat, v, right); |
|
324 |
+ if(!v) |
|
325 |
+ return NULL; |
|
326 |
+ ++*last; |
|
327 |
+ break; |
|
328 |
+ } |
|
329 |
+ } |
|
330 |
+ return v; |
|
331 |
+} |
|
332 |
+ |
|
333 |
+#define BITMAP_HASSET(b, i) (b[i>>3] & (1<<(i&7))) |
|
334 |
+ |
|
335 |
+static int build_suffixtree_ascend(struct regex_matcher *matcher, struct regex_list *regex, struct node *n, struct text_buffer *buf, struct node *prev) |
|
336 |
+{ |
|
337 |
+ size_t i; |
|
338 |
+ while(n) { |
|
339 |
+ struct node *q = n; |
|
340 |
+ switch(n->type) { |
|
341 |
+ case root: |
|
342 |
+ textbuffer_putc(buf, '\0'); |
|
343 |
+ if(add_pattern_suffix(matcher, buf->data, buf->pos, regex) < 0) |
|
344 |
+ return CL_EMEM; |
|
345 |
+ return 0; |
|
346 |
+ case leaf: |
|
347 |
+ textbuffer_putc(buf, n->u.leaf_char); |
|
348 |
+ n = n->parent; |
|
349 |
+ break; |
|
350 |
+ case leaf_class: |
|
351 |
+ if(memcmp(n->u.leaf_class_bitmap, dot_bitmap, sizeof(dot_bitmap)) == 0) { |
|
352 |
+ textbuffer_putc(buf, '\0'); |
|
353 |
+ if(add_pattern_suffix(matcher, buf->data, buf->pos, regex) < 0) |
|
354 |
+ return CL_EMEM; |
|
355 |
+ return 0; |
|
356 |
+ } |
|
357 |
+ for(i=0;i<255;i++) { |
|
358 |
+ if(BITMAP_HASSET(n->u.leaf_class_bitmap, i)) { |
|
359 |
+ size_t pos; |
|
360 |
+ pos = buf->pos; |
|
361 |
+ textbuffer_putc(buf, i); |
|
362 |
+ if(build_suffixtree_ascend(matcher, regex, n->parent, buf, n) < 0) |
|
363 |
+ return CL_EMEM; |
|
364 |
+ buf->pos = pos; |
|
365 |
+ } |
|
366 |
+ } |
|
367 |
+ return 0; |
|
368 |
+ case concat: |
|
369 |
+ if(prev != n->u.children.left) { |
|
370 |
+ if(build_suffixtree_descend(matcher, regex, n->u.children.left, buf) < 0) |
|
371 |
+ return CL_EMEM; |
|
372 |
+ /* we're done here, descend will call |
|
373 |
+ * ascend if needed */ |
|
374 |
+ return 0; |
|
375 |
+ } else { |
|
376 |
+ n = n->parent; |
|
377 |
+ } |
|
378 |
+ break; |
|
379 |
+ case alternate: |
|
380 |
+ n = n->parent; |
|
381 |
+ break; |
|
382 |
+ case optional: |
|
383 |
+ textbuffer_putc(buf, '\0'); |
|
384 |
+ if(add_pattern_suffix(matcher, buf->data, buf->pos, regex) < 0) |
|
385 |
+ return CL_EMEM; |
|
386 |
+ return 0; |
|
387 |
+ } |
|
388 |
+ prev = q; |
|
389 |
+ } |
|
390 |
+ return 0; |
|
391 |
+} |
|
392 |
+ |
|
393 |
+static int build_suffixtree_descend(struct regex_matcher *matcher, struct regex_list *regex, struct node *n, struct text_buffer *buf) |
|
394 |
+{ |
|
395 |
+ size_t pos; |
|
396 |
+ while(n && n->type == concat) { |
|
397 |
+ n = n->u.children.right; |
|
398 |
+ } |
|
399 |
+ if(!n) |
|
400 |
+ return 0; |
|
401 |
+ /* find out end of the regular expression, |
|
402 |
+ * if it ends with a static pattern */ |
|
403 |
+ switch(n->type) { |
|
404 |
+ case alternate: |
|
405 |
+ /* save pos as restart point */ |
|
406 |
+ pos = buf->pos; |
|
407 |
+ if(build_suffixtree_descend(matcher, regex, n->u.children.left, buf) < 0) |
|
408 |
+ return CL_EMEM; |
|
409 |
+ buf->pos = pos; |
|
410 |
+ if(build_suffixtree_descend(matcher, regex, n->u.children.right, buf) < 0) |
|
411 |
+ return CL_EMEM; |
|
412 |
+ buf->pos = pos; |
|
413 |
+ break; |
|
414 |
+ case optional: |
|
415 |
+ textbuffer_putc(buf, '\0'); |
|
416 |
+ if(add_pattern_suffix(matcher, buf->data, buf->pos, regex) < 0) |
|
417 |
+ return CL_EMEM; |
|
418 |
+ return 0; |
|
419 |
+ case leaf: |
|
420 |
+ case leaf_class: |
|
421 |
+ if(build_suffixtree_ascend(matcher, regex, n, buf, NULL) < 0) |
|
422 |
+ return CL_EMEM; |
|
423 |
+ return 0; |
|
424 |
+ default: |
|
425 |
+ break; |
|
426 |
+ } |
|
427 |
+ return 0; |
|
428 |
+} |
|
429 |
+ |
|
430 |
+ |
|
431 |
+/* ----- shift-or filtering -------------- */ |
|
432 |
+ |
|
433 |
+#define BITMAP_CONTAINS(bmap, val) ((bmap)[(val) >> 5] & (1 << ((val) & 0x1f))) |
|
434 |
+#define BITMAP_INSERT(bmap, val) ((bmap)[(val) >> 5] |= (1 << ((val) & 0x1f))) |
|
435 |
+ |
|
436 |
+static void SO_init(struct filter *m) |
|
437 |
+{ |
|
438 |
+ memset(m->B, ~0, sizeof(m->B)); |
|
439 |
+ memset(m->end, ~0, sizeof(m->end)); |
|
440 |
+ memset(m->end_fast, ~0, sizeof(m->end_fast)); |
|
441 |
+} |
|
442 |
+ |
|
443 |
+/* because we use uint32_t */ |
|
444 |
+#define MAXSOPATLEN 32 |
|
445 |
+ |
|
446 |
+/* merge another pattern into the filter |
|
447 |
+ * add('abc'); add('bcd'); will match [ab][bc][cd] */ |
|
448 |
+static int SO_preprocess_add(struct filter *m, const unsigned char *pattern, size_t len) |
|
449 |
+{ |
|
450 |
+ uint16_t q; |
|
451 |
+ uint8_t j; |
|
452 |
+ |
|
453 |
+ /* cut length, and make it modulo 2 */ |
|
454 |
+ if(len > MAXSOPATLEN) { |
|
455 |
+ len = MAXSOPATLEN; |
|
456 |
+ } else { |
|
457 |
+ /* we use 2-grams, must be multiple of 2 */ |
|
458 |
+ len = len & ~1; |
|
459 |
+ } |
|
460 |
+ if(!len) |
|
461 |
+ return 0; |
|
462 |
+ |
|
463 |
+ /* Shift-Or like preprocessing */ |
|
464 |
+ for(j=0;j < len-1;j++) { |
|
465 |
+ /* use overlapping 2-grams. We need them overlapping because matching can start at any position */ |
|
466 |
+ q = cli_readint16( &pattern[j] ); |
|
467 |
+ m->B[q] &= ~(1 << j); |
|
468 |
+ } |
|
469 |
+ /* we use variable length patterns, use last character to mark pattern end, |
|
470 |
+ * can lead to false positives.*/ |
|
471 |
+ /* mark that at state j, the q-gram q can end the pattern */ |
|
472 |
+ if(j) { |
|
473 |
+ j--; |
|
474 |
+ m->end[q] &= ~(1 << j); |
|
475 |
+ m->end_fast[pattern[j]] &= (1<<j); |
|
476 |
+ } |
|
477 |
+ return 0; |
|
478 |
+} |
|
479 |
+ |
|
480 |
+/* this is like a FSM, with multiple active states at the same time. |
|
481 |
+ * each bit in "state" means an active state, when a char is encountered |
|
482 |
+ * we determine what states can remain active. |
|
483 |
+ * The FSM transition rules are expressed as bit-masks */ |
|
484 |
+static long SO_search(const struct filter *m, const unsigned char *data, unsigned long len) |
|
485 |
+{ |
|
486 |
+ size_t j; |
|
487 |
+ uint32_t state = ~0; |
|
488 |
+ const uint32_t *B = m->B; |
|
489 |
+ const uint32_t *End = m->end; |
|
490 |
+ const uint32_t *EndFast = m->end_fast; |
|
491 |
+ |
|
492 |
+ if(!len) return -1; |
|
493 |
+ /* Shift-Or like search algorithm */ |
|
494 |
+ for(j=0;j < len-1; j++) { |
|
495 |
+ const uint16_t q0 = cli_readint16( &data[j] ); |
|
496 |
+ uint32_t match_end; |
|
497 |
+ state = (state << 1) | B[q0]; |
|
498 |
+ /* state marks with a 0 bit all active states |
|
499 |
+ * End[q0] marks with a 0 bit all states where the q-gram 'q' can end a pattern |
|
500 |
+ * if we got two 0's at matching positions, it means we encountered a pattern's end */ |
|
501 |
+ match_end = state | EndFast[data[j+1]]; |
|
502 |
+ if((match_end != 0xffffffff) && (state | End[q0]) != 0xffffffff) { |
|
503 |
+ /* note: we rely on short-circuit eval here, we only evaluate and fetch End[q0], if |
|
504 |
+ * end_fast has matched. This reduces cache pressure on End[], and allows us to keep the working |
|
505 |
+ * set inside L2 */ |
|
506 |
+ |
|
507 |
+ /* if state is reachable, and this character can finish a pattern, assume match */ |
|
508 |
+ /* to reduce false positives check if qgram can finish the pattern */ |
|
509 |
+ /* return position of probable match */ |
|
510 |
+ /* find first 0 starting from MSB, the position of that bit as counted from LSB, is the length of the |
|
511 |
+ * longest pattern that could match */ |
|
512 |
+ return j >= MAXSOPATLEN ? j - MAXSOPATLEN : 0; |
|
513 |
+ } |
|
514 |
+ } |
|
515 |
+ /* no match */ |
|
516 |
+ return -1; |
|
517 |
+} |
|
518 |
+ |
|
519 |
+/* ----------------------------------------------------------- */ |
|
190 | 520 |
|
191 |
-/* Prototypes */ |
|
192 |
-static int add_pattern(struct regex_matcher* matcher,const unsigned char* pat,const char* info,int hostOnly); |
|
193 |
-static int match_node(struct tree_node* node,const unsigned char* c,size_t len,const char** info); |
|
194 |
-static void destroy_tree(struct regex_matcher* matcher); |
|
195 |
-static struct tree_node* tree_root_alloc(void); |
|
196 |
-static int build_regex_list(struct regex_matcher* matcher); |
|
197 |
-static void stack_destroy(struct node_stack* stack); |
|
198 |
- |
|
199 |
-#ifndef NDEBUG |
|
200 |
-void dump_tree(struct tree_node* root); |
|
201 |
-#endif |
|
202 | 521 |
|
203 | 522 |
#define MATCH_SUCCESS 0 |
204 | 523 |
#define MATCH_FAILED -1 |
... | ... |
@@ -233,6 +554,43 @@ static inline size_t get_char_at_pos_with_skip(const struct pre_fixup_info* info |
233 | 233 |
return (pos>0 && !str[realpos]) ? '\0' : str[realpos>0?realpos-1:0]; |
234 | 234 |
} |
235 | 235 |
|
236 |
+static int validate_subdomain(const struct regex_list *regex, const struct pre_fixup_info *pre_fixup, const char *buffer, size_t buffer_len, char *real_url, size_t real_len, char *orig_real_url) |
|
237 |
+{ |
|
238 |
+ char c; |
|
239 |
+ const char *matched; |
|
240 |
+ size_t match_len; |
|
241 |
+ |
|
242 |
+ if(!regex || !regex->pattern) |
|
243 |
+ return 0; |
|
244 |
+ match_len = strlen(regex->pattern); |
|
245 |
+ if(((c=get_char_at_pos_with_skip(pre_fixup,buffer,buffer_len+1))==' ' || c=='\0' || c=='/' || c=='?') && |
|
246 |
+ (match_len == buffer_len || /* full match */ |
|
247 |
+ (match_len < buffer_len && |
|
248 |
+ ((c=get_char_at_pos_with_skip(pre_fixup,buffer,buffer_len-match_len))=='.' || (c==' ')) ) |
|
249 |
+ /* subdomain matched*/)) { |
|
250 |
+ cli_dbgmsg("Got a match: %s with %s\n", buffer, regex->pattern); |
|
251 |
+ cli_dbgmsg("Before inserting .: %s\n", orig_real_url); |
|
252 |
+ if(real_len >= match_len + 1) { |
|
253 |
+ const size_t pos = real_len - match_len - 1; |
|
254 |
+ if(real_url[pos] != '.') { |
|
255 |
+ /* we need to shift left, and insert a '.' |
|
256 |
+ * we have an extra '.' at the beginning inserted by get_host to have room, |
|
257 |
+ * orig_real_url has to be used here, |
|
258 |
+ * because we want to overwrite that extra '.' */ |
|
259 |
+ size_t orig_real_len = strlen(orig_real_url); |
|
260 |
+ cli_dbgmsg("No dot here:%s\n",real_url+pos); |
|
261 |
+ real_url = orig_real_url; |
|
262 |
+ memmove(real_url, real_url+1, orig_real_len-match_len-1); |
|
263 |
+ real_url[orig_real_len-match_len-1]='.'; |
|
264 |
+ cli_dbgmsg("After inserting .: %s\n", real_url); |
|
265 |
+ } |
|
266 |
+ } |
|
267 |
+ return 1; |
|
268 |
+ } |
|
269 |
+ cli_dbgmsg("Ignoring false match: %s with %s, mismatched character: %c\n", buffer, regex->pattern, c); |
|
270 |
+ return 0; |
|
271 |
+} |
|
272 |
+ |
|
236 | 273 |
/* |
237 | 274 |
* @matcher - matcher structure to use |
238 | 275 |
* @real_url - href target |
... | ... |
@@ -246,24 +604,28 @@ static inline size_t get_char_at_pos_with_skip(const struct pre_fixup_info* info |
246 | 246 |
* Do not send NULL pointers to this function!! |
247 | 247 |
* |
248 | 248 |
*/ |
249 |
-int regex_list_match(struct regex_matcher* matcher,char* real_url,const char* display_url,const struct pre_fixup_info* pre_fixup,int hostOnly,const char** info,int is_whitelist) |
|
249 |
+int regex_list_match(struct regex_matcher* matcher,char* real_url,const char* display_url,const struct pre_fixup_info* pre_fixup,int hostOnly,const char **info, int is_whitelist) |
|
250 | 250 |
{ |
251 | 251 |
char* orig_real_url = real_url; |
252 |
- massert(matcher); |
|
253 |
- massert(real_url); |
|
254 |
- massert(display_url); |
|
255 |
- massert(info); |
|
252 |
+ const char *vinfo; |
|
253 |
+ struct regex_list *regex; |
|
254 |
+ |
|
255 |
+ assert(matcher); |
|
256 |
+ assert(real_url); |
|
257 |
+ assert(display_url); |
|
258 |
+ *info = NULL; |
|
256 | 259 |
if(!matcher->list_inited) |
257 | 260 |
return 0; |
258 |
- massert(matcher->list_built); |
|
261 |
+ assert(matcher->list_built); |
|
259 | 262 |
/* skip initial '.' inserted by get_host */ |
260 | 263 |
if(real_url[0] == '.') real_url++; |
261 | 264 |
if(display_url[0] == '.') display_url++; |
262 | 265 |
{ |
263 | 266 |
size_t real_len = strlen(real_url); |
264 | 267 |
size_t display_len = strlen(display_url); |
265 |
- size_t buffer_len = (hostOnly && !is_whitelist) ? real_len : real_len + display_len + 1 + (is_whitelist ? 1 : 0); |
|
266 |
- char* buffer = cli_malloc(buffer_len+1); |
|
268 |
+ size_t buffer_len = (hostOnly && !is_whitelist) ? real_len + 1 : real_len + display_len + 1 + 1; |
|
269 |
+ char *buffer = cli_malloc(buffer_len+1); |
|
270 |
+ char *bufrev; |
|
267 | 271 |
size_t i; |
268 | 272 |
int rc = 0; |
269 | 273 |
struct cli_ac_data mdata; |
... | ... |
@@ -272,61 +634,48 @@ int regex_list_match(struct regex_matcher* matcher,char* real_url,const char* di |
272 | 272 |
return CL_EMEM; |
273 | 273 |
|
274 | 274 |
strncpy(buffer,real_url,real_len); |
275 |
- buffer[real_len]= (!is_whitelist && hostOnly) ? '\0' : ':'; |
|
275 |
+ buffer[real_len]= (!is_whitelist && hostOnly) ? '/' : ':'; |
|
276 | 276 |
if(!hostOnly || is_whitelist) { |
277 | 277 |
strncpy(buffer+real_len+1,display_url,display_len); |
278 |
- if(is_whitelist) |
|
279 |
- buffer[buffer_len - 1] = '/'; |
|
280 |
- buffer[buffer_len]=0; |
|
281 | 278 |
} |
279 |
+ buffer[buffer_len - 1] = '/'; |
|
280 |
+ buffer[buffer_len]=0; |
|
282 | 281 |
cli_dbgmsg("Looking up in regex_list: %s\n", buffer); |
283 | 282 |
|
284 |
- if(hostOnly) { |
|
285 |
- if((rc = cli_ac_initdata(&mdata, 0, AC_DEFAULT_TRACKLEN))) |
|
286 |
- return rc; |
|
287 |
- rc = 0; |
|
288 |
- |
|
289 |
- for(i = 0; i < matcher->root_hosts_cnt; i++) { |
|
290 |
- /* doesn't need to match terminating \0*/ |
|
291 |
- rc = cli_ac_scanbuff((unsigned char*)buffer,buffer_len,info, &matcher->root_hosts[i] ,&mdata,0,0,-1,NULL,AC_SCAN_VIR,NULL); |
|
292 |
- cli_ac_freedata(&mdata); |
|
293 |
- if(rc) { |
|
294 |
- char c; |
|
295 |
- const char* matched = strchr(*info,':'); |
|
296 |
- const size_t match_len = matched ? strlen(matched+1) : 0; |
|
297 |
- if(((c=get_char_at_pos_with_skip(pre_fixup,buffer,buffer_len+1))==' ' || c=='\0' || c=='/' || c=='?') && |
|
298 |
- (match_len == buffer_len || /* full match */ |
|
299 |
- (match_len < buffer_len && |
|
300 |
- ((c=get_char_at_pos_with_skip(pre_fixup,buffer,buffer_len-match_len))=='.' || (c==' ')) ) |
|
301 |
- /* subdomain matched*/)) { |
|
302 |
- |
|
303 |
- cli_dbgmsg("Got a match: %s with %s\n", buffer, *info); |
|
304 |
- cli_dbgmsg("Before inserting .: %s\n", orig_real_url); |
|
305 |
- if(real_len >= match_len + 1) { |
|
306 |
- const size_t pos = real_len - match_len - 1; |
|
307 |
- if(real_url[pos] != '.') { |
|
308 |
- /* we need to shift left, and insert a '.' |
|
309 |
- * we have an extra '.' at the beginning inserted by get_host to have room, |
|
310 |
- * orig_real_url has to be used here, |
|
311 |
- * because we want to overwrite that extra '.' */ |
|
312 |
- size_t orig_real_len = strlen(orig_real_url); |
|
313 |
- cli_dbgmsg("No dot here:%s\n",real_url+pos); |
|
314 |
- real_url = orig_real_url; |
|
315 |
- memmove(real_url, real_url+1, orig_real_len-match_len-1); |
|
316 |
- real_url[orig_real_len-match_len-1]='.'; |
|
317 |
- cli_dbgmsg("After inserting .: %s\n", real_url); |
|
318 |
- } |
|
319 |
- } |
|
320 |
- break; |
|
321 |
- } |
|
322 |
- cli_dbgmsg("Ignoring false match: %s with %s, mismatched character: %c\n", buffer, *info, c); |
|
323 |
- rc=0; |
|
283 |
+ if((rc = cli_ac_initdata(&mdata, 0, AC_DEFAULT_TRACKLEN))) |
|
284 |
+ return rc; |
|
285 |
+ |
|
286 |
+ bufrev = cli_strdup(buffer); |
|
287 |
+ if(!bufrev) |
|
288 |
+ return CL_EMEM; |
|
289 |
+ reverse_string(bufrev); |
|
290 |
+ rc = SO_search(&matcher->filter, (const unsigned char*)bufrev, buffer_len) != -1; |
|
291 |
+ if(!rc) { |
|
292 |
+ /* filter says this suffix doesn't match. |
|
293 |
+ * The filter has false positives, but no false |
|
294 |
+ * negatives */ |
|
295 |
+ return 0; |
|
296 |
+ } |
|
297 |
+ |
|
298 |
+ rc = cli_ac_scanbuff((unsigned char*)bufrev,buffer_len, &vinfo, &matcher->suffixes,&mdata,0,0,-1,NULL,AC_SCAN_VIR,NULL); |
|
299 |
+ cli_ac_freedata(&mdata); |
|
300 |
+ |
|
301 |
+ if(rc) { |
|
302 |
+ /* TODO loop over multiple virusnames here */ |
|
303 |
+ regex = (struct regex_list*)vinfo; |
|
304 |
+ do { |
|
305 |
+ /* loop over multiple regexes corresponding to |
|
306 |
+ * this suffix */ |
|
307 |
+ if (!regex->preg.re_magic) { |
|
308 |
+ /* we matched a static pattern */ |
|
309 |
+ rc = validate_subdomain(regex, pre_fixup, buffer, buffer_len, real_url, real_len, orig_real_url); |
|
310 |
+ } else { |
|
311 |
+ rc = !cli_regexec(®ex->preg, buffer, 0, NULL, 0); |
|
324 | 312 |
} |
325 |
- } |
|
326 |
- } else |
|
327 |
- rc = 0; |
|
328 |
- if(!rc) |
|
329 |
- rc = match_node(hostOnly ? matcher->root_regex_hostonly : matcher->root_regex,(unsigned char*)buffer,buffer_len,info) == MATCH_SUCCESS ? CL_VIRUS : CL_SUCCESS; |
|
313 |
+ if(rc) *info = regex->pattern; |
|
314 |
+ regex = regex->nxt; |
|
315 |
+ } while(!rc && regex); |
|
316 |
+ } |
|
330 | 317 |
free(buffer); |
331 | 318 |
if(!rc) |
332 | 319 |
cli_dbgmsg("Lookup result: not in regex list\n"); |
... | ... |
@@ -336,56 +685,6 @@ int regex_list_match(struct regex_matcher* matcher,char* real_url,const char* di |
336 | 336 |
} |
337 | 337 |
} |
338 | 338 |
|
339 |
-/* node stack */ |
|
340 |
-#define NODE_STACK_INITIAL 1024 |
|
341 |
-#define NODE_STACK_GROW 4096 |
|
342 |
-/* Initialize @stack */ |
|
343 |
-static int stack_init(struct node_stack* stack) |
|
344 |
-{ |
|
345 |
- massert(stack); |
|
346 |
- |
|
347 |
- stack->cnt = 0; |
|
348 |
- stack->capacity = NODE_STACK_INITIAL; |
|
349 |
- stack->data = cli_malloc(stack->capacity * sizeof(*stack->data)); |
|
350 |
- if(!stack->data) |
|
351 |
- return CL_EMEM; |
|
352 |
- else |
|
353 |
- return CL_SUCCESS; |
|
354 |
-} |
|
355 |
- |
|
356 |
-/* Reset @stack pointer, but don't realloc */ |
|
357 |
-static void stack_reset(struct node_stack* stack) |
|
358 |
-{ |
|
359 |
- massert(stack); |
|
360 |
- |
|
361 |
- stack->cnt = 0; |
|
362 |
-} |
|
363 |
- |
|
364 |
-/* Push @node on @stack, growing it if necessarry */ |
|
365 |
-static int stack_push(struct node_stack* stack,struct tree_node* node) |
|
366 |
-{ |
|
367 |
- massert(stack); |
|
368 |
- massert(stack->data); |
|
369 |
- |
|
370 |
- if(stack->cnt == stack->capacity) { |
|
371 |
- stack->capacity += NODE_STACK_GROW; |
|
372 |
- stack->data = cli_realloc2(stack->data,stack->capacity*sizeof(*stack->data)); |
|
373 |
- if(!stack->data) |
|
374 |
- return CL_EMEM; |
|
375 |
- } |
|
376 |
- stack->data[stack->cnt++] = node; |
|
377 |
- return CL_SUCCESS; |
|
378 |
-} |
|
379 |
- |
|
380 |
-/* Pops node from @stack, doesn't realloc */ |
|
381 |
-static struct tree_node* stack_pop(struct node_stack* stack) |
|
382 |
-{ |
|
383 |
- massert(stack); |
|
384 |
- massert(stack->data); |
|
385 |
- massert(stack->cnt);/*don't pop from empty stack */ |
|
386 |
- |
|
387 |
- return stack->cnt ? stack->data[--stack->cnt] : NULL; |
|
388 |
-} |
|
389 | 339 |
|
390 | 340 |
/* Initialization & loading */ |
391 | 341 |
/* Initializes @matcher, allocating necesarry substructures */ |
... | ... |
@@ -393,90 +692,21 @@ int init_regex_list(struct regex_matcher* matcher) |
393 | 393 |
{ |
394 | 394 |
int rc; |
395 | 395 |
|
396 |
- massert(matcher); |
|
397 |
- matcher->list_inited = 0; |
|
398 |
- matcher->root_hosts_cnt = 0; |
|
399 |
- matcher->root_hosts = NULL; |
|
400 |
- matcher->root_hosts_cnt = 0; |
|
401 |
- |
|
402 |
- matcher->root_regex = tree_root_alloc(); |
|
403 |
- if(!matcher->root_regex) { |
|
404 |
- return CL_EMEM; |
|
405 |
- } |
|
406 |
- |
|
407 |
- matcher->root_regex_hostonly = tree_root_alloc(); |
|
408 |
- if(!matcher->root_regex_hostonly) { |
|
409 |
- free(matcher->root_regex); |
|
410 |
- return CL_EMEM; |
|
411 |
- } |
|
412 |
- |
|
413 |
- if(( rc = stack_init(&matcher->node_stack) )) { |
|
414 |
- free(matcher->root_regex_hostonly); |
|
415 |
- free(matcher->root_regex); |
|
416 |
- return rc; |
|
417 |
- } |
|
418 |
- if(( rc = stack_init(&matcher->node_stack_alt) )) { |
|
419 |
- free(matcher->root_regex_hostonly); |
|
420 |
- free(matcher->root_regex); |
|
421 |
- stack_destroy(&matcher->node_stack); |
|
422 |
- return rc; |
|
423 |
- } |
|
396 |
+ assert(matcher); |
|
397 |
+ memset(matcher, 0, sizeof(*matcher)); |
|
424 | 398 |
|
425 | 399 |
matcher->list_inited=1; |
426 |
- matcher->list_built=1;/* its empty, but pretend its built, so that load_ will realloc root_hosts */ |
|
400 |
+ matcher->list_built=0; |
|
427 | 401 |
matcher->list_loaded=0; |
428 | 402 |
|
403 |
+ hashtab_init(&matcher->suffix_hash, 10); |
|
404 |
+ if((rc = cli_ac_init(&matcher->suffixes, 2, 32))) { |
|
405 |
+ return rc; |
|
406 |
+ } |
|
407 |
+ SO_init(&matcher->filter); |
|
429 | 408 |
return CL_SUCCESS; |
430 | 409 |
} |
431 | 410 |
|
432 |
-/* inserts @pattern into @root, using ac-matcher |
|
433 |
- * although the name might be confusing, @pattern is not a regex!*/ |
|
434 |
-static int add_regex_list_element(struct cli_matcher* root,const char* pattern,char* info) |
|
435 |
-{ |
|
436 |
- int ret; |
|
437 |
- struct cli_ac_patt *new = cli_calloc(1,sizeof(*new)); |
|
438 |
- size_t len,i; |
|
439 |
- |
|
440 |
- if(!new) |
|
441 |
- return CL_EMEM; |
|
442 |
- massert(root); |
|
443 |
- massert(pattern); |
|
444 |
- |
|
445 |
- len = strlen(pattern); |
|
446 |
- /* need not to match \0 too */ |
|
447 |
- new->rtype = 0; |
|
448 |
- new->type = 0; |
|
449 |
- new->sigid = 0; |
|
450 |
- new->parts = 0; |
|
451 |
- new->partno = 0; |
|
452 |
- new->mindist = 0; |
|
453 |
- new->maxdist = 0; |
|
454 |
- new->offset = 0; |
|
455 |
- new->target = 0; |
|
456 |
- new->length = len; |
|
457 |
- new->ch[0] = new->ch[1] |= CLI_MATCH_IGNORE; |
|
458 |
- if(new->length > root->maxpatlen) |
|
459 |
- root->maxpatlen = new->length; |
|
460 |
- |
|
461 |
- new->pattern = cli_malloc(sizeof(new->pattern[0])*len); |
|
462 |
- if(!new->pattern) { |
|
463 |
- free(new); |
|
464 |
- return CL_EMEM; |
|
465 |
- } |
|
466 |
- for(i=0;i<len;i++) |
|
467 |
- new->pattern[i]=pattern[i];/*new->pattern is short int* */ |
|
468 |
- |
|
469 |
- |
|
470 |
- new->virname = cli_strdup(info); |
|
471 |
- if((ret = cli_ac_addpatt(root,new))) { |
|
472 |
- free(new->virname); |
|
473 |
- free(new->pattern); |
|
474 |
- free(new); |
|
475 |
- return ret; |
|
476 |
- } |
|
477 |
- return CL_SUCCESS; |
|
478 |
-} |
|
479 |
- |
|
480 | 411 |
static int functionality_level_check(char* line) |
481 | 412 |
{ |
482 | 413 |
char* ptmin; |
... | ... |
@@ -527,14 +757,10 @@ int load_regex_matcher(struct regex_matcher* matcher,FILE* fd,unsigned int optio |
527 | 527 |
int rc,line=0; |
528 | 528 |
char buffer[FILEBUFF]; |
529 | 529 |
|
530 |
- massert(matcher); |
|
530 |
+ assert(matcher); |
|
531 | 531 |
|
532 | 532 |
if(matcher->list_inited==-1) |
533 | 533 |
return CL_EMALFDB; /* already failed to load */ |
534 |
-/* if(matcher->list_loaded) { |
|
535 |
- cli_warnmsg("Regex list has already been loaded, ignoring further requests for load\n"); |
|
536 |
- return CL_SUCCESS; |
|
537 |
- }*/ |
|
538 | 534 |
if(!fd && !dbio) { |
539 | 535 |
cli_errmsg("Unable to load regex list (null file)\n"); |
540 | 536 |
return CL_EIO; |
... | ... |
@@ -548,7 +774,6 @@ int load_regex_matcher(struct regex_matcher* matcher,FILE* fd,unsigned int optio |
548 | 548 |
fatal_error(matcher); |
549 | 549 |
return rc; |
550 | 550 |
} |
551 |
- /*atexit(regex_list_done); TODO: destroy this in manager.c */ |
|
552 | 551 |
} |
553 | 552 |
/* |
554 | 553 |
* Regexlist db format (common to .wdb(whitelist) and .pdb(domainlist) files: |
... | ... |
@@ -573,11 +798,13 @@ int load_regex_matcher(struct regex_matcher* matcher,FILE* fd,unsigned int optio |
573 | 573 |
while(cli_dbgets(buffer, FILEBUFF, fd, dbio)) { |
574 | 574 |
char* pattern; |
575 | 575 |
char* flags; |
576 |
+ size_t pattern_len; |
|
577 |
+ |
|
576 | 578 |
cli_chomp(buffer); |
577 | 579 |
if(!*buffer) |
578 | 580 |
continue;/* skip empty lines */ |
579 | 581 |
|
580 |
- if(functionality_level_check(buffer)) |
|
582 |
+ if(functionality_level_check(buffer)) |
|
581 | 583 |
continue; |
582 | 584 |
|
583 | 585 |
line++; |
... | ... |
@@ -591,83 +818,39 @@ int load_regex_matcher(struct regex_matcher* matcher,FILE* fd,unsigned int optio |
591 | 591 |
flags = buffer+1; |
592 | 592 |
pattern++; |
593 | 593 |
|
594 |
- if(is_whitelist) { |
|
595 |
- const size_t pattern_len = strlen(pattern); |
|
596 |
- if(pattern_len < FILEBUFF) { |
|
597 |
- pattern[pattern_len] = '/'; |
|
598 |
- pattern[pattern_len+1] = '\0'; |
|
599 |
- } |
|
600 |
- else { |
|
601 |
- cli_errmsg("Overlong regex line %d\n",line); |
|
602 |
- fatal_error(matcher); |
|
603 |
- return CL_EMALFDB; |
|
604 |
- } |
|
594 |
+ pattern_len = strlen(pattern); |
|
595 |
+ if(pattern_len < FILEBUFF) { |
|
596 |
+ pattern[pattern_len] = '/'; |
|
597 |
+ pattern[pattern_len+1] = '\0'; |
|
598 |
+ } |
|
599 |
+ else { |
|
600 |
+ cli_errmsg("Overlong regex line %d\n",line); |
|
601 |
+ fatal_error(matcher); |
|
602 |
+ return CL_EMALFDB; |
|
605 | 603 |
} |
606 | 604 |
|
607 |
- if((buffer[0] == 'R' && !is_whitelist) || ((buffer[0] == 'X' || buffer[0] == 'Y') && is_whitelist)) {/*regex*/ |
|
608 |
- if(( rc = add_pattern(matcher,(const unsigned char*)pattern,flags, buffer[0] == 'Y') )) |
|
605 |
+ if((buffer[0] == 'R' && !is_whitelist) || ((buffer[0] == 'X' || buffer[0] == 'Y') && is_whitelist)) { |
|
606 |
+ /* regex for hostname*/ |
|
607 |
+ if (( rc = add_pattern(matcher, pattern) )) |
|
609 | 608 |
return rc==CL_EMEM ? CL_EMEM : CL_EMALFDB; |
610 | 609 |
} |
611 |
- else if( ( buffer[0] == 'H' && !is_whitelist) || (buffer[0] == 'M' && is_whitelist)) {/*matches displayed host*/ |
|
612 |
- struct cli_matcher* root; |
|
613 |
- if(matcher->list_built) { |
|
614 |
- struct cli_matcher* old_hosts = matcher->root_hosts; |
|
615 |
- matcher->root_hosts_cnt++; |
|
616 |
- |
|
617 |
- matcher->root_hosts = cli_realloc(matcher->root_hosts, matcher->root_hosts_cnt * sizeof(*matcher->root_hosts)); |
|
618 |
- if(!matcher->root_hosts) { |
|
619 |
- matcher->root_hosts = old_hosts;/* according to manpage this must still be valid*/ |
|
620 |
- return CL_EMEM; |
|
621 |
- } |
|
622 |
- |
|
623 |
- root = &matcher->root_hosts[matcher->root_hosts_cnt-1]; |
|
624 |
- memset(root, 0, sizeof(struct cli_matcher)); |
|
625 |
- |
|
626 |
- cli_dbgmsg("regex_list: Initialising AC pattern matcher\n"); |
|
627 |
- if((rc = cli_ac_init(root, cli_ac_mindepth, cli_ac_maxdepth))) { |
|
628 |
- /* no need to free previously allocated memory here */ |
|
629 |
- cli_errmsg("regex_list: Can't initialise AC pattern matcher\n"); |
|
630 |
- return rc; |
|
631 |
- } |
|
632 |
- matcher->list_built = 0; |
|
633 |
- } |
|
634 |
- else { |
|
635 |
- root = &matcher->root_hosts[matcher->root_hosts_cnt-1]; |
|
636 |
- } |
|
637 |
- if(( rc = add_regex_list_element(root,pattern,flags) )) |
|
610 |
+ else if( ( buffer[0] == 'H' && !is_whitelist) || (buffer[0] == 'M' && is_whitelist)) { |
|
611 |
+ /*matches displayed host*/ |
|
612 |
+ if (( rc = add_static_pattern(matcher, pattern) )) |
|
638 | 613 |
return rc==CL_EMEM ? CL_EMEM : CL_EMALFDB; |
639 | 614 |
} |
640 | 615 |
else { |
641 | 616 |
return CL_EMALFDB; |
642 |
- /* this is useless, we have host, and regex matches |
|
643 |
- if(( rc = add_regex_list_element(matcher->root_urls,pattern,flags) )) |
|
644 |
- return rc==CL_EMEM ? CL_EMEM : CL_EMALFDB;*/ |
|
645 | 617 |
} |
646 | 618 |
} |
647 | 619 |
matcher->list_loaded = 1; |
648 |
- if(( rc = build_regex_list(matcher) )) |
|
649 |
- return rc; |
|
650 | 620 |
|
651 |
-#ifndef NDEBUG |
|
652 |
-/* dump_tree(matcher->root_regex);*/ |
|
653 |
-#endif |
|
654 |
- if(!matcher->list_built) { |
|
655 |
- cli_errmsg("Regex list not loaded: build failed!\n"); |
|
656 |
- fatal_error(matcher); |
|
657 |
- return CL_EMALFDB; |
|
658 |
- } |
|
659 |
- regex_list_cleanup(matcher); |
|
660 | 621 |
return CL_SUCCESS; |
661 | 622 |
} |
662 | 623 |
|
663 | 624 |
|
664 |
-static struct tree_node ** tree_node_get_children(const struct tree_node* node) |
|
665 |
-{ |
|
666 |
- return node->op==OP_CUSTOMCLASS ? (node->u.children[1] ? node->u.children+1 : NULL) :node->u.children; |
|
667 |
-} |
|
668 |
- |
|
669 | 625 |
/* Build the matcher list */ |
670 |
-static int build_regex_list(struct regex_matcher* matcher) |
|
626 |
+int cli_build_regex_list(struct regex_matcher* matcher) |
|
671 | 627 |
{ |
672 | 628 |
int rc; |
673 | 629 |
if(!matcher->list_inited || !matcher->list_loaded) { |
... | ... |
@@ -675,9 +858,9 @@ static int build_regex_list(struct regex_matcher* matcher) |
675 | 675 |
return -1;/*TODO: better error code */ |
676 | 676 |
} |
677 | 677 |
cli_dbgmsg("Building regex list\n"); |
678 |
- if(matcher->root_hosts) |
|
679 |
- if(( rc = cli_ac_buildtrie(&matcher->root_hosts[matcher->root_hosts_cnt-1]) )) |
|
680 |
- return rc; |
|
678 |
+ hashtab_free(&matcher->suffix_hash); |
|
679 |
+ if(( rc = cli_ac_buildtrie(&matcher->suffixes) )) |
|
680 |
+ return rc; |
|
681 | 681 |
matcher->list_built=1; |
682 | 682 |
|
683 | 683 |
return CL_SUCCESS; |
... | ... |
@@ -686,864 +869,193 @@ static int build_regex_list(struct regex_matcher* matcher) |
686 | 686 |
/* Done with this matcher, free resources */ |
687 | 687 |
void regex_list_done(struct regex_matcher* matcher) |
688 | 688 |
{ |
689 |
- massert(matcher); |
|
689 |
+ assert(matcher); |
|
690 | 690 |
|
691 |
- regex_list_cleanup(matcher); |
|
692 | 691 |
if(matcher->list_loaded) { |
693 |
- if(matcher->root_hosts) { |
|
694 |
- size_t i; |
|
695 |
- for(i=0;i<matcher->root_hosts_cnt;i++) |
|
696 |
- cli_ac_free(&matcher->root_hosts[i]); |
|
697 |
- free(matcher->root_hosts); |
|
698 |
- matcher->root_hosts=NULL; |
|
692 |
+ size_t i; |
|
693 |
+ /* TODO: call it, but be sure it won't free virname */ |
|
694 |
+ //cli_ac_free(&matcher->suffixes); |
|
695 |
+ if(matcher->suffix_regexes) { |
|
696 |
+ for(i=0;i<matcher->suffix_cnt;i++) { |
|
697 |
+ struct regex_list *r = matcher->suffix_regexes[i]; |
|
698 |
+ while(r) { |
|
699 |
+ cli_regfree(&r->preg); |
|
700 |
+ r = r->nxt; |
|
701 |
+ } |
|
702 |
+ } |
|
703 |
+ free(matcher->suffix_regexes); |
|
704 |
+ matcher->suffix_regexes = NULL; |
|
699 | 705 |
} |
700 |
- |
|
701 |
- matcher->root_hosts_cnt=0; |
|
706 |
+ hashtab_free(&matcher->suffix_hash); |
|
702 | 707 |
matcher->list_built=0; |
703 |
- destroy_tree(matcher); |
|
704 | 708 |
matcher->list_loaded=0; |
705 | 709 |
} |
706 | 710 |
if(matcher->list_inited) { |
707 | 711 |
matcher->list_inited=0; |
708 | 712 |
} |
709 |
- stack_destroy(&matcher->node_stack); |
|
710 |
- stack_destroy(&matcher->node_stack_alt); |
|
711 | 713 |
} |
712 | 714 |
|
713 |
-/* Tree matcher algorithm */ |
|
714 |
-struct token_t |
|
715 |
-{ |
|
716 |
- union { |
|
717 |
- const unsigned char* start; |
|
718 |
- char_bitmap_p bitmap; |
|
719 |
- unsigned char c; |
|
720 |
- } u; |
|
721 |
- size_t len; |
|
722 |
- char type; |
|
723 |
-}; |
|
724 |
- |
|
725 |
-enum {TOKEN_CHAR,TOKEN_DOT,TOKEN_PAR_OPEN,TOKEN_PAR_CLOSE,TOKEN_BRACKET,TOKEN_ALT,TOKEN_REGEX,TOKEN_DONE}; |
|
726 |
- |
|
727 |
-static const unsigned char* getNextToken(const unsigned char* pat,struct token_t* token) |
|
728 |
-{ |
|
729 |
- massert(pat); |
|
730 |
- massert(token); |
|
731 |
- |
|
732 |
- switch(*pat) { |
|
733 |
- case '\\': |
|
734 |
- token->type=TOKEN_CHAR; |
|
735 |
- token->u.c = *(++pat); |
|
736 |
- if(islower(token->u.c)) { |
|
737 |
- /* handle \n, \t, etc. */ |
|
738 |
- char fmt[3] = {'\\', '\0', '\0'}; |
|
739 |
- char c; |
|
740 |
- |
|
741 |
- fmt[1] = token->u.c; |
|
742 |
- if(snprintf(&c,1,fmt)!=1) { |
|
743 |
- token->type=TOKEN_REGEX; |
|
744 |
- token->u.start = pat; |
|
745 |
- } |
|
746 |
- else |
|
747 |
- token->u.c=c; |
|
748 |
- } |
|
749 |
- token->len = 1; |
|
750 |
- break; |
|
751 |
- case '|': |
|
752 |
- token->type=TOKEN_ALT; |
|
753 |
- break; |
|
754 |
- case '*': |
|
755 |
- case '+': |
|
756 |
- case '?': |
|
757 |
- case '{': |
|
758 |
- case '}': |
|
759 |
- token->type=TOKEN_REGEX; |
|
760 |
- break; |
|
761 |
- case '[': |
|
762 |
- { |
|
763 |
- /*TODO: implement*/ |
|
764 |
- /*see if it is something simple like a list of characters, a range, or negated ...*/ |
|
765 |
- const unsigned char* old=pat++;/* save this in case we change our mind and decide this is too complicated for us to handle*/ |
|
766 |
- unsigned char range_start=0; |
|
767 |
- int hasprev = 0; |
|
768 |
- char_bitmap_p bitmap = cli_malloc(32); |
|
769 |
- if(!bitmap) |
|
770 |
- return NULL; |
|
771 |
- if (*pat=='^') { |
|
772 |
- memset(bitmap,0xFF,32);/*match chars not in brackets*/ |
|
773 |
- pat++; |
|
774 |
- } |
|
775 |
- else |
|
776 |
- memset(bitmap,0x00,32); |
|
777 |
- do { |
|
778 |
- /* literal ] can be first character, so test for it at the end of the loop, for example: []] */ |
|
779 |
- if (*pat=='-' && hasprev) { |
|
780 |
- /* it is a range*/ |
|
781 |
- unsigned char range_end; |
|
782 |
- unsigned int c; |
|
783 |
- massert(range_start); |
|
784 |
- pat++; |
|
785 |
- if (pat[0]=='[') |
|
786 |
- if (pat[1]=='.') { |
|
787 |
- if(pat[2]=='-' && pat[3]=='.' && pat[4]==']') |
|
788 |
- range_end = '-'; |
|
789 |
- else { |
|
790 |
- /* this is getting complicated, bail out */ |
|
791 |
- cli_warnmsg("confused about collating sequences in regex,bailing out"); |
|
792 |
- pat=old; |
|
793 |
- token->type=TOKEN_REGEX; |
|
794 |
- break; |
|
795 |
- } |
|
796 |
- } |
|
797 |
- else |
|
798 |
- range_end = *pat; |
|
799 |
- else |
|
800 |
- range_end = *pat; |
|
801 |
- for(c=range_start+1;c<=range_end;c++) |
|
802 |
- bitmap[c>>3] ^= 1<<(c&0x7); |
|
803 |
- hasprev = 0; |
|
804 |
- } |
|
805 |
- else if (pat[0]=='[' && pat[1]==':') { |
|
806 |
- const unsigned char* end; |
|
807 |
- int len,found=-1; |
|
808 |
- size_t i; |
|
809 |
- |
|
810 |
- pat+=2; |
|
811 |
- end=(unsigned char*)strstr((const char*)pat,":]"); |
|
812 |
- if(!end) { |
|
813 |
- cli_warnmsg("confused about std char class syntax regex,bailing out"); |
|
814 |
- pat=old; |
|
815 |
- token->type=TOKEN_REGEX; |
|
816 |
- break; |
|
817 |
- } |
|
818 |
- |
|
819 |
- len = end-pat; |
|
820 |
- for(i=0;i<std_class_cnt;i++) |
|
821 |
- if(!strncmp((const char*)pat,std_class[i],len)) { |
|
822 |
- found=i; |
|
823 |
- break; |
|
824 |
- } |
|
825 |
- if(found!=-1) { |
|
826 |
- for(i=0;i<256;i++) |
|
827 |
- if(char_class[i]&(1<<found)) |
|
828 |
- bitmap[i>>3] ^= 1<<(i&0x7); |
|
829 |
- } |
|
830 |
- else { |
|
831 |
- /*unknown class*/ |
|
832 |
- cli_warnmsg("confused about regex bracket expression, bailing out"); |
|
833 |
- pat=old; |
|
834 |
- token->type=TOKEN_REGEX; |
|
835 |
- break; |
|
836 |
- } |
|
837 |
- } |
|
838 |
- else { |
|
839 |
- bitmap[*pat>>3] ^= 1<<(*pat&0x7); |
|
840 |
- pat++; |
|
841 |
- range_start = *pat; |
|
842 |
- hasprev = 1; |
|
843 |
- } |
|
844 |
- } while(*pat!=']'); |
|
845 |
- /*TODO: see if this bitmap already exists, then reuse*/ |
|
846 |
- token->type = TOKEN_BRACKET; |
|
847 |
- token->u.bitmap = bitmap; |
|
848 |
- break; |
|
849 |
- } |
|
850 |
- case ']': |
|
851 |
- massert(0 && "Encountered ] without matching ["); |
|
852 |
- /* bad state */ |
|
853 |
- break; |
|
854 |
- case '.': |
|
855 |
- token->type=TOKEN_DOT; |
|
856 |
- break; |
|
857 |
- case '(': |
|
858 |
- token->type=TOKEN_PAR_OPEN; |
|
859 |
- break; |
|
860 |
- case ')': |
|
861 |
- token->type=TOKEN_PAR_CLOSE; |
|
862 |
- break; |
|
863 |
- default: |
|
864 |
- token->type=TOKEN_CHAR; |
|
865 |
- token->u.c = *pat; |
|
866 |
- token->len=1; |
|
867 |
- break; |
|
868 |
- } |
|
869 |
- return ++pat; |
|
870 |
-} |
|
871 |
- |
|
872 |
-#define INITIAL_ALT_STACK 10 |
|
873 |
-#define ALT_STACK_GROW 20 |
|
874 |
- |
|
875 |
-static const unsigned char* find_regex_start(const unsigned char* pat) |
|
715 |
+int is_regex_ok(struct regex_matcher* matcher) |
|
876 | 716 |
{ |
877 |
- struct token_t token; |
|
878 |
- /*TODO: find where the regex part begins, for ex: |
|
879 |
- * abcd+, regex begins at 'd' |
|
880 |
- * */ |
|
881 |
- const unsigned char* last=NULL; |
|
882 |
- const unsigned char* tmp=NULL; |
|
883 |
- const unsigned char** altpositions = cli_malloc(INITIAL_ALT_STACK*sizeof(*altpositions)); |
|
884 |
- size_t altpositions_capacity = INITIAL_ALT_STACK; |
|
885 |
- size_t altpositions_cnt = 0; |
|
886 |
- char lasttype = -1; |
|
887 |
- if(!altpositions) |
|
888 |
- return NULL; |
|
889 |
- massert(pat); |
|
890 |
- |
|
891 |
- /* Try to parse pattern till special regex chars are encountered, that the tree-matcher doesn't handle, like: +,*,{}. |
|
892 |
- * The tricky part is that once we encounter these, the previous 'atom' has to be passed on to the regex matcher, so we have to |
|
893 |
- * back up to the last known good position |
|
894 |
- * Example, if we have: abc(defg)+, then only abc can be handled by tree parser, so we have to return the position of (. |
|
895 |
- * Another example: abc(defg|xyz|oz+|pdo), the last known good position is |, after xyz |
|
896 |
- * TODO: what about open parantheses? maybe once we found a special char, we have top back out before the first (? |
|
897 |
- * */ |
|
898 |
- do { |
|
899 |
- tmp = pat; |
|
900 |
- pat = getNextToken(pat,&token); |
|
901 |
- if(token.type!=TOKEN_REGEX) { |
|
902 |
- last = tmp; |
|
903 |
- lasttype = token.type; |
|
904 |
- if(token.type==TOKEN_BRACKET && token.u.bitmap) |
|
905 |
- free(token.u.bitmap); |
|
906 |
- if(token.type==TOKEN_ALT || token.type==TOKEN_PAR_OPEN) { |
|
907 |
- /* save this position on stack, succesfully parsed till here*/ |
|
908 |
- if(altpositions_cnt && altpositions[altpositions_cnt-1][0]=='|') |
|
909 |
- /* encountered another alternate (|) operator, override previous | position stored */ |
|
910 |
- altpositions[altpositions_cnt-1]=last; |
|
911 |
- else { |
|
912 |
- altpositions[altpositions_cnt++] = last; |
|
913 |
- if(altpositions_cnt == altpositions_capacity) { |
|
914 |
- altpositions_capacity += ALT_STACK_GROW; |
|
915 |
- altpositions = cli_realloc2(altpositions,altpositions_capacity*sizeof(*altpositions)); |
|
916 |
- if(!altpositions) |
|
917 |
- return NULL; |
|
918 |
- } |
|
919 |
- } |
|
920 |
- } else if (lasttype==TOKEN_PAR_CLOSE) { |
|
921 |
- /* remove last stored position from stack, succesfully this last group */ |
|
922 |
- altpositions_cnt--; |
|
923 |
- massert(altpositions_cnt>0); |
|
924 |
- } |
|
925 |
- } |
|
926 |
- else { |
|
927 |
- if(altpositions_cnt) |
|
928 |
- last = altpositions[0 /*altpositions_cnt-1*/];/*TODO: which index here?, see above TODO... */ |
|
929 |
- /*last stored 'safe' position where no special (+,*,{}) regex chars were encountered*/ |
|
930 |
- } |
|
931 |
- } while(*pat && token.type!=TOKEN_REGEX); |
|
932 |
- free(altpositions); |
|
933 |
- return *pat ? last : last+1; |
|
717 |
+ assert(matcher); |
|
718 |
+ return (!matcher->list_inited || matcher->list_inited!=-1);/* either we don't have a regexlist, or we initialized it successfully */ |
|
934 | 719 |
} |
935 | 720 |
|
936 |
-static struct tree_node* tree_node_alloc(struct tree_node* next,char listend) |
|
721 |
+static int add_newsuffix(struct regex_matcher *matcher, struct regex_list *info, char *suffix, size_t len) |
|
937 | 722 |
{ |
938 |
- struct tree_node* node = cli_malloc(sizeof(*node)); |
|
939 |
- if(node) { |
|
940 |
- node->alternatives=0; |
|
941 |
- node->next=next; |
|
942 |
- node->listend=listend; |
|
943 |
- node->u.children=NULL; |
|
944 |
- } |
|
945 |
- return node; |
|
946 |
-} |
|
723 |
+ struct cli_matcher *root = &matcher->suffixes; |
|
724 |
+ struct cli_ac_patt *new = cli_calloc(1,sizeof(*new)); |
|
725 |
+ size_t i; |
|
726 |
+ int ret; |
|
947 | 727 |
|
948 |
-static struct tree_node* tree_root_alloc(void) |
|
949 |
-{ |
|
950 |
- struct tree_node* root=tree_node_alloc(NULL,1); |
|
951 |
- if(root) { |
|
952 |
- root->op=OP_ROOT; |
|
953 |
- root->c=0; |
|
954 |
- root->next=NULL; |
|
955 |
- root->listend=1; |
|
728 |
+ if(!new) |
|
729 |
+ return CL_EMEM; |
|
730 |
+ assert(root && suffix); |
|
731 |
+ |
|
732 |
+ new->rtype = 0; |
|
733 |
+ new->type = 0; |
|
734 |
+ new->sigid = 0; |
|
735 |
+ new->parts = 0; |
|
736 |
+ new->partno = 0; |
|
737 |
+ new->mindist = 0; |
|
738 |
+ new->maxdist = 0; |
|
739 |
+ new->offset = 0; |
|
740 |
+ new->target = 0; |
|
741 |
+ new->length = len; |
|
742 |
+ |
|
743 |
+ new->ch[0] = new->ch[1] |= CLI_MATCH_IGNORE; |
|
744 |
+ if(new->length > root->maxpatlen) |
|
745 |
+ root->maxpatlen = new->length; |
|
746 |
+ |
|
747 |
+ new->pattern = cli_malloc(sizeof(new->pattern[0])*len); |
|
748 |
+ if(!new->pattern) { |
|
749 |
+ free(new); |
|
750 |
+ return CL_EMEM; |
|
956 | 751 |
} |
957 |
- return root; |
|
958 |
-} |
|
959 |
- |
|
960 |
-static struct tree_node* tree_node_char_binsearch(const struct tree_node* node,const char csearch,int* left) |
|
961 |
-{ |
|
962 |
- int right; |
|
963 |
- struct tree_node **children; |
|
964 |
- massert(node); |
|
965 |
- massert(left); |
|
966 |
- |
|
967 |
- children = tree_node_get_children(node); |
|
968 |
- right = node->alternatives-1; |
|
969 |
- *left = 0; |
|
970 |
- if(!node->alternatives) |
|
971 |
- return NULL; |
|
972 |
- massert(children); |
|
973 |
- while(*left<=right) { |
|
974 |
- int mid = *left+(right-*left)/2; |
|
975 |
- if(children[mid]->c == csearch) |
|
976 |
- return children[mid]; |
|
977 |
- else if(children[mid]->c < csearch) |
|
978 |
- *left=mid+1; |
|
979 |
- else |
|
980 |
- right=mid-1; |
|
752 |
+ for(i=0;i<len;i++) |
|
753 |
+ new->pattern[i] = suffix[i];/*new->pattern is short int* */ |
|
754 |
+ |
|
755 |
+ new->virname = (char*)info; |
|
756 |
+ if((ret = cli_ac_addpatt(root,new))) { |
|
757 |
+ free(new->pattern); |
|
758 |
+ free(new); |
|
759 |
+ return ret; |
|
981 | 760 |
} |
982 |
- return NULL; |
|
983 |
-} |
|
984 |
- |
|
985 |
-static struct tree_node* tree_get_next(struct tree_node* node) |
|
986 |
-{ |
|
987 |
- struct tree_node** children; |
|
988 |
- massert(node); |
|
989 |
- children = tree_node_get_children(node); |
|
990 |
- |
|
991 |
- if(!node->alternatives && children && children[0]) |
|
992 |
- return children[0]; |
|
993 |
- else if(node->alternatives<=1) |
|
994 |
- return node; |
|
995 |
- else |
|
996 |
- return children[0]->next; |
|
761 |
+ SO_preprocess_add(&matcher->filter, suffix, len); |
|
762 |
+ return CL_SUCCESS; |
|
997 | 763 |
} |
998 | 764 |
|
999 |
-static size_t tree_node_get_array_size(const struct tree_node* node) |
|
1000 |
-{ |
|
1001 |
- massert(node); |
|
1002 |
- /* if op is CUSTOMCLASS, then first pointer is pointer to bitmap, so array size is +1 */ |
|
1003 |
- return (node->alternatives + (node->op==OP_CUSTOMCLASS ? 1 : 0)) * sizeof(node->u.children[0]); |
|
1004 |
-} |
|
765 |
+#define MODULE "regex_list: " |
|
766 |
+/* ------ load a regex, determine suffix, determine suffix2regexlist map ---- */ |
|
1005 | 767 |
|
1006 |
-static struct tree_node* tree_node_char_insert(struct tree_node* node,const char c,int left) |
|
768 |
+/* returns 0 on success, clamav error code otherwise */ |
|
769 |
+static int add_pattern_suffix(struct regex_matcher *matcher, char *suffix, size_t suffix_len, struct regex_list *regex) |
|
1007 | 770 |
{ |
1008 |
- struct tree_node* new, *alt = tree_get_next(node); |
|
1009 |
- struct tree_node **children; |
|
1010 |
- node->alternatives++; |
|
1011 |
- node->u.children = cli_realloc2(node->u.children,tree_node_get_array_size(node)); |
|
1012 |
- if(!node->u.children) |
|
1013 |
- return NULL; |
|
1014 |
- |
|
1015 |
- children = node->op==OP_CUSTOMCLASS ? node->u.children+1 : node->u.children; |
|
1016 |
- |
|
1017 |
- new = tree_node_alloc(alt , node == alt ); |
|
1018 |
- if(new) { |
|
1019 |
- new->op=OP_CHAR; |
|
1020 |
- new->c=c; |
|
1021 |
- } |
|
1022 |
- |
|
1023 |
- if(node->alternatives-left-1>0) |
|
1024 |
- memmove(&children[left+1],&children[left],(node->alternatives-left-1)*sizeof(node->u.children[0])); |
|
1025 |
- children[left] = new; |
|
1026 |
- |
|
1027 |
- return new; |
|
1028 |
-} |
|
1029 |
- |
|
1030 |
-static void tree_node_insert_nonbin(struct tree_node* node, struct tree_node* new) |
|
1031 |
-{ |
|
1032 |
- struct tree_node **children; |
|
1033 |
- massert(node); |
|
1034 |
- massert(new); |
|
1035 |
- |
|
1036 |
- children = tree_node_get_children(node); |
|
1037 |
- if(node->alternatives) { |
|
1038 |
- massert(children); |
|
1039 |
- if(children[0]->next == node) { |
|
1040 |
- int i; |
|
1041 |
- new->listend = 1; |
|
1042 |
- for(i=0;i<node->alternatives;i++) { |
|
1043 |
- children[i]->next = new; |
|
1044 |
- children[i]->listend = 0; |
|
1045 |
- } |
|
1046 |
- } |
|
1047 |
- else { |
|
1048 |
- struct tree_node* p; |
|
1049 |
- for(p = children[0]->next ; p->next != node ; p = p->next) |
|
1050 |
- massert(!p->listend); |
|
1051 |
- new->listend = 1; |
|
1052 |
- p->listend = 0; |
|
1053 |
- p->next = new; |
|
1054 |
- } |
|
1055 |
- } |
|
1056 |
- else { |
|
1057 |
- int idx = node->op==OP_CUSTOMCLASS ? 1 : 0; |
|
1058 |
- if(node->u.children) |
|
1059 |
- if(node->u.children[idx]) { |
|
1060 |
- node = node->u.children[idx]; |
|
1061 |
- while(node->next && !node->listend) |
|
1062 |
- node = node->next; |
|
1063 |
- node->listend = 0; |
|
1064 |
- new->next = node->next; |
|
1065 |
- node->next = new; |
|
1066 |
- new->listend=1; |
|
1067 |
- return; |
|
1068 |
- } |
|
1069 |
- node->u.children = cli_realloc2(node->u.children,sizeof(node->u.children[0])*(2)); |
|
1070 |
- if(node->u.children) { |
|
1071 |
- node->u.children[idx] = new; |
|
1072 |
- } |
|
771 |
+ const struct element *el; |
|
772 |
+ |
|
773 |
+ assert(matcher); |
|
774 |
+ el = hashtab_find(&matcher->suffix_hash, suffix, suffix_len); |
|
775 |
+ /* TODO: what if suffixes are prefixes of eachother and only one will |
|
776 |
+ * match? */ |
|
777 |
+ if(el) { |
|
778 |
+ /* existing suffix */ |
|
779 |
+ assert(el->data < matcher->suffix_cnt); |
|
780 |
+ regex->nxt = matcher->suffix_regexes[el->data]; |
|
781 |
+ matcher->suffix_regexes[el->data] = regex; |
|
782 |
+ cli_dbgmsg(MODULE "added new regex to existing suffix %s: %s\n", suffix, regex->pattern); |
|
783 |
+ } else { |
|
784 |
+ /* new suffix */ |
|
785 |
+ size_t n = matcher->suffix_cnt++; |
|
786 |
+ el = hashtab_insert(&matcher->suffix_hash, suffix, suffix_len, n); |
|
787 |
+ matcher->suffix_regexes = cli_realloc(matcher->suffix_regexes, (n+1)*sizeof(*matcher->suffix_regexes)); |
|
788 |
+ if(!matcher->suffix_regexes) |
|
789 |
+ return CL_EMEM; |
|
790 |
+ matcher->suffix_regexes[n] = regex; |
|
791 |
+ add_newsuffix(matcher, regex, suffix, suffix_len); |
|
792 |
+ cli_dbgmsg(MODULE "added new suffix %s, for regex: %s\n", suffix, regex->pattern); |
|
1073 | 793 |
} |
794 |
+ return 0; |
|
1074 | 795 |
} |
1075 | 796 |
|
1076 |
-static unsigned char char_getclass(const unsigned char* bitmap) |
|
797 |
+static size_t reverse_string(char *pattern) |
|
1077 | 798 |
{ |
799 |
+ size_t len = strlen(pattern); |
|
1078 | 800 |
size_t i; |
1079 |
- massert(bitmap); |
|
1080 |
- |
|
1081 |
- for(i=0;i<std_class_cnt;i++) |
|
1082 |
- if(!memcmp(bitmap,char_class_bitmap[i],256>>3)) |
|
1083 |
- return i; |
|
1084 |
- return std_class_cnt; |
|
1085 |
-} |
|
1086 |
- |
|
1087 |
-static void stack_destroy(struct node_stack* stack) |
|
1088 |
-{ |
|
1089 |
- massert(stack); |
|
1090 |
- if(stack->data) |
|
1091 |
- free(stack->data); |
|
1092 |
- stack->data = NULL; |
|
1093 |
- stack->capacity = 0; |
|
1094 |
-} |
|
1095 |
- |
|
1096 |
-/* call this after whitelist load is complete, and the tree is no longer going to be modified */ |
|
1097 |
-void regex_list_cleanup(struct regex_matcher* matcher) |
|
1098 |
-{ |
|
1099 |
- massert(matcher); |
|
1100 |
- |
|
1101 |
- stack_destroy(&matcher->node_stack); |
|
1102 |
- stack_destroy(&matcher->node_stack_alt); |
|
1103 |
- stack_init(&matcher->node_stack); |
|
1104 |
- stack_init(&matcher->node_stack_alt); |
|
1105 |
-} |
|
1106 |
- |
|
1107 |
-int is_regex_ok(struct regex_matcher* matcher) |
|
1108 |
-{ |
|
1109 |
- massert(matcher); |
|
1110 |
- return (!matcher->list_inited || matcher->list_inited!=-1);/* either we don't have a regexlist, or we initialized it successfully */ |
|
801 |
+ for(i=0; i < (len/2); i++) { |
|
802 |
+ char aux = pattern[i]; |
|
803 |
+ pattern[i] = pattern[len-i-1]; |
|
804 |
+ pattern[len-i-1] = aux; |
|
805 |
+ } |
|
806 |
+ return len; |
|
1111 | 807 |
} |
1112 | 808 |
|
1113 |
-/* returns 0 on success, regexec error code otherwise */ |
|
1114 |
-static int add_pattern(struct regex_matcher* matcher,const unsigned char* pat,const char* info, int hostonly) |
|
809 |
+static int add_static_pattern(struct regex_matcher *matcher, char* pattern) |
|
1115 | 810 |
{ |
1116 |
- int bol=1; |
|
1117 |
- const unsigned char* pat_end = find_regex_start(pat); |
|
1118 |
- struct token_t token; |
|
1119 |
- struct tree_node* node; |
|
1120 |
- |
|
1121 |
- massert(matcher); |
|
1122 |
- |
|
1123 |
- node = hostonly ? matcher->root_regex_hostonly : matcher->root_regex; |
|
1124 |
- |
|
1125 |
- stack_reset(&matcher->node_stack); |
|
1126 |
- stack_reset(&matcher->node_stack_alt); |
|
1127 |
- stack_push(&matcher->node_stack,node); |
|
1128 |
- |
|
1129 |
- for(;node->op!=OP_LEAF;){ |
|
1130 |
- if(pat<pat_end) |
|
1131 |
- pat = getNextToken(pat,&token); |
|
1132 |
- else if(*pat) { |
|
1133 |
- token.type = TOKEN_REGEX; |
|
1134 |
- token.u.start=pat; |
|
1135 |
- } |
|
1136 |
- else |
|
1137 |
- token.type = TOKEN_DONE; |
|
1138 |
- |
|
1139 |
- switch(token.type) { |
|
1140 |
- case TOKEN_CHAR: |
|
1141 |
- { |
|
1142 |
- /* search for char in tree */ |
|
1143 |
- int left; |
|
1144 |
- struct tree_node* newnode = tree_node_char_binsearch(node,token.u.c,&left); |
|
1145 |
- if(newnode) |
|
1146 |
- node = newnode; |
|
1147 |
- else { |
|
1148 |
- /* not found, insert it */ |
|
1149 |
- node = tree_node_char_insert(node,token.u.c,left); |
|
1150 |
- } |
|
1151 |
- break; |
|
1152 |
- } |
|
1153 |
- |
|
1154 |
- case TOKEN_PAR_OPEN: |
|
1155 |
- stack_push(&matcher->node_stack_alt,NULL);/* marker */ |
|
1156 |
- stack_push(&matcher->node_stack,node); |
|
1157 |
- break; |
|
1158 |
- |
|
1159 |
- case TOKEN_PAR_CLOSE: { |
|
1160 |
- /*TODO: test this!!!*/ |
|
1161 |
- struct tree_node* node_alt = node; |
|
1162 |
- node = tree_node_alloc(NULL,1); |
|
1163 |
- node->op=OP_PARCLOSE; |
|
1164 |
- node->c=0; |
|
1165 |
- node->listend=1; |
|
1166 |
- tree_node_insert_nonbin(node_alt,node); |
|
1167 |
- while (( node_alt = stack_pop(&matcher->node_stack_alt) )) { |
|
1168 |
- tree_node_insert_nonbin(node_alt,node); |
|
1169 |
- } |
|
1170 |
- stack_pop(&matcher->node_stack); |
|
1171 |
- break; |
|
1172 |
- } |
|
1173 |
- |
|
1174 |
- case TOKEN_ALT: |
|
1175 |
- stack_push(&matcher->node_stack_alt,node); |
|
1176 |
- node = stack_pop(&matcher->node_stack); |
|
1177 |
- stack_push(&matcher->node_stack,node); |
|
1178 |
- break; |
|
1179 |
- |
|
1180 |
- case TOKEN_BRACKET: |
|
1181 |
- { |
|
1182 |
- struct tree_node* new = tree_node_alloc(tree_get_next(node),1); |
|
1183 |
- unsigned char charclass = char_getclass(token.u.bitmap); |
|
1184 |
- if(charclass == std_class_cnt) {/*not a std char class*/ |
|
1185 |
- new->op = OP_CUSTOMCLASS; |
|
1186 |
- new->u.children = cli_malloc(sizeof(new->u.children[0])*2); |
|
1187 |
- if(!new->u.children) |
|
1188 |
- return CL_EMEM; |
|
1189 |
- new->u.bitmap[0] = token.u.bitmap; |
|
1190 |
- new->u.bitmap[1] = NULL; |
|
1191 |
- tree_node_insert_nonbin(node,new); |
|
1192 |
- node = new; |
|
1193 |
- } |
|
1194 |
- else { |
|
1195 |
- new->op = OP_STDCLASS; |
|
1196 |
- new->c = charclass; |
|
1197 |
- tree_node_insert_nonbin(node,new); |
|
1198 |
- node=new; |
|
1199 |
- } |
|
1200 |
- break; |
|
1201 |
- } |
|
1202 |
- |
|
1203 |
- case TOKEN_DOT: |
|
1204 |
- { |
|
1205 |
- struct tree_node* new = tree_node_alloc(tree_get_next(node),1); |
|
1206 |
- new->op = OP_DOT; |
|
1207 |
- tree_node_insert_nonbin(node,new); |
|
1208 |
- node=new; |
|
1209 |
- break; |
|
1210 |
- } |
|
1211 |
- |
|
1212 |
- case TOKEN_REGEX: |
|
1213 |
- case TOKEN_DONE: { |
|
1214 |
- struct leaf_info* leaf=cli_malloc(sizeof(*leaf)); |
|
1215 |
- if(!leaf) |
|
1216 |
- return CL_EMEM; |
|
1217 |
- leaf->info = cli_strdup(info); |
|
1218 |
- if(token.type==TOKEN_REGEX) { |
|
1219 |
- int rc; |
|
1220 |
- struct tree_node* new; |
|
1221 |
- regex_t* preg; |
|
1222 |
- preg=cli_malloc(sizeof(*preg)); |
|
1223 |
- if(!preg) |
|
1224 |
- return CL_EMEM; |
|
1225 |
- rc = cli_regcomp(preg,(const char*)token.u.start,REG_EXTENDED|(bol?0:REG_NOTBOL)); |
|
1226 |
- leaf->preg=preg; |
|
1227 |
- if(rc) |
|
1228 |
- return rc; |
|
1229 |
- new=cli_malloc(sizeof(*new)); |
|
1230 |
- if(!new) |
|
1231 |
- return CL_EMEM; |
|
1232 |
- new->op=OP_LEAF; |
|
1233 |
- new->next=node; |
|
1234 |
- new->alternatives=0; |
|
1235 |
- new->u.leaf=leaf; |
|
1236 |
- new->listend=1; |
|
1237 |
- tree_node_insert_nonbin(node,new); |
|
1238 |
- } |
|
1239 |
- else { |
|
1240 |
- leaf->preg=NULL; |
|
1241 |
- node->alternatives=0; |
|
1242 |
- node->u.leaf=leaf; |
|
1243 |
- node->op=OP_LEAF; |
|
1244 |
- } |
|
1245 |
- return 0; |
|
1246 |
- } |
|
1247 |
- } |
|
1248 |
- |
|
1249 |
- bol=0; |
|
1250 |
- } |
|
1251 |
- return 0; |
|
811 |
+ size_t len; |
|
812 |
+ struct regex_list *regex = cli_malloc(sizeof(*regex)); |
|
813 |
+ if(!regex) |
|
814 |
+ return CL_EMEM; |
|
815 |
+ len = reverse_string(pattern); |
|
816 |
+ regex->nxt = NULL; |
|
817 |
+ regex->pattern = cli_strdup(pattern); |
|
818 |
+ regex->preg.re_magic = 0; |
|
819 |
+ return add_pattern_suffix(matcher, pattern, len, regex); |
|
1252 | 820 |
} |
1253 | 821 |
|
1254 |
-/* c has to be unsigned char here!! */ |
|
1255 |
-static int match_node(struct tree_node* node,const unsigned char* c,size_t len,const char** info) |
|
822 |
+static int add_pattern(struct regex_matcher *matcher, char *pattern) |
|
1256 | 823 |
{ |
1257 |
- struct tree_node** children; |
|
824 |
+ struct text_buffer buf; |
|
825 |
+ struct node *n; |
|
826 |
+ size_t last=0; |
|
1258 | 827 |
int rc; |
828 |
+ struct regex_list *regex = cli_malloc(sizeof(*regex)); |
|
829 |
+ struct node root_node; |
|
830 |
+ size_t len; |
|
831 |
+ /* we only match the host, so remove useless stuff */ |
|
832 |
+ const char remove_end[] = "([/?].*)?/"; |
|
833 |
+ const char remove_end2[] = "([/?].*)/"; |
|
1259 | 834 |
|
1260 |
- massert(node); |
|
1261 |
- massert(c); |
|
1262 |
- massert(info); |
|
1263 |
- |
|
1264 |
- if(!node->u.children) |
|
1265 |
- return MATCH_FAILED;/* tree empty */ |
|
1266 |
- *info = NULL; |
|
1267 |
- len++; |
|
1268 |
- c--; |
|
1269 |
- for(;;) { |
|
1270 |
- massert(node); |
|
1271 |
- children = node->u.children; |
|
1272 |
- switch(node->op) { |
|
1273 |
- case OP_ROOT: |
|
1274 |
- rc=1; |
|
1275 |
- break; |
|
1276 |
- case OP_PARCLOSE: |
|
1277 |
- /*this isn't a real character, so don't move*/ |
|
1278 |
- c--; |
|
1279 |
- len++; |
|
1280 |
- rc=1; |
|
1281 |
- break; |
|
1282 |
- case OP_CHAR: |
|
1283 |
- massert(*c==node->c && "We know this has to match"); |
|
1284 |
- rc = 1;/* *c==node->c;- we know it has matched */ |
|
1285 |
- break; |
|
1286 |
- case OP_DOT: |
|
1287 |
- rc = *c!='\n'; |
|
1288 |
- break; |
|
1289 |
- case OP_STDCLASS: |
|
1290 |
- rc = char_class[*c]&(node->c); |
|
1291 |
- break; |
|
1292 |
- case OP_CUSTOMCLASS: |
|
1293 |
- { |
|
1294 |
- char_bitmap_p bitmap; |
|
1295 |
- massert(children); |
|
1296 |
- bitmap = (char_bitmap_p)node->u.bitmap[0]; |
|
1297 |
- children++; |
|
1298 |
- rc = bitmap[*c>>3]&(1<<(*c&0x7)); |
|
1299 |
- break; |
|
1300 |
- } |
|
1301 |
- case OP_LEAF: |
|
1302 |
- { |
|
1303 |
- const struct leaf_info* leaf = node->u.leaf; |
|
1304 |
- /*isleaf = 1;*/ |
|
1305 |
- if(leaf->preg) { |
|
1306 |
- rc = !cli_regexec(leaf->preg,(const char*)c,0,NULL,0); |
|
1307 |
- } |
|
1308 |
- else { |
|
1309 |
- massert(*c==node->c && "We know this has to match[2]"); |
|
1310 |
- rc = 1; |
|
1311 |
- } |
|
1312 |
- if(rc) { |
|
1313 |
- *info = leaf->info; |
|
1314 |
- return MATCH_SUCCESS; |
|
1315 |
- } |
|
1316 |
- break; |
|
1317 |
- } |
|
1318 |
- default: |
|
1319 |
- /* impossible */ |
|
1320 |
- cli_errmsg("Encountered invalid operator in tree:%d\n",node->op); |
|
1321 |
- exit(1); |
|
1322 |
- } |
|
1323 |
- len--; |
|
1324 |
- if(!len) rc=0; |
|
1325 |
- c++; |
|
1326 |
- if(rc) { |
|
1327 |
- const char csearch = *c; |
|
1328 |
- int left = 0,right = node->alternatives-1; |
|
1329 |
- int mid; |
|
1330 |
- /*matched so far, go deeper*/ |
|
1331 |
- /*do a binary search between children */ |
|
1332 |
- massert(children); |
|
1333 |
- while(left<=right) { |
|
1334 |
- mid = left+(right-left)/2; |
|
1335 |
- if (children[mid]->c == csearch) |
|
1336 |
- break; |
|
1337 |
- else if(children[mid]->c < csearch) |
|
1338 |
- left=mid+1; |
|
1339 |
- else |
|
1340 |
- right=mid-1; |
|
1341 |
- } |
|
1342 |
- if(left<=right) { |
|
1343 |
- node = children[mid]; |
|
1344 |
- massert(node); |
|
1345 |
- } |
|
1346 |
- else { |
|
1347 |
- if(node->alternatives) { |
|
1348 |
- if(!children[0]->listend) { |
|
1349 |
- node = children[0]; |
|
1350 |
- c++; |
|
1351 |
- len--; |
|
1352 |
- } |
|
1353 |
- while(node && node->listend) { |
|
1354 |
- node = node->next;/* climb up */ |
|
1355 |
- c--; |
|
1356 |
- len++; |
|
1357 |
- } |
|
1358 |
- if(!node || !node->next) |
|
1359 |
- return MATCH_FAILED;/* reached root node */ |
|
1360 |
- node=node->next; |
|
1361 |
- c--; |
|
1362 |
- len++; |
|
1363 |
- } |
|
1364 |
- else if(node->u.children) { |
|
1365 |
- struct tree_node* rewrite_next = NULL; |
|
1366 |
- if(node->op==OP_PARCLOSE) |
|
1367 |
- rewrite_next = node; |
|
1368 |
- node = children[0]; |
|
1369 |
- massert(node); |
|
1370 |
- massert(node->op!=OP_CHAR); |
|
1371 |
- if(rewrite_next) |
|
1372 |
- node->next = rewrite_next;/* this node is pointed to by several parent nodes, |
|
1373 |
- we need to know |
|
1374 |
- from which one we came, so we can find out way back |
|
1375 |
- should we fail to match somewhere deeper*/ |
|
1376 |
- } |
|
1377 |
- } |
|
1378 |
- } |
|
1379 |
- else { |
|
1380 |
- /* this node didn't match, try sibling, or parent (if no more siblings) */ |
|
1381 |
- while(node && node->listend) { |
|
1382 |
- node = node->next;/* sibling of parent */ |
|
1383 |
- c--; |
|
1384 |
- len++; |
|
1385 |
- } |
|
1386 |
- if(!node || !node->next) /* reached root node, it has no next */ |
|
1387 |
- return MATCH_FAILED; |
|
1388 |
- else { |
|
1389 |
- c--; |
|
1390 |
- len++; |
|
1391 |
- node=node->next; |
|
1392 |
- } |
|
1393 |
- } |
|
1394 |
- } |
|
1395 |
- return MATCH_FAILED; |
|
1396 |
-} |
|
1397 |
- |
|
1398 |
-/* push node on stack, only if it isn't there already */ |
|
1399 |
-static void stack_push_once(struct node_stack* stack,struct tree_node* node) |
|
1400 |
-{ |
|
1401 |
- size_t i; |
|
1402 |
- massert(stack); |
|
1403 |
- massert(node); |
|
1404 | 835 |
|
1405 |
- for(i=0;i < stack->cnt;i++) |
|
1406 |
- if(stack->data[i]==node) |
|
1407 |
- return; |
|
1408 |
- stack_push(stack,node); |
|
1409 |
-} |
|
836 |
+ if(!regex) |
|
837 |
+ return CL_EMEM; |
|
1410 | 838 |
|
1411 |
-static void destroy_tree_internal(struct regex_matcher* matcher,struct tree_node* node) |
|
1412 |
-{ |
|
1413 |
- struct tree_node **children; |
|
1414 |
- massert(matcher); |
|
1415 |
- massert(node); |
|
1416 |
- |
|
1417 |
- children = tree_node_get_children(node); |
|
1418 |
- if(node->op==OP_LEAF) { |
|
1419 |
- struct leaf_info* leaf = node->u.leaf; |
|
1420 |
- if(node->next && !node->listend) |
|
1421 |
- destroy_tree_internal(matcher,node->next); |
|
1422 |
- stack_push_once(&matcher->node_stack,(struct tree_node*)node->u.leaf);/* cast to make compiler happy, and to not make another stack implementation for storing void* */ |
|
1423 |
- stack_push_once(&matcher->node_stack,node); |
|
1424 |
- if(leaf->preg) { |
|
1425 |
- cli_regfree(leaf->preg); |
|
1426 |
- free(leaf->preg); |
|
1427 |
- leaf->preg=NULL; |
|
839 |
+ len = strlen(pattern); |
|
840 |
+ if(len > sizeof(remove_end)) { |
|
841 |
+ if(strncmp(&pattern[len - sizeof(remove_end)+1], remove_end, sizeof(remove_end)-1) == 0) { |
|
842 |
+ len -= sizeof(remove_end) - 1; |
|
1428 | 843 |
} |
1429 |
- if(leaf->info) { |
|
1430 |
- free(leaf->info); |
|
1431 |
- leaf->info=NULL; |
|
844 |
+ if(strncmp(&pattern[len - sizeof(remove_end2)+1], remove_end2, sizeof(remove_end2)-1) == 0) { |
|
845 |
+ len -= sizeof(remove_end2) - 1; |
|
1432 | 846 |
} |
1433 |
- /* return;*/ |
|
1434 | 847 |
} |
1435 |
- if(node->alternatives) { |
|
1436 |
- int i; |
|
1437 |
- struct tree_node* p; |
|
1438 |
- massert(children); |
|
1439 |
- p = children[0]->op==OP_LEAF ? NULL : children[0]->next; |
|
1440 |
- for(i=0;i<node->alternatives;i++) |
|
1441 |
- destroy_tree_internal(matcher,children[i]); |
|
1442 |
- if(p && p!=node) |
|
1443 |
- destroy_tree_internal(matcher,p);/*?? is this ok, or without _internal?*/ |
|
1444 |
- } |
|
1445 |
- else { |
|
1446 |
- if(children) { |
|
1447 |
- if(children[0]) |
|
1448 |
- destroy_tree_internal(matcher,children[0]); |
|
848 |
+ pattern[len] = '\0'; |
|
849 |
+ |
|
850 |
+ |
|
851 |
+ rc = cli_regcomp(®ex->preg, pattern, REG_EXTENDED); |
|
852 |
+ if(rc) { |
|
853 |
+ size_t buflen = cli_regerror(rc, ®ex->preg, NULL, 0); |
|
854 |
+ char *errbuf = cli_malloc(buflen); |
|
855 |
+ if(errbuf) { |
|
856 |
+ cli_regerror(rc, ®ex->preg, errbuf, buflen); |
|
857 |
+ cli_errmsg(MODULE "Error compiling regular expression %s: %s\n", pattern, errbuf); |
|
858 |
+ free(errbuf); |
|
859 |
+ } else { |
|
860 |
+ cli_errmsg(MODULE "Error compiling regular expression: %s\n", pattern); |
|
1449 | 861 |
} |
862 |
+ return rc; |
|
863 |
+ cli_regfree(®ex->preg); |
|
864 |
+ free(regex); |
|
865 |
+ return CL_EMALFDB; |
|
1450 | 866 |
} |
1451 |
- if(node->op!=OP_LEAF && node->next && !node->listend) |
|
1452 |
- destroy_tree_internal(matcher,node->next); |
|
1453 |
- if(node->u.children) |
|
1454 |
- stack_push_once(&matcher->node_stack,(struct tree_node*)node->u.children);/* cast to make compiler happy, it isn't really a tree_node* */ |
|
1455 |
- if(node->op==OP_CUSTOMCLASS && node->u.children[0]) { |
|
1456 |
- free(node->u.children[0]); |
|
1457 |
- node->u.children[0]=NULL; |
|
1458 |
- } |
|
1459 |
- stack_push_once(&matcher->node_stack,node); |
|
1460 |
-} |
|
867 |
+ regex->pattern = cli_strdup(pattern); |
|
868 |
+ regex->nxt = NULL; |
|
1461 | 869 |
|
1462 |
-static void destroy_tree(struct regex_matcher* matcher) |
|
1463 |
-{ |
|
1464 |
- /* we might have the same node linked by different nodes, so a recursive walk&free doesn't work in all situations, |
|
1465 |
- * i.e. it might double-free, so instead of freeing, just push the nodes on a stack, and later free the nodes in that stack, |
|
1466 |
- * (and push to stack only if it doesn't contain it already*/ |
|
1467 |
- massert(matcher); |
|
1468 |
- |
|
1469 |
- stack_reset(&matcher->node_stack); |
|
1470 |
- destroy_tree_internal(matcher,matcher->root_regex); |
|
1471 |
- destroy_tree_internal(matcher,matcher->root_regex_hostonly); |
|
1472 |
- while (matcher->node_stack.cnt) { |
|
1473 |
- struct tree_node* node = stack_pop(&matcher->node_stack); |
|
1474 |
- if(node) |
|
1475 |
- free(node); |
|
1476 |
- } |
|
1477 |
-} |
|
1478 |
-#ifndef NDEBUG |
|
1479 |
-static void dump_node(struct tree_node* node) |
|
1480 |
-{ |
|
1481 |
- int i; |
|
1482 |
- struct tree_node* p,**children; |
|
1483 |
- massert(node); |
|
1484 |
- if(node->op==OP_LEAF) { |
|
1485 |
- if(node->u.leaf->preg) |
|
1486 |
- printf("n%p [label=\"regex\\nleaf\"]",(void*)node); |
|
1487 |
- else |
|
1488 |
- printf("n%p [label=\"%c\\nleaf\"];\n",(void*)node,node->c); |
|
1489 |
- if(node->next && !node->listend) { |
|
1490 |
- printf("n%p -> n%p;\n",(void*)node,(void*)node->next); |
|
1491 |
- dump_node(node->next); |
|
1492 |
- } |
|
1493 |
- return; |
|
1494 |
- } |
|
1495 |
- printf("n%p [label=\"%c\\n%d\\nlistend:%d\"];\n",(void*)node,(node->op==OP_ROOT||node->op==OP_PARCLOSE) ?'@' :node->c,node->op,node->listend); |
|
1496 |
- if(node->next) |
|
1497 |
- printf("n%p -> n%p;\n",(void*)node,(void*)node->next); |
|
1498 |
- printf("n%p -> {",(void*)node);/*using address of node as id*/ |
|
1499 |
- children = tree_node_get_children(node); |
|
1500 |
- if(node->alternatives) |
|
1501 |
- massert(children); |
|
1502 |
- for(i=0;i<node->alternatives;i++) |
|
1503 |
- printf("n%p ",(void*)children[i]); |
|
1504 |
- if(node->alternatives && children[0]->op!=OP_LEAF) |
|
1505 |
- for(p=children[0]->next;p!=node;p=p->next) |
|
1506 |
- { |
|
1507 |
- massert(p); |
|
1508 |
- printf("n%p ",(void*)p); |
|
1509 |
- if(p->op==OP_LEAF || p->listend) |
|
1510 |
- break; |
|
1511 |
- } |
|
1512 |
- if(!node->alternatives && children && children[0]) |
|
1513 |
- printf("n%p ",(void*)children[0]); |
|
1514 |
- printf("};\n"); |
|
1515 |
- printf("{rank=same;"); |
|
1516 |
- for(i=0;i<node->alternatives;i++) |
|
1517 |
- printf("n%p ",(void*)node->u.children[i]); |
|
1518 |
- if(node->alternatives && children[0]->op!=OP_LEAF) |
|
1519 |
- for(p=children[0]->next;p!=node;p=p->next) |
|
1520 |
- { |
|
1521 |
- printf("n%p ",(void*)p); |
|
1522 |
- if(p->op==OP_LEAF || p->listend) |
|
1523 |
- break; |
|
1524 |
- } |
|
1525 |
- if(!node->alternatives && children && children[0]) |
|
1526 |
- printf("n%p ",(void*)children[0]); |
|
1527 |
- printf("};\n"); |
|
1528 |
- for(i=0;i<node->alternatives;i++) |
|
1529 |
- dump_node(children[i]); |
|
1530 |
- if(node->alternatives && children[0]->op!=OP_LEAF) |
|
1531 |
- for(p=children[0]->next;p!=node;p=p->next) |
|
1532 |
- { |
|
1533 |
- dump_node(p); |
|
1534 |
- if(p->op==OP_LEAF || p->listend) |
|
1535 |
- break; |
|
1536 |
- } |
|
1537 |
- if(!node->alternatives && children && children[0]) |
|
1538 |
- dump_node(children[0]); |
|
1539 |
-} |
|
870 |
+ n = parse_regex(pattern, &last); |
|
871 |
+ memset(&buf, 0, sizeof(buf)); |
|
872 |
+ memset(&root_node, 0, sizeof(buf)); |
|
873 |
+ n->parent = &root_node; |
|
1540 | 874 |
|
1541 |
-void dump_tree(struct tree_node* root) |
|
1542 |
-{ |
|
1543 |
- /*use dot/dotty from graphviz to view it*/ |
|
1544 |
- massert(root); |
|
1545 |
- printf("digraph tree {\n"); |
|
1546 |
- dump_node(root); |
|
1547 |
- printf("}\n"); |
|
875 |
+ rc = build_suffixtree_descend(matcher, regex, n, &buf); |
|
876 |
+ destroy_tree(n); |
|
877 |
+ return rc; |
|
1548 | 878 |
} |
1549 |
-#endif |
... | ... |
@@ -24,39 +24,37 @@ |
24 | 24 |
#ifndef _REGEX_LIST_H |
25 | 25 |
#define _REGEX_LIST_H |
26 | 26 |
|
27 |
-#ifdef NDEBUG |
|
28 |
-#define massert(x) (void)(0) |
|
29 |
-#else |
|
30 |
-/*debug version, massert enabled*/ |
|
31 |
- |
|
32 |
-#define __massert_fail(expr,file,line) (void)cli_errmsg("Assertion failed at %s:%d\n %s\n",file,line,expr) |
|
33 |
- |
|
34 |
-#define massert(expr) ((void) ((expr) ? (void)0 : (__massert_fail (#expr,__FILE__,__LINE__)))) |
|
35 |
-#endif |
|
36 |
- |
|
37 | 27 |
#include "phishcheck.h" |
38 | 28 |
#include "readdb.h" |
39 | 29 |
#include "matcher.h" |
40 | 30 |
#include <zlib.h> /* for gzFile */ |
41 |
-struct node_stack { |
|
42 |
- struct tree_node** data; |
|
43 |
- size_t capacity; |
|
44 |
- size_t cnt; |
|
31 |
+ |
|
32 |
+struct regex_list { |
|
33 |
+ const char *pattern; |
|
34 |
+ regex_t preg; |
|
35 |
+ struct regex_list *nxt; |
|
36 |
+}; |
|
37 |
+ |
|
38 |
+struct filter { |
|
39 |
+ uint32_t B[65536]; |
|
40 |
+ uint32_t end_fast[256]; |
|
41 |
+ uint32_t end[65536]; |
|
42 |
+ unsigned long m; |
|
45 | 43 |
}; |
46 | 44 |
|
47 | 45 |
struct regex_matcher { |
48 |
- struct cli_matcher* root_hosts; |
|
49 |
- struct tree_node* root_regex; |
|
50 |
- struct tree_node* root_regex_hostonly; |
|
51 |
- struct node_stack node_stack; |
|
52 |
- struct node_stack node_stack_alt; |
|
53 |
- size_t root_hosts_cnt; |
|
54 |
- int list_inited; |
|
55 |
- int list_loaded; |
|
56 |
- int list_built; |
|
46 |
+ struct hashtable suffix_hash; |
|
47 |
+ size_t suffix_cnt; |
|
48 |
+ struct regex_list **suffix_regexes; |
|
49 |
+ struct cli_matcher suffixes; |
|
50 |
+ struct filter filter; |
|
51 |
+ int list_inited:2; |
|
52 |
+ int list_loaded:2; |
|
53 |
+ int list_built:2; |
|
57 | 54 |
}; |
58 | 55 |
|
59 |
-int regex_list_match(struct regex_matcher* matcher, char* real_url,const char* display_url,const struct pre_fixup_info* pre_fixup, int hostOnly,const char** info,int is_whitelist); |
|
56 |
+int cli_build_regex_list(struct regex_matcher* matcher); |
|
57 |
+int regex_list_match(struct regex_matcher* matcher, char* real_url,const char* display_url,const struct pre_fixup_info* pre_fixup, int hostOnly,const char **info, int is_whitelist); |
|
60 | 58 |
int init_regex_list(struct regex_matcher* matcher); |
61 | 59 |
int load_regex_matcher(struct regex_matcher* matcher,FILE* fd,unsigned int options,int is_whitelist,struct cli_dbio *dbio); |
62 | 60 |
void regex_list_cleanup(struct regex_matcher* matcher); |