Browse code

performance improvements for URL matching (bb #725, bb #650): * use a suffix AC-trie and a shift-or FSM to filter * rewrite the URL regex in C * use a perfect hash to lookup TLD and ccTLD, instead of a regex * TODO: suffixes having a common prefix: loop over all of them cli_ac_free: multiple virname pointing to same location

git-svn: trunk@3978

Török Edvin authored on 2008/07/23 22:51:57
Showing 16 changed files
... ...
@@ -1,3 +1,12 @@
1
+Wed Jul 23 16:32:32 EEST 2008 (edwin)
2
+------------------------------------
3
+  * libclamav: performance improvements for URL matching (bb #725, bb #650):
4
+	* use a suffix AC-trie and a shift-or FSM to filter
5
+	* rewrite the URL regex in C
6
+	* use a perfect hash to lookup TLD and ccTLD, instead of a regex
7
+	* TODO: suffixes having a common prefix: loop over all of them
8
+		cli_ac_free: multiple virname pointing to same location
9
+
1 10
 Mon Jul 21 12:16:44 CEST 2008 (tk)
2 11
 ----------------------------------
3 12
   * sigtool/vba.c: fix crash on error in vba code (bb#1106)
... ...
@@ -1,7 +1,7 @@
1 1
 PERL=perl
2 2
 CC=cc
3 3
 
4
-all: entitylist.h encoding_aliases.h gentbl encname_chars.h
4
+all: entitylist.h encoding_aliases.h gentbl encname_chars.h generate_hash
5 5
 
6 6
 entities_parsed: entities entities/* entity_decl_parse.pl
7 7
 	$(PERL) entity_decl_parse.pl $</* | sort -u >$@
... ...
@@ -9,6 +9,9 @@ entities_parsed: entities entities/* entity_decl_parse.pl
9 9
 generate_entitylist: generate_entitylist.c ../../libclamav/hashtab.h ../../libclamav/hashtab.c ../../libclamav/others.c
10 10
 	$(CC) -I. -DHAVE_CONFIG_H -DCLI_MEMFUNSONLY -DPROFILE_HASHTABLE $< ../../libclamav/hashtab.c ../../libclamav/others.c -o $@
11 11
 
12
+generate_hash: generate_hash.c ../../libclamav/hashtab.h ../../libclamav/hashtab.c ../../libclamav/others.c
13
+	$(CC) -I. -DHAVE_CONFIG_H -DCLI_MEMFUNSONLY -DPROFILE_HASHTABLE $< ../../libclamav/hashtab.c ../../libclamav/others.c -o $@
14
+
12 15
 generate_encoding_aliases: generate_encoding_aliases.c ../../libclamav/hashtab.c ../../libclamav/others.c ../../libclamav/htmlnorm.h ../../libclamav/entconv.h ../../libclamav/cltypes.h ../../libclamav/hashtab.h ../../libclamav/hashtab.h
13 16
 	$(CC) -I. -DHAVE_CONFIG_H -DCLI_MEMFUNSONLY -DPROFILE_HASHTABLE $< ../../libclamav/hashtab.c ../../libclamav/others.c -o $@
14 17
 
... ...
@@ -26,30 +26,11 @@ OUTFILE=iana_tld.h
26 26
 echo "Downloading updated tld list from iana.org"
27 27
 wget $IANA_TLD -O $TMP || exit 2
28 28
 echo "Download complete, parsing data"
29
-# 174 is the code for |
30
-TLDLIST=$(egrep -v ^# $TMP | tr \\n \\174 | sed 's/[^a-zA-Z]$//')
31
-echo "Parse complete, removing tmpfile"
32
-rm $TMP
33
-echo "Generating tld list in $OUTFILE"
34
-cat >$OUTFILE <<EOF
35
-#ifndef IANA_TLD_H
36
-#define IANA_TLD_H
37
-EOF
38
-echo -n "#define iana_tld \"(" >>$OUTFILE
39
-echo -n $TLDLIST >>$OUTFILE
40
-echo ")\"" >>$OUTFILE
29
+grep -Ev ^# $TMP | tr [A-Z] [a-z] | gperf -C -l -L ANSI-C -E -C -H tld_hash -N in_tld_set|grep -v '^#line' | sed -e 's/^const struct/static const struct/' -e 's/register //g' >iana_tld.h
41 30
 
42 31
 echo "Downloading updated country-code list from iana.org"
43 32
 wget $IANA_CCTLD -O $TMP || exit 2
44 33
 echo "Download complete, parsing data"
45
-CCTLDLIST=$(cat $TMP | egrep -oi "<a href=[^>]+>\\.([a-zA-Z]+).+</a>" | egrep -o ">.[a-zA-Z]+" | colrm 1 2 | tr \\n \\174 | sed 's/[^a-zA-Z]$//')
46
-echo "Parse complete, removing tmpfile"
47
-rm $TMP
48
-echo "Generating cctld list in $OUTFILE"
49
-echo -n "#define iana_cctld \"(" >>$OUTFILE
50
-echo -n $CCTLDLIST >>$OUTFILE
51
-echo ")\"" >>$OUTFILE
52
-
53
-
54
-echo "#endif" >>$OUTFILE
55
-echo "Finished succesfully"
34
+cat $TMP | grep country-code|egrep -oi "<a
35
+href=[^>]+>\\.([a-zA-Z]+).+</a>"|egrep -o ">.[a-zA-Z]+" | colrm 1 2 | tr [A-Z] [a-z]| gperf -C -l -L ANSI-C -E -C -H cctld_hash -N in_cctld_set |grep -v '^#line'|sed -e 's/^const struct/static const struct/' -e 's/register //g' >iana_cctld.h
36
+echo "Done"
... ...
@@ -26,17 +26,4 @@ echo "Downloading updated tld list from iana.org"
26 26
 wget $IANA_TLD -O $TMP || exit 2
27 27
 echo "Download complete, parsing data"
28 28
 # 174 is the code for |
29
-TLDLIST=$(egrep -v ^# $TMP|tr \\n \\174 )
30
-echo "Parse complete, removing tmpfile"
31
-rm $TMP
32
-echo "Generating $OUTFILE"
33
-cat >$OUTFILE <<EOF
34
-#ifndef IANA_TLD_H
35
-#define IANA_TLD_H
36
-EOF
37
-echo -n "#define iana_tld \"(" >>$OUTFILE
38
-echo -n $TLDLIST >>$OUTFILE
39
-echo ")\"" >>$OUTFILE
40
-echo "#endif" >>$OUTFILE
41
-echo "Finished succesfully"
42
-
29
+grep -Ev ^# $TMP | tr [A-Z] [a-z] | gperf -C -H tld_hash -N in_tld_set -l|grep -v '^#line' | sed -e 's/^const struct/static const struct/' -e 's/register //g'
... ...
@@ -361,7 +361,7 @@ All 4 tests passed
361 361
 	 \item The exact output from \verb+make check+	 
362 362
 	 \item Output of \verb+uname -mrsp+ 
363 363
 	 \item your \verb+config.log+	 
364
-	 \item The following files from the \verb+unit-tests/+ directory:
364
+	 \item The following files from the \verb+unit_tests/+ directory:
365 365
 		\begin{itemize}
366 366
 			\item \verb+test.log+
367 367
 	 		\item \verb+clamscan.log+
... ...
@@ -367,10 +367,18 @@ void hashtab_clear(struct hashtable *s)
367 367
 		if(s->htable[i].key && s->htable[i].key != DELETED_KEY)
368 368
 			free((void *)s->htable[i].key);
369 369
 	}
370
-	memset(s->htable, 0, s->capacity);
370
+	if(s->htable)
371
+		memset(s->htable, 0, s->capacity);
371 372
 	s->used = 0;
372 373
 }
373 374
 
375
+void hashtab_free(struct hashtable *s)
376
+{
377
+	hashtab_clear(s);
378
+	free(s->htable);
379
+	s->htable = NULL;
380
+	s->capacity = 0;
381
+}
374 382
 
375 383
 int hashtab_store(const struct hashtable *s,FILE* out)
376 384
 {
... ...
@@ -82,7 +82,7 @@ int hashtab_init(struct hashtable *s,size_t capacity);
82 82
 const struct element* hashtab_insert(struct hashtable *s, const char* key, const size_t len, const element_data data);
83 83
 void hashtab_delete(struct hashtable *s,const char* key,const size_t len);
84 84
 void hashtab_clear(struct hashtable *s);
85
-
85
+void hashtab_free(struct hashtable *s);
86 86
 int hashtab_load(FILE* in, struct hashtable *s);
87 87
 int hashtab_store(const struct hashtable *s,FILE* out);
88 88
 
89 89
new file mode 100644
... ...
@@ -0,0 +1,505 @@
0
+/* ANSI-C code produced by gperf version 3.0.3 */
1
+/* Command-line: gperf -C -l -L ANSI-C -E -C -H cctld_hash -N in_cctld_set  */
2
+/* Computed positions: -k'1-2' */
3
+
4
+#if !((' ' == 32) && ('!' == 33) && ('"' == 34) && ('#' == 35) \
5
+      && ('%' == 37) && ('&' == 38) && ('\'' == 39) && ('(' == 40) \
6
+      && (')' == 41) && ('*' == 42) && ('+' == 43) && (',' == 44) \
7
+      && ('-' == 45) && ('.' == 46) && ('/' == 47) && ('0' == 48) \
8
+      && ('1' == 49) && ('2' == 50) && ('3' == 51) && ('4' == 52) \
9
+      && ('5' == 53) && ('6' == 54) && ('7' == 55) && ('8' == 56) \
10
+      && ('9' == 57) && (':' == 58) && (';' == 59) && ('<' == 60) \
11
+      && ('=' == 61) && ('>' == 62) && ('?' == 63) && ('A' == 65) \
12
+      && ('B' == 66) && ('C' == 67) && ('D' == 68) && ('E' == 69) \
13
+      && ('F' == 70) && ('G' == 71) && ('H' == 72) && ('I' == 73) \
14
+      && ('J' == 74) && ('K' == 75) && ('L' == 76) && ('M' == 77) \
15
+      && ('N' == 78) && ('O' == 79) && ('P' == 80) && ('Q' == 81) \
16
+      && ('R' == 82) && ('S' == 83) && ('T' == 84) && ('U' == 85) \
17
+      && ('V' == 86) && ('W' == 87) && ('X' == 88) && ('Y' == 89) \
18
+      && ('Z' == 90) && ('[' == 91) && ('\\' == 92) && (']' == 93) \
19
+      && ('^' == 94) && ('_' == 95) && ('a' == 97) && ('b' == 98) \
20
+      && ('c' == 99) && ('d' == 100) && ('e' == 101) && ('f' == 102) \
21
+      && ('g' == 103) && ('h' == 104) && ('i' == 105) && ('j' == 106) \
22
+      && ('k' == 107) && ('l' == 108) && ('m' == 109) && ('n' == 110) \
23
+      && ('o' == 111) && ('p' == 112) && ('q' == 113) && ('r' == 114) \
24
+      && ('s' == 115) && ('t' == 116) && ('u' == 117) && ('v' == 118) \
25
+      && ('w' == 119) && ('x' == 120) && ('y' == 121) && ('z' == 122) \
26
+      && ('{' == 123) && ('|' == 124) && ('}' == 125) && ('~' == 126))
27
+/* The character set is not based on ISO-646.  */
28
+#error "gperf generated tables don't work with this execution character set. Please report a bug to <bug-gnu-gperf@gnu.org>."
29
+#endif
30
+
31
+/* maximum key range = 472, duplicates = 0 */
32
+
33
+#ifdef __GNUC__
34
+__inline
35
+#else
36
+#ifdef __cplusplus
37
+inline
38
+#endif
39
+#endif
40
+static unsigned int
41
+cctld_hash (const char *str, unsigned int len)
42
+{
43
+  static const unsigned short asso_values[] =
44
+    {
45
+      476, 476, 476, 476, 476, 476, 476, 476, 476, 476,
46
+      476, 476, 476, 476, 476, 476, 476, 476, 476, 476,
47
+      476, 476, 476, 476, 476, 476, 476, 476, 476, 476,
48
+      476, 476, 476, 476, 476, 476, 476, 476, 476, 476,
49
+      476, 476, 476, 476, 476, 476, 476, 476, 476, 476,
50
+      476, 476, 476, 476, 476, 476, 476, 476, 476, 476,
51
+      476, 476, 476, 476, 476, 476, 476, 476, 476, 476,
52
+      476, 476, 476, 476, 476, 476, 476, 476, 476, 476,
53
+      476, 476, 476, 476, 476, 476, 476, 476, 476, 476,
54
+      476, 476, 476, 476, 476, 476, 476, 119,  97,  33,
55
+      103,   4,  59, 115, 210, 149, 169, 143, 175,  55,
56
+      145,  89, 178,  37,  85,  18,  34, 239,   2,  73,
57
+      112,   3,  25,  10,  15, 117, 209, 229, 150, 223,
58
+      200,  78, 225,  54,   5, 215, 215, 190,  25,  23,
59
+        0,  20, 233, 234,  14, 476,  33, 204, 476, 476,
60
+      476, 476, 476, 476, 476, 476, 476, 476, 476, 476,
61
+      476, 476, 476, 476, 476, 476, 476, 476, 476, 476,
62
+      476, 476, 476, 476, 476, 476, 476, 476, 476, 476,
63
+      476, 476, 476, 476, 476, 476, 476, 476, 476, 476,
64
+      476, 476, 476, 476, 476, 476, 476, 476, 476, 476,
65
+      476, 476, 476, 476, 476, 476, 476, 476, 476, 476,
66
+      476, 476, 476, 476, 476, 476, 476, 476, 476, 476,
67
+      476, 476, 476, 476, 476, 476, 476, 476, 476, 476,
68
+      476, 476, 476, 476, 476, 476, 476, 476, 476, 476,
69
+      476, 476, 476, 476, 476, 476, 476, 476, 476, 476,
70
+      476, 476, 476, 476, 476, 476, 476, 476, 476, 476,
71
+      476, 476, 476, 476, 476, 476, 476, 476, 476, 476,
72
+      476, 476, 476, 476, 476, 476, 476, 476, 476, 476,
73
+      476
74
+    };
75
+  return len + asso_values[(unsigned char)str[1]] + asso_values[(unsigned char)str[0]+25];
76
+}
77
+
78
+#ifdef __GNUC__
79
+__inline
80
+#ifdef __GNUC_STDC_INLINE__
81
+__attribute__ ((__gnu_inline__))
82
+#endif
83
+#endif
84
+const char *
85
+in_cctld_set (const char *str, unsigned int len)
86
+{
87
+  enum
88
+    {
89
+      TOTAL_KEYWORDS = 252,
90
+      MIN_WORD_LENGTH = 2,
91
+      MAX_WORD_LENGTH = 2,
92
+      MIN_HASH_VALUE = 4,
93
+      MAX_HASH_VALUE = 475
94
+    };
95
+
96
+  static const unsigned char lengthtable[] =
97
+    {
98
+       0,  0,  0,  0,  2,  2,  2,  0,  0,  2,  2,  2,  0,  0,
99
+       2,  2,  2,  0,  0,  2,  2,  0,  0,  0,  2,  2,  0,  2,
100
+       0,  2,  2,  2,  2,  0,  2,  2,  2,  2,  0,  2,  2,  2,
101
+       2,  2,  2,  2,  2,  2,  0,  0,  2,  0,  2,  0,  0,  2,
102
+       2,  2,  2,  2,  2,  2,  2,  0,  2,  0,  2,  2,  0,  2,
103
+       0,  2,  2,  0,  2,  2,  2,  2,  0,  0,  2,  2,  2,  0,
104
+       2,  2,  2,  2,  0,  2,  2,  2,  2,  0,  0,  2,  2,  2,
105
+       2,  2,  2,  2,  2,  0,  0,  2,  2,  2,  0,  2,  2,  2,
106
+       2,  0,  2,  2,  2,  2,  0,  2,  2,  2,  2,  2,  0,  2,
107
+       2,  2,  0,  2,  2,  2,  2,  0,  0,  2,  2,  2,  0,  2,
108
+       0,  2,  2,  0,  2,  2,  2,  2,  0,  0,  2,  2,  2,  2,
109
+       0,  2,  2,  2,  0,  0,  2,  2,  2,  0,  0,  2,  2,  2,
110
+       0,  2,  2,  2,  2,  0,  2,  2,  2,  2,  0,  0,  0,  2,
111
+       2,  0,  0,  2,  2,  2,  0,  2,  0,  2,  2,  0,  0,  2,
112
+       2,  2,  0,  2,  2,  0,  2,  0,  0,  2,  2,  2,  2,  0,
113
+       2,  2,  2,  0,  0,  2,  0,  2,  0,  0,  2,  2,  2,  0,
114
+       0,  2,  2,  2,  0,  2,  2,  2,  2,  0,  0,  0,  2,  2,
115
+       2,  2,  2,  2,  2,  0,  2,  2,  2,  2,  0,  2,  2,  2,
116
+       2,  2,  0,  2,  2,  2,  2,  2,  2,  2,  2,  0,  2,  2,
117
+       2,  2,  0,  2,  0,  2,  2,  0,  2,  0,  2,  2,  0,  2,
118
+       2,  0,  2,  0,  0,  0,  2,  2,  2,  0,  2,  2,  0,  0,
119
+       0,  2,  2,  2,  0,  0,  2,  2,  2,  0,  0,  2,  2,  2,
120
+       0,  0,  2,  2,  2,  0,  0,  0,  2,  0,  0,  0,  2,  0,
121
+       0,  0,  0,  2,  2,  2,  0,  0,  2,  0,  2,  0,  0,  2,
122
+       2,  2,  0,  0,  0,  0,  2,  0,  0,  0,  0,  2,  0,  0,
123
+       2,  2,  0,  0,  2,  2,  0,  0,  0,  0,  0,  0,  2,  0,
124
+       0,  0,  2,  2,  2,  0,  2,  0,  2,  0,  2,  0,  2,  2,
125
+       2,  0,  2,  2,  0,  0,  0,  2,  0,  0,  0,  0,  0,  2,
126
+       2,  0,  0,  2,  0,  0,  0,  0,  2,  0,  2,  0,  0,  2,
127
+       0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
128
+       0,  2,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
129
+       0,  0,  0,  2,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
130
+       0,  0,  2,  0,  0,  0,  0,  0,  2,  0,  0,  0,  0,  0,
131
+       0,  0,  2,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  2
132
+    };
133
+  static const char * const wordlist[] =
134
+    {
135
+      "", "", "", "",
136
+      "sv",
137
+      "sy",
138
+      "se",
139
+      "", "",
140
+      "mv",
141
+      "my",
142
+      "me",
143
+      "", "",
144
+      "bv",
145
+      "by",
146
+      "be",
147
+      "", "",
148
+      "cv",
149
+      "cy",
150
+      "", "", "",
151
+      "tv",
152
+      "ms",
153
+      "",
154
+      "sz",
155
+      "",
156
+      "re",
157
+      "bs",
158
+      "ae",
159
+      "mz",
160
+      "",
161
+      "ws",
162
+      "sc",
163
+      "st",
164
+      "bz",
165
+      "",
166
+      "ye",
167
+      "mc",
168
+      "mt",
169
+      "cz",
170
+      "rs",
171
+      "mq",
172
+      "as",
173
+      "bt",
174
+      "tz",
175
+      "", "",
176
+      "cc",
177
+      "",
178
+      "az",
179
+      "", "",
180
+      "tc",
181
+      "tt",
182
+      "sm",
183
+      "lv",
184
+      "ly",
185
+      "ac",
186
+      "at",
187
+      "mm",
188
+      "",
189
+      "aq",
190
+      "",
191
+      "mf",
192
+      "bm",
193
+      "",
194
+      "yt",
195
+      "",
196
+      "bf",
197
+      "cm",
198
+      "",
199
+      "ls",
200
+      "wf",
201
+      "cf",
202
+      "tm",
203
+      "", "",
204
+      "mw",
205
+      "tf",
206
+      "am",
207
+      "",
208
+      "je",
209
+      "bw",
210
+      "af",
211
+      "sr",
212
+      "",
213
+      "lc",
214
+      "lt",
215
+      "so",
216
+      "mr",
217
+      "", "",
218
+      "tw",
219
+      "mo",
220
+      "br",
221
+      "rw",
222
+      "sb",
223
+      "aw",
224
+      "bo",
225
+      "cr",
226
+      "", "",
227
+      "sd",
228
+      "co",
229
+      "tr",
230
+      "",
231
+      "bb",
232
+      "md",
233
+      "to",
234
+      "ar",
235
+      "",
236
+      "ro",
237
+      "bd",
238
+      "ao",
239
+      "sg",
240
+      "",
241
+      "mx",
242
+      "cd",
243
+      "sa",
244
+      "mg",
245
+      "de",
246
+      "",
247
+      "td",
248
+      "ma",
249
+      "bg",
250
+      "",
251
+      "cx",
252
+      "ad",
253
+      "ba",
254
+      "cg",
255
+      "", "",
256
+      "jm",
257
+      "ca",
258
+      "tg",
259
+      "",
260
+      "ax",
261
+      "",
262
+      "lr",
263
+      "ag",
264
+      "",
265
+      "dz",
266
+      "sk",
267
+      "qa",
268
+      "sn",
269
+      "", "",
270
+      "mk",
271
+      "si",
272
+      "mn",
273
+      "lb",
274
+      "",
275
+      "gy",
276
+      "ge",
277
+      "bn",
278
+      "", "",
279
+      "ck",
280
+      "bi",
281
+      "cn",
282
+      "", "",
283
+      "tk",
284
+      "ci",
285
+      "tn",
286
+      "",
287
+      "jo",
288
+      "gs",
289
+      "sj",
290
+      "an",
291
+      "",
292
+      "dm",
293
+      "la",
294
+      "ai",
295
+      "sl",
296
+      "", "", "",
297
+      "bj",
298
+      "ml",
299
+      "", "",
300
+      "mp",
301
+      "gt",
302
+      "bl",
303
+      "",
304
+      "gq",
305
+      "",
306
+      "tj",
307
+      "cl",
308
+      "", "",
309
+      "py",
310
+      "pe",
311
+      "tl",
312
+      "",
313
+      "lk",
314
+      "tp",
315
+      "",
316
+      "al",
317
+      "", "",
318
+      "li",
319
+      "ie",
320
+      "gm",
321
+      "do",
322
+      "",
323
+      "ps",
324
+      "gf",
325
+      "sh",
326
+      "", "",
327
+      "ee",
328
+      "",
329
+      "mh",
330
+      "", "",
331
+      "is",
332
+      "ne",
333
+      "bh",
334
+      "", "",
335
+      "gw",
336
+      "pt",
337
+      "ch",
338
+      "",
339
+      "es",
340
+      "ky",
341
+      "ke",
342
+      "th",
343
+      "", "", "",
344
+      "it",
345
+      "gr",
346
+      "uy",
347
+      "iq",
348
+      "ve",
349
+      "su",
350
+      "nz",
351
+      "",
352
+      "ec",
353
+      "et",
354
+      "mu",
355
+      "pm",
356
+      "",
357
+      "gb",
358
+      "nc",
359
+      "pf",
360
+      "kz",
361
+      "us",
362
+      "",
363
+      "gd",
364
+      "cu",
365
+      "im",
366
+      "jp",
367
+      "ht",
368
+      "uz",
369
+      "zm",
370
+      "dk",
371
+      "",
372
+      "ru",
373
+      "pw",
374
+      "au",
375
+      "gg",
376
+      "",
377
+      "vc",
378
+      "",
379
+      "ga",
380
+      "om",
381
+      "",
382
+      "yu",
383
+      "",
384
+      "nf",
385
+      "pr",
386
+      "",
387
+      "zw",
388
+      "hm",
389
+      "",
390
+      "km",
391
+      "", "", "",
392
+      "fm",
393
+      "ir",
394
+      "dj",
395
+      "",
396
+      "um",
397
+      "io",
398
+      "", "", "",
399
+      "lu",
400
+      "er",
401
+      "gn",
402
+      "", "",
403
+      "kw",
404
+      "gi",
405
+      "nr",
406
+      "", "",
407
+      "id",
408
+      "no",
409
+      "pg",
410
+      "", "",
411
+      "hr",
412
+      "pa",
413
+      "kr",
414
+      "", "", "",
415
+      "fr",
416
+      "", "", "",
417
+      "fo",
418
+      "", "", "", "",
419
+      "za",
420
+      "eg",
421
+      "gl",
422
+      "", "",
423
+      "gp",
424
+      "",
425
+      "ng",
426
+      "", "",
427
+      "pk",
428
+      "na",
429
+      "pn",
430
+      "", "", "", "",
431
+      "kg",
432
+      "", "", "", "",
433
+      "in",
434
+      "", "",
435
+      "ug",
436
+      "vg",
437
+      "", "",
438
+      "ua",
439
+      "va",
440
+      "", "", "", "", "", "",
441
+      "gh",
442
+      "", "", "",
443
+      "ni",
444
+      "pl",
445
+      "hk",
446
+      "",
447
+      "hn",
448
+      "",
449
+      "kn",
450
+      "",
451
+      "fk",
452
+      "",
453
+      "ki",
454
+      "il",
455
+      "uk",
456
+      "",
457
+      "fi",
458
+      "vn",
459
+      "", "", "",
460
+      "vi",
461
+      "", "", "", "", "",
462
+      "gu",
463
+      "nl",
464
+      "", "",
465
+      "np",
466
+      "", "", "", "",
467
+      "fj",
468
+      "",
469
+      "ph",
470
+      "", "",
471
+      "kp",
472
+      "", "", "", "", "", "", "", "", "",
473
+      "", "", "", "", "", "",
474
+      "eh",
475
+      "", "", "", "", "", "", "", "", "",
476
+      "", "", "", "", "", "",
477
+      "kh",
478
+      "", "", "", "", "", "", "", "", "",
479
+      "", "", "",
480
+      "eu",
481
+      "", "", "", "", "",
482
+      "nu",
483
+      "", "", "", "", "", "", "",
484
+      "hu",
485
+      "", "", "", "", "", "", "", "", "",
486
+      "",
487
+      "vu"
488
+    };
489
+
490
+  if (len <= MAX_WORD_LENGTH && len >= MIN_WORD_LENGTH)
491
+    {
492
+      int key = cctld_hash (str, len);
493
+
494
+      if (key <= MAX_HASH_VALUE && key >= 0)
495
+        if (len == lengthtable[key])
496
+          {
497
+            const char *s = wordlist[key];
498
+
499
+            if (*str == *s && !memcmp (str + 1, s + 1, len - 1))
500
+              return s;
501
+          }
502
+    }
503
+  return 0;
504
+}
... ...
@@ -1,28 +1,746 @@
1
-/*
2
- *  Phishing module: iana tld list.
3
- *
4
- *  Copyright (C) 2007-2008 Sourcefire, Inc.
5
- *
6
- *  Authors: Török Edvin
7
- *
8
- *  This program is free software; you can redistribute it and/or modify
9
- *  it under the terms of the GNU General Public License version 2 as
10
- *  published by the Free Software Foundation.
11
- *
12
- *  This program is distributed in the hope that it will be useful,
13
- *  but WITHOUT ANY WARRANTY; without even the implied warranty of
14
- *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15
- *  GNU General Public License for more details.
16
- *
17
- *  You should have received a copy of the GNU General Public License
18
- *  along with this program; if not, write to the Free Software
19
- *  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
20
- *  MA 02110-1301, USA.
21
- */
22
-
23
-#ifndef IANA_TLD_H
24
-#define IANA_TLD_H
25
-#define iana_tld "(A[CDEFGILMNOQRSTUWXZ]|B[ABDEFGHIJMNORSTVWYZ]|C[ACDFGHIKLMNORUVXYZ]|D[EJKMOZ]|E[CEGRSTU]|F[IJKMOR]|G[ABDEFGHILMNPQRSTUWY]|H[KMNRTU]|I[DELMNOQRST]|J[EMOP]|K[EGHIMNPRWYZ]|L[ABCIKRSTUVY]|M[ACDEGHKLMNOPQRSTUVWXYZ]|N[ACEFGILOPRUZ]|OM|P[AEFGHKLMNRSTWY]|QA|R[EOSUW]|S[ABCDEGHIJKLMNORTUVYZ]|T[CDFGHJKLMNOPRTVWZ]|U[AGKMSYZ]|V[ACEGINU]|W[FS]|Y[ETU]|Z[AMW]|BIZ|CAT|COM|EDU|GOV|INT|MIL|NET|ORG|PRO|TEL|AERO|ARPA|ASIA|COOP|INFO|JOBS|MOBI|NAME|MUSEUM|TRAVEL|XN--ZCKZAH|XN--0ZWM56D|XN--DEBA0AD|XN--G6W251D|XN--JXALPDLP|XN--KGBECHTV|XN--9T4B11YI5A|XN--80AKHBYKNJ4F|XN--11B5BS3A9AJ6G|XN--HGBK6AJ7F53BBA)"
26
-#define iana_cctld "(A[CDEFGILMNOQRSTUWXZ]|B[ABDEFGHIJLMNORSTVWYZ]|C[ACDFGHIKLMNORUVXYZ]|D[EJKMOZ]|E[CEGHRSTU]|F[IJKMOR]|G[ABDEFGHILMNPQRSTUWY]|H[KMNRTU]|I[DELMNOQRST]|J[EMOP]|K[EGHIMNPRWYZ]|L[ABCIKRSTUVY]|M[ACDEFGHKLMNOPQRSTUVWXYZ]|N[ACEFGILOPRUZ]|OM|P[AEFGHKLMNRSTWY]|QA|R[EOSUW]|S[ABCDEGHIJKLMNORTUVYZ]|T[CDFGHJKLMNOPRTVWZ]|U[AGKMSYZ]|V[ACEGINU]|W[FS]|Y[ETU]|Z[AMW]|BIZ|CAT|COM|EDU|GOV|IN[TT]|MIL|NET|ORG|PRO|TEL|AERO|ARP[AA]|ASIA|COOP|INFO|JOBS|MOBI|NAME|MUSEUM)"
1
+/* ANSI-C code produced by gperf version 3.0.3 */
2
+/* Command-line: gperf -C -l -L ANSI-C -E -C -H tld_hash -N in_tld_set  */
3
+/* Computed positions: -k'1-2,6' */
4
+
5
+#if !((' ' == 32) && ('!' == 33) && ('"' == 34) && ('#' == 35) \
6
+      && ('%' == 37) && ('&' == 38) && ('\'' == 39) && ('(' == 40) \
7
+      && (')' == 41) && ('*' == 42) && ('+' == 43) && (',' == 44) \
8
+      && ('-' == 45) && ('.' == 46) && ('/' == 47) && ('0' == 48) \
9
+      && ('1' == 49) && ('2' == 50) && ('3' == 51) && ('4' == 52) \
10
+      && ('5' == 53) && ('6' == 54) && ('7' == 55) && ('8' == 56) \
11
+      && ('9' == 57) && (':' == 58) && (';' == 59) && ('<' == 60) \
12
+      && ('=' == 61) && ('>' == 62) && ('?' == 63) && ('A' == 65) \
13
+      && ('B' == 66) && ('C' == 67) && ('D' == 68) && ('E' == 69) \
14
+      && ('F' == 70) && ('G' == 71) && ('H' == 72) && ('I' == 73) \
15
+      && ('J' == 74) && ('K' == 75) && ('L' == 76) && ('M' == 77) \
16
+      && ('N' == 78) && ('O' == 79) && ('P' == 80) && ('Q' == 81) \
17
+      && ('R' == 82) && ('S' == 83) && ('T' == 84) && ('U' == 85) \
18
+      && ('V' == 86) && ('W' == 87) && ('X' == 88) && ('Y' == 89) \
19
+      && ('Z' == 90) && ('[' == 91) && ('\\' == 92) && (']' == 93) \
20
+      && ('^' == 94) && ('_' == 95) && ('a' == 97) && ('b' == 98) \
21
+      && ('c' == 99) && ('d' == 100) && ('e' == 101) && ('f' == 102) \
22
+      && ('g' == 103) && ('h' == 104) && ('i' == 105) && ('j' == 106) \
23
+      && ('k' == 107) && ('l' == 108) && ('m' == 109) && ('n' == 110) \
24
+      && ('o' == 111) && ('p' == 112) && ('q' == 113) && ('r' == 114) \
25
+      && ('s' == 115) && ('t' == 116) && ('u' == 117) && ('v' == 118) \
26
+      && ('w' == 119) && ('x' == 120) && ('y' == 121) && ('z' == 122) \
27
+      && ('{' == 123) && ('|' == 124) && ('}' == 125) && ('~' == 126))
28
+/* The character set is not based on ISO-646.  */
29
+#error "gperf generated tables don't work with this execution character set. Please report a bug to <bug-gnu-gperf@gnu.org>."
30
+#endif
31
+
32
+/* maximum key range = 983, duplicates = 0 */
33
+
34
+#ifdef __GNUC__
35
+__inline
36
+#else
37
+#ifdef __cplusplus
38
+inline
39
+#endif
40
+#endif
41
+static unsigned int
42
+tld_hash (const char *str, unsigned int len)
43
+{
44
+  static const unsigned short asso_values[] =
45
+    {
46
+      988, 988, 988, 988, 988, 988, 988, 988, 988, 988,
47
+      988, 988, 988, 988, 988, 988, 988, 988, 988, 988,
48
+      988, 988, 988, 988, 988, 988, 988, 988, 988, 988,
49
+      988, 988, 988, 988, 988, 988, 988, 988, 988, 988,
50
+      988, 988, 988, 988, 988, 988, 988, 988,   0,  15,
51
+      988, 988, 988, 988,   0, 988, 988, 988, 988, 988,
52
+      988, 988, 988, 988, 988, 988, 988, 988, 988, 988,
53
+      988, 988, 988, 988, 988, 988, 988, 988, 988, 988,
54
+      988, 988, 988, 988, 988, 988, 988, 988, 988, 988,
55
+      988, 988, 988, 988, 988, 988, 988, 170, 328,  88,
56
+        3,  50, 293, 205, 123, 430, 500, 238, 115, 320,
57
+      375,  30, 413, 348,  70,  43, 475,  18,   6, 283,
58
+       95,  58,  10, 220,   5, 485, 480,   8, 190, 390,
59
+      225, 113, 420,  95,   0,  15,  50, 295,  20, 128,
60
+      130,  80, 405, 470, 340,   0, 305, 415, 988, 988,
61
+      988, 988, 988, 988, 988, 988, 988, 988, 988, 988,
62
+      988, 988, 988, 988, 988, 988, 988, 988, 988, 988,
63
+      988, 988, 988, 988, 988, 988, 988, 988, 988, 988,
64
+      988, 988, 988, 988, 988, 988, 988, 988, 988, 988,
65
+      988, 988, 988, 988, 988, 988, 988, 988, 988, 988,
66
+      988, 988, 988, 988, 988, 988, 988, 988, 988, 988,
67
+      988, 988, 988, 988, 988, 988, 988, 988, 988, 988,
68
+      988, 988, 988, 988, 988, 988, 988, 988, 988, 988,
69
+      988, 988, 988, 988, 988, 988, 988, 988, 988, 988,
70
+      988, 988, 988, 988, 988, 988, 988, 988, 988, 988,
71
+      988, 988, 988, 988, 988, 988, 988, 988, 988, 988,
72
+      988, 988, 988, 988, 988, 988, 988, 988, 988, 988,
73
+      988, 988, 988, 988, 988, 988, 988, 988, 988, 988,
74
+      988
75
+    };
76
+  int hval = len;
77
+
78
+  switch (hval)
79
+    {
80
+      default:
81
+        hval += asso_values[(unsigned char)str[5]];
82
+      /*FALLTHROUGH*/
83
+      case 5:
84
+      case 4:
85
+      case 3:
86
+      case 2:
87
+        hval += asso_values[(unsigned char)str[1]];
88
+      /*FALLTHROUGH*/
89
+      case 1:
90
+        hval += asso_values[(unsigned char)str[0]+25];
91
+        break;
92
+    }
93
+  return hval;
94
+}
95
+
96
+#ifdef __GNUC__
97
+__inline
98
+#ifdef __GNUC_STDC_INLINE__
99
+__attribute__ ((__gnu_inline__))
100
+#endif
27 101
 #endif
102
+const char *
103
+in_tld_set (const char *str, unsigned int len)
104
+{
105
+  enum
106
+    {
107
+      TOTAL_KEYWORDS = 280,
108
+      MIN_WORD_LENGTH = 2,
109
+      MAX_WORD_LENGTH = 18,
110
+      MIN_HASH_VALUE = 5,
111
+      MAX_HASH_VALUE = 987
112
+    };
113
+
114
+  static const unsigned char lengthtable[] =
115
+    {
116
+       0,  0,  0,  0,  0,  2,  0,  0,  2,  0,  2,  0,  2,  2,
117
+       0,  2,  0,  2,  0,  0,  2,  0,  2,  0,  0,  2,  0,  2,
118
+       0,  0,  2,  0,  2,  0,  4,  2,  0,  2,  3,  4,  2,  0,
119
+       2,  0,  0,  2,  0,  2,  0,  0,  0,  0,  2,  0,  0,  2,
120
+       0,  4,  0,  0,  2,  0,  2,  0,  4,  2,  0,  2,  3,  0,
121
+       0,  0,  2,  0,  0,  0,  0,  2,  0,  0,  2,  0,  2,  0,
122
+       4,  2,  0,  2,  2,  0,  2,  0,  2,  0,  0,  2,  0,  2,
123
+       0,  0,  2,  0,  2,  2,  0,  2,  0,  2,  0,  0,  0,  0,
124
+       2,  0,  0,  2,  0,  2,  0,  0,  0,  0,  2,  3,  0,  2,
125
+       0,  2,  0,  0,  2,  0,  2,  3,  0,  2,  0,  0,  2,  0,
126
+       2,  0,  2,  0,  0,  2,  0,  4,  2,  0,  2,  0,  2,  0,
127
+       0,  2,  0,  0,  0,  0,  2,  0,  2,  0,  0,  2,  0,  2,
128
+       0,  0,  2,  0,  2,  2,  0,  0,  0,  2,  3,  0,  2,  0,
129
+       2,  0,  0,  2,  0,  2,  0,  4,  2,  0,  2,  0,  0,  2,
130
+       0,  2,  0,  0,  0,  0,  2,  0,  0,  2,  0,  2,  0,  0,
131
+       2,  0,  2,  0,  0,  0,  0,  2,  0,  0,  2,  0,  2,  3,
132
+       0,  2,  0,  0,  2,  0,  2,  0,  2,  0,  0,  2,  0,  0,
133
+       0,  0,  2,  0,  2,  0,  0,  2,  0,  2,  2,  0,  2,  0,
134
+       2,  0,  0,  2,  0,  2,  0,  0,  0,  0,  2,  0,  0,  2,
135
+       0,  2,  0,  0,  2,  6,  2,  0,  0,  0,  0,  2,  0,  0,
136
+       2,  0,  0,  0,  0,  2,  0,  2,  0,  0,  0,  0,  2,  0,
137
+       0,  2,  0,  2,  0,  0,  2,  0,  2,  0,  0,  2,  0,  2,
138
+       0,  0,  2,  0,  0,  0,  0,  2,  0,  0,  0,  0,  2,  0,
139
+       2,  0,  0,  2,  0,  2,  0,  0,  2,  0,  2,  0,  0,  2,
140
+       0,  2,  0,  0,  2,  0,  2,  0,  6,  2,  0,  2,  0,  0,
141
+       2,  0,  0,  0,  0,  2,  0,  2,  0,  0,  2,  0,  2,  0,
142
+       0,  2,  0,  2,  3,  0,  2,  0,  2,  0,  0,  2,  0,  2,
143
+       0,  0,  0,  0,  2,  0,  0,  2, 11,  2,  0,  0,  0, 16,
144
+       2,  0,  0,  0, 11,  2,  0,  0,  0,  0,  2,  0,  0,  0,
145
+       0, 17,  0,  0,  2,  0,  2,  2,  0,  2,  0,  2,  0,  0,
146
+       2,  0,  0,  0,  0,  2,  0,  2,  0,  0,  2,  0,  2,  3,
147
+       0,  2, 11,  2,  0,  0,  2,  0,  2,  0,  0,  0,  0,  2,
148
+       0,  0,  2,  0,  2,  0,  0,  0,  0,  2,  0,  0,  2,  0,
149
+       2,  0,  0,  2,  0,  2,  0,  0,  0,  0,  2, 10,  0,  2,
150
+       0,  2,  0,  0,  2,  0, 12,  0,  0,  2,  3,  2,  0,  0,
151
+       2,  0,  2,  0,  0,  2,  0,  2,  0,  0,  2,  0,  2,  0,
152
+       0,  2,  0,  2, 18,  0,  2,  0,  2,  0,  0,  2,  0,  2,
153
+       0,  0,  2,  0,  2,  0,  0,  2,  0,  2,  2,  0,  0,  0,
154
+       2,  0,  0,  2,  0,  2,  0,  0,  2,  0,  2,  0,  0,  2,
155
+       0,  2,  0,  0,  2,  0,  2,  0,  0,  0,  0,  2,  0,  0,
156
+       2,  0,  2,  0,  0,  0,  0,  2,  0,  0,  2,  0,  2,  0,
157
+       0,  2,  0,  2,  0,  0,  2,  0,  2,  0,  0,  0,  0,  2,
158
+       0,  0,  2,  0, 12,  0,  0,  0,  0,  2, 18,  0,  0,  0,
159
+       2,  3,  4,  2,  0,  2,  0,  0,  0,  0,  2,  0,  0,  0,
160
+       0,  2,  0,  0,  0,  0,  2,  0,  0,  0,  0,  2,  0,  0,
161
+       2,  0,  2,  0,  0,  2,  0,  0,  0,  0,  0,  0,  2,  0,
162
+       0,  2,  0,  0,  0,  0,  0,  0,  2,  3,  0,  0,  0,  0,
163
+       0,  0,  0,  0,  0,  0,  0,  0,  0,  2,  0,  0,  0,  0,
164
+       2,  0,  0,  0,  0,  2,  0,  0,  0,  0,  0,  0,  0,  0,
165
+       0,  2,  0,  0,  0,  0,  0,  0,  0,  0,  0,  2,  0,  0,
166
+       2,  0,  2,  0,  0,  2,  0,  0,  0,  0,  0,  0,  2,  0,
167
+       0,  0,  0,  0,  0,  0,  0,  0,  2,  0,  0,  2,  0,  0,
168
+       0,  0,  0,  0,  0,  0,  0,  0,  0,  2,  0,  0,  0,  0,
169
+       2,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
170
+       0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  2,  0,  0,
171
+       0,  0,  2,  0,  0,  0,  0,  0,  0,  0,  0,  0,  2,  0,
172
+       0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  2,
173
+       0,  0,  0,  0,  0,  0,  0,  0,  0,  2,  0,  0,  0,  0,
174
+       0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
175
+       0,  0,  0,  0,  0,  0,  0,  0,  0,  2,  0,  0,  0,  0,
176
+       0,  0,  0,  0,  0,  0,  0,  2,  0,  0,  0,  0,  2,  0,
177
+       0,  0,  0,  0,  0,  0,  0,  0,  0,  0, 14,  0,  0,  2,
178
+       0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
179
+       0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
180
+       0,  0,  0,  0,  0,  0,  2,  0,  0,  0,  0,  0,  0,  0,
181
+       0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
182
+       0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
183
+       0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
184
+       0,  0,  0,  0,  0,  2,  0,  0,  0,  0,  0,  0,  0,  0,
185
+       0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
186
+       0,  0,  0,  0,  0,  0,  0,  2
187
+    };
188
+  static const char * const wordlist[] =
189
+    {
190
+      "", "", "", "", "",
191
+      "md",
192
+      "", "",
193
+      "mv",
194
+      "",
195
+      "cd",
196
+      "",
197
+      "mz",
198
+      "cv",
199
+      "",
200
+      "ad",
201
+      "",
202
+      "cz",
203
+      "", "",
204
+      "mu",
205
+      "",
206
+      "az",
207
+      "", "",
208
+      "cu",
209
+      "",
210
+      "nz",
211
+      "", "",
212
+      "au",
213
+      "",
214
+      "mo",
215
+      "",
216
+      "mobi",
217
+      "nu",
218
+      "",
219
+      "co",
220
+      "com",
221
+      "coop",
222
+      "fo",
223
+      "",
224
+      "ao",
225
+      "", "",
226
+      "ms",
227
+      "",
228
+      "no",
229
+      "", "", "", "",
230
+      "me",
231
+      "", "",
232
+      "as",
233
+      "",
234
+      "asia",
235
+      "", "",
236
+      "my",
237
+      "",
238
+      "ae",
239
+      "",
240
+      "aero",
241
+      "cy",
242
+      "",
243
+      "ne",
244
+      "net",
245
+      "", "", "",
246
+      "mr",
247
+      "", "", "", "",
248
+      "cr",
249
+      "", "",
250
+      "fr",
251
+      "",
252
+      "ar",
253
+      "",
254
+      "arpa",
255
+      "td",
256
+      "",
257
+      "nr",
258
+      "tv",
259
+      "",
260
+      "mc",
261
+      "",
262
+      "tz",
263
+      "", "",
264
+      "cc",
265
+      "",
266
+      "mx",
267
+      "", "",
268
+      "ac",
269
+      "",
270
+      "cx",
271
+      "lv",
272
+      "",
273
+      "nc",
274
+      "",
275
+      "ax",
276
+      "", "", "", "",
277
+      "to",
278
+      "", "",
279
+      "lu",
280
+      "",
281
+      "ml",
282
+      "", "", "", "",
283
+      "cl",
284
+      "org",
285
+      "",
286
+      "mh",
287
+      "",
288
+      "al",
289
+      "", "",
290
+      "ch",
291
+      "",
292
+      "nl",
293
+      "tel",
294
+      "",
295
+      "sd",
296
+      "", "",
297
+      "sv",
298
+      "",
299
+      "ls",
300
+      "",
301
+      "sz",
302
+      "", "",
303
+      "jo",
304
+      "",
305
+      "jobs",
306
+      "ru",
307
+      "",
308
+      "su",
309
+      "",
310
+      "tr",
311
+      "", "",
312
+      "ly",
313
+      "", "", "", "",
314
+      "ro",
315
+      "",
316
+      "so",
317
+      "", "",
318
+      "je",
319
+      "",
320
+      "lr",
321
+      "", "",
322
+      "tc",
323
+      "",
324
+      "ma",
325
+      "rs",
326
+      "", "", "",
327
+      "ca",
328
+      "cat",
329
+      "",
330
+      "re",
331
+      "",
332
+      "se",
333
+      "", "",
334
+      "lc",
335
+      "",
336
+      "na",
337
+      "",
338
+      "name",
339
+      "sy",
340
+      "",
341
+      "qa",
342
+      "", "",
343
+      "gd",
344
+      "",
345
+      "tl",
346
+      "", "", "", "",
347
+      "sr",
348
+      "", "",
349
+      "th",
350
+      "",
351
+      "mg",
352
+      "", "",
353
+      "gu",
354
+      "",
355
+      "cg",
356
+      "", "", "", "",
357
+      "ag",
358
+      "", "",
359
+      "sc",
360
+      "",
361
+      "ng",
362
+      "gov",
363
+      "",
364
+      "bd",
365
+      "", "",
366
+      "bv",
367
+      "",
368
+      "id",
369
+      "",
370
+      "bz",
371
+      "", "",
372
+      "gs",
373
+      "", "", "", "",
374
+      "mk",
375
+      "",
376
+      "ge",
377
+      "", "",
378
+      "ck",
379
+      "",
380
+      "sl",
381
+      "fk",
382
+      "",
383
+      "gy",
384
+      "",
385
+      "bo",
386
+      "", "",
387
+      "sh",
388
+      "",
389
+      "io",
390
+      "", "", "", "",
391
+      "gr",
392
+      "", "",
393
+      "bs",
394
+      "",
395
+      "la",
396
+      "", "",
397
+      "is",
398
+      "travel",
399
+      "be",
400
+      "", "", "", "",
401
+      "ie",
402
+      "", "",
403
+      "by",
404
+      "", "", "", "",
405
+      "mw",
406
+      "",
407
+      "tg",
408
+      "", "", "", "",
409
+      "br",
410
+      "", "",
411
+      "aw",
412
+      "",
413
+      "ir",
414
+      "", "",
415
+      "cf",
416
+      "",
417
+      "sa",
418
+      "", "",
419
+      "af",
420
+      "",
421
+      "gl",
422
+      "", "",
423
+      "nf",
424
+      "", "", "", "",
425
+      "gh",
426
+      "", "", "", "",
427
+      "tk",
428
+      "",
429
+      "mm",
430
+      "", "",
431
+      "yu",
432
+      "",
433
+      "cm",
434
+      "", "",
435
+      "fm",
436
+      "",
437
+      "am",
438
+      "", "",
439
+      "lk",
440
+      "",
441
+      "sg",
442
+      "", "",
443
+      "ps",
444
+      "",
445
+      "il",
446
+      "",
447
+      "museum",
448
+      "bh",
449
+      "",
450
+      "pe",
451
+      "", "",
452
+      "mq",
453
+      "", "", "", "",
454
+      "py",
455
+      "",
456
+      "ye",
457
+      "", "",
458
+      "aq",
459
+      "",
460
+      "ga",
461
+      "", "",
462
+      "tw",
463
+      "",
464
+      "pr",
465
+      "pro",
466
+      "",
467
+      "sk",
468
+      "",
469
+      "om",
470
+      "", "",
471
+      "tf",
472
+      "",
473
+      "mn",
474
+      "", "", "", "",
475
+      "cn",
476
+      "", "",
477
+      "ws",
478
+      "xn--g6w251d",
479
+      "an",
480
+      "", "", "",
481
+      "xn--80akhbyknj4f",
482
+      "ba",
483
+      "", "", "",
484
+      "xn--0zwm56d",
485
+      "gg",
486
+      "", "", "", "",
487
+      "tm",
488
+      "", "", "", "",
489
+      "xn--11b5bs3a9aj6g",
490
+      "", "",
491
+      "hu",
492
+      "",
493
+      "pl",
494
+      "rw",
495
+      "",
496
+      "mp",
497
+      "",
498
+      "uz",
499
+      "", "",
500
+      "ph",
501
+      "", "", "", "",
502
+      "lb",
503
+      "",
504
+      "bg",
505
+      "", "",
506
+      "np",
507
+      "",
508
+      "kz",
509
+      "mil",
510
+      "",
511
+      "jm",
512
+      "xn--deba0ad",
513
+      "ci",
514
+      "", "",
515
+      "fi",
516
+      "",
517
+      "ai",
518
+      "", "", "", "",
519
+      "ni",
520
+      "", "",
521
+      "us",
522
+      "",
523
+      "sm",
524
+      "", "", "", "",
525
+      "tn",
526
+      "", "",
527
+      "sb",
528
+      "",
529
+      "hr",
530
+      "", "",
531
+      "uy",
532
+      "",
533
+      "pa",
534
+      "", "", "", "",
535
+      "ke",
536
+      "xn--zckzah",
537
+      "",
538
+      "gw",
539
+      "",
540
+      "mt",
541
+      "", "",
542
+      "ky",
543
+      "",
544
+      "xn--jxalpdlp",
545
+      "", "",
546
+      "gf",
547
+      "edu",
548
+      "at",
549
+      "", "",
550
+      "vu",
551
+      "",
552
+      "kr",
553
+      "", "",
554
+      "tp",
555
+      "",
556
+      "dz",
557
+      "", "",
558
+      "eu",
559
+      "",
560
+      "pg",
561
+      "", "",
562
+      "bw",
563
+      "",
564
+      "sn",
565
+      "xn--hlcj6aya9esc7a",
566
+      "",
567
+      "fj",
568
+      "",
569
+      "gm",
570
+      "", "",
571
+      "bf",
572
+      "",
573
+      "do",
574
+      "", "",
575
+      "gb",
576
+      "",
577
+      "ve",
578
+      "", "",
579
+      "es",
580
+      "",
581
+      "li",
582
+      "jp",
583
+      "", "", "",
584
+      "ee",
585
+      "", "",
586
+      "pk",
587
+      "",
588
+      "de",
589
+      "", "",
590
+      "gq",
591
+      "",
592
+      "bm",
593
+      "", "",
594
+      "kh",
595
+      "",
596
+      "im",
597
+      "", "",
598
+      "bb",
599
+      "",
600
+      "er",
601
+      "", "", "", "",
602
+      "tt",
603
+      "", "",
604
+      "vc",
605
+      "",
606
+      "si",
607
+      "", "", "", "",
608
+      "gn",
609
+      "", "",
610
+      "ec",
611
+      "",
612
+      "lt",
613
+      "", "",
614
+      "iq",
615
+      "",
616
+      "ua",
617
+      "", "",
618
+      "pw",
619
+      "",
620
+      "tj",
621
+      "", "", "", "",
622
+      "za",
623
+      "", "",
624
+      "pf",
625
+      "",
626
+      "xn--kgbechtv",
627
+      "", "", "", "",
628
+      "bn",
629
+      "xn--hgbk6aj7f53bba",
630
+      "", "", "",
631
+      "in",
632
+      "int",
633
+      "info",
634
+      "gp",
635
+      "",
636
+      "st",
637
+      "", "", "", "",
638
+      "ug",
639
+      "", "", "", "",
640
+      "pm",
641
+      "", "", "", "",
642
+      "gi",
643
+      "", "", "", "",
644
+      "kg",
645
+      "", "",
646
+      "hk",
647
+      "",
648
+      "sj",
649
+      "", "",
650
+      "wf",
651
+      "", "", "", "", "", "",
652
+      "va",
653
+      "", "",
654
+      "uk",
655
+      "", "", "", "", "", "",
656
+      "bi",
657
+      "biz",
658
+      "", "", "", "", "", "", "", "", "",
659
+      "", "", "", "",
660
+      "gt",
661
+      "", "", "", "",
662
+      "pn",
663
+      "", "", "", "",
664
+      "vg",
665
+      "", "", "", "", "", "", "", "", "",
666
+      "eg",
667
+      "", "", "", "", "", "", "", "", "",
668
+      "bt",
669
+      "", "",
670
+      "zw",
671
+      "",
672
+      "it",
673
+      "", "",
674
+      "kw",
675
+      "", "", "", "", "", "",
676
+      "hm",
677
+      "", "", "", "", "", "", "", "", "",
678
+      "bj",
679
+      "", "",
680
+      "dk",
681
+      "", "", "", "", "", "", "", "", "",
682
+      "", "",
683
+      "zm",
684
+      "", "", "", "",
685
+      "km",
686
+      "", "", "", "", "", "", "", "", "",
687
+      "", "", "", "", "", "", "", "", "",
688
+      "", "", "", "", "", "",
689
+      "hn",
690
+      "", "", "", "",
691
+      "pt",
692
+      "", "", "", "", "", "", "", "", "",
693
+      "yt",
694
+      "", "", "", "", "", "", "", "", "",
695
+      "", "", "", "", "",
696
+      "kn",
697
+      "", "", "", "", "", "", "", "", "",
698
+      "dm",
699
+      "", "", "", "", "", "", "", "", "",
700
+      "", "", "", "", "", "", "", "", "",
701
+      "", "", "", "", "", "", "", "", "",
702
+      "kp",
703
+      "", "", "", "", "", "", "", "", "",
704
+      "", "",
705
+      "vn",
706
+      "", "", "", "",
707
+      "ki",
708
+      "", "", "", "", "", "", "", "", "",
709
+      "", "",
710
+      "xn--9t4b11yi5a",
711
+      "", "",
712
+      "ht",
713
+      "", "", "", "", "", "", "", "", "",
714
+      "", "", "", "", "", "", "", "", "",
715
+      "", "", "", "", "", "", "", "", "",
716
+      "", "", "", "", "", "", "",
717
+      "vi",
718
+      "", "", "", "", "", "", "", "", "",
719
+      "", "", "", "", "", "", "", "", "",
720
+      "", "", "", "", "", "", "", "", "",
721
+      "", "", "", "", "", "", "", "", "",
722
+      "", "", "", "", "", "", "", "", "",
723
+      "", "", "", "", "", "", "", "", "",
724
+      "et",
725
+      "", "", "", "", "", "", "", "", "",
726
+      "", "", "", "", "", "", "", "", "",
727
+      "", "", "", "", "", "", "", "", "",
728
+      "", "",
729
+      "dj"
730
+    };
731
+
732
+  if (len <= MAX_WORD_LENGTH && len >= MIN_WORD_LENGTH)
733
+    {
734
+      int key = tld_hash (str, len);
735
+
736
+      if (key <= MAX_HASH_VALUE && key >= 0)
737
+        if (len == lengthtable[key])
738
+          {
739
+            const char *s = wordlist[key];
28 740
 
741
+            if (*str == *s && !memcmp (str + 1, s + 1, len - 1))
742
+              return s;
743
+          }
744
+    }
745
+  return 0;
746
+}
... ...
@@ -49,16 +49,6 @@ int domainlist_match(const struct cl_engine* engine,char* real_url,const char* d
49 49
 {
50 50
 	const char* info;
51 51
 	int rc = engine->domainlist_matcher ? regex_list_match(engine->domainlist_matcher,real_url,display_url,hostOnly ? pre_fixup : NULL,hostOnly,&info,0) : 0;
52
-	if(rc && info && info[0] && info[0] != ':') {/*match successful, and has custom flags*/
53
-		if(strlen(info)==3 && isxdigit(info[0]) && isxdigit(info[1]) && isxdigit(info[2])) {
54
-			unsigned short notwantedflags=0;
55
-			sscanf(info,"%hx",&notwantedflags);
56
-		        *flags &= ~notwantedflags;/* filter unwanted phishcheck flags */	
57
-		}
58
-		else {
59
-			cli_warnmsg("Phishcheck:Unknown flag format in domain-list, 3 hex digits expected");
60
-		}
61
-	}
62 52
 	return rc;
63 53
 }
64 54
 
... ...
@@ -79,13 +69,6 @@ int is_domainlist_ok(const struct cl_engine* engine)
79 79
 	return (engine && engine->domainlist_matcher) ? is_regex_ok(engine->domainlist_matcher) : 1;
80 80
 }
81 81
 
82
-void domainlist_cleanup(const struct cl_engine* engine)
83
-{
84
-	if(engine && engine->domainlist_matcher) {
85
-		regex_list_cleanup(engine->domainlist_matcher);
86
-	}
87
-}
88
-
89 82
 void domainlist_done(struct cl_engine* engine)
90 83
 {
91 84
 	if(engine && engine->domainlist_matcher) {
... ...
@@ -69,13 +69,6 @@ int is_whitelist_ok(const struct cl_engine* engine)
69 69
 	return (engine && engine->whitelist_matcher) ? is_regex_ok(engine->whitelist_matcher) : 1;
70 70
 }
71 71
 
72
-void whitelist_cleanup(const struct cl_engine* engine)
73
-{
74
-	if(engine && engine->whitelist_matcher) {
75
-		regex_list_cleanup(engine->whitelist_matcher);
76
-	}
77
-}
78
-
79 72
 void whitelist_done(struct cl_engine* engine)
80 73
 {
81 74
 	if(engine && engine->whitelist_matcher) {
... ...
@@ -39,6 +39,7 @@
39 39
 #include <ctype.h>
40 40
 
41 41
 #include "clamav.h"
42
+#include "cltypes.h"
42 43
 #include "others.h"
43 44
 #include "mbox.h"
44 45
 #include "message.h"
... ...
@@ -47,6 +48,7 @@
47 47
 #include "phish_domaincheck_db.h"
48 48
 #include "phish_whitelist.h"
49 49
 #include "iana_tld.h"
50
+#include "iana_cctld.h"
50 51
 
51 52
 
52 53
 #define DOMAIN_REAL 1
... ...
@@ -140,8 +142,6 @@ static char empty_string[]="";
140 140
 #define CLOAKED_URL "^"ANY_CLOAK"(\\."ANY_CLOAK"){0,3}$"
141 141
 
142 142
 static const char cloaked_host_regex[] = CLOAKED_URL;
143
-static const char tld_regex[] = "^"iana_tld"$";
144
-static const char cctld_regex[] = "^"iana_cctld"$";
145 143
 static const char dotnet[] = ".net";
146 144
 static const char adonet[] = "ado.net";
147 145
 static const char aspnet[] = "asp.net";
... ...
@@ -151,7 +151,10 @@ static const char gt[]="&gt";
151 151
 static const char src_text[] = "src";
152 152
 static const char href_text[] = "href";
153 153
 static const char mailto[] = "mailto:";
154
+static const char mailto_proto[] = "mailto://";
154 155
 static const char https[]="https://";
156
+static const char http[]="http://";
157
+static const char ftp[] = "ftp://";
155 158
 
156 159
 static const size_t href_text_len = sizeof(href_text);
157 160
 static const size_t src_text_len = sizeof(src_text);
... ...
@@ -161,7 +164,10 @@ static const size_t aspnet_len = sizeof(aspnet)-1;
161 161
 static const size_t lt_len = sizeof(lt)-1;
162 162
 static const size_t gt_len = sizeof(gt)-1;
163 163
 static const size_t mailto_len = sizeof(mailto)-1;
164
+static const size_t mailto_proto_len = sizeof(mailto_proto)-1;
164 165
 static const size_t https_len  = sizeof(https)-1;
166
+static const size_t http_len  = sizeof(http)-1;
167
+static const size_t ftp_len  = sizeof(ftp)-1;
165 168
 
166 169
 /* for urls, including mailto: urls, and (broken) http:www... style urls*/
167 170
 /* refer to: http://www.w3.org/Addressing/URL/5_URI_BNF.html
... ...
@@ -169,41 +175,13 @@ static const size_t https_len  = sizeof(https)-1;
169 169
  * So the 'safe' char class has been split up
170 170
  * */
171 171
 /* character classes */
172
-#define URI_alpha	"a-zA-Z"
173 172
 #define URI_digit	"0-9"
174
-#define URI_safe_nodot  "-$_@&"
175
-#define URI_safe	"-$_@.&"
176
-#define URI_extra	"!*\"'(),"
177
-
178
-#define URI_hex		 "[0-9a-fA-f]"
179
-#define URI_escape      "%"URI_hex"{2}"
180
-#define URI_xalpha "([" URI_safe URI_alpha URI_digit  URI_extra "]|"URI_escape")" /* URI_safe has to be first, because it contains - */
181
-#define URI_xalpha_nodot "([" URI_safe_nodot URI_alpha URI_digit URI_extra "]|"URI_escape")"
182
-
183
-#define URI_xalphas_nodot URI_xalpha_nodot"*"
184
-
185
-#define URI_ialpha  "["URI_alpha"]"URI_xalphas_nodot""
186
-#define URI_xpalpha URI_xalpha"|\\+"
187
-#define URI_xpalpha_nodot URI_xalpha_nodot"|\\+"
188
-#define URI_xpalphas_nodot "("URI_xpalpha_nodot")+"
189
-
190
-#define URI_scheme URI_ialpha
191
-#define URI_tld iana_tld
192
-#define URI_path1 URI_xpalphas_nodot"\\.("URI_xpalphas_nodot"\\.)*"
193
-
194 173
 #define URI_IP_digits "["URI_digit"]{1,3}"
195 174
 #define URI_path_start "[/?:]?"
196 175
 #define URI_numeric_path URI_IP_digits"(\\."URI_IP_digits"){3}"URI_path_start
197
-#define URI_numeric_URI "("URI_scheme":(//)?)?"URI_numeric_path
176
+#define URI_numeric_URI "(http|https|ftp:(//)?)?"URI_numeric_path
198 177
 #define URI_numeric_fragmentaddress URI_numeric_URI
199 178
 
200
-#define URI_URI1 "("URI_scheme":(//)?)?"URI_path1
201
-#define URI_URI2 URI_tld
202
-
203
-#define URI_fragmentaddress1 URI_URI1
204
-#define URI_fragmentaddress2 URI_URI2""URI_path_start
205
-
206
-#define URI_CHECK_PROTOCOLS "(http|https|ftp|mailto)://.+"
207 179
 
208 180
 /*Warning: take care when modifying this regex, it has been tweaked, and tuned, just don't break it please.
209 181
  * there is fragmentaddress1, and 2  to work around the ISO limitation of 509 bytes max length for string constants*/
... ...
@@ -235,7 +213,6 @@ static int string_assign_concatenated(struct string* dest, const char* prefix, c
235 235
 static void string_assign_null(struct string* dest);
236 236
 static char *rfind(char *start, char c, size_t len);
237 237
 static char hex2int(const unsigned char* src);
238
-static int isTLD(const struct phishcheck* pchk,const char* str,int len);
239 238
 static enum phish_status phishingCheck(const struct cl_engine* engine,struct url_check* urls);
240 239
 static const char* phishing_ret_toString(enum phish_status rc);
241 240
 
... ...
@@ -416,7 +393,7 @@ static int get_host(const struct phishcheck* s,const char* URL,int isReal,int* p
416 416
 			}
417 417
 
418 418
 			tld = strrchr(realhost,'.');
419
-			rc = tld ? isTLD(s,tld,tld-realhost-1) : 0;
419
+			rc = tld ? !!in_tld_set(tld,tld-realhost-1) : 0;
420 420
 			if(rc < 0)
421 421
 				return rc;
422 422
 			if(rc)
... ...
@@ -438,28 +415,6 @@ static int get_host(const struct phishcheck* s,const char* URL,int isReal,int* p
438 438
 	return 0;
439 439
 }
440 440
 
441
-static int isCountryCode(const struct phishcheck* s,const char* str)
442
-{
443
-	return str ? !cli_regexec(&s->preg_cctld,str,0,NULL,0) : 0;
444
-}
445
-
446
-static int isTLD(const struct phishcheck* pchk,const char* str,int len)
447
-{
448
-	if (!str)
449
-		return 0;
450
-	else {
451
-		char*	s  = cli_malloc(len+1);
452
-		int rc;
453
-
454
-		if(!s)
455
-			return CL_EMEM;
456
-		strncpy(s,str,len);
457
-		s[len]='\0';
458
-		rc = !cli_regexec(&pchk->preg_tld,s,0,NULL,0);
459
-		free(s);
460
-		return rc ? 1 : 0;
461
-	}
462
-}
463 441
 
464 442
 /*
465 443
  * memrchr isn't standard, so I use this
... ...
@@ -486,7 +441,7 @@ static void get_domain(const struct phishcheck* pchk,struct string* dest,struct
486 486
 		string_assign(dest,host);
487 487
 		return;
488 488
 	}
489
-	if(isCountryCode(pchk,tld+1)) {
489
+	if(in_cctld_set(tld+1, strlen(tld+1))) {
490 490
 		const char* countrycode = tld+1;
491 491
 		tld = rfind(host->data,'.',tld-host->data-1);
492 492
 		if(!tld) {
... ...
@@ -495,7 +450,7 @@ static void get_domain(const struct phishcheck* pchk,struct string* dest,struct
495 495
 			string_assign(dest,host);
496 496
 			return;
497 497
 		}
498
-		if(!isTLD(pchk,tld+1,countrycode-tld-2)) {
498
+		if(!in_tld_set(tld+1, countrycode-tld-2)) {
499 499
 			string_assign_ref(dest,host,tld+1);
500 500
 			return;/*it was a name like: subdomain.domain.uk, return domain.uk*/
501 501
 		}
... ...
@@ -737,11 +692,7 @@ cleanupURL(struct string *URL,struct string *pre_URL, int isReal)
737 737
 			/* @end points to last character we want to be part of the URL */
738 738
 			end = host_begin + host_len - 1;
739 739
 		}
740
-		/* terminate URL with a slash, except when we're at end of string */
741
-		if(host_begin[host_len]) {
742
-			host_begin[host_len] = '/';
743
-			end++;
744
-		}
740
+		host_begin[host_len] = '\0';
745 741
 		/* convert hostname to lowercase, but only hostname! */
746 742
 		str_make_lowercase(host_begin, host_len);
747 743
 		/* some broken MUAs put > in the href, and then
... ...
@@ -797,6 +748,40 @@ int phishingScan(message* m,const char* dir,cli_ctx* ctx,tag_arguments_t* hrefs)
797 797
 
798 798
 	if(!ctx->found_possibly_unwanted)
799 799
 		*ctx->virname=NULL;
800
+#if 0
801
+	FILE *f = fopen("/home/edwin/quarantine/urls","r");
802
+	if(!f)
803
+		abort();
804
+	while(!feof(f)) {
805
+		struct url_check urls;
806
+		char line1[4096];
807
+		char line2[4096];
808
+		char line3[4096];
809
+
810
+		fgets(line1, sizeof(line1), f);
811
+		fgets(line2, sizeof(line2), f);
812
+		fgets(line3, sizeof(line3), f);
813
+		if(strcmp(line3, "\n") != 0) {
814
+			strcpy(line1, line2);
815
+			strcpy(line2, line3);
816
+			fgets(line3, sizeof(line3), f);
817
+			while(strcmp(line3, "\n") != 0) {
818
+				fgets(line3, sizeof(line3),f);
819
+			}
820
+		}
821
+		urls.flags = CL_PHISH_ALL_CHECKS;
822
+		urls.link_type = 0;
823
+		string_init_c(&urls.realLink, line1);
824
+		string_init_c(&urls.displayLink, line2);
825
+		string_init_c(&urls.pre_fixup.pre_displayLink, NULL);
826
+		urls.realLink.refcount=-1;
827
+		urls.displayLink.refcount=-1;
828
+		int rc = phishingCheck(ctx->engine, &urls);
829
+		//printf("%d\n",rc);
830
+	}
831
+	fclose(f);
832
+	return 0;
833
+#endif
800 834
 	for(i=0;i<hrefs->count;i++)
801 835
 		if(hrefs->contents[i]) {
802 836
 			struct url_check urls;
... ...
@@ -928,44 +913,7 @@ int phishing_init(struct cl_engine* engine)
928 928
 		return CL_EFORMAT;
929 929
 	}
930 930
 
931
-	if(build_regex(&pchk->preg_cctld,cctld_regex,1)) {
932
-		free(pchk);
933
-		engine->phishcheck = NULL;
934
-		return CL_EFORMAT;
935
-	}
936
-	if(build_regex(&pchk->preg_tld,tld_regex,1)) {
937
-		free_regex(&pchk->preg_cctld);
938
-		free(pchk);
939
-		engine->phishcheck = NULL;
940
-		return CL_EFORMAT;
941
-	}
942
-	url_regex = str_compose("^ *(("URI_CHECK_PROTOCOLS")|(",URI_fragmentaddress1,URI_fragmentaddress2")) *$");
943
-	if(!url_regex || build_regex(&pchk->preg,url_regex,1)) {
944
-		free_regex(&pchk->preg_cctld);
945
-		free_regex(&pchk->preg_tld);
946
-		free(url_regex);
947
-		free(pchk);
948
-		engine->phishcheck = NULL;
949
-		return CL_EFORMAT;
950
-	}
951
-	free(url_regex);
952
-	realurl_regex = str_compose("^ *(("URI_CHECK_PROTOCOLS")|(",URI_path1,URI_fragmentaddress2")) *$");
953
-	if(!realurl_regex || build_regex(&pchk->preg_realurl, realurl_regex,1)) {
954
-		free_regex(&pchk->preg_cctld);
955
-		free_regex(&pchk->preg_tld);
956
-		free_regex(&pchk->preg);
957
-		free(url_regex);
958
-		free(realurl_regex);
959
-		free(pchk);
960
-		engine->phishcheck = NULL;
961
-		return CL_EFORMAT;
962
-	}
963
-	free(realurl_regex);
964 931
 	if(build_regex(&pchk->preg_numeric,numeric_url_regex,1)) {
965
-		free_regex(&pchk->preg_cctld);
966
-		free_regex(&pchk->preg_tld);
967
-		free_regex(&pchk->preg);
968
-		free_regex(&pchk->preg_realurl);
969 932
 		free(pchk);
970 933
 		engine->phishcheck = NULL;
971 934
 		return CL_EFORMAT;
... ...
@@ -980,12 +928,8 @@ void phishing_done(struct cl_engine* engine)
980 980
 	struct phishcheck* pchk = engine->phishcheck;
981 981
 	cli_dbgmsg("Cleaning up phishcheck\n");
982 982
 	if(pchk && !pchk->is_disabled) {
983
-		free_regex(&pchk->preg);
984 983
 		free_regex(&pchk->preg_hexurl);
985
-		free_regex(&pchk->preg_cctld);
986
-		free_regex(&pchk->preg_tld);
987 984
 		free_regex(&pchk->preg_numeric);
988
-		free_regex(&pchk->preg_realurl);
989 985
 		pchk->is_disabled = 1;
990 986
 	}
991 987
 	whitelist_done(engine);
... ...
@@ -998,22 +942,165 @@ void phishing_done(struct cl_engine* engine)
998 998
 	cli_dbgmsg("Phishcheck cleaned up\n");
999 999
 }
1000 1000
 
1001
+
1002
+/*ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz*/
1003
+static const uint8_t URI_alpha[256] = {
1004
+        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1005
+        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1006
+        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1007
+        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1008
+        0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1009
+        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0,
1010
+        0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1011
+        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0,
1012
+        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1013
+        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1014
+        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1015
+        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1016
+        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1017
+        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1018
+        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1019
+        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
1020
+};
1021
+
1022
+/*!"$%&'()*,-0123456789@ABCDEFGHIJKLMNOPQRSTUVWXYZ_abcdefghijklmnopqrstuvwxyz*/
1023
+static const uint8_t URI_xalpha_nodot[256] = {
1024
+        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1025
+        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1026
+        0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0,
1027
+        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0,
1028
+        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1029
+        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1,
1030
+        0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1031
+        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0,
1032
+        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1033
+        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1034
+        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1035
+        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1036
+        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1037
+        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1038
+        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1039
+        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
1040
+};
1041
+
1042
+/*!"$%&'()*+,-0123456789@ABCDEFGHIJKLMNOPQRSTUVWXYZ_abcdefghijklmnopqrstuvwxyz*/
1043
+static const uint8_t URI_xpalpha_nodot[256] = {
1044
+        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1045
+        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1046
+        0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0,
1047
+        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0,
1048
+        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1049
+        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1,
1050
+        0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1051
+        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0,
1052
+        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1053
+        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1054
+        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1055
+        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1056
+        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1057
+        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1058
+        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1059
+        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
1060
+};
1061
+
1062
+static inline int validate_uri_xalphas_nodot(const char *start, const char *end)
1063
+{
1064
+	const unsigned char *p = start;
1065
+	for(p=start;p < (const unsigned char*)end; p++) {
1066
+		if(!URI_xalpha_nodot[*p])
1067
+			return 0;
1068
+	}
1069
+	return 1;
1070
+}
1071
+
1072
+static inline int validate_uri_xpalphas_nodot(const char *start, const char *end)
1073
+{
1074
+	const unsigned char *p = start;
1075
+	for(p=start;p < (const unsigned char*)end; p++) {
1076
+		if(!URI_xpalpha_nodot[*p])
1077
+			return 0;
1078
+	}
1079
+	/* must have at least on char */
1080
+	return p > (const unsigned char*)start;
1081
+}
1082
+
1083
+
1084
+static inline int validate_uri_ialpha(const char *start, const char *end)
1085
+{
1086
+	const unsigned char *p = start;
1087
+	if(start >= end || !URI_alpha[*p])
1088
+		return 0;
1089
+	return validate_uri_xalphas_nodot(start + 1, end);
1090
+}
1091
+
1001 1092
 /*
1002 1093
  * Only those URLs are identified as URLs for which phishing detection can be performed.
1003 1094
  */
1004
-static int isURL(const struct phishcheck* pchk,const char* URL)
1095
+static int isURL(const struct phishcheck* pchk,const char* URL, int accept_anyproto)
1005 1096
 {
1006
-	return URL ? !cli_regexec(&pchk->preg,URL,0,NULL,0) : 0;
1097
+	const char *start = NULL, *p, *q;
1098
+	if(!URL)
1099
+		return 0;
1100
+
1101
+	switch (URL[0]) {
1102
+		case 'h':
1103
+			if (strncmp(URL, https, https_len) == 0)
1104
+				start = URL + https_len;
1105
+			else if (strncmp(URL, http, http_len) == 0)
1106
+				start = URL + http_len;
1107
+			break;
1108
+		case 'f':
1109
+		       if (strncmp(URL, ftp, ftp_len) == 0)
1110
+			       start = URL + ftp_len;
1111
+		       break;
1112
+		case 'm':
1113
+		       if (strncmp(URL, mailto_proto, mailto_proto_len) == 0)
1114
+			       start = URL + mailto_proto_len;
1115
+		       break;
1116
+	}
1117
+	if(start) {
1118
+		if(start[0] == '\0')
1119
+			return 0;/* empty URL */
1120
+		/* has a valid protocol, it is a URL */
1121
+		return 1;
1122
+	}
1123
+	start = accept_anyproto ?  strchr(URL, ':') : NULL;
1124
+	if(start) {
1125
+		/* validate URI scheme */
1126
+		if(validate_uri_ialpha(URL, start)) {
1127
+			if(start[1] == '/' && start[2] == '/')
1128
+				start += 3; /* skip :// */
1129
+			else
1130
+				start++;
1131
+		}
1132
+		else
1133
+			start = URL; /* scheme invalid */
1134
+	} else
1135
+		start = URL;
1136
+	p = start;
1137
+	do {
1138
+		q = strchr(p, '.');
1139
+		if(q) {
1140
+			if(!validate_uri_xpalphas_nodot(p, q))
1141
+				return 0;
1142
+			p = q+1;
1143
+		}
1144
+	} while(q);
1145
+	if (p == start) /* must have at least one dot in the URL */
1146
+		return 0;
1147
+	return !!in_tld_set(p, strlen(p));
1007 1148
 }
1008 1149
 
1009 1150
 /*
1010 1151
  * Check if this is a real URL, which basically means to check if it has a known URL scheme (http,https,ftp).
1011 1152
  * This prevents false positives with outbind:// and blocked:: links.
1012 1153
  */
1154
+#if 0
1013 1155
 static int isRealURL(const struct phishcheck* pchk,const char* URL)
1014 1156
 {
1015 1157
 	return URL ? !cli_regexec(&pchk->preg_realurl,URL,0,NULL,0) : 0;
1016 1158
 }
1159
+#endif
1017 1160
 
1018 1161
 static int isNumericURL(const struct phishcheck* pchk,const char* URL)
1019 1162
 {
... ...
@@ -1139,7 +1226,7 @@ static enum phish_status phishingCheck(const struct cl_engine* engine,struct url
1139 1139
 	cli_dbgmsg("Phishcheck:URL after cleanup: %s->%s\n", urls->realLink.data,
1140 1140
 		urls->displayLink.data);
1141 1141
 
1142
-	if((!isURL(pchk, urls->displayLink.data) || !isRealURL(pchk, urls->realLink.data) ) &&
1142
+	if((!isURL(pchk, urls->displayLink.data, 1) || !isURL(pchk, urls->realLink.data, 0) ) &&
1143 1143
 			( (phishy&PHISHY_NUMERIC_IP && !isNumericURL(pchk, urls->displayLink.data)) ||
1144 1144
 			  !(phishy&PHISHY_NUMERIC_IP))) {
1145 1145
 		cli_dbgmsg("Displayed 'url' is not url:%s\n",urls->displayLink.data);
... ...
@@ -44,10 +44,6 @@ struct string {
44 44
 };
45 45
 
46 46
 struct phishcheck {
47
-	regex_t preg;
48
-	regex_t preg_realurl;
49
-	regex_t preg_tld;
50
-	regex_t preg_cctld;
51 47
 	regex_t preg_numeric;
52 48
 	regex_t preg_hexurl;
53 49
 	int      is_disabled;
... ...
@@ -1839,6 +1839,12 @@ int cl_build(struct cl_engine *engine)
1839 1839
 	}
1840 1840
     }
1841 1841
 
1842
+    if((ret = cli_build_regex_list(engine->whitelist_matcher))) {
1843
+	    return ret;
1844
+    }
1845
+    if((ret = cli_build_regex_list(engine->domainlist_matcher))) {
1846
+	    return ret;
1847
+    }
1842 1848
     cli_md5db_build(engine->md5_mdb);
1843 1849
     cli_freeign(engine);
1844 1850
     cli_dconf_print(engine->dconf);
... ...
@@ -42,6 +42,8 @@
42 42
 
43 43
 #include <limits.h>
44 44
 #include <sys/types.h>
45
+#include <assert.h>
46
+
45 47
 
46 48
 #include "regex/regex.h"
47 49
 
... ...
@@ -53,152 +55,471 @@
53 53
 #include "matcher.h"
54 54
 #include "str.h"
55 55
 #include "readdb.h"
56
+#include "jsparse/textbuf.h"
57
+
58
+/* ------- parse a regular expression, and extract a static suffix ------*/
59
+enum node_type {
60
+	root=0,
61
+	concat,
62
+	alternate, /* | */
63
+	optional,/* ?, * */
64
+	leaf, /* a character */
65
+	leaf_class /* character class */
66
+	/* (x)+ is transformed into (x)*(x) */
67
+};
56 68
 
57
-/*Tree*/
58
-enum token_op_t {OP_CHAR,OP_STDCLASS,OP_CUSTOMCLASS,OP_DOT,OP_LEAF,OP_ROOT,OP_PARCLOSE};
59
-typedef unsigned char* char_bitmap_p;
60
-/*
61
- *
62
- * OP_CHAR: 1 character, c = character
63
- * complex stuff:
64
- * OP_STDCLASS: standard character class, c = char class, class: 1<<(index into std_class of class name)
65
- * OP_CUSTOMCLASS: custom character class, first pointer in ptr array is a pointer to the bitmap table for this class
66
- * OP_DOT: single . matching any character except \n
67
- * OP_LEAF: this is a leaf node, reinterpret structure
68
- */
69
-struct tree_node {
70
-	struct tree_node* next;/* next regex/complex sibling, or parent, if no more siblings , can't be NULL except for root node*/
69
+struct node {
70
+	enum node_type type;
71
+	struct node *parent;
71 72
 	union {
72
-		struct tree_node** children;/* alternatives nr. of children, followed by (a null pointer terminated) regex leaf node pointers) */
73
-		char_bitmap_p* bitmap;
74
-		struct leaf_info*  leaf;
73
+		struct {
74
+			struct node* left;
75
+			struct node* right;
76
+		} children;
77
+		uint8_t*    leaf_class_bitmap;
78
+		uint8_t     leaf_char;
75 79
 	} u;
76
-	enum token_op_t op;
77
-	unsigned char c;
78
-	char alternatives;/* number of (non-regex) children of node, i.e. sizeof(children)*/
79
-	char listend;/* no more siblings, next pointer is pointer to parent*/
80 80
 };
81 81
 
82
-struct leaf_info {
83
-	char* info;/* what does it mean that we reached the leaf...*/
84
-	regex_t* preg;/* this is NULL if leaf node, and non-regex*/
85
-};
82
+/* Prototypes */
83
+static size_t reverse_string(char *pattern);
84
+static int add_pattern(struct regex_matcher *matcher, char *pattern);
85
+static int add_pattern_suffix(struct regex_matcher *matcher, char *suffix, size_t suffix_len, struct regex_list *regex);
86
+static int add_static_pattern(struct regex_matcher *matcher, char* pattern);
87
+static int build_suffixtree_descend(struct regex_matcher *matcher, struct regex_list *regex, struct node *n, struct text_buffer *buf);
88
+/* ---------- */
86 89
 
87
-/* Character classes */
88
-static const char* std_class[] = {
89
-	"[:alnum:]",
90
-	"[:digit:]",
91
-	"[:punct:]",
92
-	"[:alpha:]",
93
-	"[:graph:]",
94
-	"[:space:]",
95
-	"[:blank:]",
96
-	"[:lower:]", 
97
-	"[:upper:]",
98
-	"[:cntrl:]",
99
-	"[:print:]",
100
-	"[:xdigit:]"
101
-	/* don't change the order of these strings, unless you change them in generate_tables.c too, and regenerate the tables*/
102
-};
90
+static uint8_t dot_bitmap[32];
103 91
 
92
+static struct node* make_node(enum node_type type, struct node *left, struct node *right)
93
+{
94
+	struct node *n;
95
+	if(type == concat) {
96
+		if(left == NULL)
97
+			return right;
98
+		if(right == NULL)
99
+			return left;
100
+	}
101
+	n = cli_malloc(sizeof(*n));
102
+	if(!n)
103
+		return NULL;
104
+	n->type = type;
105
+	n->parent = NULL;
106
+	n->u.children.left = left;
107
+	n->u.children.right = right;
108
+	if(left)
109
+		left->parent = n;
110
+	if(right)
111
+		right->parent = n;
112
+	return n;
113
+}
104 114
 
105
-#define STD_CLASS_CNT sizeof(std_class)/sizeof(std_class[0])
106
-
107
-/* generated by contrib/phishing/generate_tables.c */
108
-static const unsigned char char_class_bitmap[STD_CLASS_CNT][32] = {
109
-        {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0x03, 
110
-         0xfe, 0xff, 0xff, 0x07, 0xfe, 0xff, 0xff, 0x07, 
111
-         0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 
112
-         0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
113
-
114
-        {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0x03, 
115
-         0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 
116
-         0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 
117
-         0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
118
-
119
-        {0x00, 0x00, 0x00, 0x00, 0xfe, 0xff, 0x00, 0xfc, 
120
-         0x01, 0x00, 0x00, 0xf8, 0x01, 0x00, 0x00, 0x78, 
121
-         0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 
122
-         0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
123
-
124
-        {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 
125
-         0xfe, 0xff, 0xff, 0x07, 0xfe, 0xff, 0xff, 0x07, 
126
-         0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 
127
-         0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
128
-
129
-        {0x00, 0x00, 0x00, 0x00, 0xfe, 0xff, 0xff, 0xff, 
130
-         0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x7f, 
131
-         0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 
132
-         0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
133
-
134
-        {0x00, 0x3e, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 
135
-         0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 
136
-         0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 
137
-         0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
138
-
139
-        {0x00, 0x02, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 
140
-         0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 
141
-         0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 
142
-         0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
143
-
144
-        {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 
145
-         0x00, 0x00, 0x00, 0x00, 0xfe, 0xff, 0xff, 0x07, 
146
-         0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 
147
-         0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
148
-
149
-        {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 
150
-         0xfe, 0xff, 0xff, 0x07, 0x00, 0x00, 0x00, 0x00, 
151
-         0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 
152
-         0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
153
-
154
-        {0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 
155
-         0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x80, 
156
-         0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 
157
-         0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
158
-
159
-        {0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 
160
-         0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x7f, 
161
-         0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 
162
-         0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
163
-
164
-        {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0x03, 
165
-         0x7e, 0x00, 0x00, 0x00, 0x7e, 0x00, 0x00, 0x00, 
166
-         0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 
167
-         0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}
168
-};
115
+static struct node *dup_node(struct node *p)
116
+{
117
+	struct node *node_left, *node_right;
118
+	struct node *d;
169 119
 
170
-static const unsigned short int char_class[256] = {
171
-        0x200, 0x200, 0x200, 0x200, 0x200, 0x200, 0x200, 0x200, 0x200, 0x260, 0x220, 0x220, 0x220, 0x220, 0x200, 0x200, 
172
-        0x200, 0x200, 0x200, 0x200, 0x200, 0x200, 0x200, 0x200, 0x200, 0x200, 0x200, 0x200, 0x200, 0x200, 0x200, 0x200, 
173
-        0x460, 0x414, 0x414, 0x414, 0x414, 0x414, 0x414, 0x414, 0x414, 0x414, 0x414, 0x414, 0x414, 0x414, 0x414, 0x414, 
174
-        0xc13, 0xc13, 0xc13, 0xc13, 0xc13, 0xc13, 0xc13, 0xc13, 0xc13, 0xc13, 0x414, 0x414, 0x414, 0x414, 0x414, 0x414, 
175
-        0x414, 0xd19, 0xd19, 0xd19, 0xd19, 0xd19, 0xd19, 0x519, 0x519, 0x519, 0x519, 0x519, 0x519, 0x519, 0x519, 0x519, 
176
-        0x519, 0x519, 0x519, 0x519, 0x519, 0x519, 0x519, 0x519, 0x519, 0x519, 0x519, 0x414, 0x414, 0x414, 0x414, 0x414, 
177
-        0x414, 0xc99, 0xc99, 0xc99, 0xc99, 0xc99, 0xc99, 0x499, 0x499, 0x499, 0x499, 0x499, 0x499, 0x499, 0x499, 0x499, 
178
-        0x499, 0x499, 0x499, 0x499, 0x499, 0x499, 0x499, 0x499, 0x499, 0x499, 0x499, 0x414, 0x414, 0x414, 0x414, 0x200, 
179
-        0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 
180
-        0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 
181
-        0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 
182
-        0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 
183
-        0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 
184
-        0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 
185
-        0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 
186
-        0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000
187
-};
120
+	if(!p)
121
+		return NULL;
122
+	d = cli_malloc(sizeof(*d));
123
+	if(!d)
124
+		return NULL;
125
+	d->type = p->type;
126
+	d->parent = NULL;
127
+	switch(p->type) {
128
+		case leaf:
129
+			d->u.leaf_char = p->u.leaf_char;
130
+			break;
131
+		case leaf_class:
132
+			d->u.leaf_class_bitmap = cli_malloc(32);
133
+			if(!d->u.leaf_class_bitmap)
134
+				return NULL;
135
+			memcpy(d->u.leaf_class_bitmap, p->u.leaf_class_bitmap, 32);
136
+			break;
137
+		default:
138
+			node_left = dup_node(p->u.children.left);
139
+			node_right = dup_node(p->u.children.right);
140
+			d->u.children.left = node_left;
141
+			d->u.children.right = node_right;
142
+			if(node_left)
143
+				node_left->parent = d;
144
+			if(node_right)
145
+				node_right->parent = d;
146
+			break;
147
+	}
148
+	return d;
149
+}
188 150
 
189
-static const size_t std_class_cnt =  sizeof(std_class)/sizeof(std_class[0]);
151
+static struct node *make_charclass(uint8_t *bitmap)
152
+{
153
+	struct node *v = cli_malloc(sizeof(*v));
154
+	if(!v)
155
+		return NULL;
156
+	v->type = leaf_class;
157
+	v->parent = NULL;
158
+	v->u.leaf_class_bitmap = bitmap;
159
+	return v;
160
+}
161
+
162
+static struct node *make_leaf(char c)
163
+{
164
+	struct node *v = cli_malloc(sizeof(*v));
165
+	if(!v)
166
+		return NULL;
167
+	v->type = leaf;
168
+	v->parent = NULL;
169
+	v->u.leaf_char = c;
170
+	return v;
171
+}
172
+
173
+static void destroy_tree(struct node *n)
174
+{
175
+	if(!n)
176
+		return;
177
+	switch(n->type) {
178
+		case concat:
179
+		case alternate:
180
+		case optional:
181
+			destroy_tree(n->u.children.left);
182
+			destroy_tree(n->u.children.right);
183
+			break;
184
+		case leaf_class:
185
+			if(n->u.leaf_class_bitmap != dot_bitmap)
186
+			  free(n->u.leaf_class_bitmap);
187
+			break;
188
+		case root:
189
+		case leaf:
190
+			break;
191
+	}
192
+	free(n);
193
+}
194
+
195
+static uint8_t* parse_char_class(const char *pat, size_t *pos)
196
+{
197
+	unsigned char range_start=0;
198
+	int hasprev = 0;
199
+	uint8_t* bitmap = cli_malloc(32);
200
+	if(!bitmap)
201
+		return NULL;
202
+	if (pat[*pos]=='^') {
203
+		memset(bitmap,0xFF,32);/*match chars not in brackets*/
204
+		++*pos;
205
+	}
206
+	else
207
+		memset(bitmap,0x00,32);
208
+	do {
209
+		/* literal ] can be first character, so test for it at the end of the loop, for example: []] */
210
+		if (pat[*pos]=='-' && hasprev) {
211
+			/* it is a range*/
212
+			unsigned char range_end;
213
+			unsigned int c;
214
+			assert(range_start);
215
+			++*pos;
216
+			if (pat[*pos]=='[')
217
+				if (pat[*pos+1]=='.') {
218
+					/* collating sequence not handled */
219
+					free(bitmap);
220
+					/* we are parsing the regex for a
221
+					 * filter, be conservative and
222
+					 * tell the filter that anything could
223
+					 * match here */
224
+					while(pat[*pos] != ']') ++*pos;
225
+					++*pos;
226
+					while(pat[*pos] != ']') ++*pos;
227
+					return dot_bitmap;
228
+				}
229
+				else
230
+					range_end = pat[*pos];
231
+			else
232
+				range_end = pat[*pos];
233
+			for(c=range_start+1;c<=range_end;c++)
234
+				bitmap[c>>3] ^= 1<<(c&0x7);
235
+			hasprev = 0;
236
+		}
237
+		else if (pat[*pos]=='[' && pat[*pos]==':') {
238
+			/* char class */
239
+			free(bitmap);
240
+			while(pat[*pos] != ']') ++*pos;
241
+			++*pos;
242
+			while(pat[*pos] != ']') ++*pos;
243
+			return dot_bitmap;
244
+		} else {
245
+			bitmap[pat[*pos]>>3] ^= 1<<(pat[*pos]&0x7);
246
+			++*pos;
247
+			range_start = pat[*pos];
248
+			hasprev = 1;
249
+		}
250
+	} while(pat[*pos]!=']');
251
+	return bitmap;
252
+}
253
+
254
+static struct node* parse_regex(const char *p, size_t *last)
255
+{
256
+	struct node *v = NULL;
257
+	struct node *right;
258
+	struct node *tmp;
259
+
260
+	while(p[*last] != '$' && p[*last] != '\0') {
261
+		switch(p[*last]) {
262
+			case '|':
263
+				++*last;
264
+				right = parse_regex(p, last);
265
+				v = make_node(alternate, v, right);
266
+				if(!v)
267
+					return NULL;
268
+				break;
269
+			case '*':
270
+			case '?':
271
+				v = make_node(optional, v, NULL);
272
+				if(!v)
273
+					return NULL;
274
+				++*last;
275
+				break;
276
+			case '+':
277
+				/* (x)* */
278
+				tmp = make_node(optional, v, NULL);
279
+				if(!tmp)
280
+					return NULL;
281
+				/* (x) */
282
+				right = dup_node(v);
283
+				if(!right)
284
+					return NULL;
285
+				/* (x)*(x) => (x)+ */
286
+				v = make_node(concat, tmp, right);
287
+				if(!v)
288
+					return NULL;
289
+				++*last;
290
+				break;
291
+			case '(':
292
+				++*last;
293
+				right = parse_regex(p, last);
294
+				if(!right)
295
+					return NULL;
296
+				++*last;
297
+				v = make_node(concat, v, right);
298
+				break;
299
+			case ')':
300
+				return v;
301
+			case '.':
302
+				right = make_charclass(dot_bitmap);
303
+				if(!right)
304
+					return NULL;
305
+				v = make_node(concat, v, right);
306
+				if(!v)
307
+					return NULL;
308
+				++*last;
309
+				break;
310
+			case '[':
311
+				right = make_charclass( parse_char_class(p, last) );
312
+				if(!right)
313
+					return NULL;
314
+				v = make_node(concat, v, right);
315
+				if(!v)
316
+					return NULL;
317
+			case '\\':
318
+				/* next char is escaped, advance pointer
319
+				 * and let fall-through handle it */
320
+				++*last;
321
+			default:
322
+				right = make_leaf(p[*last]);
323
+				v = make_node(concat, v, right);
324
+				if(!v)
325
+					return NULL;
326
+				++*last;
327
+				break;
328
+		}
329
+	}
330
+	return v;
331
+}
332
+
333
+#define BITMAP_HASSET(b, i) (b[i>>3] & (1<<(i&7)))
334
+
335
+static int build_suffixtree_ascend(struct regex_matcher *matcher, struct regex_list *regex, struct node *n, struct text_buffer *buf, struct node *prev)
336
+{
337
+	size_t i;
338
+	while(n) {
339
+		struct node *q = n;
340
+		switch(n->type) {
341
+			case root:
342
+				textbuffer_putc(buf, '\0');
343
+				if(add_pattern_suffix(matcher, buf->data, buf->pos, regex) < 0)
344
+					return CL_EMEM;
345
+				return 0;
346
+			case leaf:
347
+				textbuffer_putc(buf, n->u.leaf_char);
348
+				n = n->parent;
349
+				break;
350
+			case leaf_class:
351
+				if(memcmp(n->u.leaf_class_bitmap, dot_bitmap, sizeof(dot_bitmap)) == 0) {
352
+					textbuffer_putc(buf, '\0');
353
+					if(add_pattern_suffix(matcher, buf->data, buf->pos, regex) < 0)
354
+						return CL_EMEM;
355
+					return 0;
356
+				}
357
+				for(i=0;i<255;i++) {
358
+					if(BITMAP_HASSET(n->u.leaf_class_bitmap, i)) {
359
+						size_t pos;
360
+						pos = buf->pos;
361
+						textbuffer_putc(buf, i);
362
+						if(build_suffixtree_ascend(matcher, regex, n->parent, buf, n) < 0)
363
+							return CL_EMEM;
364
+						buf->pos = pos;
365
+					}
366
+				}
367
+				return 0;
368
+			case concat:
369
+				if(prev != n->u.children.left) {
370
+					if(build_suffixtree_descend(matcher, regex, n->u.children.left, buf) < 0)
371
+						return CL_EMEM;
372
+					/* we're done here, descend will call
373
+					 * ascend if needed */
374
+					return 0;
375
+				} else {
376
+					n = n->parent;
377
+				}
378
+				break;
379
+			case alternate:
380
+				n = n->parent;
381
+				break;
382
+			case optional:
383
+				textbuffer_putc(buf, '\0');
384
+				if(add_pattern_suffix(matcher, buf->data, buf->pos, regex) < 0)
385
+					return CL_EMEM;
386
+				return 0;
387
+		}
388
+		prev = q;
389
+	}
390
+	return 0;
391
+}
392
+
393
+static int build_suffixtree_descend(struct regex_matcher *matcher, struct regex_list *regex, struct node *n, struct text_buffer *buf)
394
+{
395
+	size_t pos;
396
+	while(n && n->type == concat) {
397
+		n = n->u.children.right;
398
+	}
399
+	if(!n)
400
+		return 0;
401
+	/* find out end of the regular expression,
402
+	 * if it ends with a static pattern */
403
+	switch(n->type) {
404
+		case alternate:
405
+			/* save pos as restart point */
406
+			pos = buf->pos;
407
+			if(build_suffixtree_descend(matcher, regex, n->u.children.left, buf) < 0)
408
+				return CL_EMEM;
409
+			buf->pos = pos;
410
+			if(build_suffixtree_descend(matcher, regex, n->u.children.right, buf) < 0)
411
+				return CL_EMEM;
412
+			buf->pos = pos;
413
+			break;
414
+		case optional:
415
+			textbuffer_putc(buf, '\0');
416
+			if(add_pattern_suffix(matcher, buf->data, buf->pos, regex) < 0)
417
+				return CL_EMEM;
418
+			return 0;
419
+		case leaf:
420
+		case leaf_class:
421
+			if(build_suffixtree_ascend(matcher, regex, n, buf, NULL) < 0)
422
+			        return CL_EMEM;
423
+			return 0;
424
+		default:
425
+			break;
426
+	}
427
+	return 0;
428
+}
429
+
430
+
431
+/* ----- shift-or filtering -------------- */
432
+
433
+#define BITMAP_CONTAINS(bmap, val) ((bmap)[(val) >> 5] & (1 << ((val) & 0x1f)))
434
+#define BITMAP_INSERT(bmap, val) ((bmap)[(val) >> 5] |= (1 << ((val) & 0x1f)))
435
+
436
+static void SO_init(struct filter *m)
437
+{
438
+	memset(m->B, ~0, sizeof(m->B));
439
+	memset(m->end, ~0, sizeof(m->end));
440
+	memset(m->end_fast, ~0, sizeof(m->end_fast));
441
+}
442
+
443
+/* because we use uint32_t */
444
+#define MAXSOPATLEN 32
445
+
446
+/* merge another pattern into the filter
447
+ * add('abc'); add('bcd'); will match [ab][bc][cd] */
448
+static int SO_preprocess_add(struct filter *m, const unsigned char *pattern, size_t len)
449
+{
450
+	uint16_t q;
451
+	uint8_t j;
452
+
453
+	/* cut length, and make it modulo 2 */
454
+	if(len > MAXSOPATLEN) {
455
+		len = MAXSOPATLEN;
456
+	} else {
457
+		/* we use 2-grams, must be multiple of 2 */
458
+		len = len & ~1;
459
+	}
460
+	if(!len)
461
+		return 0;
462
+
463
+	/* Shift-Or like preprocessing */
464
+	for(j=0;j < len-1;j++) {
465
+		/* use overlapping 2-grams. We need them overlapping because matching can start at any position */
466
+		q = cli_readint16( &pattern[j] );
467
+		m->B[q] &= ~(1 << j);
468
+	}
469
+	/* we use variable length patterns, use last character to mark pattern end,
470
+	 * can lead to false positives.*/
471
+	/* mark that at state j, the q-gram q can end the pattern */
472
+	if(j) {
473
+		j--;
474
+		m->end[q] &= ~(1 << j);
475
+		m->end_fast[pattern[j]] &= (1<<j);
476
+	}
477
+	return 0;
478
+}
479
+
480
+/* this is like a FSM, with multiple active states at the same time.
481
+ * each bit in "state" means an active state, when a char is encountered
482
+ * we determine what states can remain active.
483
+ * The FSM transition rules are expressed as bit-masks */
484
+static long SO_search(const struct filter *m, const unsigned char *data, unsigned long len)
485
+{
486
+	size_t j;
487
+	uint32_t state = ~0;
488
+	const uint32_t *B = m->B;
489
+	const uint32_t *End = m->end;
490
+	const uint32_t *EndFast = m->end_fast;
491
+
492
+	if(!len) return -1;
493
+	/* Shift-Or like search algorithm */
494
+	for(j=0;j < len-1; j++) {
495
+		const uint16_t q0 = cli_readint16( &data[j] );
496
+		uint32_t match_end;
497
+		state = (state << 1) | B[q0];
498
+		/* state marks with a 0 bit all active states
499
+		 * End[q0] marks with a 0 bit all states where the q-gram 'q' can end a pattern
500
+		 * if we got two 0's at matching positions, it means we encountered a pattern's end */
501
+		match_end = state | EndFast[data[j+1]];
502
+		if((match_end != 0xffffffff) && (state | End[q0]) !=  0xffffffff) {
503
+			/* note: we rely on short-circuit eval here, we only evaluate and fetch End[q0], if
504
+			 * end_fast has matched. This reduces cache pressure on End[], and allows us to keep the working
505
+			 * set inside L2 */
506
+
507
+			/* if state is reachable, and this character can finish a pattern, assume match */
508
+			/* to reduce false positives check if qgram can finish the pattern */
509
+			/* return position of probable match */
510
+			/* find first 0 starting from MSB, the position of that bit as counted from LSB, is the length of the
511
+			 * longest pattern that could match */
512
+			return j >= MAXSOPATLEN  ? j - MAXSOPATLEN : 0;
513
+		}
514
+	}
515
+	/* no match */
516
+	return -1;
517
+}
518
+
519
+/* ----------------------------------------------------------- */
190 520
 
191
-/* Prototypes */
192
-static int add_pattern(struct regex_matcher* matcher,const unsigned char* pat,const char* info,int hostOnly);
193
-static int match_node(struct tree_node* node,const unsigned char* c,size_t len,const char** info);
194
-static void destroy_tree(struct regex_matcher* matcher);
195
-static struct tree_node* tree_root_alloc(void);
196
-static int build_regex_list(struct regex_matcher* matcher);
197
-static void stack_destroy(struct node_stack* stack);
198
-
199
-#ifndef NDEBUG
200
-void dump_tree(struct tree_node* root);
201
-#endif
202 521
 
203 522
 #define MATCH_SUCCESS 0 
204 523
 #define MATCH_FAILED  -1
... ...
@@ -233,6 +554,43 @@ static inline size_t get_char_at_pos_with_skip(const struct pre_fixup_info* info
233 233
 	return (pos>0 && !str[realpos]) ? '\0' : str[realpos>0?realpos-1:0];
234 234
 }
235 235
 
236
+static int validate_subdomain(const struct regex_list *regex, const struct pre_fixup_info *pre_fixup, const char *buffer, size_t buffer_len, char *real_url, size_t real_len, char *orig_real_url)
237
+{
238
+	char c;
239
+	const char *matched;
240
+	size_t match_len;
241
+
242
+	if(!regex || !regex->pattern)
243
+		return 0;
244
+	match_len = strlen(regex->pattern);
245
+	if(((c=get_char_at_pos_with_skip(pre_fixup,buffer,buffer_len+1))==' ' || c=='\0' || c=='/' || c=='?') &&
246
+			(match_len == buffer_len || /* full match */
247
+			 (match_len < buffer_len &&
248
+			  ((c=get_char_at_pos_with_skip(pre_fixup,buffer,buffer_len-match_len))=='.' || (c==' ')) )
249
+			 /* subdomain matched*/)) {
250
+		cli_dbgmsg("Got a match: %s with %s\n", buffer, regex->pattern);
251
+		cli_dbgmsg("Before inserting .: %s\n", orig_real_url);
252
+		if(real_len >= match_len + 1) {
253
+			const size_t pos = real_len - match_len - 1;
254
+			if(real_url[pos] != '.') {
255
+				/* we need to shift left, and insert a '.'
256
+				 * we have an extra '.' at the beginning inserted by get_host to have room,
257
+				 * orig_real_url has to be used here, 
258
+				 * because we want to overwrite that extra '.' */
259
+				size_t orig_real_len = strlen(orig_real_url);
260
+				cli_dbgmsg("No dot here:%s\n",real_url+pos);
261
+				real_url = orig_real_url;
262
+				memmove(real_url, real_url+1, orig_real_len-match_len-1);
263
+				real_url[orig_real_len-match_len-1]='.';
264
+				cli_dbgmsg("After inserting .: %s\n", real_url);
265
+			}
266
+		}
267
+		return 1;
268
+	}
269
+	cli_dbgmsg("Ignoring false match: %s with %s, mismatched character: %c\n", buffer, regex->pattern, c);
270
+	return 0;
271
+}
272
+
236 273
 /*
237 274
  * @matcher - matcher structure to use
238 275
  * @real_url - href target
... ...
@@ -246,24 +604,28 @@ static inline size_t get_char_at_pos_with_skip(const struct pre_fixup_info* info
246 246
  * Do not send NULL pointers to this function!!
247 247
  *
248 248
  */
249
-int regex_list_match(struct regex_matcher* matcher,char* real_url,const char* display_url,const struct pre_fixup_info* pre_fixup,int hostOnly,const char** info,int is_whitelist)
249
+int regex_list_match(struct regex_matcher* matcher,char* real_url,const char* display_url,const struct pre_fixup_info* pre_fixup,int hostOnly,const char **info, int is_whitelist)
250 250
 {
251 251
 	char* orig_real_url = real_url;
252
-	massert(matcher);
253
-	massert(real_url);
254
-	massert(display_url);
255
-	massert(info);
252
+	const char *vinfo;
253
+	struct regex_list *regex;
254
+
255
+	assert(matcher);
256
+	assert(real_url);
257
+	assert(display_url);
258
+	*info = NULL;
256 259
 	if(!matcher->list_inited)
257 260
 		return 0;
258
-	massert(matcher->list_built);
261
+	assert(matcher->list_built);
259 262
 	/* skip initial '.' inserted by get_host */
260 263
 	if(real_url[0] == '.') real_url++;
261 264
 	if(display_url[0] == '.') display_url++;
262 265
 	{
263 266
 		size_t real_len    = strlen(real_url);
264 267
 		size_t display_len = strlen(display_url);
265
-		size_t buffer_len  = (hostOnly && !is_whitelist) ? real_len : real_len + display_len + 1 + (is_whitelist ? 1 : 0);
266
-		char*  buffer = cli_malloc(buffer_len+1);
268
+		size_t buffer_len  = (hostOnly && !is_whitelist) ? real_len + 1 : real_len + display_len + 1 + 1;
269
+		char *buffer = cli_malloc(buffer_len+1);
270
+		char *bufrev;
267 271
 		size_t i;
268 272
 		int rc = 0;
269 273
 		struct cli_ac_data mdata;
... ...
@@ -272,61 +634,48 @@ int regex_list_match(struct regex_matcher* matcher,char* real_url,const char* di
272 272
 			return CL_EMEM;
273 273
 
274 274
 		strncpy(buffer,real_url,real_len);
275
-		buffer[real_len]= (!is_whitelist && hostOnly) ? '\0' : ':';
275
+		buffer[real_len]= (!is_whitelist && hostOnly) ? '/' : ':';
276 276
 		if(!hostOnly || is_whitelist) {
277 277
 			strncpy(buffer+real_len+1,display_url,display_len);
278
-			if(is_whitelist)
279
-				buffer[buffer_len - 1] = '/';
280
-			buffer[buffer_len]=0;
281 278
 		}
279
+		buffer[buffer_len - 1] = '/';
280
+		buffer[buffer_len]=0;
282 281
 		cli_dbgmsg("Looking up in regex_list: %s\n", buffer);
283 282
 
284
-		if(hostOnly) {
285
-			if((rc = cli_ac_initdata(&mdata, 0, AC_DEFAULT_TRACKLEN)))
286
-				return rc;
287
-			rc = 0;
288
-
289
-			for(i = 0; i < matcher->root_hosts_cnt; i++) {
290
-				/* doesn't need to match terminating \0*/
291
-				rc = cli_ac_scanbuff((unsigned char*)buffer,buffer_len,info, &matcher->root_hosts[i] ,&mdata,0,0,-1,NULL,AC_SCAN_VIR,NULL);
292
-				cli_ac_freedata(&mdata);
293
-				if(rc) {
294
-					char c;
295
-					const char* matched = strchr(*info,':');
296
-					const size_t match_len = matched ? strlen(matched+1) : 0;
297
-					if(((c=get_char_at_pos_with_skip(pre_fixup,buffer,buffer_len+1))==' ' || c=='\0' || c=='/' || c=='?') &&
298
-						(match_len == buffer_len || /* full match */
299
-					        (match_len < buffer_len &&
300
-						((c=get_char_at_pos_with_skip(pre_fixup,buffer,buffer_len-match_len))=='.' || (c==' ')) )
301
-						/* subdomain matched*/)) {
302
-
303
-						cli_dbgmsg("Got a match: %s with %s\n", buffer, *info);
304
-						cli_dbgmsg("Before inserting .: %s\n", orig_real_url);
305
-						if(real_len >= match_len + 1) {
306
-							const size_t pos = real_len - match_len - 1;
307
-							if(real_url[pos] != '.') {
308
-								/* we need to shift left, and insert a '.'
309
-								 * we have an extra '.' at the beginning inserted by get_host to have room,
310
-								 * orig_real_url has to be used here, 
311
-								 * because we want to overwrite that extra '.' */
312
-								size_t orig_real_len = strlen(orig_real_url);
313
-								cli_dbgmsg("No dot here:%s\n",real_url+pos);
314
-								real_url = orig_real_url;
315
-								memmove(real_url, real_url+1, orig_real_len-match_len-1);
316
-								real_url[orig_real_len-match_len-1]='.';
317
-								cli_dbgmsg("After inserting .: %s\n", real_url);
318
-							}
319
-						}
320
-						break;
321
-					}
322
-					cli_dbgmsg("Ignoring false match: %s with %s, mismatched character: %c\n", buffer, *info, c);
323
-					rc=0;
283
+		if((rc = cli_ac_initdata(&mdata, 0, AC_DEFAULT_TRACKLEN)))
284
+			return rc;
285
+
286
+		bufrev = cli_strdup(buffer);
287
+		if(!bufrev)
288
+			return CL_EMEM;
289
+		reverse_string(bufrev);
290
+		rc = SO_search(&matcher->filter, (const unsigned char*)bufrev, buffer_len) != -1;
291
+		if(!rc) {
292
+			/* filter says this suffix doesn't match.
293
+			 * The filter has false positives, but no false
294
+			 * negatives */
295
+			return 0;
296
+		}
297
+
298
+		rc = cli_ac_scanbuff((unsigned char*)bufrev,buffer_len, &vinfo, &matcher->suffixes,&mdata,0,0,-1,NULL,AC_SCAN_VIR,NULL);
299
+		cli_ac_freedata(&mdata);
300
+
301
+		if(rc) {
302
+			/* TODO loop over multiple virusnames here */
303
+			regex = (struct regex_list*)vinfo;
304
+			do {
305
+				/* loop over multiple regexes corresponding to
306
+				 * this suffix */
307
+				if (!regex->preg.re_magic) {
308
+					/* we matched a static pattern */
309
+					rc = validate_subdomain(regex, pre_fixup, buffer, buffer_len, real_url, real_len, orig_real_url);
310
+				} else {
311
+					rc = !cli_regexec(&regex->preg, buffer, 0, NULL, 0);
324 312
 				}
325
-			}
326
-		} else
327
-			rc = 0;
328
-		if(!rc)
329
-			rc = match_node(hostOnly ? matcher->root_regex_hostonly : matcher->root_regex,(unsigned char*)buffer,buffer_len,info) == MATCH_SUCCESS ? CL_VIRUS : CL_SUCCESS;
313
+				if(rc) *info = regex->pattern;
314
+				regex = regex->nxt;
315
+			 } while(!rc && regex);
316
+		}
330 317
 		free(buffer);
331 318
 		if(!rc)
332 319
 			cli_dbgmsg("Lookup result: not in regex list\n");
... ...
@@ -336,56 +685,6 @@ int regex_list_match(struct regex_matcher* matcher,char* real_url,const char* di
336 336
 	}
337 337
 }
338 338
 
339
-/* node stack */
340
-#define NODE_STACK_INITIAL 1024
341
-#define NODE_STACK_GROW    4096
342
-/* Initialize @stack */
343
-static int stack_init(struct node_stack* stack)
344
-{
345
-	massert(stack);
346
-
347
-	stack->cnt = 0;
348
-	stack->capacity = NODE_STACK_INITIAL;
349
-	stack->data = cli_malloc(stack->capacity * sizeof(*stack->data));
350
-	if(!stack->data)
351
-		return CL_EMEM;
352
-	else
353
-		return CL_SUCCESS;
354
-}
355
-
356
-/* Reset @stack pointer, but don't realloc */
357
-static void stack_reset(struct node_stack* stack)
358
-{
359
-	massert(stack);
360
-
361
-	stack->cnt = 0;
362
-}
363
-
364
-/* Push @node on @stack, growing it if necessarry */
365
-static int stack_push(struct node_stack* stack,struct tree_node* node)
366
-{
367
-	massert(stack);
368
-	massert(stack->data);
369
-
370
-	if(stack->cnt == stack->capacity) {
371
-		stack->capacity += NODE_STACK_GROW;
372
-		stack->data = cli_realloc2(stack->data,stack->capacity*sizeof(*stack->data));
373
-		if(!stack->data)
374
-			return CL_EMEM;
375
-	}
376
-	stack->data[stack->cnt++] = node;
377
-	return CL_SUCCESS;
378
-}
379
-
380
-/* Pops node from @stack, doesn't realloc */
381
-static struct tree_node* stack_pop(struct node_stack* stack)
382
-{
383
-	massert(stack);
384
-	massert(stack->data);
385
-	massert(stack->cnt);/*don't pop from empty stack */
386
-
387
-	return stack->cnt ? stack->data[--stack->cnt] : NULL;
388
-}
389 339
 
390 340
 /* Initialization & loading */
391 341
 /* Initializes @matcher, allocating necesarry substructures */
... ...
@@ -393,90 +692,21 @@ int init_regex_list(struct regex_matcher* matcher)
393 393
 {
394 394
 	int rc;
395 395
 
396
-	massert(matcher);
397
-	matcher->list_inited = 0;
398
- 	matcher->root_hosts_cnt = 0;
399
- 	matcher->root_hosts = NULL;
400
- 	matcher->root_hosts_cnt = 0;
401
-
402
-	matcher->root_regex = tree_root_alloc();
403
-	if(!matcher->root_regex) {
404
-		return CL_EMEM;
405
-	}
406
-
407
-	matcher->root_regex_hostonly = tree_root_alloc();
408
-	if(!matcher->root_regex_hostonly) {
409
-		free(matcher->root_regex);
410
-		return CL_EMEM;
411
-	}
412
-
413
-	if(( rc = stack_init(&matcher->node_stack) )) {
414
-		free(matcher->root_regex_hostonly);
415
-		free(matcher->root_regex);
416
-		return rc;
417
-	}
418
-	if(( rc = stack_init(&matcher->node_stack_alt) )) {
419
-		free(matcher->root_regex_hostonly);
420
-		free(matcher->root_regex);
421
-		stack_destroy(&matcher->node_stack);
422
-		return rc;
423
-	}
396
+	assert(matcher);
397
+	memset(matcher, 0, sizeof(*matcher));
424 398
 
425 399
 	matcher->list_inited=1;
426
-	matcher->list_built=1;/* its empty, but pretend its built, so that load_ will realloc root_hosts */
400
+	matcher->list_built=0;
427 401
 	matcher->list_loaded=0;
428 402
 
403
+	hashtab_init(&matcher->suffix_hash, 10);
404
+	if((rc = cli_ac_init(&matcher->suffixes, 2, 32))) {
405
+		return rc;
406
+	}
407
+	SO_init(&matcher->filter);
429 408
 	return CL_SUCCESS;
430 409
 }
431 410
 
432
-/* inserts @pattern into @root, using ac-matcher 
433
- * although the name might be confusing, @pattern is not a regex!*/
434
-static int add_regex_list_element(struct cli_matcher* root,const char* pattern,char* info)
435
-{
436
-       int ret;
437
-       struct cli_ac_patt *new = cli_calloc(1,sizeof(*new));
438
-       size_t len,i;
439
-
440
-       if(!new)
441
-	       return CL_EMEM;
442
-       massert(root);
443
-       massert(pattern);
444
-
445
-       len = strlen(pattern);
446
-       /* need not to match \0 too */
447
-       new->rtype = 0;
448
-       new->type = 0;
449
-       new->sigid = 0;
450
-       new->parts = 0;
451
-       new->partno = 0;
452
-       new->mindist = 0;
453
-       new->maxdist = 0;
454
-       new->offset = 0;
455
-       new->target = 0;
456
-       new->length = len;
457
-       new->ch[0] = new->ch[1] |= CLI_MATCH_IGNORE;
458
-       if(new->length > root->maxpatlen)
459
-               root->maxpatlen = new->length;
460
-
461
-       new->pattern = cli_malloc(sizeof(new->pattern[0])*len);
462
-       if(!new->pattern) {
463
-	       free(new);
464
-	       return CL_EMEM;
465
-       }
466
-       for(i=0;i<len;i++)
467
-	       new->pattern[i]=pattern[i];/*new->pattern is short int* */
468
-
469
-	
470
-       new->virname = cli_strdup(info);
471
-       if((ret = cli_ac_addpatt(root,new))) {
472
-	       free(new->virname);
473
-               free(new->pattern);
474
-               free(new);
475
-               return ret;
476
-       }
477
-       return CL_SUCCESS;
478
-}
479
-
480 411
 static int functionality_level_check(char* line)
481 412
 {
482 413
 	char* ptmin;
... ...
@@ -527,14 +757,10 @@ int load_regex_matcher(struct regex_matcher* matcher,FILE* fd,unsigned int optio
527 527
 	int rc,line=0;
528 528
 	char buffer[FILEBUFF];
529 529
 
530
-	massert(matcher);
530
+	assert(matcher);
531 531
 
532 532
 	if(matcher->list_inited==-1)
533 533
 		return CL_EMALFDB; /* already failed to load */
534
-/*	if(matcher->list_loaded) {
535
-		cli_warnmsg("Regex list has already been loaded, ignoring further requests for load\n");
536
-		return CL_SUCCESS;
537
-	}*/
538 534
 	if(!fd && !dbio) {
539 535
 		cli_errmsg("Unable to load regex list (null file)\n");
540 536
 		return CL_EIO;
... ...
@@ -548,7 +774,6 @@ int load_regex_matcher(struct regex_matcher* matcher,FILE* fd,unsigned int optio
548 548
 			fatal_error(matcher);
549 549
 			return rc;
550 550
 		}
551
-		/*atexit(regex_list_done); TODO: destroy this in manager.c */
552 551
 	}
553 552
 	/*
554 553
 	 * Regexlist db format (common to .wdb(whitelist) and .pdb(domainlist) files:
... ...
@@ -573,11 +798,13 @@ int load_regex_matcher(struct regex_matcher* matcher,FILE* fd,unsigned int optio
573 573
 	while(cli_dbgets(buffer, FILEBUFF, fd, dbio)) {
574 574
 		char* pattern;
575 575
 		char* flags;
576
+		size_t pattern_len;
577
+
576 578
 		cli_chomp(buffer);
577 579
 		if(!*buffer)
578 580
 			continue;/* skip empty lines */
579 581
 
580
-		if(functionality_level_check(buffer)) 
582
+		if(functionality_level_check(buffer))
581 583
 			continue;
582 584
 
583 585
 		line++;
... ...
@@ -591,83 +818,39 @@ int load_regex_matcher(struct regex_matcher* matcher,FILE* fd,unsigned int optio
591 591
 		flags = buffer+1;
592 592
 		pattern++;
593 593
 
594
-		if(is_whitelist) {
595
-			const size_t pattern_len = strlen(pattern);
596
-			if(pattern_len < FILEBUFF) {
597
-				pattern[pattern_len] = '/';
598
-				pattern[pattern_len+1] = '\0';
599
-			}
600
-			else {
601
-				cli_errmsg("Overlong regex line %d\n",line);
602
-				fatal_error(matcher);
603
-				return CL_EMALFDB;
604
-			}
594
+		pattern_len = strlen(pattern);
595
+		if(pattern_len < FILEBUFF) {
596
+			pattern[pattern_len] = '/';
597
+			pattern[pattern_len+1] = '\0';
598
+		}
599
+		else {
600
+			cli_errmsg("Overlong regex line %d\n",line);
601
+			fatal_error(matcher);
602
+			return CL_EMALFDB;
605 603
 		}
606 604
 
607
-		if((buffer[0] == 'R' && !is_whitelist) || ((buffer[0] == 'X' || buffer[0] == 'Y') && is_whitelist)) {/*regex*/
608
-			if(( rc = add_pattern(matcher,(const unsigned char*)pattern,flags, buffer[0] == 'Y') ))
605
+		if((buffer[0] == 'R' && !is_whitelist) || ((buffer[0] == 'X' || buffer[0] == 'Y') && is_whitelist)) {
606
+			/* regex for hostname*/
607
+			if (( rc = add_pattern(matcher, pattern) ))
609 608
 				return rc==CL_EMEM ? CL_EMEM : CL_EMALFDB;
610 609
 		}
611
-		else if( ( buffer[0] == 'H' && !is_whitelist) || (buffer[0] == 'M' && is_whitelist)) {/*matches displayed host*/
612
-			struct cli_matcher* root;
613
- 			if(matcher->list_built) {
614
- 				struct cli_matcher* old_hosts = matcher->root_hosts;
615
- 				matcher->root_hosts_cnt++;
616
- 
617
- 				matcher->root_hosts = cli_realloc(matcher->root_hosts, matcher->root_hosts_cnt * sizeof(*matcher->root_hosts));
618
- 				if(!matcher->root_hosts) {
619
- 					matcher->root_hosts = old_hosts;/* according to manpage this must still be valid*/
620
- 					return CL_EMEM;
621
-				} 
622
-
623
-				root = &matcher->root_hosts[matcher->root_hosts_cnt-1];
624
- 				memset(root, 0, sizeof(struct cli_matcher));
625
-
626
-				cli_dbgmsg("regex_list: Initialising AC pattern matcher\n");
627
-				if((rc = cli_ac_init(root, cli_ac_mindepth, cli_ac_maxdepth))) {
628
-					/* no need to free previously allocated memory here */
629
-					cli_errmsg("regex_list: Can't initialise AC pattern matcher\n");
630
-					return rc;
631
-				}
632
- 				matcher->list_built = 0;
633
- 			}
634
-			else {
635
-				root = &matcher->root_hosts[matcher->root_hosts_cnt-1];
636
-			}
637
- 			if(( rc = add_regex_list_element(root,pattern,flags) ))
610
+		else if( ( buffer[0] == 'H' && !is_whitelist) || (buffer[0] == 'M' && is_whitelist)) {
611
+			/*matches displayed host*/
612
+			if (( rc = add_static_pattern(matcher, pattern) ))
638 613
 				return rc==CL_EMEM ? CL_EMEM : CL_EMALFDB;
639 614
 		}
640 615
 		else {
641 616
 			return CL_EMALFDB;
642
-			/* this is useless, we have host, and regex matches
643
-			if(( rc = add_regex_list_element(matcher->root_urls,pattern,flags) ))
644
-				return rc==CL_EMEM ? CL_EMEM : CL_EMALFDB;*/
645 617
 		}
646 618
 	}
647 619
 	matcher->list_loaded = 1;
648
-	if(( rc = build_regex_list(matcher) ))
649
-		return rc;
650 620
 
651
-#ifndef NDEBUG
652
-/*			dump_tree(matcher->root_regex);*/
653
-#endif
654
-	if(!matcher->list_built) {
655
-		cli_errmsg("Regex list not loaded: build failed!\n");
656
-		fatal_error(matcher);
657
-		return CL_EMALFDB;
658
-	}
659
-	regex_list_cleanup(matcher);
660 621
 	return CL_SUCCESS;
661 622
 }
662 623
 
663 624
 
664
-static struct tree_node ** tree_node_get_children(const struct tree_node* node)
665
-{
666
-	return node->op==OP_CUSTOMCLASS ? (node->u.children[1] ? node->u.children+1 : NULL) :node->u.children;
667
-}
668
-
669 625
 /* Build the matcher list */
670
-static int build_regex_list(struct regex_matcher* matcher)
626
+int cli_build_regex_list(struct regex_matcher* matcher)
671 627
 {
672 628
 	int rc;
673 629
 	if(!matcher->list_inited || !matcher->list_loaded) {
... ...
@@ -675,9 +858,9 @@ static int build_regex_list(struct regex_matcher* matcher)
675 675
 		return -1;/*TODO: better error code */
676 676
 	}
677 677
 	cli_dbgmsg("Building regex list\n");
678
-	if(matcher->root_hosts)
679
-		if(( rc = cli_ac_buildtrie(&matcher->root_hosts[matcher->root_hosts_cnt-1]) ))
680
- 			return rc;
678
+	hashtab_free(&matcher->suffix_hash);
679
+	if(( rc = cli_ac_buildtrie(&matcher->suffixes) ))
680
+		return rc;
681 681
 	matcher->list_built=1;
682 682
 
683 683
 	return CL_SUCCESS;
... ...
@@ -686,864 +869,193 @@ static int build_regex_list(struct regex_matcher* matcher)
686 686
 /* Done with this matcher, free resources */
687 687
 void regex_list_done(struct regex_matcher* matcher)
688 688
 {
689
-	massert(matcher);
689
+	assert(matcher);
690 690
 
691
-	regex_list_cleanup(matcher);
692 691
 	if(matcher->list_loaded) {
693
-		if(matcher->root_hosts) {
694
-			size_t i;
695
-			for(i=0;i<matcher->root_hosts_cnt;i++) 
696
-				cli_ac_free(&matcher->root_hosts[i]);
697
-			free(matcher->root_hosts);
698
-			matcher->root_hosts=NULL;
692
+		size_t i;
693
+		/* TODO: call it, but be sure it won't free virname */
694
+		//cli_ac_free(&matcher->suffixes);
695
+		if(matcher->suffix_regexes) {
696
+			for(i=0;i<matcher->suffix_cnt;i++) {
697
+				struct regex_list *r = matcher->suffix_regexes[i];
698
+				while(r) {
699
+					cli_regfree(&r->preg);
700
+					r = r->nxt;
701
+				}
702
+			}
703
+			free(matcher->suffix_regexes);
704
+			matcher->suffix_regexes = NULL;
699 705
 		}
700
-
701
-		matcher->root_hosts_cnt=0;
706
+		hashtab_free(&matcher->suffix_hash);
702 707
 		matcher->list_built=0;
703
-		destroy_tree(matcher);
704 708
 		matcher->list_loaded=0;
705 709
 	}
706 710
 	if(matcher->list_inited) {
707 711
 		matcher->list_inited=0;
708 712
 	}
709
-	stack_destroy(&matcher->node_stack);
710
-	stack_destroy(&matcher->node_stack_alt);
711 713
 }
712 714
 
713
-/* Tree matcher algorithm */
714
-struct token_t
715
-{
716
-	union {
717
-		const unsigned char* start;
718
-		char_bitmap_p  bitmap;
719
-		unsigned char  c;
720
-	} u;
721
-	size_t len;
722
-	char   type;
723
-};
724
-
725
-enum {TOKEN_CHAR,TOKEN_DOT,TOKEN_PAR_OPEN,TOKEN_PAR_CLOSE,TOKEN_BRACKET,TOKEN_ALT,TOKEN_REGEX,TOKEN_DONE};
726
-
727
-static const unsigned char* getNextToken(const unsigned char* pat,struct token_t* token)
728
-{
729
-	massert(pat);
730
-	massert(token);
731
-
732
-	switch(*pat) {
733
-		case '\\':
734
-			token->type=TOKEN_CHAR;
735
-			token->u.c = *(++pat);
736
-			if(islower(token->u.c)) {
737
-				/* handle \n, \t, etc. */
738
-				char fmt[3] = {'\\', '\0', '\0'};
739
-				char c;
740
-
741
-				fmt[1] = token->u.c;
742
-				if(snprintf(&c,1,fmt)!=1) {
743
-					token->type=TOKEN_REGEX;
744
-					token->u.start = pat;
745
-				}
746
-				else
747
-					token->u.c=c;
748
-			}
749
-			token->len = 1;
750
-			break;
751
-		case '|':
752
-			token->type=TOKEN_ALT;
753
-			break;
754
-		case '*':
755
-		case '+':
756
-		case '?':
757
-		case '{':
758
-		case '}':
759
-			token->type=TOKEN_REGEX;
760
-			break;
761
-		case '[':
762
-			{
763
-			/*TODO: implement*/
764
-			/*see if it is something simple like a list of characters, a range, or negated ...*/
765
-			const unsigned char* old=pat++;/* save this in case we change our mind and decide this is too complicated for us to handle*/
766
-			unsigned char range_start=0;
767
-			int hasprev = 0;
768
-			char_bitmap_p bitmap = cli_malloc(32);
769
-			if(!bitmap)
770
-				return NULL;
771
-			if (*pat=='^') {
772
-				memset(bitmap,0xFF,32);/*match chars not in brackets*/
773
-				pat++;
774
-			}
775
-			else
776
-				memset(bitmap,0x00,32);
777
-			do {
778
-				/* literal ] can be first character, so test for it at the end of the loop, for example: []] */
779
-				if (*pat=='-' && hasprev) {
780
-					/* it is a range*/
781
-					unsigned char range_end;
782
-					unsigned int c;
783
-					massert(range_start);
784
-					pat++;
785
-					if (pat[0]=='[')
786
-						if (pat[1]=='.') {
787
-							if(pat[2]=='-' && pat[3]=='.' && pat[4]==']')
788
-								range_end = '-';
789
-							else {
790
-								/* this is getting complicated, bail out */
791
-								cli_warnmsg("confused about collating sequences in regex,bailing out");
792
-								pat=old;
793
-								token->type=TOKEN_REGEX;
794
-								break;
795
-							}
796
-						}
797
-						else 
798
-							range_end = *pat;
799
-					else
800
-						range_end = *pat;
801
-					for(c=range_start+1;c<=range_end;c++)
802
-						bitmap[c>>3] ^= 1<<(c&0x7);
803
-					hasprev = 0;
804
-				}
805
-				else if (pat[0]=='[' && pat[1]==':') {
806
-							const unsigned char* end;
807
-							int len,found=-1;
808
-							size_t i;
809
-
810
-							pat+=2;
811
-							end=(unsigned char*)strstr((const char*)pat,":]");
812
-							if(!end) {
813
-								cli_warnmsg("confused about std char class syntax regex,bailing out");
814
-								pat=old;
815
-								token->type=TOKEN_REGEX;
816
-								break;
817
-							}
818
-
819
-							len = end-pat;
820
-							for(i=0;i<std_class_cnt;i++)
821
-								if(!strncmp((const char*)pat,std_class[i],len)) {
822
-									found=i;
823
-									break;
824
-								}
825
-							if(found!=-1) {
826
-								for(i=0;i<256;i++)
827
-									if(char_class[i]&(1<<found))
828
-										bitmap[i>>3] ^= 1<<(i&0x7);
829
-							}
830
-							else {
831
-								/*unknown class*/
832
-								cli_warnmsg("confused about regex bracket expression, bailing out");
833
-								pat=old;
834
-								token->type=TOKEN_REGEX;
835
-								break;
836
-							}
837
-						}
838
-				else {
839
-					bitmap[*pat>>3] ^= 1<<(*pat&0x7);
840
-					pat++;
841
-					range_start = *pat;
842
-					hasprev = 1;
843
-				}
844
-			} while(*pat!=']');
845
-			/*TODO: see if this bitmap already exists, then reuse*/			
846
-			token->type = TOKEN_BRACKET;
847
-			token->u.bitmap = bitmap;
848
-			break;
849
-			}
850
-		case ']':
851
-			massert(0 && "Encountered ] without matching [");
852
-			/* bad state */
853
-			break;
854
-		case '.':
855
-			token->type=TOKEN_DOT;
856
-			break;
857
-		case '(':
858
-			token->type=TOKEN_PAR_OPEN;
859
-			break;
860
-		case ')':
861
-			token->type=TOKEN_PAR_CLOSE;
862
-			break;
863
-		default:
864
-			token->type=TOKEN_CHAR;
865
-			token->u.c = *pat;
866
-			token->len=1;
867
-			break;
868
-	}
869
-	return ++pat;
870
-}
871
-
872
-#define INITIAL_ALT_STACK 10
873
-#define ALT_STACK_GROW 20
874
-
875
-static const unsigned char* find_regex_start(const unsigned char* pat)
715
+int is_regex_ok(struct regex_matcher* matcher)
876 716
 {
877
-	struct token_t token;
878
-	/*TODO: find where the regex part begins, for ex:
879
-	 * abcd+, regex begins at 'd'
880
-	 * */
881
-	const unsigned char* last=NULL;
882
-	const unsigned char* tmp=NULL;
883
-	const unsigned char** altpositions = cli_malloc(INITIAL_ALT_STACK*sizeof(*altpositions));
884
-	size_t altpositions_capacity = INITIAL_ALT_STACK;
885
-	size_t altpositions_cnt = 0;
886
-	char lasttype = -1;
887
-	if(!altpositions)
888
-		return NULL;
889
-	massert(pat);
890
-
891
-	/* Try to parse pattern till special regex chars are encountered, that the tree-matcher doesn't handle, like: +,*,{}.
892
-	 * The tricky part is that once we encounter these, the previous 'atom' has to be passed on to the regex matcher, so we have to
893
-	 * back up to the last known good position
894
-	 * Example, if we have: abc(defg)+, then only abc can be handled by tree parser, so we have to return the position of (.
895
-	 * Another example: abc(defg|xyz|oz+|pdo), the last known good position is |, after xyz
896
-	 * TODO: what about open parantheses? maybe once we found a special char, we have top back out before the first (?
897
-	 * */
898
-	do {	
899
-		tmp = pat;
900
-		pat = getNextToken(pat,&token);
901
-		if(token.type!=TOKEN_REGEX) {
902
-			last = tmp;
903
-			lasttype = token.type;
904
-			if(token.type==TOKEN_BRACKET && token.u.bitmap)
905
-				free(token.u.bitmap);
906
-			if(token.type==TOKEN_ALT || token.type==TOKEN_PAR_OPEN) {
907
-				/* save this position on stack, succesfully parsed till here*/
908
-				if(altpositions_cnt && altpositions[altpositions_cnt-1][0]=='|')
909
-					/* encountered another alternate (|) operator, override previous | position stored */
910
-					altpositions[altpositions_cnt-1]=last;
911
-				else {
912
-					altpositions[altpositions_cnt++] = last;
913
-					if(altpositions_cnt == altpositions_capacity) {
914
-						altpositions_capacity += ALT_STACK_GROW;
915
-						altpositions = cli_realloc2(altpositions,altpositions_capacity*sizeof(*altpositions));
916
-						if(!altpositions)
917
-							return NULL;
918
-					}
919
-				}
920
-			} else if (lasttype==TOKEN_PAR_CLOSE) {
921
-				/* remove last stored position from stack, succesfully this last group */
922
-				altpositions_cnt--;
923
-				massert(altpositions_cnt>0);
924
-			}
925
-		}
926
-		else {
927
-			if(altpositions_cnt)
928
-				last = altpositions[0 /*altpositions_cnt-1*/];/*TODO: which index here?, see above TODO... */
929
-			/*last stored 'safe' position where no special (+,*,{}) regex chars were encountered*/
930
-		}
931
-	} while(*pat && token.type!=TOKEN_REGEX);
932
-	free(altpositions);
933
-	return *pat ? last : last+1;
717
+	assert(matcher);
718
+	return (!matcher->list_inited || matcher->list_inited!=-1);/* either we don't have a regexlist, or we initialized it successfully */
934 719
 }
935 720
 
936
-static struct tree_node* tree_node_alloc(struct tree_node* next,char listend)
721
+static int add_newsuffix(struct regex_matcher *matcher, struct regex_list *info, char *suffix, size_t len)
937 722
 {
938
-	struct tree_node* node = cli_malloc(sizeof(*node));
939
-	if(node) {
940
-		node->alternatives=0;
941
-		node->next=next;
942
-		node->listend=listend;
943
-		node->u.children=NULL;
944
-	}
945
-	return node;
946
-}
723
+	struct cli_matcher *root = &matcher->suffixes;
724
+	struct cli_ac_patt *new = cli_calloc(1,sizeof(*new));
725
+	size_t i;
726
+	int ret;
947 727
 
948
-static struct tree_node* tree_root_alloc(void)
949
-{
950
-	struct tree_node* root=tree_node_alloc(NULL,1);
951
-	if(root) {
952
-		root->op=OP_ROOT;
953
-		root->c=0;
954
-		root->next=NULL;
955
-		root->listend=1;
728
+	if(!new)
729
+		return CL_EMEM;
730
+	assert(root && suffix);
731
+
732
+	new->rtype = 0;
733
+	new->type = 0;
734
+	new->sigid = 0;
735
+	new->parts = 0;
736
+	new->partno = 0;
737
+	new->mindist = 0;
738
+	new->maxdist = 0;
739
+	new->offset = 0;
740
+	new->target = 0;
741
+	new->length = len;
742
+
743
+	new->ch[0] = new->ch[1] |= CLI_MATCH_IGNORE;
744
+	if(new->length > root->maxpatlen)
745
+		root->maxpatlen = new->length;
746
+
747
+	new->pattern = cli_malloc(sizeof(new->pattern[0])*len);
748
+	if(!new->pattern) {
749
+		free(new);
750
+		return CL_EMEM;
956 751
 	}
957
-	return root;
958
-}
959
-
960
-static struct tree_node* tree_node_char_binsearch(const struct tree_node* node,const char csearch,int* left)
961
-{
962
-	int right;
963
-	struct tree_node **children;
964
-	massert(node);
965
-	massert(left);
966
-
967
-	children = tree_node_get_children(node);
968
-	right = node->alternatives-1;
969
-	*left = 0;
970
-	if(!node->alternatives)
971
-		return NULL;
972
-	massert(children);
973
-	while(*left<=right) {
974
-		int mid  = *left+(right-*left)/2;
975
-		if(children[mid]->c == csearch)
976
-			return children[mid]; 
977
-		else if(children[mid]->c < csearch)
978
-			*left=mid+1;
979
-		else
980
-			right=mid-1;
752
+	for(i=0;i<len;i++)
753
+		new->pattern[i] = suffix[i];/*new->pattern is short int* */
754
+
755
+	new->virname = (char*)info;
756
+	if((ret = cli_ac_addpatt(root,new))) {
757
+		free(new->pattern);
758
+		free(new);
759
+		return ret;
981 760
 	}
982
-	return NULL;
983
-}
984
-
985
-static struct tree_node* tree_get_next(struct tree_node* node)
986
-{
987
-	struct tree_node** children;
988
-	massert(node);
989
-	children = tree_node_get_children(node);
990
-
991
-	if(!node->alternatives && children && children[0])
992
-		return children[0];
993
-	else if(node->alternatives<=1)
994
-		return node;
995
-	else
996
-		return children[0]->next;
761
+	SO_preprocess_add(&matcher->filter, suffix, len);
762
+	return CL_SUCCESS;
997 763
 }
998 764
 
999
-static size_t tree_node_get_array_size(const struct tree_node* node)
1000
-{
1001
-	massert(node);
1002
-	/* if op is CUSTOMCLASS, then first pointer is pointer to bitmap, so array size is +1 */
1003
-	return (node->alternatives + (node->op==OP_CUSTOMCLASS ? 1 : 0)) * sizeof(node->u.children[0]);
1004
-}
765
+#define MODULE "regex_list: "
766
+/* ------ load a regex, determine suffix, determine suffix2regexlist map ---- */
1005 767
 
1006
-static struct tree_node* tree_node_char_insert(struct tree_node* node,const char c,int left)
768
+/* returns 0 on success, clamav error code otherwise */
769
+static int add_pattern_suffix(struct regex_matcher *matcher, char *suffix, size_t suffix_len, struct regex_list *regex)
1007 770
 {
1008
-	struct tree_node* new, *alt = tree_get_next(node);
1009
-	struct tree_node **children;
1010
-	node->alternatives++;
1011
-	node->u.children = cli_realloc2(node->u.children,tree_node_get_array_size(node));
1012
-	if(!node->u.children)
1013
-		return NULL;
1014
-
1015
-	children = node->op==OP_CUSTOMCLASS ? node->u.children+1 : node->u.children;
1016
-
1017
-	new = tree_node_alloc(alt , node == alt );
1018
-	if(new) {
1019
-		new->op=OP_CHAR;
1020
-		new->c=c;
1021
-	}
1022
-
1023
-	if(node->alternatives-left-1>0)
1024
-			memmove(&children[left+1],&children[left],(node->alternatives-left-1)*sizeof(node->u.children[0]));
1025
-	children[left] = new;	
1026
-
1027
-	return new;
1028
-}
1029
-
1030
-static void tree_node_insert_nonbin(struct tree_node* node, struct tree_node* new)
1031
-{
1032
-	struct tree_node **children;
1033
-	massert(node);
1034
-	massert(new);
1035
-
1036
-	children = tree_node_get_children(node);
1037
-	if(node->alternatives) {
1038
-		massert(children);
1039
-	       	if(children[0]->next == node) {
1040
-			int i;
1041
-			new->listend = 1;
1042
-			for(i=0;i<node->alternatives;i++) {
1043
-				children[i]->next = new;
1044
-				children[i]->listend = 0;
1045
-			}
1046
-		}
1047
-		else {
1048
-			struct tree_node* p;
1049
-			for(p = children[0]->next ; p->next != node ; p = p->next)
1050
-				massert(!p->listend);
1051
-			new->listend = 1;
1052
-			p->listend = 0;
1053
-			p->next = new;
1054
-		}
1055
-	}
1056
-	else {
1057
-		int idx = node->op==OP_CUSTOMCLASS ? 1 : 0;
1058
-		if(node->u.children)
1059
-			if(node->u.children[idx]) {
1060
-				node = node->u.children[idx];
1061
-				while(node->next && !node->listend)
1062
-					node = node->next;
1063
-				node->listend = 0;
1064
-				new->next = node->next;
1065
-				node->next = new;
1066
-				new->listend=1;
1067
-				return;
1068
-			}
1069
-		node->u.children = cli_realloc2(node->u.children,sizeof(node->u.children[0])*(2));
1070
-		if(node->u.children) {
1071
-			node->u.children[idx] = new;
1072
-		}
771
+	const struct element *el;
772
+
773
+	assert(matcher);
774
+	el = hashtab_find(&matcher->suffix_hash, suffix, suffix_len);
775
+	/* TODO: what if suffixes are prefixes of eachother and only one will
776
+	 * match? */
777
+	if(el) {
778
+		/* existing suffix */
779
+		assert(el->data < matcher->suffix_cnt);
780
+		regex->nxt = matcher->suffix_regexes[el->data];
781
+		matcher->suffix_regexes[el->data] = regex;
782
+		cli_dbgmsg(MODULE "added new regex to existing suffix %s: %s\n", suffix, regex->pattern);
783
+	} else {
784
+		/* new suffix */
785
+		size_t n = matcher->suffix_cnt++;
786
+		el = hashtab_insert(&matcher->suffix_hash, suffix, suffix_len, n);
787
+		matcher->suffix_regexes = cli_realloc(matcher->suffix_regexes, (n+1)*sizeof(*matcher->suffix_regexes));
788
+		if(!matcher->suffix_regexes)
789
+			return CL_EMEM;
790
+		matcher->suffix_regexes[n] = regex;
791
+		add_newsuffix(matcher, regex, suffix, suffix_len);
792
+		cli_dbgmsg(MODULE "added new suffix %s, for regex: %s\n", suffix, regex->pattern);
1073 793
 	}
794
+	return 0;
1074 795
 }
1075 796
 
1076
-static unsigned char char_getclass(const unsigned char* bitmap)
797
+static size_t reverse_string(char *pattern)
1077 798
 {
799
+	size_t len = strlen(pattern);
1078 800
 	size_t i;
1079
-	massert(bitmap);
1080
-
1081
-	for(i=0;i<std_class_cnt;i++)
1082
-		if(!memcmp(bitmap,char_class_bitmap[i],256>>3))
1083
-			return i;
1084
-	return std_class_cnt;
1085
-}
1086
-
1087
-static void stack_destroy(struct node_stack* stack)
1088
-{
1089
-	massert(stack);
1090
-	if(stack->data)
1091
-		free(stack->data);
1092
-	stack->data = NULL;
1093
-	stack->capacity = 0;
1094
-}
1095
-
1096
-/* call this after whitelist load is complete, and the tree is no longer going to be modified */
1097
-void regex_list_cleanup(struct regex_matcher* matcher)
1098
-{
1099
-	massert(matcher);
1100
-
1101
-	stack_destroy(&matcher->node_stack);
1102
-	stack_destroy(&matcher->node_stack_alt);
1103
-	stack_init(&matcher->node_stack);
1104
-	stack_init(&matcher->node_stack_alt);
1105
-}
1106
-
1107
-int is_regex_ok(struct regex_matcher* matcher)
1108
-{
1109
-	massert(matcher);
1110
-	return (!matcher->list_inited || matcher->list_inited!=-1);/* either we don't have a regexlist, or we initialized it successfully */
801
+	for(i=0; i < (len/2); i++) {
802
+		char aux = pattern[i];
803
+		pattern[i] = pattern[len-i-1];
804
+		pattern[len-i-1] = aux;
805
+	}
806
+	return len;
1111 807
 }
1112 808
 
1113
-/* returns 0 on success, regexec error code otherwise */						
1114
-static int add_pattern(struct regex_matcher* matcher,const unsigned char* pat,const char* info, int hostonly)
809
+static int add_static_pattern(struct regex_matcher *matcher, char* pattern)
1115 810
 {
1116
-	int bol=1;
1117
-	const unsigned char* pat_end = find_regex_start(pat);
1118
-	struct token_t token;
1119
-	struct tree_node* node;
1120
-	
1121
-	massert(matcher);
1122
-
1123
-	node = hostonly ? matcher->root_regex_hostonly : matcher->root_regex;
1124
-
1125
-	stack_reset(&matcher->node_stack);
1126
-	stack_reset(&matcher->node_stack_alt);
1127
-	stack_push(&matcher->node_stack,node);
1128
-
1129
-	for(;node->op!=OP_LEAF;){
1130
-		if(pat<pat_end)
1131
-			pat  = getNextToken(pat,&token);
1132
-		else if(*pat) {
1133
-			token.type = TOKEN_REGEX;
1134
-			token.u.start=pat;
1135
-		}
1136
-		else
1137
-			token.type = TOKEN_DONE;
1138
-
1139
-		switch(token.type) {
1140
-			case TOKEN_CHAR: 
1141
-				{
1142
-					/* search for char in tree */
1143
-					int left;
1144
-					struct tree_node* newnode = tree_node_char_binsearch(node,token.u.c,&left);
1145
-					if(newnode)
1146
-						node = newnode;
1147
-					else {
1148
-						/* not found, insert it */
1149
-						node = tree_node_char_insert(node,token.u.c,left);
1150
-					}
1151
-					break;
1152
-				}
1153
-
1154
-			case TOKEN_PAR_OPEN:
1155
-				stack_push(&matcher->node_stack_alt,NULL);/* marker */
1156
-				stack_push(&matcher->node_stack,node);
1157
-				break;
1158
-
1159
-			case TOKEN_PAR_CLOSE: {
1160
-						      /*TODO: test this!!!*/
1161
-						      struct tree_node* node_alt = node;
1162
-						      node = tree_node_alloc(NULL,1);
1163
-						      node->op=OP_PARCLOSE;
1164
-						      node->c=0;
1165
-						      node->listend=1;
1166
-						      tree_node_insert_nonbin(node_alt,node);
1167
-						      while (( node_alt = stack_pop(&matcher->node_stack_alt) )) {
1168
-							      tree_node_insert_nonbin(node_alt,node);
1169
-						      }
1170
-				      		      stack_pop(&matcher->node_stack);					      
1171
-		      				      break;
1172
-					      }
1173
-
1174
-			case TOKEN_ALT:
1175
-				stack_push(&matcher->node_stack_alt,node);
1176
-				node = stack_pop(&matcher->node_stack);
1177
-				stack_push(&matcher->node_stack,node);
1178
-				break;
1179
-
1180
-			case TOKEN_BRACKET:
1181
-				{
1182
-					struct tree_node* new = tree_node_alloc(tree_get_next(node),1);
1183
-					unsigned char charclass = char_getclass(token.u.bitmap);
1184
-					if(charclass == std_class_cnt) {/*not a std char class*/
1185
-						new->op = OP_CUSTOMCLASS;
1186
-						new->u.children = cli_malloc(sizeof(new->u.children[0])*2);
1187
-						if(!new->u.children)
1188
-							return CL_EMEM;
1189
-						new->u.bitmap[0] = token.u.bitmap;
1190
-						new->u.bitmap[1] = NULL;
1191
-						tree_node_insert_nonbin(node,new);
1192
-						node = new;
1193
-					}
1194
-					else {
1195
-						new->op = OP_STDCLASS;
1196
-						new->c = charclass;
1197
-						tree_node_insert_nonbin(node,new);
1198
-						node=new;
1199
-					}
1200
-					break;
1201
-				}
1202
-
1203
-			case TOKEN_DOT:
1204
-				{
1205
-					struct tree_node* new = tree_node_alloc(tree_get_next(node),1);
1206
-					new->op = OP_DOT;
1207
-					tree_node_insert_nonbin(node,new);
1208
-					node=new;
1209
-					break;
1210
-				}
1211
-
1212
-			case TOKEN_REGEX:
1213
-			case TOKEN_DONE: {
1214
-						 struct leaf_info* leaf=cli_malloc(sizeof(*leaf));
1215
-						 if(!leaf)
1216
-							 return CL_EMEM;
1217
-						 leaf->info = cli_strdup(info);
1218
-						 if(token.type==TOKEN_REGEX) {
1219
-							 int rc;
1220
-							 struct tree_node* new;
1221
-							 regex_t* preg;
1222
-							 preg=cli_malloc(sizeof(*preg));
1223
-							 if(!preg)
1224
-								 return CL_EMEM;
1225
-							 rc = cli_regcomp(preg,(const char*)token.u.start,REG_EXTENDED|(bol?0:REG_NOTBOL));
1226
-							 leaf->preg=preg;
1227
-							 if(rc)
1228
-								 return rc;
1229
-							 new=cli_malloc(sizeof(*new));
1230
-							 if(!new)
1231
-								 return CL_EMEM;
1232
-							 new->op=OP_LEAF;
1233
-							 new->next=node;
1234
-							 new->alternatives=0;
1235
-							 new->u.leaf=leaf;
1236
-							 new->listend=1;
1237
-							 tree_node_insert_nonbin(node,new);
1238
-						 }
1239
-						 else {
1240
-							 leaf->preg=NULL;
1241
-							 node->alternatives=0;
1242
-							 node->u.leaf=leaf;
1243
-							 node->op=OP_LEAF;
1244
-						 }
1245
-						 return 0;
1246
-					 }
1247
-		}
1248
-
1249
-		bol=0;
1250
-	}
1251
-	return 0;
811
+	size_t len;
812
+	struct regex_list *regex = cli_malloc(sizeof(*regex));
813
+	if(!regex)
814
+		return CL_EMEM;
815
+	len = reverse_string(pattern);
816
+	regex->nxt = NULL;
817
+	regex->pattern = cli_strdup(pattern);
818
+	regex->preg.re_magic = 0;
819
+	return add_pattern_suffix(matcher, pattern, len, regex);
1252 820
 }
1253 821
 
1254
-/* c has to be unsigned char here!! */
1255
-static int match_node(struct tree_node* node,const unsigned char* c,size_t len,const char** info)
822
+static int add_pattern(struct regex_matcher *matcher, char *pattern)
1256 823
 {
1257
-	struct tree_node** children;
824
+	struct text_buffer buf;
825
+	struct node *n;
826
+	size_t last=0;
1258 827
 	int rc;
828
+	struct regex_list *regex = cli_malloc(sizeof(*regex));
829
+	struct node root_node;
830
+	size_t len;
831
+	/* we only match the host, so remove useless stuff */
832
+	const char remove_end[] = "([/?].*)?/";
833
+	const char remove_end2[] = "([/?].*)/";
1259 834
 
1260
-	massert(node);
1261
-	massert(c);
1262
-	massert(info);
1263
-
1264
-	if(!node->u.children)
1265
-		return MATCH_FAILED;/* tree empty */
1266
-	*info = NULL;
1267
-	len++;
1268
-	c--;
1269
-	for(;;) {
1270
-		massert(node);
1271
-		children = node->u.children;
1272
-		switch(node->op) {
1273
-			case OP_ROOT:
1274
-				rc=1;
1275
-				break;
1276
-			case OP_PARCLOSE:
1277
-				/*this isn't a real character, so don't move*/
1278
-				c--;
1279
-				len++;
1280
-				rc=1;
1281
-				break;
1282
-			case OP_CHAR:
1283
-				massert(*c==node->c && "We know this has to match");
1284
-				rc = 1;/* *c==node->c;- we know it has matched */
1285
-				break;
1286
-			case OP_DOT:	
1287
-				rc = *c!='\n';
1288
-				break;
1289
-			case OP_STDCLASS:
1290
-				rc = char_class[*c]&(node->c);
1291
-				break;
1292
-			case OP_CUSTOMCLASS:
1293
-			{
1294
-				char_bitmap_p bitmap;
1295
-				massert(children);
1296
-				bitmap = (char_bitmap_p)node->u.bitmap[0];
1297
-				children++;
1298
-				rc = bitmap[*c>>3]&(1<<(*c&0x7));
1299
-				break;
1300
-			}
1301
-			case OP_LEAF:
1302
-			{
1303
-				const struct leaf_info* leaf = node->u.leaf;
1304
-				/*isleaf = 1;*/
1305
-				if(leaf->preg) {
1306
-					rc = !cli_regexec(leaf->preg,(const char*)c,0,NULL,0);
1307
-				}
1308
-				else  {
1309
-					massert(*c==node->c && "We know this has to match[2]");
1310
-					rc = 1;
1311
-				}
1312
-				if(rc) {
1313
-					*info = leaf->info;
1314
-					return MATCH_SUCCESS;
1315
-				}
1316
-				break;
1317
-			}
1318
-			default:
1319
-				/* impossible */
1320
-				cli_errmsg("Encountered invalid operator in tree:%d\n",node->op);
1321
-				exit(1);
1322
-		}
1323
-		len--;
1324
-		if(!len) rc=0;
1325
-		c++;
1326
-		if(rc) {
1327
-			const char csearch = *c;
1328
-			int left = 0,right = node->alternatives-1;
1329
-			int mid;
1330
-			/*matched so far, go deeper*/
1331
-			/*do a binary search between children */
1332
-			massert(children);
1333
-			while(left<=right) {
1334
-				mid  = left+(right-left)/2;
1335
-				if (children[mid]->c == csearch)
1336
-					break;
1337
-				else if(children[mid]->c < csearch)
1338
-					left=mid+1;
1339
-				else
1340
-					right=mid-1;
1341
-			}
1342
-			if(left<=right) {
1343
-				node = children[mid];
1344
-				massert(node);
1345
-			}
1346
-			else {
1347
-				if(node->alternatives) {
1348
-					if(!children[0]->listend) {
1349
-						node = children[0];
1350
-						c++;
1351
-						len--;
1352
-					}
1353
-					while(node && node->listend) {
1354
-						node = node->next;/* climb up */
1355
-						c--;
1356
-						len++;
1357
-					}
1358
-					if(!node || !node->next) 
1359
-						return MATCH_FAILED;/* reached root node */
1360
-					node=node->next;
1361
-					c--;
1362
-					len++;
1363
-				}
1364
-				else if(node->u.children) {
1365
-					struct tree_node* rewrite_next = NULL;
1366
-					if(node->op==OP_PARCLOSE) 
1367
-						rewrite_next = node;
1368
-					node = children[0];
1369
-					massert(node);
1370
-					massert(node->op!=OP_CHAR);
1371
-					if(rewrite_next)
1372
-						node->next = rewrite_next;/* this node is pointed to by several parent nodes, 
1373
-									     we need to know 
1374
-									     from which one we came, so we can find out way back
1375
-									     should we fail to match somewhere deeper*/
1376
-				}
1377
-			}
1378
-		}
1379
-		else {
1380
-			/* this node didn't match, try sibling, or parent (if no more siblings) */
1381
-			while(node && node->listend) {
1382
-				node = node->next;/* sibling of parent */
1383
-				c--;
1384
-				len++;
1385
-			}
1386
-			if(!node || !node->next) /* reached root node, it has no next */
1387
-				return MATCH_FAILED;
1388
-			else {
1389
-				c--;
1390
-				len++;
1391
-				node=node->next;
1392
-			}
1393
-		}
1394
-	}
1395
-	return MATCH_FAILED;
1396
-}
1397
-
1398
-/* push node on stack, only if it isn't there already */
1399
-static void stack_push_once(struct node_stack* stack,struct tree_node* node)
1400
-{
1401
-	size_t i;
1402
-	massert(stack);
1403
-	massert(node);
1404 835
 
1405
-	for(i=0;i < stack->cnt;i++)
1406
-		if(stack->data[i]==node)
1407
-			return;
1408
-	stack_push(stack,node);
1409
-}
836
+	if(!regex)
837
+		return CL_EMEM;
1410 838
 
1411
-static void destroy_tree_internal(struct regex_matcher* matcher,struct tree_node* node)
1412
-{
1413
-	struct tree_node **children;
1414
-	massert(matcher);
1415
-	massert(node);
1416
-
1417
-	children = tree_node_get_children(node);
1418
-	if(node->op==OP_LEAF) {
1419
-		struct leaf_info* leaf = node->u.leaf;
1420
-		if(node->next && !node->listend)
1421
-			destroy_tree_internal(matcher,node->next);
1422
-		stack_push_once(&matcher->node_stack,(struct tree_node*)node->u.leaf);/* cast to make compiler happy, and to not make another stack implementation for storing void* */
1423
-		stack_push_once(&matcher->node_stack,node);
1424
-		if(leaf->preg) {
1425
-			cli_regfree(leaf->preg);
1426
-			free(leaf->preg);
1427
-			leaf->preg=NULL;
839
+	len = strlen(pattern);
840
+	if(len > sizeof(remove_end)) {
841
+		if(strncmp(&pattern[len - sizeof(remove_end)+1], remove_end, sizeof(remove_end)-1) == 0) {
842
+			len -= sizeof(remove_end) - 1;
1428 843
 		}
1429
-		if(leaf->info) {
1430
-			free(leaf->info);
1431
-			leaf->info=NULL;
844
+		if(strncmp(&pattern[len - sizeof(remove_end2)+1], remove_end2, sizeof(remove_end2)-1) == 0) {
845
+			len -= sizeof(remove_end2) - 1;
1432 846
 		}
1433
-	/*	return;*/
1434 847
 	}
1435
-	if(node->alternatives) {
1436
-		int i;
1437
-		struct tree_node* p;
1438
-		massert(children);
1439
-		p = children[0]->op==OP_LEAF ? NULL : children[0]->next;
1440
-		for(i=0;i<node->alternatives;i++)
1441
-			destroy_tree_internal(matcher,children[i]);
1442
-		if(p && p!=node)
1443
-			destroy_tree_internal(matcher,p);/*?? is this ok, or without _internal?*/
1444
-	}
1445
-	else {
1446
-		if(children) {
1447
-			if(children[0])
1448
-				destroy_tree_internal(matcher,children[0]);		
848
+	pattern[len] = '\0';
849
+
850
+
851
+	rc = cli_regcomp(&regex->preg, pattern, REG_EXTENDED);
852
+	if(rc) {
853
+		size_t buflen = cli_regerror(rc, &regex->preg, NULL, 0);
854
+		char *errbuf = cli_malloc(buflen);
855
+		if(errbuf) {
856
+			cli_regerror(rc, &regex->preg, errbuf, buflen);
857
+			cli_errmsg(MODULE "Error compiling regular expression %s: %s\n", pattern, errbuf);
858
+			free(errbuf);
859
+		} else {
860
+			cli_errmsg(MODULE "Error compiling regular expression: %s\n", pattern);
1449 861
 		}
862
+		return rc;
863
+		cli_regfree(&regex->preg);
864
+		free(regex);
865
+		return CL_EMALFDB;
1450 866
 	}
1451
-	if(node->op!=OP_LEAF && node->next && !node->listend)
1452
-		destroy_tree_internal(matcher,node->next);
1453
-	if(node->u.children)
1454
-		stack_push_once(&matcher->node_stack,(struct tree_node*)node->u.children);/* cast to make compiler happy, it isn't really a tree_node* */
1455
-	if(node->op==OP_CUSTOMCLASS && node->u.children[0]) {
1456
-		free(node->u.children[0]);
1457
-		node->u.children[0]=NULL;
1458
-	}
1459
-	stack_push_once(&matcher->node_stack,node);
1460
-}
867
+	regex->pattern = cli_strdup(pattern);
868
+	regex->nxt = NULL;
1461 869
 
1462
-static void destroy_tree(struct regex_matcher* matcher)
1463
-{
1464
-	/* we might have the same node linked by different nodes, so a recursive walk&free doesn't work in all situations,
1465
-	 * i.e. it might double-free, so instead of freeing, just push the nodes on a stack, and later free the nodes in that stack,
1466
-	 * (and push to stack only if it doesn't contain it already*/
1467
-	massert(matcher);
1468
-
1469
-	stack_reset(&matcher->node_stack);
1470
-	destroy_tree_internal(matcher,matcher->root_regex);
1471
-	destroy_tree_internal(matcher,matcher->root_regex_hostonly);
1472
-	while (matcher->node_stack.cnt) {
1473
-		struct tree_node* node = stack_pop(&matcher->node_stack);
1474
-		if(node)
1475
-			free(node);
1476
-	}
1477
-}
1478
-#ifndef NDEBUG
1479
-static void dump_node(struct tree_node* node)
1480
-{
1481
-	int i;
1482
-	struct tree_node* p,**children;
1483
-	massert(node);
1484
-	if(node->op==OP_LEAF) {
1485
-		if(node->u.leaf->preg)
1486
-			printf("n%p [label=\"regex\\nleaf\"]",(void*)node);
1487
-		else
1488
-			printf("n%p [label=\"%c\\nleaf\"];\n",(void*)node,node->c);
1489
-		if(node->next && !node->listend) {
1490
-			printf("n%p -> n%p;\n",(void*)node,(void*)node->next);
1491
-			dump_node(node->next);
1492
-		}
1493
-		return;
1494
-	}
1495
-	printf("n%p [label=\"%c\\n%d\\nlistend:%d\"];\n",(void*)node,(node->op==OP_ROOT||node->op==OP_PARCLOSE) ?'@' :node->c,node->op,node->listend);
1496
-	if(node->next)
1497
-		printf("n%p -> n%p;\n",(void*)node,(void*)node->next);
1498
-	printf("n%p -> {",(void*)node);/*using address of node as id*/
1499
-	children = tree_node_get_children(node);
1500
-	if(node->alternatives)
1501
-		massert(children);
1502
-	for(i=0;i<node->alternatives;i++)
1503
-		printf("n%p ",(void*)children[i]);
1504
-	if(node->alternatives && children[0]->op!=OP_LEAF)
1505
-		for(p=children[0]->next;p!=node;p=p->next)
1506
-		{
1507
-			massert(p);
1508
-			printf("n%p ",(void*)p);
1509
-			if(p->op==OP_LEAF || p->listend)
1510
-				break;
1511
-		}
1512
-	if(!node->alternatives && children && children[0])
1513
-		printf("n%p ",(void*)children[0]);
1514
-	printf("};\n");
1515
-	printf("{rank=same;");
1516
-	for(i=0;i<node->alternatives;i++)
1517
-		printf("n%p ",(void*)node->u.children[i]);
1518
-	if(node->alternatives && children[0]->op!=OP_LEAF)
1519
-		for(p=children[0]->next;p!=node;p=p->next) 
1520
-		{
1521
-			printf("n%p ",(void*)p);	
1522
-			if(p->op==OP_LEAF || p->listend)
1523
-				break;
1524
-		}
1525
-	if(!node->alternatives && children && children[0])
1526
-		printf("n%p ",(void*)children[0]);
1527
-	printf("};\n");
1528
-	for(i=0;i<node->alternatives;i++)
1529
-		dump_node(children[i]);
1530
-	if(node->alternatives && children[0]->op!=OP_LEAF)
1531
-		for(p=children[0]->next;p!=node;p=p->next)
1532
-		{
1533
-			dump_node(p);
1534
-			if(p->op==OP_LEAF || p->listend)
1535
-				break;
1536
-		}
1537
-	if(!node->alternatives && children && children[0])
1538
-		dump_node(children[0]);
1539
-}
870
+	n = parse_regex(pattern, &last);
871
+	memset(&buf, 0, sizeof(buf));
872
+	memset(&root_node, 0, sizeof(buf));
873
+	n->parent = &root_node;
1540 874
 
1541
-void dump_tree(struct tree_node* root)
1542
-{
1543
-	/*use dot/dotty from graphviz to view it*/
1544
-	massert(root);
1545
-	printf("digraph tree {\n");
1546
-	dump_node(root);
1547
-	printf("}\n");
875
+	rc = build_suffixtree_descend(matcher, regex, n, &buf);
876
+	destroy_tree(n);
877
+	return rc;
1548 878
 }
1549
-#endif
... ...
@@ -24,39 +24,37 @@
24 24
 #ifndef _REGEX_LIST_H
25 25
 #define _REGEX_LIST_H
26 26
 
27
-#ifdef NDEBUG
28
-#define massert(x) (void)(0)
29
-#else
30
-/*debug version, massert enabled*/
31
-
32
-#define __massert_fail(expr,file,line) (void)cli_errmsg("Assertion failed at %s:%d\n %s\n",file,line,expr)
33
-
34
-#define massert(expr) ((void) ((expr) ? (void)0 : (__massert_fail (#expr,__FILE__,__LINE__))))
35
-#endif
36
-
37 27
 #include "phishcheck.h"
38 28
 #include "readdb.h"
39 29
 #include "matcher.h"
40 30
 #include <zlib.h> /* for gzFile */
41
-struct node_stack {
42
-	struct tree_node** data;
43
-	size_t capacity;
44
-	size_t cnt;
31
+
32
+struct regex_list {
33
+	const char *pattern;
34
+	regex_t preg;
35
+	struct regex_list *nxt;
36
+};
37
+
38
+struct filter {
39
+	uint32_t B[65536];
40
+	uint32_t end_fast[256];
41
+	uint32_t end[65536];
42
+	unsigned long m;
45 43
 };
46 44
 
47 45
 struct regex_matcher {
48
-	struct cli_matcher* root_hosts;
49
-	struct tree_node* root_regex;
50
-	struct tree_node* root_regex_hostonly; 
51
-	struct node_stack node_stack;
52
-	struct node_stack node_stack_alt;
53
-	size_t root_hosts_cnt;
54
-	int list_inited;
55
-	int list_loaded;
56
-	int list_built;
46
+	struct hashtable suffix_hash;
47
+	size_t suffix_cnt;
48
+	struct regex_list **suffix_regexes;
49
+	struct cli_matcher suffixes;
50
+	struct filter filter;
51
+	int list_inited:2;
52
+	int list_loaded:2;
53
+	int list_built:2;
57 54
 };
58 55
 
59
-int regex_list_match(struct regex_matcher* matcher, char* real_url,const char* display_url,const struct pre_fixup_info* pre_fixup, int hostOnly,const char** info,int is_whitelist);
56
+int cli_build_regex_list(struct regex_matcher* matcher);
57
+int regex_list_match(struct regex_matcher* matcher, char* real_url,const char* display_url,const struct pre_fixup_info* pre_fixup, int hostOnly,const char **info, int is_whitelist);
60 58
 int init_regex_list(struct regex_matcher* matcher);
61 59
 int load_regex_matcher(struct regex_matcher* matcher,FILE* fd,unsigned int options,int is_whitelist,struct cli_dbio *dbio);
62 60
 void regex_list_cleanup(struct regex_matcher* matcher);