Browse code

bb12389 - fast AC sig load - courtesy of Alberto Wu

This commit addresses the signature load time issue in the following steps:
1. Loaded list items are allocated but left unattached; only a node reference is set on them for further processing. This is done with no increase of memory usage. See changes in insert_list and matcher-ac.h
2. Before the tries are built, the whole list of entries is sorted by node, then by pattern, then by partno. This requires O(N log(N)) time.
3. The list is processed linearly, one node at a time and the `next_same` chain is built. Each next_same chain head is also extracted. This requires O(N) time.
4. The list of heads is sorted by partno. This requires O(M log(M)) time on average with M<=N.
5. The list of heads is processed linearly and the `next` chain is built. This has O(M) complexity.

Micah Snyder authored on 2019/10/17 03:40:16
Showing 3 changed files
... ...
@@ -7,6 +7,13 @@ Note: This file refers to the source tarball. Things described here may differ
7 7
 
8 8
 ClamAV 0.101.5 is a security patch release that addresses the following issues.
9 9
 
10
+- Signature load time is significantly reduced by changing to a more efficient
11
+  algorithm for loading signature patterns and allocating the AC trie.
12
+  Patch courtesy of Alberto Wu.
13
+
14
+Special thanks to the following for code contributions and bug reports:
15
+
16
+- Alberto Wu
10 17
 -
11 18
 
12 19
 ## 0.101.4
... ...
@@ -101,6 +101,7 @@ static inline int insert_list(struct cli_matcher *root, struct cli_ac_patt *patt
101 101
         return CL_EMEM;
102 102
     }
103 103
     new->me = pattern;
104
+    new->node = pt;
104 105
 
105 106
     root->ac_lists++;
106 107
     newtable = mpool_realloc(root->mempool, root->ac_listtable, root->ac_lists * sizeof(struct cli_ac_list *));
... ...
@@ -113,112 +114,146 @@ static inline int insert_list(struct cli_matcher *root, struct cli_ac_patt *patt
113 113
 
114 114
     root->ac_listtable = newtable;
115 115
     root->ac_listtable[root->ac_lists - 1] = new;
116
+    return CL_SUCCESS;
117
+}
116 118
 
117
-    ph = pt->list;
118
-    ph_add_after = ph_prev = NULL;
119
-    while(ph) {
120
-        php = ph->me;
121
-        if(!ph_add_after && php->partno <= pattern->partno && (!ph->next || ph->next->me->partno > pattern->partno))
122
-            ph_add_after = ph;
123
-        if((php->length[0] == pattern->length[0]) && (php->prefix_length[0] == pattern->prefix_length[0]) && (php->ch[0] == pattern->ch[0]) && (php->ch[1] == pattern->ch[1]) && (php->boundary == pattern->boundary)) {
124
-            if(!memcmp(php->pattern, pattern->pattern, php->length[0] * sizeof(uint16_t)) && !memcmp(php->prefix, pattern->prefix, php->prefix_length[0] * sizeof(uint16_t))) {
125
-                if(!php->special && !pattern->special) {
126
-                    match = 1;
127
-                } else if(php->special == pattern->special) {
128
-                    match = 1;
129
-                    for(i = 0; i < php->special; i++) {
130
-                        a1 = php->special_table[i];
131
-                        a2 = pattern->special_table[i];
132
-
133
-                        if(a1->num != a2->num) {
134
-                            match = 0;
135
-                            break;
136
-                        }
119
+#define RETURN_RES_IF_NE(uia, uib) do {		\
120
+	if(uia < uib) return -1;		\
121
+	if(uia > uib) return +1;		\
122
+    } while(0)
137 123
 
138
-                        if(a1->negative != a2->negative) {
139
-                            match = 0;
140
-                            break;
141
-                        }
124
+static int patt_cmp_fn(const struct cli_ac_patt *a, const struct cli_ac_patt *b) {
125
+    unsigned int i;
126
+    int res;
127
+    RETURN_RES_IF_NE(a->length[0], b->length[0]);
128
+    RETURN_RES_IF_NE(a->prefix_length[0], b->prefix_length[0]);
129
+    RETURN_RES_IF_NE(a->ch[0], b->ch[0]);
130
+    RETURN_RES_IF_NE(a->ch[1], b->ch[1]);
131
+    RETURN_RES_IF_NE(a->boundary, b->boundary);
132
+
133
+    res = memcmp(a->pattern, b->pattern, a->length[0] * sizeof(uint16_t));
134
+    if(res) return res;
135
+    res = memcmp(a->prefix, b->prefix, a->prefix_length[0] * sizeof(uint16_t));
136
+    if(res) return res;
137
+
138
+    RETURN_RES_IF_NE(a->special, b->special);
139
+    if(!a->special && !b->special)
140
+	return 0;
141
+
142
+    for(i = 0; i < a->special; i++) {
143
+	struct cli_ac_special *spcl_a = a->special_table[i], *spcl_b = b->special_table[i];
144
+
145
+	RETURN_RES_IF_NE(spcl_a->num, spcl_b->num);
146
+	RETURN_RES_IF_NE(spcl_a->negative, spcl_b->negative);
147
+	RETURN_RES_IF_NE(spcl_a->type, spcl_b->type);
148
+
149
+	if(spcl_a->type == AC_SPECIAL_ALT_CHAR) {
150
+	    res = memcmp((spcl_a->alt).byte, (spcl_b->alt).byte, spcl_a->num);
151
+	    if(res) return res;
152
+	} else if(spcl_a->type == AC_SPECIAL_ALT_STR_FIXED) {
153
+	    unsigned int j;
154
+	    RETURN_RES_IF_NE(spcl_a->len[0], spcl_b->len[0]);
155
+	    for(j = 0; j < spcl_a->num; j++) {
156
+		res = memcmp((spcl_a->alt).f_str[j], (spcl_b->alt).f_str[j], spcl_a->len[0]);
157
+		if(res) return res;
158
+	    }
159
+	} else if(spcl_a->type == AC_SPECIAL_ALT_STR) {
160
+	    struct cli_alt_node *alt_a = (spcl_a->alt).v_str, *alt_b = (spcl_b->alt).v_str;
161
+	    while(alt_a && alt_b) {
162
+		RETURN_RES_IF_NE(alt_a->len, alt_b->len);
163
+		res = memcmp(alt_a->str, alt_b->str, alt_a->len);
164
+		if(res) return res;
165
+		alt_a = alt_a->next;
166
+		alt_b = alt_b->next;
167
+	    }
168
+	    RETURN_RES_IF_NE(alt_a, alt_b);
169
+	}
170
+    }
171
+    return 0;
172
+}
142 173
 
143
-                        if(a1->type != a2->type) {
144
-                            match = 0;
145
-                            break;
146
-                        } else if(a1->type == AC_SPECIAL_ALT_CHAR) {
147
-                            if(memcmp((a1->alt).byte, (a2->alt).byte, a1->num)) {
148
-                                match = 0;
149
-                                break;
150
-                            }
151
-                        } else if(a1->type == AC_SPECIAL_ALT_STR_FIXED) {
152
-                            if(a1->len != a2->len) {
153
-                                match = 0;
154
-                                break;
155
-                            }
156 174
 
157
-                            for(j = 0; j < a1->num; j++) {
158
-                                if(memcmp((a1->alt).f_str[j], (a2->alt).f_str[j], a1->len[0]))
159
-                                    break;
160
-                            }
175
+static int sort_list_fn(const void *a, const void *b) {
176
+    const struct cli_ac_node *node_a = (*(const struct cli_ac_list **)a)->node;
177
+    const struct cli_ac_node *node_b = (*(const struct cli_ac_list **)b)->node;
178
+    const struct cli_ac_patt *patt_a = (*(const struct cli_ac_list **)a)->me;
179
+    const struct cli_ac_patt *patt_b = (*(const struct cli_ac_list **)b)->me;
180
+    int res;
161 181
 
162
-                            if(j < a1->num) {
163
-                                match = 0;
164
-                                break;
165
-                            }
166
-                        } else if(a1->type == AC_SPECIAL_ALT_STR) {
167
-                            b1 = (a1->alt).v_str;
168
-                            b2 = (a2->alt).v_str;
169
-                            while(b1 && b2) {
170
-                                if((b1->len != b2->len) || memcmp(b1->str, b2->str, b1->len))
171
-                                    break;
172
-                                b1 = b1->next;
173
-                                b2 = b2->next;
174
-                            }
182
+    /* 1. Group by owning node */
183
+    RETURN_RES_IF_NE(node_a, node_b);
175 184
 
176
-                            if(b1 || b2) {
177
-                                match = 0;
178
-                                break;
179
-                            }
180
-                        }
181
-                    }
182
-                } else {
183
-                    match = 0;
184
-                }
185
+    /* 2. Group together equal pattern in a node */
186
+    res = patt_cmp_fn(patt_a, patt_b);
187
+    if(res)
188
+	return res;
185 189
 
186
-                if(match) {
187
-                    if(pattern->partno < php->partno) {
188
-                        new->next_same = ph;
189
-                        if(ph_prev)
190
-                            ph_prev->next = ph->next;
191
-                        else
192
-                            pt->list = ph->next;
190
+    /* 3. Sort equal patterns in a node by partno in ascending order */
191
+    RETURN_RES_IF_NE(patt_a->partno, patt_b->partno);
193 192
 
194
-                        ph->next = NULL;
195
-                        break;
196
-                    } else {
197
-                        while(ph->next_same && ph->next_same->me->partno < pattern->partno)
198
-                            ph = ph->next_same;
193
+    return 0;
194
+}
199 195
 
200
-                        new->next_same = ph->next_same;
201
-                        ph->next_same = new;
202
-                        return CL_SUCCESS;
203
-                    }
204
-                }
205
-            }
206
-        }
196
+static int sort_heads_by_partno_fn(const void *a, const void *b) {
197
+    const struct cli_ac_patt *patt_a = (*(const struct cli_ac_list **)a)->me;
198
+    const struct cli_ac_patt *patt_b = (*(const struct cli_ac_list **)b)->me;
199
+    RETURN_RES_IF_NE(patt_a->partno, patt_b->partno);
200
+    return 0;
201
+}
207 202
 
208
-        ph_prev = ph;
209
-        ph = ph->next;
203
+static inline void link_node_lists(struct cli_ac_list **listtable, unsigned int nentries) {
204
+    struct cli_ac_list *prev = listtable[0];
205
+    struct cli_ac_node *node = prev->node;
206
+    unsigned int i, nheads = 1;
207
+
208
+    /* Link equal patterns in the next_same list (entries are already sorted by partno asc) */
209
+    for(i = 1; i < nentries; i++) {
210
+	int ret = patt_cmp_fn(prev->me, listtable[i]->me);
211
+	if(ret) {
212
+	    /* This is a new head of a next_same chain */
213
+	    if(i != nheads) {
214
+		/* Move heads towards the beginning of the table */
215
+		prev = listtable[i];
216
+		listtable[i] = listtable[nheads];
217
+		listtable[nheads] = prev;
218
+	    }
219
+	    nheads++;
220
+	} else {
221
+	    prev->next_same = listtable[i];
222
+	    prev->next = NULL;
223
+	    prev = listtable[i];
224
+	}
225
+    }
226
+
227
+    cli_qsort(listtable, nheads, sizeof(listtable[0]), sort_heads_by_partno_fn);
228
+
229
+    /* Link heads in the next list */
230
+    node->list = listtable[0];
231
+    for(i=1; i<nheads; i++)
232
+	listtable[i-1]->next = listtable[i];
233
+    listtable[nheads-1]->next = NULL;
234
+}
210 235
 
211
-    }
236
+static void link_lists(struct cli_matcher *root) {
237
+   struct cli_ac_node *curnode;
238
+   unsigned int i, grouplen;
212 239
 
213
-    if(ph_add_after) {
214
-        new->next = ph_add_after->next;
215
-        ph_add_after->next = new;
216
-    } else {
217
-        new->next = pt->list;
218
-        pt->list = new;
219
-    }
240
+    if(!root->ac_lists)
241
+	return;
220 242
 
221
-    return CL_SUCCESS;
243
+    /* Group the list by owning node, pattern equality and sort by partno */
244
+    cli_qsort(root->ac_listtable, root->ac_lists, sizeof(root->ac_listtable[0]), sort_list_fn);
245
+
246
+    curnode = root->ac_listtable[0]->node;
247
+    for(i=1, grouplen = 1; i <= root->ac_lists; i++, grouplen++) {
248
+	if(i == root->ac_lists || root->ac_listtable[i]->node != curnode) {
249
+	    link_node_lists(&root->ac_listtable[i-grouplen], grouplen);
250
+	    if(i < root->ac_lists) {
251
+		grouplen = 0;
252
+		curnode = root->ac_listtable[i]->node;
253
+	    }
254
+	}
255
+    }
222 256
 }
223 257
 
224 258
 static inline struct cli_ac_node *add_new_node(struct cli_matcher *root, uint16_t i, uint16_t len)
... ...
@@ -492,6 +527,8 @@ int cli_ac_buildtrie(struct cli_matcher *root)
492 492
     if (root->filter)
493 493
         cli_dbgmsg("Using filter for trie %d\n", root->type);
494 494
 
495
+    link_lists(root);
496
+
495 497
     return ac_maketrans(root);
496 498
 }
497 499
 
... ...
@@ -108,7 +108,11 @@ struct cli_ac_patt {
108 108
 
109 109
 struct cli_ac_list {
110 110
     struct cli_ac_patt *me;
111
-    struct cli_ac_list *next, *next_same;
111
+    union {
112
+	struct cli_ac_node *node;
113
+	struct cli_ac_list *next;
114
+    };
115
+    struct cli_ac_list *next_same;
112 116
 };
113 117
 
114 118
 struct cli_ac_node {