Browse code

uniq update

git-svn: trunk@4069

aCaB authored on 2008/08/04 06:39:29
Showing 5 changed files
... ...
@@ -1,3 +1,8 @@
1
+Sun Aug  3 23:09:44 CEST 2008 (acab)
2
+------------------------------------
3
+  * libclamav/uniq: faster md5 lookup
4
+  * sigtool: sync
5
+
1 6
 Sun Aug  3 16:12:17 CEST 2008 (acab)
2 7
 ------------------------------------
3 8
   * libclamav: use md5 based lookup for ole2/vba instead of hashtab (bb#1071)
... ...
@@ -24,118 +24,21 @@
24 24
 #include "clamav-config.h"
25 25
 #endif
26 26
 
27
+#include <stdlib.h>
28
+#if HAVE_STRING_H
29
+#include <string.h>
30
+#endif
31
+
27 32
 #include "uniq.h"
33
+#include "others.h"
28 34
 #include "md5.h"
29 35
 
30
-#if 0
31 36
 struct uniq *uniq_init(uint32_t count) {
32 37
   struct uniq *U;
33
-  uint32_t i;
34 38
 
35 39
   if(!count) return NULL;
36 40
   U = cli_calloc(1, sizeof(*U));
37 41
   if(!U) return NULL;
38
-  if(cli_ac_init(&U->matcher, 16, 16)) {
39
-    uniq_free(U);
40
-    return NULL;
41
-  }
42
-  U->custs = cli_calloc(count, sizeof(U->custs));
43
-  if(!U->custs) {
44
-    uniq_free(U);
45
-    return NULL;
46
-  }
47
-  U->patts = cli_calloc(count, sizeof(U->patts));
48
-  if(!U->patts) {
49
-    uniq_free(U);
50
-    return NULL;
51
-  }
52
-  U->md5s = cli_malloc(count*sizeof(U->md5s));
53
-  if(!U->md5s) {
54
-    uniq_free(U);
55
-    return NULL;
56
-  }
57
-
58
-  U->entries = count;
59
-
60
-  for(i=0; i<count; i++) {
61
-    U->patts[i].pattern = U->md5s[i].md5;
62
-    U->patts[i].length = 16;
63
-    U->patts[i].ch[0] = U->patts[i].ch[1] |= CLI_MATCH_IGNORE;
64
-    U->patts[i].customdata = &U->custs[i];
65
-  }
66
-
67
-  return U;
68
-}
69
-
70
-void uniq_free(struct uniq *U) {
71
-  uint32_t i;
72
-  U->matcher.ac_patterns = 0; /* don't free my arrays! */
73
-  cli_ac_free(&U->matcher);
74
-  if(U->custs) free(U->custs);
75
-  if(U->patts) free(U->patts);
76
-  if(U->md5s) free(U->md5s);
77
-  free(U);
78
-}
79
-
80
-
81
-uint32_t uniq_add(struct uniq *U, const char *key, uint32_t key_len, char **rhash) {
82
-  uint8_t digest[16];
83
-  struct UNIQCUST *cust;
84
-  struct cli_ac_data mdata;
85
-
86
-  cli_md5_ctx md5;
87
-  cli_md5_init(&md5);
88
-  cli_md5_update(&md5, key, key_len);
89
-  cli_md5_final(digest, &md5);
90
-
91
-  cli_ac_initdata(&mdata, 0, 0, AC_DEFAULT_TRACKLEN); /* This can't fail as we don't have parts or lsigs */
92
-  if (cli_ac_scanbuff(digest,16, NULL, (void *)&cust, NULL, &U->matcher, &mdata,0,0,-1,NULL,AC_SCAN_VIR,NULL)!=CL_VIRUS) {
93
-    int i;
94
-    char HEX[] = { '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'a', 'b', 'c', 'd', 'e', 'f' };
95
-    struct cli_ac_patt *patt = &U->patts[U->matcher.ac_patterns];
96
-
97
-    cust = patt->customdata;
98
-    for(i = 0; i < 16; i++) {
99
-      cust->name[i*2] = HEX[digest[i]>>4 & 0xf];
100
-      cust->name[i*2+1] = HEX[digest[i] & 0xf];
101
-      patt->pattern[i] = digest[i];
102
-    }
103
-    cli_ac_addpatt(&U->matcher,patt); /* FIXME this can fail */
104
-    cli_ac_buildtrie(&U->matcher);
105
-  }
106
-
107
-  cust->count++;
108
-  if(rhash) *rhash = cust->name;
109
-  return cust->count;
110
-}
111
-
112
-uint32_t uniq_get(struct uniq *U, const char *key, uint32_t key_len, char **rhash) {
113
-  uint8_t digest[16];
114
-  struct UNIQCUST *cust;
115
-  struct cli_ac_data mdata;
116
-
117
-  cli_md5_ctx md5;
118
-  cli_md5_init(&md5);
119
-  cli_md5_update(&md5, key, key_len);
120
-  cli_md5_final(digest, &md5);
121
-
122
-  cli_ac_initdata(&mdata, 0, 0, AC_DEFAULT_TRACKLEN); /* This can't fail as we don't have parts or lsigs */
123
-  if (cli_ac_scanbuff(digest,16, NULL, (void *)&cust, NULL, &U->matcher, &mdata,0,0,-1,NULL,AC_SCAN_VIR,NULL)!=CL_VIRUS)
124
-    return 0;
125
-
126
-  if(rhash) *rhash = cust->name;
127
-  return cust->count;
128
-}
129
-
130
-#else
131
-#include <string.h>
132
-
133
-struct uniq *uniq_init(uint32_t count) {
134
-  struct uniq *U;
135
-  
136
-  if(!count) return NULL;
137
-  U = cli_malloc(sizeof(*U));
138
-  if(!U) return NULL;
139 42
 
140 43
   U->md5s = cli_malloc(count * sizeof(*U->md5s));
141 44
   if(!U->md5s) {
... ...
@@ -143,7 +46,6 @@ struct uniq *uniq_init(uint32_t count) {
143 143
     return NULL;
144 144
   }
145 145
 
146
-  U->items = 0;
147 146
   return U;
148 147
 }
149 148
 
... ...
@@ -156,50 +58,59 @@ uint32_t uniq_add(struct uniq *U, const char *key, uint32_t key_len, char **rhas
156 156
   unsigned int i;
157 157
   uint8_t digest[16];
158 158
   cli_md5_ctx md5;
159
-  struct UNIQMD5 *m;
159
+  struct UNIQMD5 *m = NULL;
160 160
 
161 161
   cli_md5_init(&md5);
162 162
   cli_md5_update(&md5, key, key_len);
163 163
   cli_md5_final(digest, &md5);
164 164
 
165
-  for(i=0; i<U->items; i++) {
166
-    if(memcmp(digest, U->md5s[i].md5, 16)) continue;
167
-    m = &U->md5s[i];
168
-    break;
169
-  }
165
+  if(U->items && U->md5s[U->idx[*digest]].md5[0]==*digest)
166
+    for(m=&U->md5s[U->idx[*digest]]; m; m=m->next)
167
+      if(!memcmp(&digest[1], &m->md5[1], 15)) break;
170 168
   
171
-  if(i==U->items) {
172
-    char HEX[] = { '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'a', 'b', 'c', 'd', 'e', 'f' };
173
-    m = &U->md5s[i];
169
+  if(!m) {
170
+    const char HEX[] = { '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'a', 'b', 'c', 'd', 'e', 'f' };
171
+
172
+    m = &U->md5s[U->items];
174 173
     m->count = 0;
174
+
175
+    if(U->items && U->md5s[U->idx[*digest]].md5[0]==*digest)
176
+      m->next = &U->md5s[U->idx[*digest]];
177
+    else
178
+      m->next = NULL;
179
+
180
+    U->idx[*digest]=U->items;
181
+
175 182
     for(i = 0; i < 16; i++) {
176 183
       m->name[i*2] = HEX[digest[i]>>4 & 0xf];
177 184
       m->name[i*2+1] = HEX[digest[i] & 0xf];
178
-      m->md5[i] = digest[i]; 
185
+      m->md5[i] = digest[i];
179 186
     }
180 187
     m->name[32] = '\0';
181 188
   }
182
-  
189
+
183 190
   U->items++;
184 191
   if(rhash) *rhash = m->name;
185 192
   return m->count++;
186 193
 }
187 194
 
188 195
 uint32_t uniq_get(struct uniq *U, const char *key, uint32_t key_len, char **rhash) {
189
-  unsigned int i;
190 196
   uint8_t digest[16];
191 197
   cli_md5_ctx md5;
198
+  struct UNIQMD5 *m = NULL;
192 199
 
193 200
   cli_md5_init(&md5);
194 201
   cli_md5_update(&md5, key, key_len);
195 202
   cli_md5_final(digest, &md5);
196 203
 
197
-  for(i=0; i<U->items; i++) {
198
-    if(memcmp(digest, U->md5s[i].md5, 16)) continue;
199
-    if(rhash) *rhash = U->md5s[i].name;
200
-    return U->md5s[i].count;
204
+  if(!U->items || U->md5s[U->idx[*digest]].md5[0]!=*digest)
205
+    return 0;
206
+
207
+  for(m=&U->md5s[U->idx[*digest]]; m; m=m->next) {
208
+    if(memcmp(&digest[1], &m->md5[1], 15)) continue;
209
+    if(rhash) *rhash = m->name;
210
+    return m->count;
201 211
   }
202 212
 
203 213
   return 0;
204 214
 }
205
-#endif
... ...
@@ -23,46 +23,25 @@
23 23
 #ifndef _UNIQ_H
24 24
 #define _UNIQ_H
25 25
 
26
-#include "matcher.h"
27 26
 #include "cltypes.h"
28 27
 
29
-#if 0
30
-struct UNIQCUST {
31
-  char name[33];
32
-  uint32_t count;
33
-};
34
-
35
-struct UNIQMD5 {
36
-  uint16_t md5[16];
37
-};
38
-
39
-/* A basic storage for unique IDs */
40
-struct uniq {
41
-  struct cli_matcher matcher;
42
-  struct cli_ac_patt *patts;
43
-  struct UNIQMD5 *md5s;
44
-  struct UNIQCUST *custs;
45
-  uint32_t entries;
46
-};
47
-
48
-#else
49
-
50 28
 struct UNIQMD5 {
29
+  struct UNIQMD5 *next;
51 30
   uint32_t count;
52 31
   uint8_t md5[16];
53 32
   char name[33];
54 33
 };
55 34
 
56 35
 struct uniq {
57
-  uint32_t items;
58 36
   struct UNIQMD5 *md5s;
37
+  uint32_t items;
38
+  uint32_t idx[256];
59 39
 };
60 40
 
61
-#endif
62
-
63 41
 struct uniq *uniq_init(uint32_t);
64 42
 void uniq_free(struct uniq *);
65 43
 uint32_t uniq_add(struct uniq *, const char *, uint32_t, char **);
66 44
 uint32_t uniq_get(struct uniq *, const char *, uint32_t, char **);
67 45
 
46
+
68 47
 #endif
... ...
@@ -1041,9 +1041,9 @@ int sigtool_vba_scandir (const char *dirname, int hex_output, struct uniq *U)
1041 1041
     DIR *dd;
1042 1042
     struct dirent *dent;
1043 1043
     struct stat statbuf;
1044
-    char *fullname, vbaname[1024];
1044
+    char *fullname, vbaname[1024], *hash;
1045 1045
     unsigned char *data;
1046
-    uint32_t hashcnt, hash;
1046
+    uint32_t hashcnt;
1047 1047
 
1048 1048
     hashcnt = uniq_get(U, "_vba_project", 12, NULL);
1049 1049
     while(hashcnt--) {
... ...
@@ -1051,7 +1051,7 @@ int sigtool_vba_scandir (const char *dirname, int hex_output, struct uniq *U)
1051 1051
 
1052 1052
 	for(i = 0; i < vba_project->count; i++) {
1053 1053
 	    for(j = 0; j < vba_project->colls[i]; j++) {
1054
-		snprintf(vbaname, 1024, "%s/%u_%u", vba_project->dir, vba_project->name[i], j);
1054
+		snprintf(vbaname, 1024, "%s/%s_%u", vba_project->dir, vba_project->name[i], j);
1055 1055
 		vbaname[sizeof(vbaname)-1] = '\0';
1056 1056
 		fd = open(vbaname, O_RDONLY|O_BINARY);
1057 1057
 		if(fd == -1) continue;
... ...
@@ -1077,7 +1077,7 @@ int sigtool_vba_scandir (const char *dirname, int hex_output, struct uniq *U)
1077 1077
 
1078 1078
     if((hashcnt = uniq_get(U, "powerpoint document", 19, &hash))) {
1079 1079
 	while(hashcnt--) {
1080
-	    snprintf(vbaname, 1024, "%s/%u_%u", dirname, hash, hashcnt);
1080
+	    snprintf(vbaname, 1024, "%s/%s_%u", dirname, hash, hashcnt);
1081 1081
 	    vbaname[sizeof(vbaname)-1] = '\0';
1082 1082
 	    fd = open(vbaname, O_RDONLY|O_BINARY);
1083 1083
 	    if (fd == -1) continue;
... ...
@@ -1093,7 +1093,7 @@ int sigtool_vba_scandir (const char *dirname, int hex_output, struct uniq *U)
1093 1093
 
1094 1094
     if ((hashcnt = uniq_get(U, "worddocument", 12, &hash))) {
1095 1095
 	while(hashcnt--) {
1096
-	    snprintf(vbaname, sizeof(vbaname), "%s/%u_%u", dirname, hash, hashcnt);
1096
+	    snprintf(vbaname, sizeof(vbaname), "%s/%s_%u", dirname, hash, hashcnt);
1097 1097
 	    vbaname[sizeof(vbaname)-1] = '\0';
1098 1098
 	    fd = open(vbaname, O_RDONLY|O_BINARY);
1099 1099
 	    if (fd == -1) continue;
... ...
@@ -20,7 +20,7 @@
20 20
 #ifndef __VBA_H
21 21
 #define __VBA_H
22 22
 
23
-#include "libclamav/hashtab.h"
23
+#include "libclamav/uniq.h"
24 24
 int sigtool_vba_scandir(const char *dirname, int hex_output, struct uniq *U);
25 25
 
26 26
 #endif