git-svn: trunk@3257
Török Edvin authored on 2007/10/01 06:00:05... | ... |
@@ -1,3 +1,7 @@ |
1 |
+Sun Sep 30 23:18:11 EEST 2007 (edwin) |
|
2 |
+------------------------------------- |
|
3 |
+ * libclamav/phishcheck.c, regex_list.c: more improvements to the algorithm. |
|
4 |
+ |
|
1 | 5 |
Fri Sep 28 21:02:43 EEST 2007 (edwin) |
2 | 6 |
------------------------------------- |
3 | 7 |
* libclamav/regex_list.c: fix off by one substring logic. |
... | ... |
@@ -151,8 +151,9 @@ static const char cctld_regex[] = "^"iana_cctld"$"; |
151 | 151 |
static const char dotnet[] = ".net"; |
152 | 152 |
static const char adonet[] = "ado.net"; |
153 | 153 |
static const char aspnet[] = "asp.net"; |
154 |
-static const char lt[]="<"; |
|
155 |
-static const char gt[]=">"; |
|
154 |
+/* ; is replaced by ' ' so omit it here*/ |
|
155 |
+static const char lt[]="<"; |
|
156 |
+static const char gt[]=">"; |
|
156 | 157 |
static const char cid[] = "cid:"; |
157 | 158 |
static const char src_text[] = "src"; |
158 | 159 |
static const char href_text[] = "href"; |
... | ... |
@@ -723,9 +724,10 @@ cleanupURL(struct string *URL,struct string *pre_URL, int isReal) |
723 | 723 |
str_replace(begin,end,'\\','/'); |
724 | 724 |
/* some broken MUAs put > in the href, and then |
725 | 725 |
* we get a false positive, so remove them */ |
726 |
- str_replace(begin,end,'<','/'); |
|
727 |
- str_replace(begin,end,'>','/'); |
|
728 |
- str_strip(&begin,&end,"\"",1); |
|
726 |
+ str_replace(begin,end,'<',' '); |
|
727 |
+ str_replace(begin,end,'>',' '); |
|
728 |
+ str_replace(begin,end,'\"',' '); |
|
729 |
+ str_replace(begin,end,';',' '); |
|
729 | 730 |
str_strip(&begin,&end,lt,lt_len); |
730 | 731 |
str_strip(&begin,&end,gt,gt_len); |
731 | 732 |
/* convert hostname to lowercase, but only hostname! */ |
... | ... |
@@ -222,20 +222,22 @@ static void fatal_error(struct regex_matcher* matcher) |
222 | 222 |
|
223 | 223 |
static inline size_t get_char_at_pos_with_skip(const struct pre_fixup_info* info, const char* buffer, size_t pos) |
224 | 224 |
{ |
225 |
+ const char* str; |
|
225 | 226 |
size_t realpos = 0; |
226 | 227 |
if(!info) { |
227 |
- return buffer[pos]; |
|
228 |
+ return (pos <= strlen(buffer)) ? buffer[pos>0 ? pos-1:0] : '\0'; |
|
228 | 229 |
} |
229 |
- cli_dbgmsg("calc_pos_with_skip: skip:%u, %u - %u \"%s\",\"%s\"\n",pos,info->host_start,info->host_end,info->pre_displayLink.data,buffer); |
|
230 |
+ str = info->pre_displayLink.data; |
|
231 |
+ cli_dbgmsg("calc_pos_with_skip: skip:%u, %u - %u \"%s\",\"%s\"\n", pos, info->host_start, info->host_end, str, buffer); |
|
230 | 232 |
pos += info->host_start; |
231 |
- while(!isalnum(info->pre_displayLink.data[realpos])) realpos++; |
|
232 |
- for(; pos>0; pos--) { |
|
233 |
- while(info->pre_displayLink.data[realpos]==' ') realpos++; |
|
233 |
+ while(str[realpos] && !isalnum(str[realpos])) realpos++; |
|
234 |
+ for(; str[realpos] && (pos>0); pos--) { |
|
235 |
+ while(str[realpos]==' ') realpos++; |
|
234 | 236 |
realpos++; |
235 | 237 |
} |
236 |
- while(info->pre_displayLink.data[realpos]==' ') realpos++; |
|
237 |
- cli_dbgmsg("calc_pos_with_skip:%s\n",info->pre_displayLink.data+realpos); |
|
238 |
- return info->pre_displayLink.data[realpos>0?realpos-1:0]; |
|
238 |
+ while(str[realpos]==' ') realpos++; |
|
239 |
+ cli_dbgmsg("calc_pos_with_skip:%s\n",str+realpos); |
|
240 |
+ return (pos>0 && !str[realpos]) ? '\0' : str[realpos>0?realpos-1:0]; |
|
239 | 241 |
} |
240 | 242 |
|
241 | 243 |
/* |
... | ... |
@@ -288,17 +290,18 @@ int regex_list_match(struct regex_matcher* matcher,const char* real_url,const ch |
288 | 288 |
rc = 0; |
289 | 289 |
|
290 | 290 |
for(i = 0; i < matcher->root_hosts_cnt; i++) { |
291 |
- /* needs to match terminating \0 too */ |
|
292 |
- rc = cli_ac_scanbuff((unsigned char*)buffer,buffer_len+1,info, &matcher->root_hosts[i] ,&mdata,0,0,0,-1,NULL); |
|
291 |
+ /* doesn't need to match terminating \0*/ |
|
292 |
+ rc = cli_ac_scanbuff((unsigned char*)buffer,buffer_len,info, &matcher->root_hosts[i] ,&mdata,0,0,0,-1,NULL); |
|
293 | 293 |
cli_ac_freedata(&mdata); |
294 | 294 |
if(rc) { |
295 | 295 |
char c; |
296 | 296 |
const char* matched = strchr(*info,':'); |
297 | 297 |
const size_t match_len = matched ? strlen(matched+1) : 0; |
298 |
- if(match_len == buffer_len || /* full match */ |
|
298 |
+ if(((c=get_char_at_pos_with_skip(pre_fixup,buffer,buffer_len+1))==' ' || c=='\0' || c=='/' || c=='?') && |
|
299 |
+ (match_len == buffer_len || /* full match */ |
|
299 | 300 |
(match_len < buffer_len && |
300 | 301 |
((c=get_char_at_pos_with_skip(pre_fixup,buffer,buffer_len-match_len))=='.' || (c==' ')) ) |
301 |
- /* subdomain matched*/) { |
|
302 |
+ /* subdomain matched*/)) { |
|
302 | 303 |
|
303 | 304 |
cli_dbgmsg("Got a match: %s with %s\n",buffer,*info); |
304 | 305 |
break; |
... | ... |
@@ -427,9 +430,8 @@ static int add_regex_list_element(struct cli_matcher* root,const char* pattern,c |
427 | 427 |
massert(root); |
428 | 428 |
massert(pattern); |
429 | 429 |
|
430 |
- len = strlen(pattern)+1; |
|
431 |
- /* need to match \0 too, so we are sure |
|
432 |
- * matches only happen at end of string */ |
|
430 |
+ len = strlen(pattern); |
|
431 |
+ /* need not to match \0 too */ |
|
433 | 432 |
new->type = 0; |
434 | 433 |
new->sigid = 0; |
435 | 434 |
new->parts = 0; |