Browse code

more improvements to the algorithm.

git-svn: trunk@3257

Török Edvin authored on 2007/10/01 06:00:05
Showing 3 changed files
... ...
@@ -1,3 +1,7 @@
1
+Sun Sep 30 23:18:11 EEST 2007 (edwin)
2
+-------------------------------------
3
+  * libclamav/phishcheck.c, regex_list.c: more improvements to the algorithm.
4
+
1 5
 Fri Sep 28 21:02:43 EEST 2007 (edwin)
2 6
 -------------------------------------
3 7
   * libclamav/regex_list.c: fix off by one substring logic.
... ...
@@ -151,8 +151,9 @@ static const char cctld_regex[] = "^"iana_cctld"$";
151 151
 static const char dotnet[] = ".net";
152 152
 static const char adonet[] = "ado.net";
153 153
 static const char aspnet[] = "asp.net";
154
-static const char lt[]="<";
155
-static const char gt[]=">";
154
+/* ; is replaced by ' ' so omit it here*/
155
+static const char lt[]="&lt";
156
+static const char gt[]="&gt";
156 157
 static const char cid[] = "cid:";
157 158
 static const char src_text[] = "src";
158 159
 static const char href_text[] = "href";
... ...
@@ -723,9 +724,10 @@ cleanupURL(struct string *URL,struct string *pre_URL, int isReal)
723 723
 		str_replace(begin,end,'\\','/');
724 724
 		/* some broken MUAs put > in the href, and then
725 725
 		 * we get a false positive, so remove them */
726
-		str_replace(begin,end,'<','/');
727
-		str_replace(begin,end,'>','/');
728
-		str_strip(&begin,&end,"\"",1);
726
+		str_replace(begin,end,'<',' ');
727
+		str_replace(begin,end,'>',' ');
728
+		str_replace(begin,end,'\"',' ');
729
+		str_replace(begin,end,';',' ');
729 730
 		str_strip(&begin,&end,lt,lt_len);
730 731
 		str_strip(&begin,&end,gt,gt_len);
731 732
 		/* convert hostname to lowercase, but only hostname! */
... ...
@@ -222,20 +222,22 @@ static void fatal_error(struct regex_matcher* matcher)
222 222
 
223 223
 static inline size_t get_char_at_pos_with_skip(const struct pre_fixup_info* info, const char* buffer, size_t pos)
224 224
 {
225
+	const char* str;
225 226
 	size_t realpos = 0;
226 227
 	if(!info) {
227
-		return buffer[pos];
228
+		return (pos <= strlen(buffer)) ? buffer[pos>0 ? pos-1:0] : '\0';
228 229
 	}
229
-	cli_dbgmsg("calc_pos_with_skip: skip:%u, %u - %u \"%s\",\"%s\"\n",pos,info->host_start,info->host_end,info->pre_displayLink.data,buffer);
230
+	str = info->pre_displayLink.data;
231
+	cli_dbgmsg("calc_pos_with_skip: skip:%u, %u - %u \"%s\",\"%s\"\n", pos, info->host_start, info->host_end, str, buffer);
230 232
 	pos += info->host_start;
231
-	while(!isalnum(info->pre_displayLink.data[realpos])) realpos++;
232
-	for(; pos>0; pos--) {
233
-		while(info->pre_displayLink.data[realpos]==' ') realpos++;
233
+	while(str[realpos] && !isalnum(str[realpos])) realpos++;
234
+	for(; str[realpos] && (pos>0); pos--) {
235
+		while(str[realpos]==' ') realpos++;
234 236
 		realpos++;
235 237
 	}
236
-	while(info->pre_displayLink.data[realpos]==' ') realpos++;
237
-	cli_dbgmsg("calc_pos_with_skip:%s\n",info->pre_displayLink.data+realpos);	
238
-	return info->pre_displayLink.data[realpos>0?realpos-1:0];
238
+	while(str[realpos]==' ') realpos++;
239
+	cli_dbgmsg("calc_pos_with_skip:%s\n",str+realpos);	
240
+	return (pos>0 && !str[realpos]) ? '\0' : str[realpos>0?realpos-1:0];
239 241
 }
240 242
 
241 243
 /*
... ...
@@ -288,17 +290,18 @@ int regex_list_match(struct regex_matcher* matcher,const char* real_url,const ch
288 288
 			rc = 0;
289 289
 
290 290
 			for(i = 0; i < matcher->root_hosts_cnt; i++) {
291
-				/* needs to match terminating \0 too */
292
-				rc = cli_ac_scanbuff((unsigned char*)buffer,buffer_len+1,info, &matcher->root_hosts[i] ,&mdata,0,0,0,-1,NULL);
291
+				/* doesn't need to match terminating \0*/
292
+				rc = cli_ac_scanbuff((unsigned char*)buffer,buffer_len,info, &matcher->root_hosts[i] ,&mdata,0,0,0,-1,NULL);
293 293
 				cli_ac_freedata(&mdata);
294 294
 				if(rc) {
295 295
 					char c;
296 296
 					const char* matched = strchr(*info,':');	
297 297
 					const size_t match_len = matched ? strlen(matched+1) : 0;
298
-					if(match_len == buffer_len || /* full match */
298
+					if(((c=get_char_at_pos_with_skip(pre_fixup,buffer,buffer_len+1))==' ' || c=='\0' || c=='/' || c=='?') &&
299
+						(match_len == buffer_len || /* full match */
299 300
 					        (match_len < buffer_len &&
300 301
 						((c=get_char_at_pos_with_skip(pre_fixup,buffer,buffer_len-match_len))=='.' || (c==' ')) ) 
301
-						/* subdomain matched*/) {
302
+						/* subdomain matched*/)) {
302 303
 
303 304
 						cli_dbgmsg("Got a match: %s with %s\n",buffer,*info);
304 305
 						break;
... ...
@@ -427,9 +430,8 @@ static int add_regex_list_element(struct cli_matcher* root,const char* pattern,c
427 427
        massert(root);
428 428
        massert(pattern);
429 429
 
430
-       len = strlen(pattern)+1;
431
-       /* need to match \0 too, so we are sure
432
-	* matches only happen at end of string */
430
+       len = strlen(pattern);
431
+       /* need not to match \0 too */
433 432
        new->type = 0;
434 433
        new->sigid = 0;
435 434
        new->parts = 0;