Browse code

when domain matchers, preserve full subdomain(bb #721)

git-svn: trunk@3560

Török Edvin authored on 2008/01/31 03:44:07
Showing 3 changed files
... ...
@@ -1,3 +1,8 @@
1
+Wed Jan 30 20:23:20 EET 2008 (edwin)
2
+------------------------------------
3
+  * libclamav/phishcheck.c, regex_list.c: when domain matchers, preserve full
4
+  subdomain(bb #721)
5
+
1 6
 Tue Jan 29 17:50:05 GMT 2008 (njh)
2 7
 ----------------------------------
3 8
   * libclamav/tnef.c:	Handle trailing CR and change handling of truncated
... ...
@@ -242,6 +242,7 @@ static const short int hextable[256] = {
242 242
 
243 243
 /* Prototypes*/
244 244
 static void string_init_c(struct string* dest,char* data);
245
+static int string_assign_concatenated(struct string* dest, const char* prefix, const char* begin, const char* end);
245 246
 static void string_assign_null(struct string* dest);
246 247
 static char *rfind(char *start, char c, size_t len);
247 248
 static char hex2int(const unsigned char* src);
... ...
@@ -298,19 +299,32 @@ static void string_init_c(struct string* dest,char* data)
298 298
 	dest->ref = NULL;
299 299
 }
300 300
 
301
+/* assigns to @dest the string made from concatenating @prefix with the string between @begin and @end */
302
+static int string_assign_concatenated(struct string* dest, const char* prefix, const char* begin, const char* end)
303
+{
304
+	const size_t prefix_len = strlen(prefix);
305
+	char* ret = cli_malloc(prefix_len + end - begin + 1);
306
+	if(!ret)
307
+		return CL_EMEM;
308
+	strncpy(ret, prefix, prefix_len);
309
+	strncpy(ret+prefix_len, begin, end-begin);
310
+	ret[prefix_len+end-begin]='\0';
311
+	string_free(dest);
312
+	string_init_c(dest, ret);
313
+	return CL_SUCCESS;
314
+}
315
+
301 316
 /* make a copy of the string between start -> end*/
302 317
 static int string_assign_dup(struct string* dest,const char* start,const char* end)
303 318
 {
304
-	char*	    ret  = cli_malloc(end-start+1);
319
+	char* ret  = cli_malloc(end-start+1);
305 320
 	if(!ret)
306 321
 		return CL_EMEM;
307 322
 	strncpy(ret,start,end-start);
308 323
 	ret[end-start]='\0';
309 324
 
310 325
 	string_free(dest);
311
-	dest->data=ret;
312
-	dest->refcount=1;
313
-	dest->ref=NULL;
326
+	string_init_c(dest, ret);
314 327
 	return CL_SUCCESS;
315 328
 }
316 329
 
... ...
@@ -745,11 +759,10 @@ cleanupURL(struct string *URL,struct string *pre_URL, int isReal)
745 745
 		}
746 746
 		if(!isReal) {
747 747
 			str_fixup_spaces(&begin,&end);
748
-			if (( rc = string_assign_dup(URL,begin,end+1) )) {
748
+			if (( rc = string_assign_dup(URL, begin, end+1) )) {
749 749
 				return rc;
750 750
 			}
751 751
 		}
752
-		/*cli_dbgmsg("%p::%s\n",URL->data,URL->data);*/
753 752
 	}
754 753
 	return 0;
755 754
 }
... ...
@@ -765,6 +778,7 @@ static int found_possibly_unwanted(cli_ctx* ctx)
765 765
 
766 766
 int phishingScan(message* m,const char* dir,cli_ctx* ctx,tag_arguments_t* hrefs)
767 767
 {
768
+	/* TODO: get_host and then apply regex, etc. */
768 769
 	int i;
769 770
 	struct phishcheck* pchk = (struct phishcheck*) ctx->engine->phishcheck;
770 771
 	/* check for status of whitelist fatal error, etc. */
... ...
@@ -1003,6 +1017,7 @@ static enum phish_status cleanupURLs(struct url_check* urls)
1003 1003
 {
1004 1004
 	if(urls->flags&CLEANUP_URL) {
1005 1005
 		cleanupURL(&urls->realLink,NULL,1);
1006
+
1006 1007
 		cleanupURL(&urls->displayLink,&urls->pre_fixup.pre_displayLink,0);
1007 1008
 		if(!urls->displayLink.data || !urls->realLink.data)
1008 1009
 			return CL_PHISH_NODECISION;
... ...
@@ -1024,7 +1039,7 @@ static int url_get_host(const struct phishcheck* pchk, struct url_check* url,str
1024 1024
 	if(!start || !end) {
1025 1025
 		string_assign_null(host);
1026 1026
 	}
1027
-	else if(( rc = string_assign_dup(host,start,end) )) {
1027
+	else if(( rc = string_assign_concatenated(host, ".", start, end) )) {
1028 1028
 		return rc;
1029 1029
 	}
1030 1030
 
... ...
@@ -1110,6 +1125,8 @@ static enum phish_status phishingCheck(const struct cl_engine* engine,struct url
1110 1110
 		return rc < 0 ? rc : CL_PHISH_CLEAN;
1111 1111
 	}
1112 1112
 
1113
+	cli_dbgmsg("Phishcheck:URL after cleanup: %s->%s\n", urls->realLink.data,
1114
+		urls->displayLink.data);
1113 1115
 	if(whitelist_check(engine, urls, 0))
1114 1116
 		return CL_PHISH_CLEAN;/* if url is whitelisted don't perform further checks */
1115 1117
 
... ...
@@ -238,7 +238,7 @@ static inline size_t get_char_at_pos_with_skip(const struct pre_fixup_info* info
238 238
 		realpos++;
239 239
 	}
240 240
 	while(str[realpos]==' ') realpos++;
241
-	cli_dbgmsg("calc_pos_with_skip:%s\n",str+realpos);	
241
+	cli_dbgmsg("calc_pos_with_skip:%s\n",str+realpos);
242 242
 	return (pos>0 && !str[realpos]) ? '\0' : str[realpos>0?realpos-1:0];
243 243
 }
244 244
 
... ...
@@ -257,6 +257,7 @@ static inline size_t get_char_at_pos_with_skip(const struct pre_fixup_info* info
257 257
  */
258 258
 int regex_list_match(struct regex_matcher* matcher,char* real_url,const char* display_url,const struct pre_fixup_info* pre_fixup,int hostOnly,const char** info,int is_whitelist)
259 259
 {
260
+	char* orig_real_url = real_url;
260 261
 	massert(matcher);
261 262
 	massert(real_url);
262 263
 	massert(display_url);
... ...
@@ -264,6 +265,9 @@ int regex_list_match(struct regex_matcher* matcher,char* real_url,const char* di
264 264
 	if(!matcher->list_inited)
265 265
 		return 0;
266 266
 	massert(matcher->list_built);
267
+	/* skip initial '.' inserted by get_host */
268
+	if(real_url[0] == '.') real_url++;
269
+	if(display_url[0] == '.') display_url++;
267 270
 	{
268 271
 		size_t real_len    = strlen(real_url);
269 272
 		size_t display_len = strlen(display_url);
... ...
@@ -280,7 +284,7 @@ int regex_list_match(struct regex_matcher* matcher,char* real_url,const char* di
280 280
 		buffer[real_len]= (!is_whitelist && hostOnly) ? '\0' : ':';
281 281
 		if(!hostOnly || is_whitelist) {
282 282
 			strncpy(buffer+real_len+1,display_url,display_len);
283
-			if(is_whitelist) 
283
+			if(is_whitelist)
284 284
 				buffer[buffer_len - 1] = '/';
285 285
 			buffer[buffer_len]=0;
286 286
 		}
... ...
@@ -297,30 +301,40 @@ int regex_list_match(struct regex_matcher* matcher,char* real_url,const char* di
297 297
 				cli_ac_freedata(&mdata);
298 298
 				if(rc) {
299 299
 					char c;
300
-					const char* matched = strchr(*info,':');	
300
+					const char* matched = strchr(*info,':');
301 301
 					const size_t match_len = matched ? strlen(matched+1) : 0;
302 302
 					if(((c=get_char_at_pos_with_skip(pre_fixup,buffer,buffer_len+1))==' ' || c=='\0' || c=='/' || c=='?') &&
303 303
 						(match_len == buffer_len || /* full match */
304 304
 					        (match_len < buffer_len &&
305
-						((c=get_char_at_pos_with_skip(pre_fixup,buffer,buffer_len-match_len))=='.' || (c==' ')) ) 
305
+						((c=get_char_at_pos_with_skip(pre_fixup,buffer,buffer_len-match_len))=='.' || (c==' ')) )
306 306
 						/* subdomain matched*/)) {
307 307
 
308
-						cli_dbgmsg("Got a match: %s with %s\n",buffer,*info);
309
-						cli_dbgmsg("Before inserting .: %s\n",real_url);
308
+						cli_dbgmsg("Got a match: %s with %s\n", buffer, *info);
309
+						cli_dbgmsg("Before inserting .: %s\n", orig_real_url);
310 310
 						if(real_len >= match_len + 1) {
311
-							real_url[real_len-match_len-1]='.';
312
-							cli_dbgmsg("After inserting .: %s\n",real_url);
311
+							const size_t pos = real_len - match_len - 1;
312
+							if(real_url[pos] != '.') {
313
+								cli_dbgmsg("No dot here:%s\n",real_url+pos);
314
+								/* we need to shift left, and insert a '.'
315
+								 * we have an extra '.' at the beginning inserted by get_host to have room,
316
+								 * orig_real_url has to be used here, 
317
+								 * because we want to overwrite that extra '.' */
318
+								size_t orig_real_len = strlen(orig_real_url);
319
+								real_url = orig_real_url;
320
+								memmove(real_url, real_url+1, orig_real_len-match_len-1);
321
+								real_url[orig_real_len-match_len-1]='.';
322
+								cli_dbgmsg("After inserting .: %s\n", real_url);
323
+							}
313 324
 						}
314 325
 						break;
315 326
 					}
316
-					cli_dbgmsg("Ignoring false match: %s with %s,%c\n",buffer,*info,c);
327
+					cli_dbgmsg("Ignoring false match: %s with %s, mismatched character: %c\n", buffer, *info, c);
317 328
 					rc=0;
318 329
 				}
319 330
 			}
320 331
 		} else
321 332
 			rc = 0;
322
-    
323
-		if(!rc) 
333
+		if(!rc)
324 334
 			rc = match_node(hostOnly ? matcher->root_regex_hostonly : matcher->root_regex,(unsigned char*)buffer,buffer_len,info) == MATCH_SUCCESS ? CL_VIRUS : CL_SUCCESS;
325 335
 		free(buffer);
326 336
 		if(!rc)