Browse code

avoid false positives with outbind:// URLs

git-svn: trunk@3260

Török Edvin authored on 2007/10/03 05:19:31
Showing 3 changed files
... ...
@@ -1,3 +1,7 @@
1
+Tue Oct  2 21:29:03 EEST 2007 (edwin)
2
+-------------------------------------
3
+  * libclamav/phishcheck.[ch]: avoid false positives with outbind:// URLs
4
+
1 5
 Mon Oct  1 14:05:35 BST 2007 (njh)
2 6
 ----------------------------------
3 7
   * clamav-milter:	More informative message when SPF record is passed
... ...
@@ -117,8 +117,6 @@ Checks if realLink is http, but displayedLink is https or viceversa.
117 117
 
118 118
 10. Hostname of real URL is extracted.
119 119
 
120
-11. Skip cid: displayedLink urls (images embedded in mails).
121
-
122 120
 12. Numeric IP detection.
123 121
 If url is a numeric IP, then -> phish.
124 122
 Maybe we should do DNS lookup?
... ...
@@ -154,7 +152,6 @@ static const char aspnet[] = "asp.net";
154 154
 /* ; is replaced by ' ' so omit it here*/
155 155
 static const char lt[]="&lt";
156 156
 static const char gt[]="&gt";
157
-static const char cid[] = "cid:";
158 157
 static const char src_text[] = "src";
159 158
 static const char href_text[] = "href";
160 159
 static const char mailto[] = "mailto:";
... ...
@@ -162,7 +159,6 @@ static const char https[]="https://";
162 162
 
163 163
 static const size_t href_text_len = sizeof(href_text);
164 164
 static const size_t src_text_len = sizeof(src_text);
165
-static const size_t cid_len = sizeof(cid)-1;
166 165
 static const size_t dotnet_len = sizeof(dotnet)-1;
167 166
 static const size_t adonet_len = sizeof(adonet)-1;
168 167
 static const size_t aspnet_len = sizeof(aspnet)-1;
... ...
@@ -223,7 +219,7 @@ static const size_t https_len  = sizeof(https)-1;
223 223
 #define URI_fragmentaddress2 URI_URI2
224 224
 #define URI_fragmentaddress3 URI_URI3"(#"URI_fragmentid")?"
225 225
 
226
-#define URI_CHECK_PROTOCOLS "(http|https|ftp)://.+"
226
+#define URI_CHECK_PROTOCOLS "(http|https|ftp|mailto)://.+"
227 227
 
228 228
 /*Warning: take care when modifying this regex, it has been tweaked, and tuned, just don't break it please.
229 229
  * there is fragmentaddress1, and 2  to work around the ISO limitation of 509 bytes max length for string constants*/
... ...
@@ -898,6 +894,7 @@ static void free_regex(regex_t* p)
898 898
 
899 899
 int phishing_init(struct cl_engine* engine)
900 900
 {
901
+	char *url_regex, *realurl_regex;
901 902
 	struct phishcheck* pchk;
902 903
 	if(!engine->phishcheck) {
903 904
 		pchk = engine->phishcheck = cli_malloc(sizeof(struct phishcheck));
... ...
@@ -934,20 +931,33 @@ int phishing_init(struct cl_engine* engine)
934 934
 		engine->phishcheck = NULL;
935 935
 		return CL_EFORMAT;	
936 936
 	}
937
-	pchk->url_regex = str_compose("^ *("URI_fragmentaddress1,URI_fragmentaddress2,URI_fragmentaddress3"|"URI_CHECK_PROTOCOLS") *$");
938
-	if(build_regex(&pchk->preg,pchk->url_regex,1)) {
937
+	url_regex = str_compose("^ *(("URI_CHECK_PROTOCOLS")|("URI_fragmentaddress1,URI_fragmentaddress2,URI_fragmentaddress3")) *$");
938
+	if(build_regex(&pchk->preg,url_regex,1)) {
939
+		free_regex(&pchk->preg_cctld);
940
+		free_regex(&pchk->preg_tld);
941
+		free(url_regex);
942
+		free(pchk);
943
+		engine->phishcheck = NULL;
944
+		return CL_EFORMAT;
945
+	}
946
+	free(url_regex);
947
+	realurl_regex = str_compose("^ *(("URI_CHECK_PROTOCOLS")|("URI_path1,URI_fragmentaddress2,URI_fragmentaddress3")) *$");
948
+	if(build_regex(&pchk->preg_realurl, realurl_regex,1)) {
939 949
 		free_regex(&pchk->preg_cctld);
940 950
 		free_regex(&pchk->preg_tld);
941
-		free(pchk->url_regex);
951
+		free_regex(&pchk->preg);
952
+		free(url_regex);
953
+		free(realurl_regex);
942 954
 		free(pchk);
943 955
 		engine->phishcheck = NULL;
944 956
 		return CL_EFORMAT;
945 957
 	}
958
+	free(realurl_regex);
946 959
 	if(build_regex(&pchk->preg_numeric,numeric_url_regex,1)) {
947 960
 		free_regex(&pchk->preg_cctld);
948 961
 		free_regex(&pchk->preg_tld);
949 962
 		free_regex(&pchk->preg);
950
-		free(pchk->url_regex);
963
+		free_regex(&pchk->preg_realurl);
951 964
 		free(pchk);
952 965
 		engine->phishcheck = NULL;
953 966
 		return CL_EFORMAT;
... ...
@@ -967,10 +977,7 @@ void phishing_done(struct cl_engine* engine)
967 967
 		free_regex(&pchk->preg_cctld);
968 968
 		free_regex(&pchk->preg_tld);
969 969
 		free_regex(&pchk->preg_numeric);
970
-		if(pchk->url_regex) {
971
-			free(pchk->url_regex);
972
-			pchk->url_regex = NULL;
973
-		}
970
+		free_regex(&pchk->preg_realurl);
974 971
 		pchk->is_disabled = 1;
975 972
 	}
976 973
 	whitelist_done(engine);
... ...
@@ -985,13 +992,21 @@ void phishing_done(struct cl_engine* engine)
985 985
 
986 986
 /*
987 987
  * Only those URLs are identified as URLs for which phishing detection can be performed.
988
- * This means that no attempt is made to properly recognize 'cid:' URLs
989 988
  */
990 989
 static int isURL(const struct phishcheck* pchk,const char* URL)
991 990
 {
992 991
 	return URL ? !cli_regexec(&pchk->preg,URL,0,NULL,0) : 0;
993 992
 }
994 993
 
994
+/*
995
+ * Check if this is a real URL, which basically means to check if it has a known URL scheme (http,https,ftp).
996
+ * This prevents false positives with outbind:// and blocked:: links.
997
+ */
998
+static int isRealURL(const struct phishcheck* pchk,const char* URL)
999
+{
1000
+	return URL ? !cli_regexec(&pchk->preg_realurl,URL,0,NULL,0) : 0;
1001
+}
1002
+
995 1003
 static int isNumericURL(const struct phishcheck* pchk,const char* URL)
996 1004
 {
997 1005
 	return URL ? !cli_regexec(&pchk->preg_numeric,URL,0,NULL,0) : 0;
... ...
@@ -1146,6 +1161,14 @@ static enum phish_status phishingCheck(const struct cl_engine* engine,struct url
1146 1146
 	if(whitelist_check(engine,urls,0))
1147 1147
 		return CL_PHISH_WHITELISTED;/* if url is whitelist don't perform further checks */
1148 1148
 
1149
+	if((!isURL(pchk, urls->displayLink.data) || !isRealURL(pchk, urls->realLink.data) )&&
1150
+			( (phishy&PHISHY_NUMERIC_IP && !isNumericURL(pchk, urls->displayLink.data)) ||
1151
+			  !(phishy&PHISHY_NUMERIC_IP))) {
1152
+		cli_dbgmsg("Displayed 'url' is not url:%s\n",urls->displayLink.data);
1153
+		free_if_needed(&host_url);
1154
+		return CL_PHISH_TEXTURL;
1155
+	}
1156
+
1149 1157
 	if(urls->flags&DOMAINLIST_REQUIRED && domainlist_match(engine,urls->realLink.data,urls->displayLink.data,NULL,0,&urls->flags))
1150 1158
 		phishy |= DOMAIN_LISTED;
1151 1159
 	else {
... ...
@@ -1226,23 +1249,11 @@ static enum phish_status phishingCheck(const struct cl_engine* engine,struct url
1226 1226
 		return CL_PHISH_HOST_NOT_LISTED;
1227 1227
 	}
1228 1228
 
1229
-	if(!strncmp(urls->displayLink.data,cid,cid_len))/* cid: image */{
1230
-		free_if_needed(&host_url);
1231
-		return CL_PHISH_CLEAN_CID;
1232
-	}
1233
-
1234 1229
 	if(whitelist_check(engine,&host_url,1)) {
1235 1230
 		free_if_needed(&host_url);
1236 1231
 		return CL_PHISH_HOST_WHITELISTED;
1237 1232
 	}
1238 1233
 
1239
-	if((!isURL(pchk, urls->displayLink.data) || !isURL(pchk, urls->realLink.data) )&&
1240
-			( (phishy&PHISHY_NUMERIC_IP && !isNumericURL(pchk, urls->displayLink.data)) ||
1241
-			  !(phishy&PHISHY_NUMERIC_IP))) {
1242
-		cli_dbgmsg("Displayed 'url' is not url:%s\n",urls->displayLink.data);
1243
-		free_if_needed(&host_url);
1244
-		return CL_PHISH_TEXTURL;
1245
-	}
1246 1234
 
1247 1235
 	if(urls->flags&HOST_SUFFICIENT) {
1248 1236
 		if(!strcmp(urls->realLink.data,urls->displayLink.data)) {
... ...
@@ -56,11 +56,11 @@ struct string {
56 56
 
57 57
 struct phishcheck {
58 58
 	regex_t preg;
59
+	regex_t preg_realurl;
59 60
 	regex_t preg_tld;
60 61
 	regex_t preg_cctld;
61 62
 	regex_t preg_numeric;
62 63
 	regex_t preg_hexurl;
63
-	char*    url_regex;
64 64
 	int      is_disabled;
65 65
 };
66 66