Browse code

remove "all domain scan" feature from phishcheck (--no-phishing-restrictedscan). Nobody is using it. Don't care why an url is clean, just state it is clean. Various cleanups resulting from this. Prepare to introduce selective turn on of sub-features.

git-svn: trunk@3369

Török Edvin authored on 2007/11/15 05:53:42
Showing 8 changed files
... ...
@@ -1,3 +1,12 @@
1
+Wed Nov 14 21:56:33 EET 2007 (edwin)
2
+------------------------------------
3
+  * libclamav/phishcheck.[ch]: 
4
+    remove "all domain scan" feature from phishcheck
5
+    (--no-phishing-restrictedscan). Nobody is using it.
6
+    Don't care why an url is clean, just state it is clean.
7
+    Various cleanups resulting from this.
8
+    Prepare to introduce selective turn on of sub-features.
9
+
1 10
 Mon Nov 12 17:47:21 GMT 2007 (njh)
2 11
 ----------------------------------
3 12
   * libclamav/vba_extract.c:	Removed more unused code
... ...
@@ -434,19 +434,6 @@ int acceptloop_th(int *socketds, int nsockets, struct cl_engine *engine, unsigne
434 434
 
435 435
     if(cfgopt(copt,"PhishingScanURLs")->enabled) {
436 436
 
437
-	if(cfgopt(copt,"PhishingRestrictedScan")->enabled) {
438
-	    /* we don't scan urls from all domains, just those listed in
439
-	     * .pdb file. This is the safe default
440
-	     */
441
-	    options |= CL_SCAN_PHISHING_DOMAINLIST;
442
-	} else {
443
-	    /* This is a false positive prone option, since newsletters, etc.
444
-	     * often contain links that will be classified as phishing attempts,
445
-	     * even though the site they link to isn't a phish site.
446
-	     */
447
-	    logg("Phishing: Checking all URLs, regardless of domain (FP prone).\n");
448
-	}
449
-
450 437
 	if(cfgopt(copt,"PhishingAlwaysBlockCloak")->enabled) {
451 438
 	    options |= CL_SCAN_PHISHING_BLOCKCLOAK; 
452 439
 	    logg("Phishing: Always checking for cloaked urls\n");
... ...
@@ -174,10 +174,6 @@ int scanmanager(const struct optstruct *opt)
174 174
 
175 175
     if(!opt_check(opt,"no-phishing-scan-urls"))
176 176
 	dboptions |= CL_DB_PHISHING_URLS;
177
-    if(!opt_check(opt,"no-phishing-restrictedscan")) {
178
-	/* not scanning all domains, check only URLs with domains from .pdb */
179
-	options |= CL_SCAN_PHISHING_DOMAINLIST;
180
-    }
181 177
     if(opt_check(opt,"phishing-ssl")) {
182 178
 	options |= CL_SCAN_PHISHING_BLOCKSSL;
183 179
     }
... ...
@@ -255,11 +255,6 @@ Scan URLs found in mails for phishing attempts using heuristics. This will class
255 255
 .br
256 256
 Default: yes
257 257
 .TP
258
-\fBPhishingRestrictedScan BOOL\fR
259
-Use phishing detection only for domains listed in the .pdb database. It is not recommended to have this option turned off, because scanning of all domains may lead to many false positives!
260
-.br
261
-Default: yes
262
-.TP
263 258
 \fBPhishingAlwaysBlockSSLMismatch BOOL\fR
264 259
 Always block SSL mismatches in URLs, even if the URL isn't in the database. This can lead to false positives.
265 260
 .br
... ...
@@ -237,12 +237,6 @@ LocalSocket /tmp/clamd.socket
237 237
 # Default: yes
238 238
 #PhishingScanURLs yes
239 239
 
240
-# Use phishing detection only for domains listed in the .pdb database. It is
241
-# not recommended to have this option turned off, because scanning of all
242
-# domains may lead to many false positives!
243
-# Default: yes
244
-#PhishingRestrictedScan yes
245
-
246 240
 # Always block SSL mismatches in URLs, even if the URL isn't in the database.
247 241
 # This can lead to false positives.
248 242
 #
... ...
@@ -85,14 +85,13 @@ extern "C"
85 85
 #define CL_SCAN_MAILURL		    0x80
86 86
 #define CL_SCAN_BLOCKMAX	    0x100
87 87
 #define CL_SCAN_ALGORITHMIC	    0x200
88
-#define CL_SCAN_PHISHING_DOMAINLIST 0x400
89 88
 #define CL_SCAN_PHISHING_BLOCKSSL   0x800 /* ssl mismatches, not ssl by itself*/
90 89
 #define CL_SCAN_PHISHING_BLOCKCLOAK 0x1000
91 90
 #define CL_SCAN_ELF		    0x2000
92 91
 #define CL_SCAN_PDF		    0x4000
93 92
 
94 93
 /* recommended scan settings */
95
-#define CL_SCAN_STDOPT		(CL_SCAN_ARCHIVE | CL_SCAN_MAIL | CL_SCAN_OLE2 | CL_SCAN_HTML | CL_SCAN_PE | CL_SCAN_ALGORITHMIC | CL_SCAN_ELF | CL_SCAN_PHISHING_DOMAINLIST) 
94
+#define CL_SCAN_STDOPT		(CL_SCAN_ARCHIVE | CL_SCAN_MAIL | CL_SCAN_OLE2 | CL_SCAN_HTML | CL_SCAN_PE | CL_SCAN_ALGORITHMIC | CL_SCAN_ELF)
96 95
 
97 96
 /* aliases for backward compatibility */
98 97
 #define CL_RAW		CL_SCAN_RAW
... ...
@@ -64,34 +64,30 @@
64 64
 * Phishing design documentation,
65 65
 (initially written at http://wiki.clamav.net/index.php/phishing_design as discussed with aCaB)
66 66
 
67
-TODO:update this doc
68
-
69
-*Warning*: if flag *--phish-scan-alldomains* (or equivalent clamd/clamav-milter config option) isn't given, then phishing scanning is done only for domains listed in daily.pdb.
70
-If your daily.pdb is empty, then by default NO PHISHING is DONE, UNLESS you give the *--phish-scan-alldomains*
71
-This is just a side-effect, daily.pdb is empty, because it isn't yet officialy in daily.cvd.
67
+TODO: update this doc whenever behaviour changes
72 68
 
73 69
 phishingCheck() determines if @displayedLink is  a legit representation of @realLink.
74 70
 
75 71
 Steps:
76 72
 
77
-1. if _realLink_ *==* _displayLink_ => *CLEAN*
73
+1. if _realLink_ == _displayLink_ => CLEAN
78 74
 
79 75
 2. url cleanup (normalization)
80 76
 - whitespace elimination
77
+ strip all spaces, and leading and trailing garbage.
78
+ When matching we have to keep in account whether we stripped any spaces or not.
79
+ See str_fixup_spaces.
81 80
 - html entity conversion
81
+- handle hex-encoded characters
82 82
 - convert hostname to lowercase
83 83
 - normalize \ to /
84
-If there is a dot after the last space, then all spaces are replaced with dots,
85
-otherwise spaces are stripped.
86
-So both: 'Go to yahoo.com', and 'Go to e b a y . c o m', and 'Go to ebay. com' will work.
87
-
88 84
 
89 85
 3. Matched the urls against a _whitelist_:
90 86
 a _realLink_, _displayedLink_ pair is matched against the _whitelist_.
91 87
 the _whitelist_ is a list of pairs of realLink, displayedLink. Any of the elements of those pairs can be a _regex_.
92 88
  if url *is found* in _whitelist_ --> *CLEAN*
93 89
 
94
-4. URL is looked up in the _domainlist_, unless disabled via flags (_--phish-scan-alldomains_).
90
+4. URL is looked up in the _domainlist_
95 91
 The _domainlist_ is a list of pairs of realLink, displayedLink (any of which can be regex).
96 92
 This is the list of domains we do phishing detection for (such as ebay,paypal,chase,....)
97 93
 We can't decide to stop processing here or not, so we just set a flag.
... ...
@@ -120,7 +116,6 @@ Checks if realLink is http, but displayedLink is https or viceversa.
120 120
 12. Numeric IP detection.
121 121
 If url is a numeric IP, then -> phish.
122 122
 Maybe we should do DNS lookup?
123
-Maybe we should disable numericIP checks for --phish-scan-alldomains?
124 123
 
125 124
 13. isURL(displayedLink).
126 125
 Checks if displayedLink is really a url.
... ...
@@ -227,21 +222,21 @@ static const char numeric_url_regex[] = "^ *"URI_numeric_fragmentaddress" *$";
227 227
 
228 228
 /* generated by contrib/phishing/generate_tables.c */
229 229
 static const short int hextable[256] = {
230
-       0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 
231
-       0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 
232
-       0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 
233
-       0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 
234
-       0x0, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 
235
-       0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 
236
-       0x0, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 
237
-       0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 
238
-       0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 
239
-       0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 
240
-       0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 
241
-       0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 
242
-       0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 
243
-       0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 
244
-       0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 
230
+       0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
231
+       0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
232
+       0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
233
+       0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
234
+       0x0, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
235
+       0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
236
+       0x0, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
237
+       0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
238
+       0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
239
+       0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
240
+       0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
241
+       0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
242
+       0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
243
+       0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
244
+       0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
245 245
        0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0
246 246
 };
247 247
 
... ...
@@ -352,13 +347,13 @@ static int build_regex(regex_t* preg,const char* regex,int nosub)
352 352
 	cli_dbgmsg("Phishcheck: Compiling regex: %s\n",regex);
353 353
 	rc = cli_regcomp(preg,regex,REG_EXTENDED|REG_ICASE|(nosub ? REG_NOSUB :0));
354 354
 	if(rc) {
355
-	
355
+
356 356
 #ifdef	C_WINDOWS
357 357
 		cli_errmsg("Phishcheck: Error in compiling regex, disabling phishing checks\n");
358 358
 #else
359 359
 		size_t buflen =	cli_regerror(rc,preg,NULL,0);
360 360
 		char *errbuf = cli_malloc(buflen);
361
-		
361
+
362 362
 		if(errbuf) {
363 363
 			cli_regerror(rc,preg,errbuf,buflen);
364 364
 			cli_errmsg("Phishcheck: Error in compiling regex:%s\nDisabling phishing checks\n",errbuf);
... ...
@@ -682,7 +677,7 @@ cleanupURL(struct string *URL,struct string *pre_URL, int isReal)
682 682
 	char *begin = URL->data;
683 683
 	const char *end;
684 684
 	size_t len;
685
-	
685
+
686 686
 	clear_msb(begin);
687 687
 	/*if(begin == NULL)
688 688
 		return;*/
... ...
@@ -782,16 +777,13 @@ int phishingScan(message* m,const char* dir,cli_ctx* ctx,tag_arguments_t* hrefs)
782 782
 		if(hrefs->contents[i]) {
783 783
 			struct url_check urls;
784 784
 			enum phish_status rc;
785
-			urls.always_check_flags = DOMAINLIST_REQUIRED;/* required to work correctly */
786 785
 			urls.flags	 = strncmp((char*)hrefs->tag[i],href_text,href_text_len)? (CL_PHISH_ALL_CHECKS&~CHECK_SSL): CL_PHISH_ALL_CHECKS;
787 786
 			urls.link_type   = 0;
788 787
 			if(!strncmp((char*)hrefs->tag[i],src_text,src_text_len)) {
789 788
 				if (!(urls.flags&CHECK_IMG_URL))
790 789
 				continue;
791
-				urls.link_type |= LINKTYPE_IMAGE; 
790
+				urls.link_type |= LINKTYPE_IMAGE;
792 791
 			}
793
-			if (ctx->options&CL_SCAN_PHISHING_DOMAINLIST)
794
-				urls.flags |= DOMAINLIST_REQUIRED;
795 792
 			if (ctx->options & CL_SCAN_PHISHING_BLOCKSSL) {
796 793
 				urls.always_check_flags |= CHECK_SSL;
797 794
 			}
... ...
@@ -823,20 +815,6 @@ int phishingScan(message* m,const char* dir,cli_ctx* ctx,tag_arguments_t* hrefs)
823 823
 			switch(rc)/*TODO: support flags from ctx->options,*/
824 824
 				{
825 825
 					case CL_PHISH_CLEAN:
826
-					case CL_PHISH_CLEANUP_OK:
827
-					case CL_PHISH_HOST_OK:
828
-					case CL_PHISH_DOMAIN_OK:
829
-					case CL_PHISH_REDIR_OK:
830
-					case CL_PHISH_HOST_REDIR_OK:
831
-					case CL_PHISH_DOMAIN_REDIR_OK:
832
-					case CL_PHISH_HOST_REVERSE_OK:
833
-					case CL_PHISH_DOMAIN_REVERSE_OK:
834
-					case CL_PHISH_WHITELISTED:
835
-					case CL_PHISH_HOST_WHITELISTED:
836
-					case CL_PHISH_MAILTO_OK:
837
-					case CL_PHISH_TEXTURL:
838
-					case CL_PHISH_HOST_NOT_LISTED:
839
-					case CL_PHISH_CLEAN_CID:
840 826
 						continue;
841 827
 /*						break;*/
842 828
 					case CL_PHISH_HEX_URL:
... ...
@@ -935,7 +913,7 @@ int phishing_init(struct cl_engine* engine)
935 935
 		free_regex(&pchk->preg_cctld);
936 936
 		free(pchk);
937 937
 		engine->phishcheck = NULL;
938
-		return CL_EFORMAT;	
938
+		return CL_EFORMAT;
939 939
 	}
940 940
 	url_regex = str_compose("^ *(("URI_CHECK_PROTOCOLS")|("URI_fragmentaddress1,URI_fragmentaddress2,URI_fragmentaddress3")) *$");
941 941
 	if(build_regex(&pchk->preg,url_regex,1)) {
... ...
@@ -992,7 +970,7 @@ void phishing_done(struct cl_engine* engine)
992 992
 		cli_dbgmsg("Freeing phishcheck struct\n");
993 993
 		free(pchk);
994 994
 		engine->phishcheck = NULL;
995
-	}		
995
+	}
996 996
 	cli_dbgmsg("Phishcheck cleaned up\n");
997 997
 }
998 998
 
... ...
@@ -1029,7 +1007,7 @@ static enum phish_status cleanupURLs(struct url_check* urls)
1029 1029
 		if(!urls->displayLink.data || !urls->realLink.data)
1030 1030
 			return CL_PHISH_NODECISION;
1031 1031
 		if(!strcmp(urls->realLink.data,urls->displayLink.data))
1032
-			return CL_PHISH_CLEANUP_OK;
1032
+			return CL_PHISH_CLEAN;
1033 1033
 	}
1034 1034
 	return CL_PHISH_NODECISION;
1035 1035
 }
... ...
@@ -1046,33 +1024,31 @@ static int url_get_host(const struct phishcheck* pchk, struct url_check* url,str
1046 1046
 	if(!start || !end) {
1047 1047
 		string_assign_null(host);
1048 1048
 	}
1049
-	else {
1050
-		if(( rc = string_assign_dup(host,start,end) ))
1051
-			return rc;
1049
+	else if(( rc = string_assign_dup(host,start,end) )) {
1050
+		return rc;
1052 1051
 	}
1052
+
1053 1053
 	cli_dbgmsg("Phishcheck:host:%s\n", host->data);
1054
-	if(!isReal) {
1055
-		url->pre_fixup.host_start = start - URL;
1056
-		url->pre_fixup.host_end = end - URL;
1057
-	}
1058
-	if(!host->data)
1059
-		return CL_PHISH_CLEANUP_OK;
1060
-	if(*phishy&REAL_IS_MAILTO)
1061
-		return CL_PHISH_MAILTO_OK;
1062
-	if(strchr(host->data,' ')) {
1063
-		string_free(host);
1064
-		return CL_PHISH_TEXTURL;
1054
+
1055
+	if(!host->data || (isReal && host->data[0]=='\0') || *phishy&REAL_IS_MAILTO || strchr(host->data,' ')) {
1056
+		/* no host,
1057
+		 * link without domain, such as: href="/isapi.dll?...
1058
+		 * mailto:
1059
+		 * spaces in hostname
1060
+		 */
1061
+		return CL_PHISH_CLEAN;
1065 1062
 	}
1066 1063
 	if(url->flags&CHECK_CLOAKING && !cli_regexec(&pchk->preg_hexurl,host->data,0,NULL,0)) {
1067 1064
 		/* uses a regex here, so that we don't accidentally block 0xacab.net style hosts */
1068
-		string_free(host);
1069 1065
 		return CL_PHISH_HEX_URL;
1070 1066
 	}
1071
-	if(isReal && host->data[0]=='\0')
1072
-		return CL_PHISH_CLEAN;/* link without domain, such as: href="/isapi.dll?... */
1073 1067
 	if(isNumeric(host->data)) {
1074 1068
 		*phishy |= PHISHY_NUMERIC_IP;
1075 1069
 	}
1070
+	if(!isReal) {
1071
+		url->pre_fixup.host_start = start - URL;
1072
+		url->pre_fixup.host_end = end - URL;
1073
+	}
1076 1074
 	return CL_PHISH_NODECISION;
1077 1075
 }
1078 1076
 
... ...
@@ -1111,45 +1087,15 @@ static int whitelist_check(const struct cl_engine* engine,struct url_check* urls
1111 1111
 	return whitelist_match(engine,urls->realLink.data,urls->displayLink.data,hostOnly);
1112 1112
 }
1113 1113
 
1114
-static int isPhishing(enum phish_status rc)
1115
-{
1116
-	switch(rc) {
1117
-		case CL_PHISH_CLEAN:
1118
-		case CL_PHISH_CLEANUP_OK:
1119
-		case CL_PHISH_WHITELISTED:
1120
-		case CL_PHISH_HOST_WHITELISTED:
1121
-		case CL_PHISH_HOST_OK:
1122
-		case CL_PHISH_DOMAIN_OK:
1123
-		case CL_PHISH_REDIR_OK:
1124
-		case CL_PHISH_HOST_REDIR_OK:
1125
-		case CL_PHISH_DOMAIN_REDIR_OK:
1126
-		case CL_PHISH_HOST_REVERSE_OK:
1127
-		case CL_PHISH_DOMAIN_REVERSE_OK:
1128
-		case CL_PHISH_MAILTO_OK:
1129
-		case CL_PHISH_TEXTURL:
1130
-		case CL_PHISH_HOST_NOT_LISTED:
1131
-		case CL_PHISH_CLEAN_CID:
1132
-			return 0;
1133
-		case CL_PHISH_HEX_URL:
1134
-		case CL_PHISH_CLOAKED_NULL:
1135
-		case CL_PHISH_SSL_SPOOF:
1136
-		case CL_PHISH_CLOAKED_UIU:
1137
-		case CL_PHISH_NUMERIC_IP:
1138
-		case CL_PHISH_NOMATCH:
1139
-			return 1;
1140
-		default:
1141
-			return 1;
1142
-	}
1143
-}
1144 1114
 /* urls can't contain null pointer, caller must ensure this */
1145 1115
 static enum phish_status phishingCheck(const struct cl_engine* engine,struct url_check* urls)
1146 1116
 {
1147 1117
 	struct url_check host_url;
1148
-	enum phish_status rc=CL_PHISH_NODECISION;
1118
+	int rc = CL_PHISH_NODECISION;
1149 1119
 	int phishy=0;
1150 1120
 	const struct phishcheck* pchk = (const struct phishcheck*) engine->phishcheck;
1151 1121
 
1152
-	if(!urls->realLink.data)
1122
+	if(!urls->realLink.data || urls->displayLink.data[0]=='\0')
1153 1123
 		return CL_PHISH_CLEAN;
1154 1124
 
1155 1125
 	cli_dbgmsg("Phishcheck:Checking url %s->%s\n", urls->realLink.data,
... ...
@@ -1159,59 +1105,43 @@ static enum phish_status phishingCheck(const struct cl_engine* engine,struct url
1159 1159
 		return CL_PHISH_CLEAN;/* displayed and real URL are identical -> clean */
1160 1160
 
1161 1161
 	if((rc = cleanupURLs(urls))) {
1162
-		if(isPhishing(rc))/* not allowed to decide this is phishing */
1163
-			return CL_PHISH_CLEAN;
1164
-		return rc;/* URLs identical after cleanup */
1162
+		/* it can only return an error, or say its clean;
1163
+		 * it is not allowed to decide it is phishing */
1164
+		return rc < 0 ? rc : CL_PHISH_CLEAN;
1165 1165
 	}
1166 1166
 
1167
-	if(whitelist_check(engine,urls,0))
1168
-		return CL_PHISH_WHITELISTED;/* if url is whitelist don't perform further checks */
1167
+	if(whitelist_check(engine, urls, 0))
1168
+		return CL_PHISH_CLEAN;/* if url is whitelisted don't perform further checks */
1169 1169
 
1170
-	if((!isURL(pchk, urls->displayLink.data) || !isRealURL(pchk, urls->realLink.data) )&&
1170
+	if((!isURL(pchk, urls->displayLink.data) || !isRealURL(pchk, urls->realLink.data) ) &&
1171 1171
 			( (phishy&PHISHY_NUMERIC_IP && !isNumericURL(pchk, urls->displayLink.data)) ||
1172 1172
 			  !(phishy&PHISHY_NUMERIC_IP))) {
1173 1173
 		cli_dbgmsg("Displayed 'url' is not url:%s\n",urls->displayLink.data);
1174
-		return CL_PHISH_TEXTURL;
1174
+		return CL_PHISH_CLEAN;
1175 1175
 	}
1176 1176
 
1177
-	if(urls->flags&DOMAINLIST_REQUIRED && domainlist_match(engine,urls->realLink.data,urls->displayLink.data,NULL,0,&urls->flags))
1177
+	if(domainlist_match(engine, urls->realLink.data, urls->displayLink.data, NULL, 0, &urls->flags)) {
1178 1178
 		phishy |= DOMAIN_LISTED;
1179
-	else {
1179
+	} else {
1180 1180
 		/* although entire url is not listed, the host might be,
1181 1181
 		 * so defer phishing decisions till we know if host is listed*/
1182 1182
 	}
1183 1183
 
1184
-	
1185 1184
 	url_check_init(&host_url);
1186 1185
 
1187
-	if((rc = url_get_host(pchk, urls,&host_url,DOMAIN_DISPLAY,&phishy))) {
1186
+	if((rc = url_get_host(pchk, urls, &host_url, DOMAIN_DISPLAY, &phishy))) {
1188 1187
 		free_if_needed(&host_url);
1189
-		if(isPhishing(rc))
1190
-			return CL_PHISH_CLEAN;
1191
-		return rc;
1188
+		return rc < 0 ? rc : CL_PHISH_CLEAN;
1192 1189
 	}
1193 1190
 
1194
-
1195
-	if(urls->flags&DOMAINLIST_REQUIRED) {
1196
-		if(!(phishy&DOMAIN_LISTED)) {
1197
-			if(domainlist_match(engine,host_url.displayLink.data,host_url.realLink.data,&urls->pre_fixup,1,&urls->flags))
1198
-				phishy |= DOMAIN_LISTED;
1199
-			else {
1200
-			}
1201
-		}
1191
+	if(!(phishy&DOMAIN_LISTED) &&
1192
+		!domainlist_match(engine,host_url.displayLink.data,host_url.realLink.data,&urls->pre_fixup,1,&urls->flags)) {
1193
+			return CL_PHISH_CLEAN; /* domain not listed */
1202 1194
 	}
1203 1195
 
1204 1196
 	/* link type filtering must occur after last domainlist_match */
1205 1197
 	if(urls->link_type & LINKTYPE_IMAGE && !(urls->flags&CHECK_IMG_URL))
1206
-		return CL_PHISH_HOST_NOT_LISTED;/* its listed, but this link type is filtered */
1207
-
1208
-	if(urls->flags & DOMAINLIST_REQUIRED && !(phishy & DOMAIN_LISTED) ) {
1209
-		urls->flags &= urls->always_check_flags;
1210
-		if(!urls->flags) {
1211
-				free_if_needed(&host_url);
1212
-				return CL_PHISH_HOST_NOT_LISTED;
1213
-			}
1214
-		}
1198
+		return CL_PHISH_CLEAN;/* its listed, but this link type is filtered */
1215 1199
 
1216 1200
 	if(urls->flags&CHECK_CLOAKING) {
1217 1201
 		/*Checks if URL is cloaked.
... ...
@@ -1227,63 +1157,41 @@ static enum phish_status phishingCheck(const struct cl_engine* engine,struct url
1227 1227
 		}
1228 1228
 	}
1229 1229
 
1230
-
1231
-	if(urls->displayLink.data[0]=='\0') {
1232
-		free_if_needed(&host_url);
1233
-		return CL_PHISH_CLEAN;
1234
-	}
1235
-
1236 1230
 	if(urls->flags&CHECK_SSL && isSSL(urls->displayLink.data) && !isSSL(urls->realLink.data)) {
1237 1231
 		free_if_needed(&host_url);
1238 1232
 		return CL_PHISH_SSL_SPOOF;
1239 1233
 	}
1240 1234
 
1241
-	if(!urls->flags&CHECK_CLOAKING && urls->flags & DOMAINLIST_REQUIRED && !(phishy&DOMAIN_LISTED) ) {
1242
-		free_if_needed(&host_url);
1243
-		return CL_PHISH_HOST_NOT_LISTED;
1244
-	}
1245
-
1246 1235
 	if((rc = url_get_host(pchk, urls,&host_url,DOMAIN_REAL,&phishy)))
1247 1236
 	{
1248 1237
 		free_if_needed(&host_url);
1249
-		return rc;
1238
+		return rc < 0 ? rc : CL_PHISH_CLEAN;
1250 1239
 	}
1251 1240
 
1252
-	if(urls->flags&DOMAINLIST_REQUIRED && !(phishy&DOMAIN_LISTED)) {
1241
+	if(whitelist_check(engine,&host_url,1)) {
1253 1242
 		free_if_needed(&host_url);
1254
-		return CL_PHISH_HOST_NOT_LISTED;
1243
+		return CL_PHISH_CLEAN;
1255 1244
 	}
1256 1245
 
1257
-	if(whitelist_check(engine,&host_url,1)) {
1246
+	if(!strcmp(urls->realLink.data,urls->displayLink.data)) {
1258 1247
 		free_if_needed(&host_url);
1259
-		return CL_PHISH_HOST_WHITELISTED;
1248
+		return CL_PHISH_CLEAN;
1260 1249
 	}
1261 1250
 
1262
-
1263
-	if(urls->flags&HOST_SUFFICIENT) {
1264
-		if(!strcmp(urls->realLink.data,urls->displayLink.data)) {
1251
+	{
1252
+		struct url_check domain_url;
1253
+		url_check_init(&domain_url);
1254
+		url_get_domain(pchk, &host_url,&domain_url);
1255
+		if(!strcmp(domain_url.realLink.data,domain_url.displayLink.data)) {
1265 1256
 			free_if_needed(&host_url);
1266
-			return CL_PHISH_HOST_OK;
1267
-		}
1268
-
1269
-
1270
-		if(urls->flags&DOMAIN_SUFFICIENT) {
1271
-			struct url_check domain_url;
1272
-			url_check_init(&domain_url);
1273
-			url_get_domain(pchk, &host_url,&domain_url);
1274
-			if(!strcmp(domain_url.realLink.data,domain_url.displayLink.data)) {
1275
-				free_if_needed(&host_url);
1276
-				free_if_needed(&domain_url);
1277
-				return CL_PHISH_DOMAIN_OK;
1278
-			}
1279 1257
 			free_if_needed(&domain_url);
1258
+			return CL_PHISH_CLEAN;
1280 1259
 		}
1260
+		free_if_needed(&domain_url);
1261
+	}
1281 1262
 
1282
-		free_if_needed(&host_url);
1283
-	}/*HOST_SUFFICIENT*/
1263
+	free_if_needed(&host_url);
1284 1264
 	/*we failed to find a reason why the 2 URLs are different, this is definitely phishing*/
1285
-	if(urls->flags&DOMAINLIST_REQUIRED && !(phishy&DOMAIN_LISTED))
1286
-		return CL_PHISH_HOST_NOT_LISTED;
1287 1265
 	return phishy_map(phishy,CL_PHISH_NOMATCH);
1288 1266
 }
1289 1267
 
... ...
@@ -1292,28 +1200,6 @@ static const char* phishing_ret_toString(enum phish_status rc)
1292 1292
 	switch(rc) {
1293 1293
 		case CL_PHISH_CLEAN:
1294 1294
 			return "Clean";
1295
-		case CL_PHISH_CLEANUP_OK:
1296
-			return "URLs match after cleanup";
1297
-		case CL_PHISH_WHITELISTED:
1298
-			return "URL is whitelisted";
1299
-		case CL_PHISH_HOST_WHITELISTED:
1300
-			return "host part of URL is whitelist";
1301
-		case CL_PHISH_HOST_OK:
1302
-			return "Hosts match";
1303
-		case CL_PHISH_DOMAIN_OK:
1304
-			return "Domains match";
1305
-		case CL_PHISH_REDIR_OK:
1306
-			return "After redirecting realURL, they match";
1307
-		case CL_PHISH_HOST_REDIR_OK:
1308
-			return "After redirecting realURL, hosts match";
1309
-		case CL_PHISH_DOMAIN_REDIR_OK:
1310
-			return "After redirecting the domains match";
1311
-		case CL_PHISH_MAILTO_OK:
1312
-			return "URL is mailto";
1313
-		case CL_PHISH_NUMERIC_IP:
1314
-			return "IP address encountered in hostname";
1315
-		case CL_PHISH_TEXTURL:
1316
-			return "Displayed link is not an URL, can't check if phishing or not";
1317 1295
 		case CL_PHISH_CLOAKED_NULL:
1318 1296
 			return "Link URL is cloaked (null byte %00)";
1319 1297
 		case CL_PHISH_CLOAKED_UIU:
... ...
@@ -1323,10 +1209,6 @@ static const char* phishing_ret_toString(enum phish_status rc)
1323 1323
 			return "Visible links is SSL, real link is not";
1324 1324
 		case CL_PHISH_NOMATCH:
1325 1325
 			return "URLs are way too different";
1326
-		case CL_PHISH_HOST_NOT_LISTED:
1327
-			return "Host not listed in .pdb -> not checked";
1328
-		case CL_PHISH_CLEAN_CID:
1329
-			return "Embedded image in mail -> clean";
1330 1326
 		case CL_PHISH_HEX_URL:
1331 1327
 			return "Embedded hex urls";
1332 1328
 		default:
... ...
@@ -23,30 +23,17 @@
23 23
 #include "regex/regex.h"
24 24
 
25 25
 #define CL_PHISH_BASE 100
26
-enum phish_status {CL_PHISH_NODECISION=0,CL_PHISH_CLEAN=CL_PHISH_BASE, CL_PHISH_CLEANUP_OK,CL_PHISH_HOST_OK, CL_PHISH_DOMAIN_OK,
27
-	CL_PHISH_HOST_NOT_LISTED,
28
-	CL_PHISH_REDIR_OK, CL_PHISH_HOST_REDIR_OK, CL_PHISH_DOMAIN_REDIR_OK,
29
-	CL_PHISH_HOST_REVERSE_OK,CL_PHISH_DOMAIN_REVERSE_OK,
30
-	CL_PHISH_WHITELISTED,CL_PHISH_HOST_WHITELISTED,
31
-	CL_PHISH_CLEAN_CID,
32
-	CL_PHISH_TEXTURL, CL_PHISH_MAILTO_OK,
33
-	CL_PHISH_CLOAKED_UIU, CL_PHISH_NUMERIC_IP,CL_PHISH_HEX_URL,CL_PHISH_CLOAKED_NULL,CL_PHISH_SSL_SPOOF, CL_PHISH_NOMATCH};
34
-
35
-#define HOST_SUFFICIENT   1
36
-#define DOMAIN_SUFFICIENT (HOST_SUFFICIENT | 2)
37
-#define DO_REVERSE_LOOKUP 4
38
-#define CHECK_REDIR       8
39
-#define CHECK_SSL         16
40
-#define CHECK_CLOAKING    32
41
-#define CLEANUP_URL       64
42
-#define CHECK_DOMAIN_REVERSE 128
43
-#define CHECK_IMG_URL        256
44
-#define DOMAINLIST_REQUIRED  512
45
-/* img checking disabled by default */
26
+enum phish_status {CL_PHISH_NODECISION=0, CL_PHISH_CLEAN=CL_PHISH_BASE,
27
+	CL_PHISH_CLOAKED_UIU, CL_PHISH_NUMERIC_IP, CL_PHISH_HEX_URL, CL_PHISH_CLOAKED_NULL, CL_PHISH_SSL_SPOOF, CL_PHISH_NOMATCH};
28
+
29
+#define CHECK_SSL         1
30
+#define CHECK_CLOAKING    2
31
+#define CLEANUP_URL       4
32
+#define CHECK_IMG_URL     8
46 33
 
47 34
 #define LINKTYPE_IMAGE     1
48 35
 
49
-#define CL_PHISH_ALL_CHECKS (CLEANUP_URL|DOMAIN_SUFFICIENT|CHECK_SSL|CHECK_CLOAKING|CHECK_IMG_URL)
36
+#define CL_PHISH_ALL_CHECKS (CLEANUP_URL|CHECK_SSL|CHECK_CLOAKING|CHECK_IMG_URL)
50 37
 
51 38
 struct string {
52 39
 	int refcount;