Browse code

document URL blacklisting, and whitelisting (bb #1458).

git-svn: trunk@4953

Török Edvin authored on 2009/03/16 22:41:23
Showing 4 changed files
... ...
@@ -1,3 +1,8 @@
1
+Mon Mar 16 15:41:17 EET 2009 (edwin)
2
+------------------------------------
3
+ * docs/phishsigs_howto.tex, libclamav/phishcheck.c: document URL
4
+ blacklisting, and whitelisting (bb #1458).
5
+
1 6
 Mon Mar 16 14:44:25 EET 2009 (edwin)
2 7
 ------------------------------------
3 8
  * clamdtop/clamdtop.c: fix warning
4 9
Binary files a/docs/phishsigs_howto.pdf and b/docs/phishsigs_howto.pdf differ
... ...
@@ -37,12 +37,7 @@ H[Filter]:DisplayedHostname[:FuncLevelSpec]
37 37
  		\item or a subdomain of the specified hostname
38 38
  		\item to avoid false matches in case of subdomain matches, the engine checks that there  is a dot(\verb+.+) or a space(\verb+ +) before the matched portion
39 39
 	\end{itemize}
40
- \item [{Filter}] an (optional) 3-digit hexadecimal number representing flags that should be filtered.
41
-	\begin{itemize}
42
- 		\item flag filtering only makes sense in .pdb files. (however clamav won't complain if you put flags in .wdb files, it will just skip them)
43
- 		\item for details on how to construct a flag number see section \prettyref{sec:Flags}
44
-	\end{itemize}
45
-
40
+ \item [{Filter}] is ignored for R and H for compatibility reasons
46 41
  \item [{\textsc{RealURL}}] is the URL the user is sent to, example: \emph{href} attribute of an html anchor (\emph{<a> tag})
47 42
  \item [{\textsc{DisplayedURL}}] is the URL description displayed to the user, where its \emph{claimed} they are sent, example: contents of an html anchor (\emph{<a> tag})
48 43
  \item [{DisplayedHostname}] is the hostname portion of the \textsc{DisplayedURL}
... ...
@@ -53,6 +48,36 @@ H[Filter]:DisplayedHostname[:FuncLevelSpec]
53 53
 	\end{itemize}
54 54
 \end{description}
55 55
 
56
+\subsection{GDB format}
57
+This file contains URL hashes in the following format:
58
+\begin{verbatim}
59
+S:P:HostPrefix[:FuncLevelSpec]
60
+S:F:Sha256hash[:FuncLevelSpec]
61
+S1:P:HostPrefix[:FuncLevelSpec]
62
+S1:F:Sha256hash[:FuncLevelSpec]
63
+S2:P:HostPrefix[:FuncLevelSpec]
64
+S2:F:Sha256hash[:FuncLevelSpec]
65
+\end{verbatim}
66
+
67
+\begin{description}
68
+ \item [{S:}]
69
+  	These are hashes for Google Safe Browsing - malware sites, and should not be used for other purposes.
70
+ \item [{S2:}]
71
+	These are hashes for Google Safe Browsing - phishing sites, and should not be used for other purposes.
72
+ \item [{S1:}]
73
+	Hashes for blacklisting phishing sites.
74
+	Virus name: Phishing.URL.Blacklisted
75
+ \item [{HostPrefix}]
76
+	4-byte prefix of the sha256 hash of the last 2 or 3 components of the hostname.
77
+If prefix doesn't match, no further lookups are performed.
78
+ \item [{Sha256hash}]
79
+	sha256 hash of the canonicalized URL, or a sha256 hash of its prefix/suffix according to the Google Safe Browsing ``Performing Lookups'' rules. There should be a corresponding \verb+:P:HostkeyPrefix+ entry for the hash to be taken into consideration.
80
+\end{description}
81
+
82
+To see which hash/URL matched, look at the \verb+clamscan --debug+ output, and look for the following strings:
83
+\verb+Looking up hash+, \verb+prefix matched+, and \verb+Hash matched+.
84
+Local whitelisting of .gdb entries can be done by creating .wdb entries.
85
+
56 86
 \subsection{WDB format}
57 87
 This file contains whitelisted url pairs
58 88
 It contains lines in the following format:
... ...
@@ -61,6 +61,7 @@
61 61
 #define DOMAIN_LISTED		 8
62 62
 #define PHISHY_CLOAKED_NULL	16
63 63
 
64
+
64 65
 /*
65 66
 * Phishing design documentation,
66 67
 (initially written at http://wiki.clamav.net/index.php/phishing_design as discussed with aCaB)
... ...
@@ -1395,7 +1396,7 @@ static enum phish_status phishingCheck(const struct cl_engine* engine,struct url
1395 1395
 {
1396 1396
 	struct url_check host_url;
1397 1397
 	int rc = CL_PHISH_NODECISION;
1398
-	int phishy=0;
1398
+	int phishy=0, blacklisted=0;
1399 1399
 	const struct phishcheck* pchk = (const struct phishcheck*) engine->phishcheck;
1400 1400
 
1401 1401
 	if(!urls->realLink.data || urls->displayLink.data[0]=='\0')
... ...
@@ -1413,11 +1414,13 @@ static enum phish_status phishingCheck(const struct cl_engine* engine,struct url
1413 1413
 	}
1414 1414
 
1415 1415
 	if(( rc = url_hash_match(engine->domainlist_matcher, urls->realLink.data, strlen(urls->realLink.data)) )) {
1416
-	    if (rc == CL_PHISH_CLEAN)
1416
+	    if (rc == CL_PHISH_CLEAN) {
1417 1417
 		cli_dbgmsg("not analyzing, not a real url: %s\n", urls->realLink.data);
1418
-	    else
1418
+		return CL_PHISH_CLEAN;
1419
+	    } else {
1419 1420
 		cli_dbgmsg("Hash matched for: %s\n", urls->realLink.data);
1420
-	    return rc;
1421
+		blacklisted = rc;
1422
+	    }
1421 1423
 	}
1422 1424
 
1423 1425
 	if((rc = cleanupURLs(urls))) {
... ...
@@ -1433,12 +1436,16 @@ static enum phish_status phishingCheck(const struct cl_engine* engine,struct url
1433 1433
 			( (phishy&PHISHY_NUMERIC_IP && !isNumericURL(pchk, urls->displayLink.data)) ||
1434 1434
 			  !(phishy&PHISHY_NUMERIC_IP))) {
1435 1435
 		cli_dbgmsg("Displayed 'url' is not url:%s\n",urls->displayLink.data);
1436
-		return CL_PHISH_CLEAN;
1436
+		if (!blacklisted)
1437
+		    return CL_PHISH_CLEAN;
1437 1438
 	}
1438 1439
 
1439 1440
 	if(whitelist_check(engine, urls, 0))
1440 1441
 		return CL_PHISH_CLEAN;/* if url is whitelisted don't perform further checks */
1441 1442
 
1443
+	if (blacklisted)
1444
+	    return blacklisted;
1445
+
1442 1446
 	url_check_init(&host_url);
1443 1447
 
1444 1448
 	if((rc = url_get_host(urls, &host_url, DOMAIN_DISPLAY, &phishy))) {