GitList

Browse code

ignore invalid URLs containing double dots, optimization: cut URL after hostname

git-svn: trunk@3569

Török Edvin authored on 2008/02/01 01:05:20
Showing 4 changed files

ChangeLog index 865237b..594f6b1 100644
docs/phishsigs_howto.pdf index 68d05b8..2640f24 100644
docs/phishsigs_howto.tex index 2a2e6b1..11aae25 100644
libclamav/phishcheck.c index 108a3c2..bd440f5 100644

@@ -1,3 +1,8 @@
                     +Thu Jan 31 17:44:35 EET 2008 (edwin)
                     +------------------------------------
                     +  * libclamav/phishcheck.c, docs/phishsigs_howto.tex: ignore invalid URLs
                     +  containing double dots, optimization: cut URL after hostname
+                    +
                      Thu Jan 31 16:33:56 CET 2008 (tk)
                      ---------------------------------
                        * libclamav/vba_extract.c: minor code tidy; drop broken sigtouint32()

docs/phishsigs_howto.pdf

History View file @ f12c2e6

Binary files a/docs/phishsigs_howto.pdf and b/docs/phishsigs_howto.pdf differ

docs/phishsigs_howto.tex

History View file @ f12c2e6

@@ -237,7 +237,10 @@ Furthermore you can restrict what checks are to be performed by specifying the 3
                      \subsubsection{Extraction of \textsc{realURL}, \textsc{displayedURL} from HTML tags\label{sub:Extraction-of-realURL,}}
                      The html parser extracts pairs of \textsc{realURL}/\textsc{displayedURL}
                     -based on the following rules:
                     +based on the following rules.
+                    +
                     +In version 0.93: After URLs have been extracted, they are normalized, and cut after the hostname.
                     +\verb+http://test.example.com/path/somecgi?queryparameters+ becomes \verb+http://test.example.com/+
                      \begin{description}
                      \item [{a}] (anchor) the \emph{href} is the \textsc{realURL}, its \emph{contents}
@@ -588,4 +591,4 @@ Then see what urls are being checked, see if any of them is in a
                      whitelist, see if all urls are detected, etc.
                     -\end{document}
                     \ No newline at end of file
                     +\end{document}

libclamav/phishcheck.c

History View file @ f12c2e6

@@ -173,46 +173,34 @@ static const size_t https_len  = sizeof(https)-1;
                      #define URI_safe_nodot  "-$_@&"
                      #define URI_safe	"-$_@.&"
                      #define URI_extra	"!*\"'(),"
                     -#define URI_reserved    "=;/#?: "
                     -#define URI_national    "{}|[]\\^~"
                     -#define URI_punctuation "<>"
                      #define URI_hex		 "[0-9a-fA-f]"
                      #define URI_escape      "%"URI_hex"{2}"
                      #define URI_xalpha "([" URI_safe URI_alpha URI_digit  URI_extra "]|"URI_escape")" /* URI_safe has to be first, because it contains - */
                      #define URI_xalpha_nodot "([" URI_safe_nodot URI_alpha URI_digit URI_extra "]|"URI_escape")"
                     -#define URI_xalphas URI_xalpha"+"
                      #define URI_xalphas_nodot URI_xalpha_nodot"*"
                      #define URI_ialpha  "["URI_alpha"]"URI_xalphas_nodot""
                      #define URI_xpalpha URI_xalpha"|\\+"
                      #define URI_xpalpha_nodot URI_xalpha_nodot"|\\+"
                     -#define URI_xpalphas "("URI_xpalpha")+"
                      #define URI_xpalphas_nodot "("URI_xpalpha_nodot")+"
                     -#define optional_URI_xpalphas "("URI_xpalpha"|=)*"
                      #define URI_scheme URI_ialpha
                      #define URI_tld iana_tld
                      #define URI_path1 URI_xpalphas_nodot"\\.("URI_xpalphas_nodot"\\.)*"
                     -#define URI_path2 URI_tld
                     -#define URI_path3 "(/"optional_URI_xpalphas")*"
+                    -
                     -#define URI_search "("URI_xalphas")*"
                     -#define URI_fragmentid URI_xalphas
                      #define URI_IP_digits "["URI_digit"]{1,3}"
                     -#define URI_numeric_path URI_IP_digits"(\\."URI_IP_digits"){3}(:"URI_xpalphas_nodot")?(/("URI_xpalphas"/?)*)?"
                     -#define URI_numeric_URI "("URI_scheme":(//)?)?"URI_numeric_path"(\\?" URI_search")?"
                     -#define URI_numeric_fragmentaddress URI_numeric_URI"(#"URI_fragmentid")?"
                     +#define URI_path_start "[/?:]?"
                     +#define URI_numeric_path URI_IP_digits"(\\."URI_IP_digits"){3}"URI_path_start
                     +#define URI_numeric_URI "("URI_scheme":(//)?)?"URI_numeric_path
                     +#define URI_numeric_fragmentaddress URI_numeric_URI
                      #define URI_URI1 "("URI_scheme":(//)?)?"URI_path1
                     -#define URI_URI2 URI_path2
                     -#define URI_URI3 URI_path3"(\\?" URI_search")?"
                     +#define URI_URI2 URI_tld
                      #define URI_fragmentaddress1 URI_URI1
                     -#define URI_fragmentaddress2 URI_URI2
                     -#define URI_fragmentaddress3 URI_URI3"(#"URI_fragmentid")?"
                     +#define URI_fragmentaddress2 URI_URI2""URI_path_start
                      #define URI_CHECK_PROTOCOLS "(http|https|ftp|mailto)://.+"
@@ -680,6 +668,9 @@ str_fixup_spaces(char **begin, const char **end)
                      	/* strip leading/trailing garbage */
                      	while(!isalnum(sbegin[0]) && sbegin <= send) sbegin++;
                      	while(!isalnum(send[0]) && send >= sbegin) send--;
+                    +
                     +	/* keep terminating slash character*/
                     +	if(send[1] == '/') send++;
                      	*begin = sbegin;
                      	*end = send;
+                     }
@@ -715,7 +706,6 @@ cleanupURL(struct string *URL,struct string *pre_URL, int isReal)
+                     	}
                      	while(isspace(*end))
                      		end--;
                     -	/*TODO: convert \ to /, and stuff like that*/
                      	/* From mailscanner, my comments enclosed in {} */
                      	if(!strncmp(begin,dotnet,dotnet_len) || !strncmp(begin,adonet,adonet_len) || !strncmp(begin,aspnet,aspnet_len)) {
                      		string_assign_null(URL);
@@ -727,6 +717,32 @@ cleanupURL(struct string *URL,struct string *pre_URL, int isReal)
                      		int rc;
                      		str_replace(begin,end,'\\','/');
                     +		/* find beginning of hostname, because:
                     +		 * - we want to keep only protocol, host, and
                     +		 *  strip path & query parameter(s)
                     +		 * - we want to make hostname lowercase*/
                     +		host_begin = strchr(begin,':');
                     +		while(host_begin && (host_begin < end) && (host_begin[1] == '/'))  host_begin++;
                     +		if(!host_begin) host_begin=begin;
                     +		else host_begin++;
                     +		host_len = strcspn(host_begin,":/?");
                     +	        if(host_begin + host_len > end + 1) {
                     +			/* prevent hostname extending beyond end, it can happen
                     +			 * if we have spaces at the end, we don't want those part of
                     +			 * the hostname */
                     +			host_len = end - host_begin + 1;
                     +		} else {
                     +			/* cut the URL after the hostname */
                     +			/* @end points to last character we want to be part of the URL */
                     +			end = host_begin + host_len - 1;
                     +		}
                     +		/* terminate URL with a slash, except when we're at end of string */
                     +		if(host_begin[host_len]) {
                     +			host_begin[host_len] = '/';
                     +			end++;
                     +		}
                     +		/* convert hostname to lowercase, but only hostname! */
                     +		str_make_lowercase(host_begin, host_len);
                      		/* some broken MUAs put > in the href, and then
                      		 * we get a false positive, so remove them */
                      		str_replace(begin,end,'<',' ');
@@ -735,13 +751,6 @@ cleanupURL(struct string *URL,struct string *pre_URL, int isReal)
                      		str_replace(begin,end,';',' ');
                      		str_strip(&begin,&end,lt,lt_len);
                      		str_strip(&begin,&end,gt,gt_len);
                     -		/* convert hostname to lowercase, but only hostname! */
                     -		host_begin = strchr(begin,':');
                     -		while(host_begin && host_begin[1]=='/') host_begin++;
                     -		if(!host_begin) host_begin=begin;
                     -		else host_begin++;
                     -		host_len = strcspn(host_begin,"/?");
                     -		str_make_lowercase(host_begin,host_len);
                      		/* convert %xx to real value */
                      		str_hex_to_char(&begin,&end);
                      		if(isReal) {
@@ -929,7 +938,7 @@ int phishing_init(struct cl_engine* engine)
                      		engine->phishcheck = NULL;
                      		return CL_EFORMAT;
+                     	}
                     -	url_regex = str_compose("^ *(("URI_CHECK_PROTOCOLS")|("URI_fragmentaddress1,URI_fragmentaddress2,URI_fragmentaddress3")) *$");
                     +	url_regex = str_compose("^ *(("URI_CHECK_PROTOCOLS")|(",URI_fragmentaddress1,URI_fragmentaddress2")) *$");
                      	if(build_regex(&pchk->preg,url_regex,1)) {
                      		free_regex(&pchk->preg_cctld);
                      		free_regex(&pchk->preg_tld);
@@ -939,7 +948,7 @@ int phishing_init(struct cl_engine* engine)
                      		return CL_EFORMAT;
+                     	}
                      	free(url_regex);
                     -	realurl_regex = str_compose("^ *(("URI_CHECK_PROTOCOLS")|("URI_path1,URI_fragmentaddress2,URI_fragmentaddress3")) *$");
                     +	realurl_regex = str_compose("^ *(("URI_CHECK_PROTOCOLS")|(",URI_path1,URI_fragmentaddress2")) *$");
                      	if(build_regex(&pchk->preg_realurl, realurl_regex,1)) {
                      		free_regex(&pchk->preg_cctld);
                      		free_regex(&pchk->preg_tld);
@@ -1017,7 +1026,6 @@ static enum phish_status cleanupURLs(struct url_check* urls)
+                     {
                      	if(urls->flags&CLEANUP_URL) {
                      		cleanupURL(&urls->realLink,NULL,1);
+                    -
                      		cleanupURL(&urls->displayLink,&urls->pre_fixup.pre_displayLink,0);
                      		if(!urls->displayLink.data || !urls->realLink.data)
                      			return CL_PHISH_NODECISION;
@@ -1045,12 +1053,14 @@ static int url_get_host(const struct phishcheck* pchk, struct url_check* url,str
                      	cli_dbgmsg("Phishcheck:host:%s\n", host->data);
                     -	if(!host->data || (isReal && host->data[0]=='\0') || *phishy&REAL_IS_MAILTO || strchr(host->data,' ')) {
                     +	if(!host->data || (isReal && (host->data[0]=='\0' || strstr(host->data,".."))) || *phishy&REAL_IS_MAILTO || strchr(host->data,' ')) {
                      		/* no host,
                      		 * link without domain, such as: href="/isapi.dll?...
                      		 * mailto:
                      		 * spaces in hostname
                     +		 * double dots
                      		 */
                     +		cli_dbgmsg("Phishcheck:skipping invalid host\n");
                      		return CL_PHISH_CLEAN;
+                     	}
                      	if(url->flags&CHECK_CLOAKING && !cli_regexec(&pchk->preg_hexurl,host->data,0,NULL,0)) {
@@ -1127,6 +1137,7 @@ static enum phish_status phishingCheck(const struct cl_engine* engine,struct url
                      	cli_dbgmsg("Phishcheck:URL after cleanup: %s->%s\n", urls->realLink.data,
                      		urls->displayLink.data);
+                    +
                      	if(whitelist_check(engine, urls, 0))
                      		return CL_PHISH_CLEAN;/* if url is whitelisted don't perform further checks */