Browse code

ignore invalid URLs containing double dots, optimization: cut URL after hostname

git-svn: trunk@3569

Török Edvin authored on 2008/02/01 01:05:20
Showing 4 changed files
... ...
@@ -1,3 +1,8 @@
1
+Thu Jan 31 17:44:35 EET 2008 (edwin)
2
+------------------------------------
3
+  * libclamav/phishcheck.c, docs/phishsigs_howto.tex: ignore invalid URLs
4
+  containing double dots, optimization: cut URL after hostname
5
+
1 6
 Thu Jan 31 16:33:56 CET 2008 (tk)
2 7
 ---------------------------------
3 8
   * libclamav/vba_extract.c: minor code tidy; drop broken sigtouint32()
4 9
Binary files a/docs/phishsigs_howto.pdf and b/docs/phishsigs_howto.pdf differ
... ...
@@ -237,7 +237,10 @@ Furthermore you can restrict what checks are to be performed by specifying the 3
237 237
 \subsubsection{Extraction of \textsc{realURL}, \textsc{displayedURL} from HTML tags\label{sub:Extraction-of-realURL,}}
238 238
 
239 239
 The html parser extracts pairs of \textsc{realURL}/\textsc{displayedURL}
240
-based on the following rules:
240
+based on the following rules.
241
+
242
+In version 0.93: After URLs have been extracted, they are normalized, and cut after the hostname.
243
+\verb+http://test.example.com/path/somecgi?queryparameters+ becomes \verb+http://test.example.com/+
241 244
 
242 245
 \begin{description}
243 246
 \item [{a}] (anchor) the \emph{href} is the \textsc{realURL}, its \emph{contents}
... ...
@@ -588,4 +591,4 @@ Then see what urls are being checked, see if any of them is in a
588 588
 whitelist, see if all urls are detected, etc.
589 589
 
590 590
 
591
-\end{document}
592 591
\ No newline at end of file
592
+\end{document}
... ...
@@ -173,46 +173,34 @@ static const size_t https_len  = sizeof(https)-1;
173 173
 #define URI_safe_nodot  "-$_@&"
174 174
 #define URI_safe	"-$_@.&"
175 175
 #define URI_extra	"!*\"'(),"
176
-#define URI_reserved    "=;/#?: "
177
-#define URI_national    "{}|[]\\^~"
178
-#define URI_punctuation "<>"
179 176
 
180 177
 #define URI_hex		 "[0-9a-fA-f]"
181 178
 #define URI_escape      "%"URI_hex"{2}"
182 179
 #define URI_xalpha "([" URI_safe URI_alpha URI_digit  URI_extra "]|"URI_escape")" /* URI_safe has to be first, because it contains - */
183 180
 #define URI_xalpha_nodot "([" URI_safe_nodot URI_alpha URI_digit URI_extra "]|"URI_escape")"
184 181
 
185
-#define URI_xalphas URI_xalpha"+"
186 182
 #define URI_xalphas_nodot URI_xalpha_nodot"*"
187 183
 
188 184
 #define URI_ialpha  "["URI_alpha"]"URI_xalphas_nodot""
189 185
 #define URI_xpalpha URI_xalpha"|\\+"
190 186
 #define URI_xpalpha_nodot URI_xalpha_nodot"|\\+"
191
-#define URI_xpalphas "("URI_xpalpha")+"
192 187
 #define URI_xpalphas_nodot "("URI_xpalpha_nodot")+"
193
-#define optional_URI_xpalphas "("URI_xpalpha"|=)*"
194 188
 
195 189
 #define URI_scheme URI_ialpha
196 190
 #define URI_tld iana_tld
197 191
 #define URI_path1 URI_xpalphas_nodot"\\.("URI_xpalphas_nodot"\\.)*"
198
-#define URI_path2 URI_tld
199
-#define URI_path3 "(/"optional_URI_xpalphas")*"
200
-
201
-#define URI_search "("URI_xalphas")*"
202
-#define URI_fragmentid URI_xalphas
203 192
 
204 193
 #define URI_IP_digits "["URI_digit"]{1,3}"
205
-#define URI_numeric_path URI_IP_digits"(\\."URI_IP_digits"){3}(:"URI_xpalphas_nodot")?(/("URI_xpalphas"/?)*)?"
206
-#define URI_numeric_URI "("URI_scheme":(//)?)?"URI_numeric_path"(\\?" URI_search")?"
207
-#define URI_numeric_fragmentaddress URI_numeric_URI"(#"URI_fragmentid")?"
194
+#define URI_path_start "[/?:]?"
195
+#define URI_numeric_path URI_IP_digits"(\\."URI_IP_digits"){3}"URI_path_start
196
+#define URI_numeric_URI "("URI_scheme":(//)?)?"URI_numeric_path
197
+#define URI_numeric_fragmentaddress URI_numeric_URI
208 198
 
209 199
 #define URI_URI1 "("URI_scheme":(//)?)?"URI_path1
210
-#define URI_URI2 URI_path2
211
-#define URI_URI3 URI_path3"(\\?" URI_search")?"
200
+#define URI_URI2 URI_tld
212 201
 
213 202
 #define URI_fragmentaddress1 URI_URI1
214
-#define URI_fragmentaddress2 URI_URI2
215
-#define URI_fragmentaddress3 URI_URI3"(#"URI_fragmentid")?"
203
+#define URI_fragmentaddress2 URI_URI2""URI_path_start
216 204
 
217 205
 #define URI_CHECK_PROTOCOLS "(http|https|ftp|mailto)://.+"
218 206
 
... ...
@@ -680,6 +668,9 @@ str_fixup_spaces(char **begin, const char **end)
680 680
 	/* strip leading/trailing garbage */
681 681
 	while(!isalnum(sbegin[0]) && sbegin <= send) sbegin++;
682 682
 	while(!isalnum(send[0]) && send >= sbegin) send--;
683
+
684
+	/* keep terminating slash character*/
685
+	if(send[1] == '/') send++;
683 686
 	*begin = sbegin;
684 687
 	*end = send;
685 688
 }
... ...
@@ -715,7 +706,6 @@ cleanupURL(struct string *URL,struct string *pre_URL, int isReal)
715 715
 	}
716 716
 	while(isspace(*end))
717 717
 		end--;
718
-	/*TODO: convert \ to /, and stuff like that*/
719 718
 	/* From mailscanner, my comments enclosed in {} */
720 719
 	if(!strncmp(begin,dotnet,dotnet_len) || !strncmp(begin,adonet,adonet_len) || !strncmp(begin,aspnet,aspnet_len)) {
721 720
 		string_assign_null(URL);
... ...
@@ -727,6 +717,32 @@ cleanupURL(struct string *URL,struct string *pre_URL, int isReal)
727 727
 		int rc;
728 728
 
729 729
 		str_replace(begin,end,'\\','/');
730
+		/* find beginning of hostname, because:
731
+		 * - we want to keep only protocol, host, and 
732
+		 *  strip path & query parameter(s) 
733
+		 * - we want to make hostname lowercase*/
734
+		host_begin = strchr(begin,':');
735
+		while(host_begin && (host_begin < end) && (host_begin[1] == '/'))  host_begin++;
736
+		if(!host_begin) host_begin=begin;
737
+		else host_begin++;
738
+		host_len = strcspn(host_begin,":/?");
739
+	        if(host_begin + host_len > end + 1) {
740
+			/* prevent hostname extending beyond end, it can happen
741
+			 * if we have spaces at the end, we don't want those part of 
742
+			 * the hostname */
743
+			host_len = end - host_begin + 1;
744
+		} else {
745
+			/* cut the URL after the hostname */
746
+			/* @end points to last character we want to be part of the URL */
747
+			end = host_begin + host_len - 1;
748
+		}
749
+		/* terminate URL with a slash, except when we're at end of string */
750
+		if(host_begin[host_len]) {
751
+			host_begin[host_len] = '/';
752
+			end++;
753
+		}
754
+		/* convert hostname to lowercase, but only hostname! */
755
+		str_make_lowercase(host_begin, host_len);
730 756
 		/* some broken MUAs put > in the href, and then
731 757
 		 * we get a false positive, so remove them */
732 758
 		str_replace(begin,end,'<',' ');
... ...
@@ -735,13 +751,6 @@ cleanupURL(struct string *URL,struct string *pre_URL, int isReal)
735 735
 		str_replace(begin,end,';',' ');
736 736
 		str_strip(&begin,&end,lt,lt_len);
737 737
 		str_strip(&begin,&end,gt,gt_len);
738
-		/* convert hostname to lowercase, but only hostname! */
739
-		host_begin = strchr(begin,':');
740
-		while(host_begin && host_begin[1]=='/') host_begin++;
741
-		if(!host_begin) host_begin=begin;
742
-		else host_begin++;
743
-		host_len = strcspn(host_begin,"/?");
744
-		str_make_lowercase(host_begin,host_len);
745 738
 		/* convert %xx to real value */
746 739
 		str_hex_to_char(&begin,&end);
747 740
 		if(isReal) {
... ...
@@ -929,7 +938,7 @@ int phishing_init(struct cl_engine* engine)
929 929
 		engine->phishcheck = NULL;
930 930
 		return CL_EFORMAT;
931 931
 	}
932
-	url_regex = str_compose("^ *(("URI_CHECK_PROTOCOLS")|("URI_fragmentaddress1,URI_fragmentaddress2,URI_fragmentaddress3")) *$");
932
+	url_regex = str_compose("^ *(("URI_CHECK_PROTOCOLS")|(",URI_fragmentaddress1,URI_fragmentaddress2")) *$");
933 933
 	if(build_regex(&pchk->preg,url_regex,1)) {
934 934
 		free_regex(&pchk->preg_cctld);
935 935
 		free_regex(&pchk->preg_tld);
... ...
@@ -939,7 +948,7 @@ int phishing_init(struct cl_engine* engine)
939 939
 		return CL_EFORMAT;
940 940
 	}
941 941
 	free(url_regex);
942
-	realurl_regex = str_compose("^ *(("URI_CHECK_PROTOCOLS")|("URI_path1,URI_fragmentaddress2,URI_fragmentaddress3")) *$");
942
+	realurl_regex = str_compose("^ *(("URI_CHECK_PROTOCOLS")|(",URI_path1,URI_fragmentaddress2")) *$");
943 943
 	if(build_regex(&pchk->preg_realurl, realurl_regex,1)) {
944 944
 		free_regex(&pchk->preg_cctld);
945 945
 		free_regex(&pchk->preg_tld);
... ...
@@ -1017,7 +1026,6 @@ static enum phish_status cleanupURLs(struct url_check* urls)
1017 1017
 {
1018 1018
 	if(urls->flags&CLEANUP_URL) {
1019 1019
 		cleanupURL(&urls->realLink,NULL,1);
1020
-
1021 1020
 		cleanupURL(&urls->displayLink,&urls->pre_fixup.pre_displayLink,0);
1022 1021
 		if(!urls->displayLink.data || !urls->realLink.data)
1023 1022
 			return CL_PHISH_NODECISION;
... ...
@@ -1045,12 +1053,14 @@ static int url_get_host(const struct phishcheck* pchk, struct url_check* url,str
1045 1045
 
1046 1046
 	cli_dbgmsg("Phishcheck:host:%s\n", host->data);
1047 1047
 
1048
-	if(!host->data || (isReal && host->data[0]=='\0') || *phishy&REAL_IS_MAILTO || strchr(host->data,' ')) {
1048
+	if(!host->data || (isReal && (host->data[0]=='\0' || strstr(host->data,".."))) || *phishy&REAL_IS_MAILTO || strchr(host->data,' ')) {
1049 1049
 		/* no host,
1050 1050
 		 * link without domain, such as: href="/isapi.dll?...
1051 1051
 		 * mailto:
1052 1052
 		 * spaces in hostname
1053
+		 * double dots
1053 1054
 		 */
1055
+		cli_dbgmsg("Phishcheck:skipping invalid host\n");
1054 1056
 		return CL_PHISH_CLEAN;
1055 1057
 	}
1056 1058
 	if(url->flags&CHECK_CLOAKING && !cli_regexec(&pchk->preg_hexurl,host->data,0,NULL,0)) {
... ...
@@ -1127,6 +1137,7 @@ static enum phish_status phishingCheck(const struct cl_engine* engine,struct url
1127 1127
 
1128 1128
 	cli_dbgmsg("Phishcheck:URL after cleanup: %s->%s\n", urls->realLink.data,
1129 1129
 		urls->displayLink.data);
1130
+
1130 1131
 	if(whitelist_check(engine, urls, 0))
1131 1132
 		return CL_PHISH_CLEAN;/* if url is whitelisted don't perform further checks */
1132 1133