git-svn: trunk@3569
Török Edvin authored on 2008/02/01 01:05:20... | ... |
@@ -1,3 +1,8 @@ |
1 |
+Thu Jan 31 17:44:35 EET 2008 (edwin) |
|
2 |
+------------------------------------ |
|
3 |
+ * libclamav/phishcheck.c, docs/phishsigs_howto.tex: ignore invalid URLs |
|
4 |
+ containing double dots, optimization: cut URL after hostname |
|
5 |
+ |
|
1 | 6 |
Thu Jan 31 16:33:56 CET 2008 (tk) |
2 | 7 |
--------------------------------- |
3 | 8 |
* libclamav/vba_extract.c: minor code tidy; drop broken sigtouint32() |
... | ... |
@@ -237,7 +237,10 @@ Furthermore you can restrict what checks are to be performed by specifying the 3 |
237 | 237 |
\subsubsection{Extraction of \textsc{realURL}, \textsc{displayedURL} from HTML tags\label{sub:Extraction-of-realURL,}} |
238 | 238 |
|
239 | 239 |
The html parser extracts pairs of \textsc{realURL}/\textsc{displayedURL} |
240 |
-based on the following rules: |
|
240 |
+based on the following rules. |
|
241 |
+ |
|
242 |
+In version 0.93: After URLs have been extracted, they are normalized, and cut after the hostname. |
|
243 |
+\verb+http://test.example.com/path/somecgi?queryparameters+ becomes \verb+http://test.example.com/+ |
|
241 | 244 |
|
242 | 245 |
\begin{description} |
243 | 246 |
\item [{a}] (anchor) the \emph{href} is the \textsc{realURL}, its \emph{contents} |
... | ... |
@@ -588,4 +591,4 @@ Then see what urls are being checked, see if any of them is in a |
588 | 588 |
whitelist, see if all urls are detected, etc. |
589 | 589 |
|
590 | 590 |
|
591 |
-\end{document} |
|
592 | 591 |
\ No newline at end of file |
592 |
+\end{document} |
... | ... |
@@ -173,46 +173,34 @@ static const size_t https_len = sizeof(https)-1; |
173 | 173 |
#define URI_safe_nodot "-$_@&" |
174 | 174 |
#define URI_safe "-$_@.&" |
175 | 175 |
#define URI_extra "!*\"'()," |
176 |
-#define URI_reserved "=;/#?: " |
|
177 |
-#define URI_national "{}|[]\\^~" |
|
178 |
-#define URI_punctuation "<>" |
|
179 | 176 |
|
180 | 177 |
#define URI_hex "[0-9a-fA-f]" |
181 | 178 |
#define URI_escape "%"URI_hex"{2}" |
182 | 179 |
#define URI_xalpha "([" URI_safe URI_alpha URI_digit URI_extra "]|"URI_escape")" /* URI_safe has to be first, because it contains - */ |
183 | 180 |
#define URI_xalpha_nodot "([" URI_safe_nodot URI_alpha URI_digit URI_extra "]|"URI_escape")" |
184 | 181 |
|
185 |
-#define URI_xalphas URI_xalpha"+" |
|
186 | 182 |
#define URI_xalphas_nodot URI_xalpha_nodot"*" |
187 | 183 |
|
188 | 184 |
#define URI_ialpha "["URI_alpha"]"URI_xalphas_nodot"" |
189 | 185 |
#define URI_xpalpha URI_xalpha"|\\+" |
190 | 186 |
#define URI_xpalpha_nodot URI_xalpha_nodot"|\\+" |
191 |
-#define URI_xpalphas "("URI_xpalpha")+" |
|
192 | 187 |
#define URI_xpalphas_nodot "("URI_xpalpha_nodot")+" |
193 |
-#define optional_URI_xpalphas "("URI_xpalpha"|=)*" |
|
194 | 188 |
|
195 | 189 |
#define URI_scheme URI_ialpha |
196 | 190 |
#define URI_tld iana_tld |
197 | 191 |
#define URI_path1 URI_xpalphas_nodot"\\.("URI_xpalphas_nodot"\\.)*" |
198 |
-#define URI_path2 URI_tld |
|
199 |
-#define URI_path3 "(/"optional_URI_xpalphas")*" |
|
200 |
- |
|
201 |
-#define URI_search "("URI_xalphas")*" |
|
202 |
-#define URI_fragmentid URI_xalphas |
|
203 | 192 |
|
204 | 193 |
#define URI_IP_digits "["URI_digit"]{1,3}" |
205 |
-#define URI_numeric_path URI_IP_digits"(\\."URI_IP_digits"){3}(:"URI_xpalphas_nodot")?(/("URI_xpalphas"/?)*)?" |
|
206 |
-#define URI_numeric_URI "("URI_scheme":(//)?)?"URI_numeric_path"(\\?" URI_search")?" |
|
207 |
-#define URI_numeric_fragmentaddress URI_numeric_URI"(#"URI_fragmentid")?" |
|
194 |
+#define URI_path_start "[/?:]?" |
|
195 |
+#define URI_numeric_path URI_IP_digits"(\\."URI_IP_digits"){3}"URI_path_start |
|
196 |
+#define URI_numeric_URI "("URI_scheme":(//)?)?"URI_numeric_path |
|
197 |
+#define URI_numeric_fragmentaddress URI_numeric_URI |
|
208 | 198 |
|
209 | 199 |
#define URI_URI1 "("URI_scheme":(//)?)?"URI_path1 |
210 |
-#define URI_URI2 URI_path2 |
|
211 |
-#define URI_URI3 URI_path3"(\\?" URI_search")?" |
|
200 |
+#define URI_URI2 URI_tld |
|
212 | 201 |
|
213 | 202 |
#define URI_fragmentaddress1 URI_URI1 |
214 |
-#define URI_fragmentaddress2 URI_URI2 |
|
215 |
-#define URI_fragmentaddress3 URI_URI3"(#"URI_fragmentid")?" |
|
203 |
+#define URI_fragmentaddress2 URI_URI2""URI_path_start |
|
216 | 204 |
|
217 | 205 |
#define URI_CHECK_PROTOCOLS "(http|https|ftp|mailto)://.+" |
218 | 206 |
|
... | ... |
@@ -680,6 +668,9 @@ str_fixup_spaces(char **begin, const char **end) |
680 | 680 |
/* strip leading/trailing garbage */ |
681 | 681 |
while(!isalnum(sbegin[0]) && sbegin <= send) sbegin++; |
682 | 682 |
while(!isalnum(send[0]) && send >= sbegin) send--; |
683 |
+ |
|
684 |
+ /* keep terminating slash character*/ |
|
685 |
+ if(send[1] == '/') send++; |
|
683 | 686 |
*begin = sbegin; |
684 | 687 |
*end = send; |
685 | 688 |
} |
... | ... |
@@ -715,7 +706,6 @@ cleanupURL(struct string *URL,struct string *pre_URL, int isReal) |
715 | 715 |
} |
716 | 716 |
while(isspace(*end)) |
717 | 717 |
end--; |
718 |
- /*TODO: convert \ to /, and stuff like that*/ |
|
719 | 718 |
/* From mailscanner, my comments enclosed in {} */ |
720 | 719 |
if(!strncmp(begin,dotnet,dotnet_len) || !strncmp(begin,adonet,adonet_len) || !strncmp(begin,aspnet,aspnet_len)) { |
721 | 720 |
string_assign_null(URL); |
... | ... |
@@ -727,6 +717,32 @@ cleanupURL(struct string *URL,struct string *pre_URL, int isReal) |
727 | 727 |
int rc; |
728 | 728 |
|
729 | 729 |
str_replace(begin,end,'\\','/'); |
730 |
+ /* find beginning of hostname, because: |
|
731 |
+ * - we want to keep only protocol, host, and |
|
732 |
+ * strip path & query parameter(s) |
|
733 |
+ * - we want to make hostname lowercase*/ |
|
734 |
+ host_begin = strchr(begin,':'); |
|
735 |
+ while(host_begin && (host_begin < end) && (host_begin[1] == '/')) host_begin++; |
|
736 |
+ if(!host_begin) host_begin=begin; |
|
737 |
+ else host_begin++; |
|
738 |
+ host_len = strcspn(host_begin,":/?"); |
|
739 |
+ if(host_begin + host_len > end + 1) { |
|
740 |
+ /* prevent hostname extending beyond end, it can happen |
|
741 |
+ * if we have spaces at the end, we don't want those part of |
|
742 |
+ * the hostname */ |
|
743 |
+ host_len = end - host_begin + 1; |
|
744 |
+ } else { |
|
745 |
+ /* cut the URL after the hostname */ |
|
746 |
+ /* @end points to last character we want to be part of the URL */ |
|
747 |
+ end = host_begin + host_len - 1; |
|
748 |
+ } |
|
749 |
+ /* terminate URL with a slash, except when we're at end of string */ |
|
750 |
+ if(host_begin[host_len]) { |
|
751 |
+ host_begin[host_len] = '/'; |
|
752 |
+ end++; |
|
753 |
+ } |
|
754 |
+ /* convert hostname to lowercase, but only hostname! */ |
|
755 |
+ str_make_lowercase(host_begin, host_len); |
|
730 | 756 |
/* some broken MUAs put > in the href, and then |
731 | 757 |
* we get a false positive, so remove them */ |
732 | 758 |
str_replace(begin,end,'<',' '); |
... | ... |
@@ -735,13 +751,6 @@ cleanupURL(struct string *URL,struct string *pre_URL, int isReal) |
735 | 735 |
str_replace(begin,end,';',' '); |
736 | 736 |
str_strip(&begin,&end,lt,lt_len); |
737 | 737 |
str_strip(&begin,&end,gt,gt_len); |
738 |
- /* convert hostname to lowercase, but only hostname! */ |
|
739 |
- host_begin = strchr(begin,':'); |
|
740 |
- while(host_begin && host_begin[1]=='/') host_begin++; |
|
741 |
- if(!host_begin) host_begin=begin; |
|
742 |
- else host_begin++; |
|
743 |
- host_len = strcspn(host_begin,"/?"); |
|
744 |
- str_make_lowercase(host_begin,host_len); |
|
745 | 738 |
/* convert %xx to real value */ |
746 | 739 |
str_hex_to_char(&begin,&end); |
747 | 740 |
if(isReal) { |
... | ... |
@@ -929,7 +938,7 @@ int phishing_init(struct cl_engine* engine) |
929 | 929 |
engine->phishcheck = NULL; |
930 | 930 |
return CL_EFORMAT; |
931 | 931 |
} |
932 |
- url_regex = str_compose("^ *(("URI_CHECK_PROTOCOLS")|("URI_fragmentaddress1,URI_fragmentaddress2,URI_fragmentaddress3")) *$"); |
|
932 |
+ url_regex = str_compose("^ *(("URI_CHECK_PROTOCOLS")|(",URI_fragmentaddress1,URI_fragmentaddress2")) *$"); |
|
933 | 933 |
if(build_regex(&pchk->preg,url_regex,1)) { |
934 | 934 |
free_regex(&pchk->preg_cctld); |
935 | 935 |
free_regex(&pchk->preg_tld); |
... | ... |
@@ -939,7 +948,7 @@ int phishing_init(struct cl_engine* engine) |
939 | 939 |
return CL_EFORMAT; |
940 | 940 |
} |
941 | 941 |
free(url_regex); |
942 |
- realurl_regex = str_compose("^ *(("URI_CHECK_PROTOCOLS")|("URI_path1,URI_fragmentaddress2,URI_fragmentaddress3")) *$"); |
|
942 |
+ realurl_regex = str_compose("^ *(("URI_CHECK_PROTOCOLS")|(",URI_path1,URI_fragmentaddress2")) *$"); |
|
943 | 943 |
if(build_regex(&pchk->preg_realurl, realurl_regex,1)) { |
944 | 944 |
free_regex(&pchk->preg_cctld); |
945 | 945 |
free_regex(&pchk->preg_tld); |
... | ... |
@@ -1017,7 +1026,6 @@ static enum phish_status cleanupURLs(struct url_check* urls) |
1017 | 1017 |
{ |
1018 | 1018 |
if(urls->flags&CLEANUP_URL) { |
1019 | 1019 |
cleanupURL(&urls->realLink,NULL,1); |
1020 |
- |
|
1021 | 1020 |
cleanupURL(&urls->displayLink,&urls->pre_fixup.pre_displayLink,0); |
1022 | 1021 |
if(!urls->displayLink.data || !urls->realLink.data) |
1023 | 1022 |
return CL_PHISH_NODECISION; |
... | ... |
@@ -1045,12 +1053,14 @@ static int url_get_host(const struct phishcheck* pchk, struct url_check* url,str |
1045 | 1045 |
|
1046 | 1046 |
cli_dbgmsg("Phishcheck:host:%s\n", host->data); |
1047 | 1047 |
|
1048 |
- if(!host->data || (isReal && host->data[0]=='\0') || *phishy&REAL_IS_MAILTO || strchr(host->data,' ')) { |
|
1048 |
+ if(!host->data || (isReal && (host->data[0]=='\0' || strstr(host->data,".."))) || *phishy&REAL_IS_MAILTO || strchr(host->data,' ')) { |
|
1049 | 1049 |
/* no host, |
1050 | 1050 |
* link without domain, such as: href="/isapi.dll?... |
1051 | 1051 |
* mailto: |
1052 | 1052 |
* spaces in hostname |
1053 |
+ * double dots |
|
1053 | 1054 |
*/ |
1055 |
+ cli_dbgmsg("Phishcheck:skipping invalid host\n"); |
|
1054 | 1056 |
return CL_PHISH_CLEAN; |
1055 | 1057 |
} |
1056 | 1058 |
if(url->flags&CHECK_CLOAKING && !cli_regexec(&pchk->preg_hexurl,host->data,0,NULL,0)) { |
... | ... |
@@ -1127,6 +1137,7 @@ static enum phish_status phishingCheck(const struct cl_engine* engine,struct url |
1127 | 1127 |
|
1128 | 1128 |
cli_dbgmsg("Phishcheck:URL after cleanup: %s->%s\n", urls->realLink.data, |
1129 | 1129 |
urls->displayLink.data); |
1130 |
+ |
|
1130 | 1131 |
if(whitelist_check(engine, urls, 0)) |
1131 | 1132 |
return CL_PHISH_CLEAN;/* if url is whitelisted don't perform further checks */ |
1132 | 1133 |
|