Browse code

improve url extraction algorithm.

git-svn: trunk@3233

Török Edvin authored on 2007/09/21 06:05:11
Showing 2 changed files
... ...
@@ -1,3 +1,7 @@
1
+Thu Sep 20 23:19:20 EEST 2007(edwin)
2
+------------------------------------
3
+  * libclamav/phishcheck.c: improve url extraction algorithm.
4
+
1 5
 Thu Sep 20 11:21:14 BST 2007 (njh)
2 6
 ----------------------------------
3 7
   * clamav-milter/clamav-milter.c:	Plug leak on SPF error handling
... ...
@@ -657,30 +657,26 @@ static void clear_msb(char* begin)
657 657
  * <a href="www.yahoo.com">Check out yahoo.com</a>
658 658
  * Here we add a ., so we get: check.out.yahoo.com (it won't trigger)
659 659
  *
660
- * Rule for adding .: if substring from right contains dot, then add dot,
660
+ * Old Rule for adding .: if substring from right contains dot, then add dot,
661 661
  *	otherwise strip space
662
+ * New Rule: strip all spaces
663
+ *  strip leading and trailing garbage
662 664
  *
663 665
  */
664 666
 static void
665 667
 str_fixup_spaces(char **begin, const char **end)
666 668
 {
667
-	char *space = strchr(*begin, ' ');
668
-
669
-	if(space == NULL)
669
+	char* sbegin = *begin;
670
+	const char* send = *end;
671
+	if(!sbegin || !send || send < sbegin)
670 672
 		return;
671
-
672
-	/* strip any number of spaces after / */
673
-	while((space > *begin) && (space[-1] == '/') && (space[0] == ' ') && (space < *end)) {
674
-		memmove(space, space+1, *end-space+1);
675
-		(*end)--;
676
-	}
677
-
678
-	for(space = rfind(*begin,' ',*end-*begin);space && space[0]!='.' && space<*end;space++)
679
-		;
680
-	if(space && space[0]=='.')
681
-		str_replace(*begin,*end,' ','.');
682
-	else
683
-		str_strip(begin,end," ",1);
673
+	/* strip spaces */
674
+	str_strip(&sbegin, &send, " ",1);
675
+	/* strip leading/trailing garbage */
676
+	while(!isalnum(sbegin[0]) && sbegin <= send) sbegin++;
677
+	while(!isalnum(send[0]) && send >= sbegin) send--;
678
+	*begin = sbegin;
679
+	*end = send;
684 680
 }
685 681
 
686 682
 /* allocates memory */
... ...
@@ -1210,6 +1206,7 @@ static enum phish_status phishingCheck(const struct cl_engine* engine,struct url
1210 1210
 	if((!isURL(pchk, urls->displayLink.data) || !isURL(pchk, urls->realLink.data) )&&
1211 1211
 			( (phishy&PHISHY_NUMERIC_IP && !isNumericURL(pchk, urls->displayLink.data)) ||
1212 1212
 			  !(phishy&PHISHY_NUMERIC_IP))) {
1213
+		cli_dbgmsg("Displayed 'url' is not url:%s\n",urls->displayLink.data);
1213 1214
 		free_if_needed(&host_url);
1214 1215
 		return CL_PHISH_TEXTURL;
1215 1216
 	}