Browse code

implement md5 URL match

git-svn: trunk@4050

Török Edvin authored on 2008/08/01 23:50:02
Showing 1 changed files
... ...
@@ -52,7 +52,7 @@
52 52
 #include "iana_cctld.h"
53 53
 #include "scanners.h"
54 54
 #include "md5.h"
55
-
55
+#include <assert.h>
56 56
 
57 57
 #define DOMAIN_REAL 1
58 58
 #define DOMAIN_DISPLAY 0
... ...
@@ -739,6 +739,7 @@ int phishingScan(message* m,const char* dir,cli_ctx* ctx,tag_arguments_t* hrefs)
739 739
 
740 740
 	if(!ctx->found_possibly_unwanted)
741 741
 		*ctx->virname=NULL;
742
+#if 0
742 743
 	FILE *f = fopen("/home/edwin/quarantine/urls","r");
743 744
 	if(!f)
744 745
 		abort();
... ...
@@ -771,6 +772,7 @@ int phishingScan(message* m,const char* dir,cli_ctx* ctx,tag_arguments_t* hrefs)
771 771
 	}
772 772
 	fclose(f);
773 773
 	return 0;
774
+#endif
774 775
 	for(i=0;i<hrefs->count;i++)
775 776
 		if(hrefs->contents[i]) {
776 777
 			struct url_check urls;
... ...
@@ -1180,19 +1182,103 @@ static int whitelist_check(const struct cl_engine* engine,struct url_check* urls
1180 1180
 	return whitelist_match(engine,urls->realLink.data,urls->displayLink.data,hostOnly);
1181 1181
 }
1182 1182
 
1183
-static int hash_match(const struct regex_matcher *rlist, const char *url, size_t len)
1183
+static int hash_match(const struct regex_matcher *rlist, const char *host, size_t hlen, const char *path, size_t plen)
1184 1184
 {
1185
-	unsigned char md5_dig[16];
1186
-	cli_md5_ctx md5;
1185
+#if 0
1186
+	char s[1024];
1187
+	strncpy(s, host, hlen);
1188
+	strncpy(s+hlen, path, plen);
1189
+	s[hlen+plen] = '\0';
1190
+	cli_dbgmsg("hash lookup for: %s\n",s);
1191
+#endif
1192
+	if(rlist->hashes.bm_patterns) {
1193
+		unsigned char md5_dig[16];
1194
+		cli_md5_ctx md5;
1195
+
1196
+		cli_md5_init(&md5);
1197
+		cli_md5_update(&md5, host, hlen);
1198
+		cli_md5_update(&md5, path, plen);
1199
+		cli_md5_final(md5_dig, &md5);
1200
+		if(cli_bm_scanbuff(md5_dig, 16, NULL, &rlist->hashes,0,0,-1) == CL_VIRUS) {
1201
+			return CL_VIRUS;
1202
+		}
1203
+	}
1204
+	return CL_SUCCESS;
1205
+}
1187 1206
 
1188
-	if(!rlist->hashes.bm_patterns)
1189
-		return CL_CLEAN;
1207
+#define URL_MAX_LEN 1024
1208
+#define COMPONENTS 4
1209
+static int url_hash_match(const struct regex_matcher *rlist, const char *inurl, size_t len)
1210
+{
1211
+	char urlbuff[URL_MAX_LEN+3];/* htmlnorm truncates at 1024 bytes + terminating null + slash + host end null */
1212
+	char *url;
1213
+	const char *urlend = urlbuff + len;
1214
+	char *host_begin;
1215
+	size_t host_len, path_len;
1216
+	char *path_begin;
1217
+	const char *component;
1218
+	const char *lp[COMPONENTS+1];
1219
+	size_t pp[COMPONENTS+2];
1220
+	size_t j, k, ji, ki;
1221
+
1222
+	if(!inurl)
1223
+		return CL_EMEM;
1224
+	strncpy(urlbuff, inurl, URL_MAX_LEN);
1225
+	urlbuff[URL_MAX_LEN] = urlbuff[URL_MAX_LEN+1] = urlbuff[URL_MAX_LEN+2] = '\0';
1226
+	url = urlbuff;
1227
+	str_hex_to_char(&url, &urlend);
1228
+	len = urlend - url;
1229
+	host_begin = strchr(url,':');
1230
+	if(!host_begin)
1231
+		return CL_PHISH_CLEAN;
1232
+	++host_begin;
1233
+	while((host_begin < urlend) && *host_begin == '/') ++host_begin;
1234
+	while(*host_begin == '.' && host_begin < urlend) ++host_begin;
1235
+	host_len = strcspn(host_begin, ":/?");
1236
+	path_begin = host_begin + host_len;
1237
+	if(host_len < len) {
1238
+		memmove(path_begin + 2, path_begin + 1, len - host_len);
1239
+		*path_begin++ = '/';
1240
+		*path_begin++ = '\0';
1241
+	} else path_begin = url+len;
1242
+	if(url + len >= path_begin) {
1243
+		path_len = url + len - path_begin + 1;
1244
+	} else
1245
+		path_len = 0;
1246
+	str_make_lowercase(host_begin, host_len);
1190 1247
 
1191
-	cli_md5_init(&md5);
1192
-	cli_md5_update(&md5, url, len);
1193
-	cli_md5_final(md5_dig, &md5);
1194
-	if(cli_bm_scanbuff(md5_dig, 16, NULL, &rlist->hashes,0,0,-1) == CL_VIRUS) {
1195
-		return CL_VIRUS;
1248
+	j=COMPONENTS;
1249
+	component = strrchr(host_begin, '.');
1250
+	while(component && j > 0) {
1251
+		do {
1252
+			--component;
1253
+		} while(*component != '.' && component > host_begin);
1254
+		if(*component != '.')
1255
+			component = NULL;
1256
+		if(component)
1257
+			lp[j--] = component + 1;
1258
+	}
1259
+	lp[j] = host_begin;
1260
+
1261
+	pp[0] = path_len;
1262
+	pp[1] = strcspn(path_begin, "?");
1263
+	if(pp[1] != pp[0]) k = 2;
1264
+	else k = 1;
1265
+	pp[k++] = 0;
1266
+	while(k < COMPONENTS+2) {
1267
+		const char *p = strchr(path_begin + pp[k-1] + 1, '/');
1268
+		if(p && p > path_begin) {
1269
+			pp[k++] = p - path_begin;
1270
+		} else
1271
+			break;
1272
+	}
1273
+
1274
+	for(ji=j;ji < COMPONENTS+1; ji++) {
1275
+		for(ki=0;ki < k; ki++) {
1276
+			assert(pp[ki] < path_len);
1277
+			if(hash_match(rlist, lp[ji], host_begin + host_len - lp[ji] + 1, path_begin, pp[ki]) == CL_VIRUS)
1278
+				return CL_VIRUS;
1279
+		}
1196 1280
 	}
1197 1281
 	return CL_SUCCESS;
1198 1282
 }
... ...
@@ -1214,6 +1300,16 @@ static enum phish_status phishingCheck(const struct cl_engine* engine,struct url
1214 1214
 	if(!strcmp(urls->realLink.data,urls->displayLink.data))
1215 1215
 		return CL_PHISH_CLEAN;/* displayed and real URL are identical -> clean */
1216 1216
 
1217
+	if(!isURL(pchk, urls->realLink.data, 0)) {
1218
+		cli_dbgmsg("Real 'url' is not url:%s\n",urls->realLink.data);
1219
+		return CL_PHISH_CLEAN;
1220
+	}
1221
+
1222
+	if(url_hash_match(engine->domainlist_matcher, urls->realLink.data, strlen(urls->realLink.data)) == CL_VIRUS) {
1223
+		cli_dbgmsg("Hash matched for: %s\n", urls->realLink.data);
1224
+		return CL_PHISH_HASH;
1225
+	}
1226
+
1217 1227
 	if((rc = cleanupURLs(urls))) {
1218 1228
 		/* it can only return an error, or say its clean;
1219 1229
 		 * it is not allowed to decide it is phishing */
... ...
@@ -1223,7 +1319,7 @@ static enum phish_status phishingCheck(const struct cl_engine* engine,struct url
1223 1223
 	cli_dbgmsg("Phishcheck:URL after cleanup: %s->%s\n", urls->realLink.data,
1224 1224
 		urls->displayLink.data);
1225 1225
 
1226
-	if((!isURL(pchk, urls->displayLink.data, 1) || !isURL(pchk, urls->realLink.data, 0) ) &&
1226
+	if((!isURL(pchk, urls->displayLink.data, 1) ) &&
1227 1227
 			( (phishy&PHISHY_NUMERIC_IP && !isNumericURL(pchk, urls->displayLink.data)) ||
1228 1228
 			  !(phishy&PHISHY_NUMERIC_IP))) {
1229 1229
 		cli_dbgmsg("Displayed 'url' is not url:%s\n",urls->displayLink.data);
... ...
@@ -1233,10 +1329,6 @@ static enum phish_status phishingCheck(const struct cl_engine* engine,struct url
1233 1233
 	if(whitelist_check(engine, urls, 0))
1234 1234
 		return CL_PHISH_CLEAN;/* if url is whitelisted don't perform further checks */
1235 1235
 
1236
-	if(hash_match(engine->domainlist_matcher, urls->realLink.data, strlen(urls->realLink.data)) == CL_VIRUS) {
1237
-		cli_dbgmsg("Hash matched for: %s\n", urls->realLink.data);
1238
-		return CL_PHISH_HASH;
1239
-	}
1240 1236
 	url_check_init(&host_url);
1241 1237
 
1242 1238
 	if((rc = url_get_host(pchk, urls, &host_url, DOMAIN_DISPLAY, &phishy))) {
... ...
@@ -1324,6 +1416,8 @@ static const char* phishing_ret_toString(enum phish_status rc)
1324 1324
 			return "URLs are way too different";
1325 1325
 		case CL_PHISH_HEX_URL:
1326 1326
 			return "Embedded hex urls";
1327
+		case CL_PHISH_HASH:
1328
+			return "Blacklisted";
1327 1329
 		default:
1328 1330
 			return "Unknown return code";
1329 1331
 	}