Browse code

add comments, no functionality change.

git-svn: trunk@4954

Török Edvin authored on 2009/03/16 22:53:57
Showing 2 changed files
... ...
@@ -1,3 +1,7 @@
1
+Mon Mar 16 15:53:52 EET 2009 (edwin)
2
+------------------------------------
3
+ * libclamav/phishcheck.c: add comments, no functionality change.
4
+
1 5
 Mon Mar 16 15:41:17 EET 2009 (edwin)
2 6
 ------------------------------------
3 7
  * docs/phishsigs_howto.tex, libclamav/phishcheck.c: document URL
... ...
@@ -1235,18 +1235,22 @@ int cli_url_canon(const char *inurl, size_t len, char *urlbuff, size_t dest_len,
1235 1235
 	urlbuff[dest_len] = urlbuff[dest_len+1] = urlbuff[dest_len+2] = '\0';
1236 1236
 	url = urlbuff;
1237 1237
 
1238
+	/* canonicalize only real URLs, with a protocol */
1238 1239
 	host_begin = strchr(url, ':');
1239 1240
 	if(!host_begin)
1240 1241
 		return CL_PHISH_CLEAN;
1241 1242
 	++host_begin;
1242 1243
 
1244
+	/* ignore username in URL */
1243 1245
 	p = strchr(host_begin, '@');
1244 1246
 	if (p)
1245 1247
 	    host_begin = p+1;
1246 1248
 	url = host_begin;
1249
+	/* repeatedly % unescape characters */
1247 1250
 	str_hex_to_char(&url, &urlend);
1248 1251
 	host_begin = url;
1249 1252
 	len = urlend - url;
1253
+	/* skip to beginning of hostname */
1250 1254
 	while((host_begin < urlend) && *host_begin == '/') ++host_begin;
1251 1255
 	while(*host_begin == '.' && host_begin < urlend) ++host_begin;
1252 1256
 
... ...
@@ -1255,11 +1259,13 @@ int cli_url_canon(const char *inurl, size_t len, char *urlbuff, size_t dest_len,
1255 1255
 	while (p < urlend) {
1256 1256
 	    if (p+2 < urlend && *p == '/' && p[1] == '.' ) {
1257 1257
 		if (p[2] == '/') {
1258
+		    /* remove /./ */
1258 1259
 		    if (p + 3 < urlend)
1259 1260
 			memmove(p+1, p+3, urlend - p - 3);
1260 1261
 		    urlend -= 2;
1261 1262
 		}
1262 1263
 		else if (p[2] == '.' && (p[3] == '/' || p[3] == '\0') && last) {
1264
+		    /* remove /component/../ */
1263 1265
 		    if (p+4 < urlend)
1264 1266
 			memmove(last+1, p+4, urlend - p - 4);
1265 1267
 		    urlend -= 3 + (p - last);
... ...
@@ -1276,6 +1282,7 @@ int cli_url_canon(const char *inurl, size_t len, char *urlbuff, size_t dest_len,
1276 1276
 	while (p < urlend && p+2 < url + dest_len) {
1277 1277
 	    unsigned char c = *p;
1278 1278
 	    if (c <= 32 || c >= 127 || c == '%' || c == '#') {
1279
+		/* convert non-ascii characters back to % escaped */
1279 1280
 		const char hexchars[] = "0123456789ABCDEF";
1280 1281
 		memmove(p+3, p+1, urlend - p - 1);
1281 1282
 		*p++ = '%';
... ...
@@ -1288,9 +1295,11 @@ int cli_url_canon(const char *inurl, size_t len, char *urlbuff, size_t dest_len,
1288 1288
 	*p = '\0';
1289 1289
 	urlend = p;
1290 1290
 	len = urlend - url;
1291
+	/* determine end of hostname */
1291 1292
 	host_len = strcspn(host_begin, ":/?");
1292 1293
 	path_begin = host_begin + host_len;
1293 1294
 	if(host_len < len) {
1295
+		/* url without path, use a single / */
1294 1296
 		memmove(path_begin + 2, path_begin + 1, len - host_len);
1295 1297
 		*path_begin++ = '/';
1296 1298
 		*path_begin++ = '\0';
... ...
@@ -1299,6 +1308,7 @@ int cli_url_canon(const char *inurl, size_t len, char *urlbuff, size_t dest_len,
1299 1299
 		path_len = url + len - path_begin + 1;
1300 1300
 		p = strchr(path_begin, '#');
1301 1301
 		if (p) {
1302
+		    /* ignore anchor */
1302 1303
 		    *p = '\0';
1303 1304
 		    path_len = p - path_begin;
1304 1305
 		}
... ...
@@ -1307,6 +1317,7 @@ int cli_url_canon(const char *inurl, size_t len, char *urlbuff, size_t dest_len,
1307 1307
 		path_len = 0;
1308 1308
 		*path = "";
1309 1309
 	}
1310
+	/* lowercase entire URL */
1310 1311
 	str_make_lowercase(host_begin, host_len);
1311 1312
 	*host = host_begin;
1312 1313
 	*hostlen = host_len;
... ...
@@ -1330,6 +1341,8 @@ static int url_hash_match(const struct regex_matcher *rlist, const char *inurl,
1330 1330
 	unsigned count;
1331 1331
 
1332 1332
 	if(!rlist || !rlist->sha256_hashes.bm_patterns) {
1333
+		/* no hashes loaded -> don't waste time canonicalizing and
1334
+		 * looking up */
1333 1335
 		return CL_SUCCESS;
1334 1336
 	}
1335 1337
 	if(!inurl)
... ...
@@ -1338,6 +1351,8 @@ static int url_hash_match(const struct regex_matcher *rlist, const char *inurl,
1338 1338
 	rc = cli_url_canon(inurl, len, urlbuff, sizeof(urlbuff), &host_begin, &host_len, &path_begin, &path_len);
1339 1339
 	if (rc == CL_PHISH_CLEAN)
1340 1340
 	    return rc;
1341
+
1342
+	/* get last 5 components of hostname */
1341 1343
 	j=COMPONENTS;
1342 1344
 	component = strrchr(host_begin, '.');
1343 1345
 	while(component && j > 0) {
... ...
@@ -1351,6 +1366,7 @@ static int url_hash_match(const struct regex_matcher *rlist, const char *inurl,
1351 1351
 	}
1352 1352
 	lp[j] = host_begin;
1353 1353
 
1354
+	/* get first 5 components of path */
1354 1355
 	pp[0] = path_len;
1355 1356
 	if(path_len) {
1356 1357
 		pp[1] = strcspn(path_begin, "?");
... ...
@@ -1376,6 +1392,7 @@ static int url_hash_match(const struct regex_matcher *rlist, const char *inurl,
1376 1376
 				       rlist->hostkey_prefix.bm_patterns;
1377 1377
 		--ji;
1378 1378
 		assert(pp[ki] <= path_len);
1379
+		/* lookup prefix/suffix hashes of URL */
1379 1380
 		rc = hash_match(rlist, lp[ji], host_begin + host_len - lp[ji] + 1, path_begin, pp[ki], 
1380 1381
 				need_prefixmatch ? &prefix_matched : NULL);
1381 1382
 		if(rc) {
... ...
@@ -1383,6 +1400,9 @@ static int url_hash_match(const struct regex_matcher *rlist, const char *inurl,
1383 1383
 		}
1384 1384
 		count++;
1385 1385
 		if (count == 2 && !prefix_matched && rlist->hostkey_prefix.bm_patterns) {
1386
+		    /* if hostkey is not matched, don't bother calculating
1387
+		     * hashes for other parts of the URL, they are not in the DB
1388
+		     */
1386 1389
 		    cli_dbgmsg("hostkey prefix not matched, short-circuiting lookups\n");
1387 1390
 		    return CL_SUCCESS;
1388 1391
 		}