...
|
...
|
@@ -1235,18 +1235,22 @@ int cli_url_canon(const char *inurl, size_t len, char *urlbuff, size_t dest_len,
|
1235
|
1235
|
urlbuff[dest_len] = urlbuff[dest_len+1] = urlbuff[dest_len+2] = '\0';
|
1236
|
1236
|
url = urlbuff;
|
1237
|
1237
|
|
|
1238
|
+ /* canonicalize only real URLs, with a protocol */
|
1238
|
1239
|
host_begin = strchr(url, ':');
|
1239
|
1240
|
if(!host_begin)
|
1240
|
1241
|
return CL_PHISH_CLEAN;
|
1241
|
1242
|
++host_begin;
|
1242
|
1243
|
|
|
1244
|
+ /* ignore username in URL */
|
1243
|
1245
|
p = strchr(host_begin, '@');
|
1244
|
1246
|
if (p)
|
1245
|
1247
|
host_begin = p+1;
|
1246
|
1248
|
url = host_begin;
|
|
1249
|
+ /* repeatedly % unescape characters */
|
1247
|
1250
|
str_hex_to_char(&url, &urlend);
|
1248
|
1251
|
host_begin = url;
|
1249
|
1252
|
len = urlend - url;
|
|
1253
|
+ /* skip to beginning of hostname */
|
1250
|
1254
|
while((host_begin < urlend) && *host_begin == '/') ++host_begin;
|
1251
|
1255
|
while(*host_begin == '.' && host_begin < urlend) ++host_begin;
|
1252
|
1256
|
|
...
|
...
|
@@ -1255,11 +1259,13 @@ int cli_url_canon(const char *inurl, size_t len, char *urlbuff, size_t dest_len,
|
1255
|
1255
|
while (p < urlend) {
|
1256
|
1256
|
if (p+2 < urlend && *p == '/' && p[1] == '.' ) {
|
1257
|
1257
|
if (p[2] == '/') {
|
|
1258
|
+ /* remove /./ */
|
1258
|
1259
|
if (p + 3 < urlend)
|
1259
|
1260
|
memmove(p+1, p+3, urlend - p - 3);
|
1260
|
1261
|
urlend -= 2;
|
1261
|
1262
|
}
|
1262
|
1263
|
else if (p[2] == '.' && (p[3] == '/' || p[3] == '\0') && last) {
|
|
1264
|
+ /* remove /component/../ */
|
1263
|
1265
|
if (p+4 < urlend)
|
1264
|
1266
|
memmove(last+1, p+4, urlend - p - 4);
|
1265
|
1267
|
urlend -= 3 + (p - last);
|
...
|
...
|
@@ -1276,6 +1282,7 @@ int cli_url_canon(const char *inurl, size_t len, char *urlbuff, size_t dest_len,
|
1276
|
1276
|
while (p < urlend && p+2 < url + dest_len) {
|
1277
|
1277
|
unsigned char c = *p;
|
1278
|
1278
|
if (c <= 32 || c >= 127 || c == '%' || c == '#') {
|
|
1279
|
+ /* convert non-ascii characters back to % escaped */
|
1279
|
1280
|
const char hexchars[] = "0123456789ABCDEF";
|
1280
|
1281
|
memmove(p+3, p+1, urlend - p - 1);
|
1281
|
1282
|
*p++ = '%';
|
...
|
...
|
@@ -1288,9 +1295,11 @@ int cli_url_canon(const char *inurl, size_t len, char *urlbuff, size_t dest_len,
|
1288
|
1288
|
*p = '\0';
|
1289
|
1289
|
urlend = p;
|
1290
|
1290
|
len = urlend - url;
|
|
1291
|
+ /* determine end of hostname */
|
1291
|
1292
|
host_len = strcspn(host_begin, ":/?");
|
1292
|
1293
|
path_begin = host_begin + host_len;
|
1293
|
1294
|
if(host_len < len) {
|
|
1295
|
+ /* url without path, use a single / */
|
1294
|
1296
|
memmove(path_begin + 2, path_begin + 1, len - host_len);
|
1295
|
1297
|
*path_begin++ = '/';
|
1296
|
1298
|
*path_begin++ = '\0';
|
...
|
...
|
@@ -1299,6 +1308,7 @@ int cli_url_canon(const char *inurl, size_t len, char *urlbuff, size_t dest_len,
|
1299
|
1299
|
path_len = url + len - path_begin + 1;
|
1300
|
1300
|
p = strchr(path_begin, '#');
|
1301
|
1301
|
if (p) {
|
|
1302
|
+ /* ignore anchor */
|
1302
|
1303
|
*p = '\0';
|
1303
|
1304
|
path_len = p - path_begin;
|
1304
|
1305
|
}
|
...
|
...
|
@@ -1307,6 +1317,7 @@ int cli_url_canon(const char *inurl, size_t len, char *urlbuff, size_t dest_len,
|
1307
|
1307
|
path_len = 0;
|
1308
|
1308
|
*path = "";
|
1309
|
1309
|
}
|
|
1310
|
+ /* lowercase entire URL */
|
1310
|
1311
|
str_make_lowercase(host_begin, host_len);
|
1311
|
1312
|
*host = host_begin;
|
1312
|
1313
|
*hostlen = host_len;
|
...
|
...
|
@@ -1330,6 +1341,8 @@ static int url_hash_match(const struct regex_matcher *rlist, const char *inurl,
|
1330
|
1330
|
unsigned count;
|
1331
|
1331
|
|
1332
|
1332
|
if(!rlist || !rlist->sha256_hashes.bm_patterns) {
|
|
1333
|
+ /* no hashes loaded -> don't waste time canonicalizing and
|
|
1334
|
+ * looking up */
|
1333
|
1335
|
return CL_SUCCESS;
|
1334
|
1336
|
}
|
1335
|
1337
|
if(!inurl)
|
...
|
...
|
@@ -1338,6 +1351,8 @@ static int url_hash_match(const struct regex_matcher *rlist, const char *inurl,
|
1338
|
1338
|
rc = cli_url_canon(inurl, len, urlbuff, sizeof(urlbuff), &host_begin, &host_len, &path_begin, &path_len);
|
1339
|
1339
|
if (rc == CL_PHISH_CLEAN)
|
1340
|
1340
|
return rc;
|
|
1341
|
+
|
|
1342
|
+ /* get last 5 components of hostname */
|
1341
|
1343
|
j=COMPONENTS;
|
1342
|
1344
|
component = strrchr(host_begin, '.');
|
1343
|
1345
|
while(component && j > 0) {
|
...
|
...
|
@@ -1351,6 +1366,7 @@ static int url_hash_match(const struct regex_matcher *rlist, const char *inurl,
|
1351
|
1351
|
}
|
1352
|
1352
|
lp[j] = host_begin;
|
1353
|
1353
|
|
|
1354
|
+ /* get first 5 components of path */
|
1354
|
1355
|
pp[0] = path_len;
|
1355
|
1356
|
if(path_len) {
|
1356
|
1357
|
pp[1] = strcspn(path_begin, "?");
|
...
|
...
|
@@ -1376,6 +1392,7 @@ static int url_hash_match(const struct regex_matcher *rlist, const char *inurl,
|
1376
|
1376
|
rlist->hostkey_prefix.bm_patterns;
|
1377
|
1377
|
--ji;
|
1378
|
1378
|
assert(pp[ki] <= path_len);
|
|
1379
|
+ /* lookup prefix/suffix hashes of URL */
|
1379
|
1380
|
rc = hash_match(rlist, lp[ji], host_begin + host_len - lp[ji] + 1, path_begin, pp[ki],
|
1380
|
1381
|
need_prefixmatch ? &prefix_matched : NULL);
|
1381
|
1382
|
if(rc) {
|
...
|
...
|
@@ -1383,6 +1400,9 @@ static int url_hash_match(const struct regex_matcher *rlist, const char *inurl,
|
1383
|
1383
|
}
|
1384
|
1384
|
count++;
|
1385
|
1385
|
if (count == 2 && !prefix_matched && rlist->hostkey_prefix.bm_patterns) {
|
|
1386
|
+ /* if hostkey is not matched, don't bother calculating
|
|
1387
|
+ * hashes for other parts of the URL, they are not in the DB
|
|
1388
|
+ */
|
1386
|
1389
|
cli_dbgmsg("hostkey prefix not matched, short-circuiting lookups\n");
|
1387
|
1390
|
return CL_SUCCESS;
|
1388
|
1391
|
}
|