git-svn: trunk@2244
aCaB authored on 2006/09/13 04:38:39... | ... |
@@ -1,3 +1,9 @@ |
1 |
+Tue Sep 12 21:59:17 CEST 2006 (acab) |
|
2 |
+------------------------------------ |
|
3 |
+ * libclamav: Merge of the related part of the phishing module from |
|
4 |
+ Torok Edvin <edwintorok*gmail.com> |
|
5 |
+ Part of the Google Summer of Code program |
|
6 |
+ |
|
1 | 7 |
Tue Sep 12 20:42:04 CEST 2006 (acab) |
2 | 8 |
------------------------------------ |
3 | 9 |
* sigtool: Merge of the related part of the phishing module from |
... | ... |
@@ -1,6 +1,6 @@ |
1 | 1 |
#! /bin/sh |
2 | 2 |
# Guess values for system-dependent variables and create Makefiles. |
3 |
-# Generated by GNU Autoconf 2.60. |
|
3 |
+# Generated by GNU Autoconf 2.60a. |
|
4 | 4 |
# |
5 | 5 |
# Copyright (C) 1992, 1993, 1994, 1995, 1996, 1998, 1999, 2000, 2001, |
6 | 6 |
# 2002, 2003, 2004, 2005, 2006 Free Software Foundation, Inc. |
... | ... |
@@ -724,36 +724,36 @@ ac_unique_file="clamscan/clamscan.c" |
724 | 724 |
# Factoring default headers for most tests. |
725 | 725 |
ac_includes_default="\ |
726 | 726 |
#include <stdio.h> |
727 |
-#if HAVE_SYS_TYPES_H |
|
727 |
+#ifdef HAVE_SYS_TYPES_H |
|
728 | 728 |
# include <sys/types.h> |
729 | 729 |
#endif |
730 |
-#if HAVE_SYS_STAT_H |
|
730 |
+#ifdef HAVE_SYS_STAT_H |
|
731 | 731 |
# include <sys/stat.h> |
732 | 732 |
#endif |
733 |
-#if STDC_HEADERS |
|
733 |
+#ifdef STDC_HEADERS |
|
734 | 734 |
# include <stdlib.h> |
735 | 735 |
# include <stddef.h> |
736 | 736 |
#else |
737 |
-# if HAVE_STDLIB_H |
|
737 |
+# ifdef HAVE_STDLIB_H |
|
738 | 738 |
# include <stdlib.h> |
739 | 739 |
# endif |
740 | 740 |
#endif |
741 |
-#if HAVE_STRING_H |
|
742 |
-# if !STDC_HEADERS && HAVE_MEMORY_H |
|
741 |
+#ifdef HAVE_STRING_H |
|
742 |
+# if !defined STDC_HEADERS && defined HAVE_MEMORY_H |
|
743 | 743 |
# include <memory.h> |
744 | 744 |
# endif |
745 | 745 |
# include <string.h> |
746 | 746 |
#endif |
747 |
-#if HAVE_STRINGS_H |
|
747 |
+#ifdef HAVE_STRINGS_H |
|
748 | 748 |
# include <strings.h> |
749 | 749 |
#endif |
750 |
-#if HAVE_INTTYPES_H |
|
750 |
+#ifdef HAVE_INTTYPES_H |
|
751 | 751 |
# include <inttypes.h> |
752 | 752 |
#endif |
753 |
-#if HAVE_STDINT_H |
|
753 |
+#ifdef HAVE_STDINT_H |
|
754 | 754 |
# include <stdint.h> |
755 | 755 |
#endif |
756 |
-#if HAVE_UNISTD_H |
|
756 |
+#ifdef HAVE_UNISTD_H |
|
757 | 757 |
# include <unistd.h> |
758 | 758 |
#endif" |
759 | 759 |
|
... | ... |
@@ -1576,7 +1576,7 @@ test -n "$ac_init_help" && exit $ac_status |
1576 | 1576 |
if $ac_init_version; then |
1577 | 1577 |
cat <<\_ACEOF |
1578 | 1578 |
configure |
1579 |
-generated by GNU Autoconf 2.60 |
|
1579 |
+generated by GNU Autoconf 2.60a |
|
1580 | 1580 |
|
1581 | 1581 |
Copyright (C) 1992, 1993, 1994, 1995, 1996, 1998, 1999, 2000, 2001, |
1582 | 1582 |
2002, 2003, 2004, 2005, 2006 Free Software Foundation, Inc. |
... | ... |
@@ -1590,7 +1590,7 @@ This file contains any messages produced by compilers while |
1590 | 1590 |
running configure, to aid debugging if configure makes a mistake. |
1591 | 1591 |
|
1592 | 1592 |
It was created by $as_me, which was |
1593 |
-generated by GNU Autoconf 2.60. Invocation command line was |
|
1593 |
+generated by GNU Autoconf 2.60a. Invocation command line was |
|
1594 | 1594 |
|
1595 | 1595 |
$ $0 $@ |
1596 | 1596 |
|
... | ... |
@@ -3164,7 +3164,7 @@ eval "echo \"\$as_me:$LINENO: $ac_try_echo\"") >&5 |
3164 | 3164 |
# in a Makefile. We should not override ac_cv_exeext if it was cached, |
3165 | 3165 |
# so that the user can short-circuit this test for compilers unknown to |
3166 | 3166 |
# Autoconf. |
3167 |
-for ac_file in $ac_files |
|
3167 |
+for ac_file in $ac_files '' |
|
3168 | 3168 |
do |
3169 | 3169 |
test -f "$ac_file" || continue |
3170 | 3170 |
case $ac_file in |
... | ... |
@@ -3192,6 +3192,12 @@ done |
3192 | 3192 |
test "$ac_cv_exeext" = no && ac_cv_exeext= |
3193 | 3193 |
|
3194 | 3194 |
else |
3195 |
+ ac_file='' |
|
3196 |
+fi |
|
3197 |
+ |
|
3198 |
+{ echo "$as_me:$LINENO: result: $ac_file" >&5 |
|
3199 |
+echo "${ECHO_T}$ac_file" >&6; } |
|
3200 |
+if test -z "$ac_file"; then |
|
3195 | 3201 |
echo "$as_me: failed program was:" >&5 |
3196 | 3202 |
sed 's/^/| /' conftest.$ac_ext >&5 |
3197 | 3203 |
|
... | ... |
@@ -3203,8 +3209,6 @@ See \`config.log' for more details." >&2;} |
3203 | 3203 |
fi |
3204 | 3204 |
|
3205 | 3205 |
ac_exeext=$ac_cv_exeext |
3206 |
-{ echo "$as_me:$LINENO: result: $ac_file" >&5 |
|
3207 |
-echo "${ECHO_T}$ac_file" >&6; } |
|
3208 | 3206 |
|
3209 | 3207 |
# Check that the compiler produces executables we can run. If not, either |
3210 | 3208 |
# the compiler is broken, or we cross compile. |
... | ... |
@@ -5882,7 +5886,7 @@ test "x$enable_libtool_lock" != xno && enable_libtool_lock=yes |
5882 | 5882 |
case $host in |
5883 | 5883 |
*-*-irix6*) |
5884 | 5884 |
# Find out which ABI we are using. |
5885 |
- echo '#line 5885 "configure"' > conftest.$ac_ext |
|
5885 |
+ echo '#line 5889 "configure"' > conftest.$ac_ext |
|
5886 | 5886 |
if { (eval echo "$as_me:$LINENO: \"$ac_compile\"") >&5 |
5887 | 5887 |
(eval $ac_compile) 2>&5 |
5888 | 5888 |
ac_status=$? |
... | ... |
@@ -5994,7 +5998,7 @@ sed 's/^/| /' conftest.$ac_ext >&5 |
5994 | 5994 |
lt_cv_cc_needs_belf=no |
5995 | 5995 |
fi |
5996 | 5996 |
|
5997 |
-rm -f core conftest.err conftest.$ac_objext \ |
|
5997 |
+rm -f core conftest.err conftest.$ac_objext conftest_ipa8_conftest.oo \ |
|
5998 | 5998 |
conftest$ac_exeext conftest.$ac_ext |
5999 | 5999 |
ac_ext=c |
6000 | 6000 |
ac_cpp='$CPP $CPPFLAGS' |
... | ... |
@@ -6470,7 +6474,7 @@ sed 's/^/| /' conftest.$ac_ext >&5 |
6470 | 6470 |
|
6471 | 6471 |
fi |
6472 | 6472 |
|
6473 |
-rm -f core conftest.err conftest.$ac_objext \ |
|
6473 |
+rm -f core conftest.err conftest.$ac_objext conftest_ipa8_conftest.oo \ |
|
6474 | 6474 |
conftest$ac_exeext conftest.$ac_ext |
6475 | 6475 |
LDFLAGS="$save_LDFLAGS" |
6476 | 6476 |
|
... | ... |
@@ -6515,7 +6519,7 @@ chmod -w . |
6515 | 6515 |
save_CFLAGS="$CFLAGS" |
6516 | 6516 |
CFLAGS="$CFLAGS -o out/conftest2.$ac_objext" |
6517 | 6517 |
compiler_c_o=no |
6518 |
-if { (eval echo configure:6518: \"$ac_compile\") 1>&5; (eval $ac_compile) 2>out/conftest.err; } && test -s out/conftest2.$ac_objext; then |
|
6518 |
+if { (eval echo configure:6522: \"$ac_compile\") 1>&5; (eval $ac_compile) 2>out/conftest.err; } && test -s out/conftest2.$ac_objext; then |
|
6519 | 6519 |
# The compiler can only warn and ignore the option if not recognized |
6520 | 6520 |
# So say no if there are warnings |
6521 | 6521 |
if test -s out/conftest.err; then |
... | ... |
@@ -8076,7 +8080,7 @@ sed 's/^/| /' conftest.$ac_ext >&5 |
8076 | 8076 |
ac_cv_func_shl_load=no |
8077 | 8077 |
fi |
8078 | 8078 |
|
8079 |
-rm -f core conftest.err conftest.$ac_objext \ |
|
8079 |
+rm -f core conftest.err conftest.$ac_objext conftest_ipa8_conftest.oo \ |
|
8080 | 8080 |
conftest$ac_exeext conftest.$ac_ext |
8081 | 8081 |
fi |
8082 | 8082 |
{ echo "$as_me:$LINENO: result: $ac_cv_func_shl_load" >&5 |
... | ... |
@@ -8155,7 +8159,7 @@ sed 's/^/| /' conftest.$ac_ext >&5 |
8155 | 8155 |
ac_cv_lib_dld_shl_load=no |
8156 | 8156 |
fi |
8157 | 8157 |
|
8158 |
-rm -f core conftest.err conftest.$ac_objext \ |
|
8158 |
+rm -f core conftest.err conftest.$ac_objext conftest_ipa8_conftest.oo \ |
|
8159 | 8159 |
conftest$ac_exeext conftest.$ac_ext |
8160 | 8160 |
LIBS=$ac_check_lib_save_LIBS |
8161 | 8161 |
fi |
... | ... |
@@ -8256,7 +8260,7 @@ sed 's/^/| /' conftest.$ac_ext >&5 |
8256 | 8256 |
ac_cv_func_dlopen=no |
8257 | 8257 |
fi |
8258 | 8258 |
|
8259 |
-rm -f core conftest.err conftest.$ac_objext \ |
|
8259 |
+rm -f core conftest.err conftest.$ac_objext conftest_ipa8_conftest.oo \ |
|
8260 | 8260 |
conftest$ac_exeext conftest.$ac_ext |
8261 | 8261 |
fi |
8262 | 8262 |
{ echo "$as_me:$LINENO: result: $ac_cv_func_dlopen" >&5 |
... | ... |
@@ -8335,7 +8339,7 @@ sed 's/^/| /' conftest.$ac_ext >&5 |
8335 | 8335 |
ac_cv_lib_dl_dlopen=no |
8336 | 8336 |
fi |
8337 | 8337 |
|
8338 |
-rm -f core conftest.err conftest.$ac_objext \ |
|
8338 |
+rm -f core conftest.err conftest.$ac_objext conftest_ipa8_conftest.oo \ |
|
8339 | 8339 |
conftest$ac_exeext conftest.$ac_ext |
8340 | 8340 |
LIBS=$ac_check_lib_save_LIBS |
8341 | 8341 |
fi |
... | ... |
@@ -8415,7 +8419,7 @@ sed 's/^/| /' conftest.$ac_ext >&5 |
8415 | 8415 |
ac_cv_lib_svld_dlopen=no |
8416 | 8416 |
fi |
8417 | 8417 |
|
8418 |
-rm -f core conftest.err conftest.$ac_objext \ |
|
8418 |
+rm -f core conftest.err conftest.$ac_objext conftest_ipa8_conftest.oo \ |
|
8419 | 8419 |
conftest$ac_exeext conftest.$ac_ext |
8420 | 8420 |
LIBS=$ac_check_lib_save_LIBS |
8421 | 8421 |
fi |
... | ... |
@@ -8495,7 +8499,7 @@ sed 's/^/| /' conftest.$ac_ext >&5 |
8495 | 8495 |
ac_cv_lib_dld_dld_link=no |
8496 | 8496 |
fi |
8497 | 8497 |
|
8498 |
-rm -f core conftest.err conftest.$ac_objext \ |
|
8498 |
+rm -f core conftest.err conftest.$ac_objext conftest_ipa8_conftest.oo \ |
|
8499 | 8499 |
conftest$ac_exeext conftest.$ac_ext |
8500 | 8500 |
LIBS=$ac_check_lib_save_LIBS |
8501 | 8501 |
fi |
... | ... |
@@ -8551,7 +8555,7 @@ else |
8551 | 8551 |
lt_dlunknown=0; lt_dlno_uscore=1; lt_dlneed_uscore=2 |
8552 | 8552 |
lt_status=$lt_dlunknown |
8553 | 8553 |
cat > conftest.$ac_ext <<EOF |
8554 |
-#line 8554 "configure" |
|
8554 |
+#line 8558 "configure" |
|
8555 | 8555 |
#include "confdefs.h" |
8556 | 8556 |
|
8557 | 8557 |
#if HAVE_DLFCN_H |
... | ... |
@@ -8649,7 +8653,7 @@ else |
8649 | 8649 |
lt_dlunknown=0; lt_dlno_uscore=1; lt_dlneed_uscore=2 |
8650 | 8650 |
lt_status=$lt_dlunknown |
8651 | 8651 |
cat > conftest.$ac_ext <<EOF |
8652 |
-#line 8652 "configure" |
|
8652 |
+#line 8656 "configure" |
|
8653 | 8653 |
#include "confdefs.h" |
8654 | 8654 |
|
8655 | 8655 |
#if HAVE_DLFCN_H |
... | ... |
@@ -10423,7 +10427,7 @@ sed 's/^/| /' conftest.$ac_ext >&5 |
10423 | 10423 |
ac_cv_lib_socket_bind=no |
10424 | 10424 |
fi |
10425 | 10425 |
|
10426 |
-rm -f core conftest.err conftest.$ac_objext \ |
|
10426 |
+rm -f core conftest.err conftest.$ac_objext conftest_ipa8_conftest.oo \ |
|
10427 | 10427 |
conftest$ac_exeext conftest.$ac_ext |
10428 | 10428 |
LIBS=$ac_check_lib_save_LIBS |
10429 | 10429 |
fi |
... | ... |
@@ -10504,7 +10508,7 @@ sed 's/^/| /' conftest.$ac_ext >&5 |
10504 | 10504 |
ac_cv_lib_nsl_gethostent=no |
10505 | 10505 |
fi |
10506 | 10506 |
|
10507 |
-rm -f core conftest.err conftest.$ac_objext \ |
|
10507 |
+rm -f core conftest.err conftest.$ac_objext conftest_ipa8_conftest.oo \ |
|
10508 | 10508 |
conftest$ac_exeext conftest.$ac_ext |
10509 | 10509 |
LIBS=$ac_check_lib_save_LIBS |
10510 | 10510 |
fi |
... | ... |
@@ -10622,7 +10626,7 @@ sed 's/^/| /' conftest.$ac_ext >&5 |
10622 | 10622 |
eval "$as_ac_var=no" |
10623 | 10623 |
fi |
10624 | 10624 |
|
10625 |
-rm -f core conftest.err conftest.$ac_objext \ |
|
10625 |
+rm -f core conftest.err conftest.$ac_objext conftest_ipa8_conftest.oo \ |
|
10626 | 10626 |
conftest$ac_exeext conftest.$ac_ext |
10627 | 10627 |
fi |
10628 | 10628 |
ac_res=`eval echo '${'$as_ac_var'}'` |
... | ... |
@@ -10897,7 +10901,7 @@ sed 's/^/| /' conftest.$ac_ext >&5 |
10897 | 10897 |
eval "$as_ac_var=no" |
10898 | 10898 |
fi |
10899 | 10899 |
|
10900 |
-rm -f core conftest.err conftest.$ac_objext \ |
|
10900 |
+rm -f core conftest.err conftest.$ac_objext conftest_ipa8_conftest.oo \ |
|
10901 | 10901 |
conftest$ac_exeext conftest.$ac_ext |
10902 | 10902 |
fi |
10903 | 10903 |
ac_res=`eval echo '${'$as_ac_var'}'` |
... | ... |
@@ -10954,21 +10958,21 @@ $ac_includes_default |
10954 | 10954 |
#include <fcntl.h> |
10955 | 10955 |
#include <sys/mman.h> |
10956 | 10956 |
|
10957 |
-#if !STDC_HEADERS && !HAVE_STDLIB_H |
|
10957 |
+#if !defined STDC_HEADERS && !defined HAVE_STDLIB_H |
|
10958 | 10958 |
char *malloc (); |
10959 | 10959 |
#endif |
10960 | 10960 |
|
10961 | 10961 |
/* This mess was copied from the GNU getpagesize.h. */ |
10962 |
-#if !HAVE_GETPAGESIZE |
|
10962 |
+#ifndef HAVE_GETPAGESIZE |
|
10963 | 10963 |
/* Assume that all systems that can run configure have sys/param.h. */ |
10964 |
-# if !HAVE_SYS_PARAM_H |
|
10964 |
+# ifndef HAVE_SYS_PARAM_H |
|
10965 | 10965 |
# define HAVE_SYS_PARAM_H 1 |
10966 | 10966 |
# endif |
10967 | 10967 |
|
10968 | 10968 |
# ifdef _SC_PAGESIZE |
10969 | 10969 |
# define getpagesize() sysconf(_SC_PAGESIZE) |
10970 | 10970 |
# else /* no _SC_PAGESIZE */ |
10971 |
-# if HAVE_SYS_PARAM_H |
|
10971 |
+# ifdef HAVE_SYS_PARAM_H |
|
10972 | 10972 |
# include <sys/param.h> |
10973 | 10973 |
# ifdef EXEC_PAGESIZE |
10974 | 10974 |
# define getpagesize() EXEC_PAGESIZE |
... | ... |
@@ -11300,7 +11304,7 @@ sed 's/^/| /' conftest.$ac_ext >&5 |
11300 | 11300 |
ac_cv_func_fseeko=no |
11301 | 11301 |
fi |
11302 | 11302 |
|
11303 |
-rm -f core conftest.err conftest.$ac_objext \ |
|
11303 |
+rm -f core conftest.err conftest.$ac_objext conftest_ipa8_conftest.oo \ |
|
11304 | 11304 |
conftest$ac_exeext conftest.$ac_ext |
11305 | 11305 |
fi |
11306 | 11306 |
{ echo "$as_me:$LINENO: result: $ac_cv_func_fseeko" >&5 |
... | ... |
@@ -11696,7 +11700,7 @@ sed 's/^/| /' conftest.$ac_ext >&5 |
11696 | 11696 |
ac_cv_lib_z_inflateEnd=no |
11697 | 11697 |
fi |
11698 | 11698 |
|
11699 |
-rm -f core conftest.err conftest.$ac_objext \ |
|
11699 |
+rm -f core conftest.err conftest.$ac_objext conftest_ipa8_conftest.oo \ |
|
11700 | 11700 |
conftest$ac_exeext conftest.$ac_ext |
11701 | 11701 |
LIBS=$ac_check_lib_save_LIBS |
11702 | 11702 |
fi |
... | ... |
@@ -11786,7 +11790,7 @@ sed 's/^/| /' conftest.$ac_ext >&5 |
11786 | 11786 |
ac_cv_lib_z_inflateEnd=no |
11787 | 11787 |
fi |
11788 | 11788 |
|
11789 |
-rm -f core conftest.err conftest.$ac_objext \ |
|
11789 |
+rm -f core conftest.err conftest.$ac_objext conftest_ipa8_conftest.oo \ |
|
11790 | 11790 |
conftest$ac_exeext conftest.$ac_ext |
11791 | 11791 |
LIBS=$ac_check_lib_save_LIBS |
11792 | 11792 |
fi |
... | ... |
@@ -11888,7 +11892,7 @@ sed 's/^/| /' conftest.$ac_ext >&5 |
11888 | 11888 |
ac_cv_lib_bz2_bzReadOpen=no |
11889 | 11889 |
fi |
11890 | 11890 |
|
11891 |
-rm -f core conftest.err conftest.$ac_objext \ |
|
11891 |
+rm -f core conftest.err conftest.$ac_objext conftest_ipa8_conftest.oo \ |
|
11892 | 11892 |
conftest$ac_exeext conftest.$ac_ext |
11893 | 11893 |
LIBS=$ac_check_lib_save_LIBS |
11894 | 11894 |
fi |
... | ... |
@@ -12146,7 +12150,7 @@ sed 's/^/| /' conftest.$ac_ext >&5 |
12146 | 12146 |
ac_cv_lib_sn_sigscan_sn_sigscan_initdb=no |
12147 | 12147 |
fi |
12148 | 12148 |
|
12149 |
-rm -f core conftest.err conftest.$ac_objext \ |
|
12149 |
+rm -f core conftest.err conftest.$ac_objext conftest_ipa8_conftest.oo \ |
|
12150 | 12150 |
conftest$ac_exeext conftest.$ac_ext |
12151 | 12151 |
LIBS=$ac_check_lib_save_LIBS |
12152 | 12152 |
fi |
... | ... |
@@ -12402,7 +12406,7 @@ sed 's/^/| /' conftest.$ac_ext >&5 |
12402 | 12402 |
ac_cv_lib_resolv___dn_expand=no |
12403 | 12403 |
fi |
12404 | 12404 |
|
12405 |
-rm -f core conftest.err conftest.$ac_objext \ |
|
12405 |
+rm -f core conftest.err conftest.$ac_objext conftest_ipa8_conftest.oo \ |
|
12406 | 12406 |
conftest$ac_exeext conftest.$ac_ext |
12407 | 12407 |
LIBS=$ac_check_lib_save_LIBS |
12408 | 12408 |
fi |
... | ... |
@@ -12484,7 +12488,7 @@ sed 's/^/| /' conftest.$ac_ext >&5 |
12484 | 12484 |
ac_cv_lib_resolv_dn_expand=no |
12485 | 12485 |
fi |
12486 | 12486 |
|
12487 |
-rm -f core conftest.err conftest.$ac_objext \ |
|
12487 |
+rm -f core conftest.err conftest.$ac_objext conftest_ipa8_conftest.oo \ |
|
12488 | 12488 |
conftest$ac_exeext conftest.$ac_ext |
12489 | 12489 |
LIBS=$ac_check_lib_save_LIBS |
12490 | 12490 |
fi |
... | ... |
@@ -12915,7 +12919,7 @@ sed 's/^/| /' conftest.$ac_ext >&5 |
12915 | 12915 |
ac_cv_lib_gmp___gmpz_init=no |
12916 | 12916 |
fi |
12917 | 12917 |
|
12918 |
-rm -f core conftest.err conftest.$ac_objext \ |
|
12918 |
+rm -f core conftest.err conftest.$ac_objext conftest_ipa8_conftest.oo \ |
|
12919 | 12919 |
conftest$ac_exeext conftest.$ac_ext |
12920 | 12920 |
LIBS=$ac_check_lib_save_LIBS |
12921 | 12921 |
fi |
... | ... |
@@ -12999,7 +13003,7 @@ sed 's/^/| /' conftest.$ac_ext >&5 |
12999 | 12999 |
ac_cv_lib_gmp_mpz_init=no |
13000 | 13000 |
fi |
13001 | 13001 |
|
13002 |
-rm -f core conftest.err conftest.$ac_objext \ |
|
13002 |
+rm -f core conftest.err conftest.$ac_objext conftest_ipa8_conftest.oo \ |
|
13003 | 13003 |
conftest$ac_exeext conftest.$ac_ext |
13004 | 13004 |
LIBS=$ac_check_lib_save_LIBS |
13005 | 13005 |
fi |
... | ... |
@@ -13906,7 +13910,7 @@ sed 's/^/| /' conftest.$ac_ext >&5 |
13906 | 13906 |
ac_cv_lib_milter_mi_stop=no |
13907 | 13907 |
fi |
13908 | 13908 |
|
13909 |
-rm -f core conftest.err conftest.$ac_objext \ |
|
13909 |
+rm -f core conftest.err conftest.$ac_objext conftest_ipa8_conftest.oo \ |
|
13910 | 13910 |
conftest$ac_exeext conftest.$ac_ext |
13911 | 13911 |
LIBS=$ac_check_lib_save_LIBS |
13912 | 13912 |
fi |
... | ... |
@@ -13993,7 +13997,7 @@ sed 's/^/| /' conftest.$ac_ext >&5 |
13993 | 13993 |
|
13994 | 13994 |
fi |
13995 | 13995 |
|
13996 |
-rm -f core conftest.err conftest.$ac_objext \ |
|
13996 |
+rm -f core conftest.err conftest.$ac_objext conftest_ipa8_conftest.oo \ |
|
13997 | 13997 |
conftest$ac_exeext |
13998 | 13998 |
if test "${ac_cv_search_strlcpy+set}" = set; then |
13999 | 13999 |
break |
... | ... |
@@ -14088,7 +14092,7 @@ sed 's/^/| /' conftest.$ac_ext >&5 |
14088 | 14088 |
ac_cv_lib_milter_mi_stop=no |
14089 | 14089 |
fi |
14090 | 14090 |
|
14091 |
-rm -f core conftest.err conftest.$ac_objext \ |
|
14091 |
+rm -f core conftest.err conftest.$ac_objext conftest_ipa8_conftest.oo \ |
|
14092 | 14092 |
conftest$ac_exeext conftest.$ac_ext |
14093 | 14093 |
LIBS=$ac_check_lib_save_LIBS |
14094 | 14094 |
fi |
... | ... |
@@ -14407,7 +14411,7 @@ sed 's/^/| /' conftest.$ac_ext >&5 |
14407 | 14407 |
eval "$as_ac_var=no" |
14408 | 14408 |
fi |
14409 | 14409 |
|
14410 |
-rm -f core conftest.err conftest.$ac_objext \ |
|
14410 |
+rm -f core conftest.err conftest.$ac_objext conftest_ipa8_conftest.oo \ |
|
14411 | 14411 |
conftest$ac_exeext conftest.$ac_ext |
14412 | 14412 |
fi |
14413 | 14413 |
ac_res=`eval echo '${'$as_ac_var'}'` |
... | ... |
@@ -14898,11 +14902,11 @@ echo "${ECHO_T}no" >&6; } |
14898 | 14898 |
LIBS=$save_LIBS |
14899 | 14899 |
fi |
14900 | 14900 |
|
14901 |
-rm -f core conftest.err conftest.$ac_objext \ |
|
14901 |
+rm -f core conftest.err conftest.$ac_objext conftest_ipa8_conftest.oo \ |
|
14902 | 14902 |
conftest$ac_exeext conftest.$ac_ext |
14903 | 14903 |
fi |
14904 | 14904 |
|
14905 |
-rm -f core conftest.err conftest.$ac_objext \ |
|
14905 |
+rm -f core conftest.err conftest.$ac_objext conftest_ipa8_conftest.oo \ |
|
14906 | 14906 |
conftest$ac_exeext conftest.$ac_ext |
14907 | 14907 |
else |
14908 | 14908 |
have_wrappers=no |
... | ... |
@@ -15717,10 +15721,10 @@ main () |
15717 | 15717 |
#ifndef __cplusplus |
15718 | 15718 |
/* Ultrix mips cc rejects this. */ |
15719 | 15719 |
typedef int charset[2]; |
15720 |
- const charset x; |
|
15720 |
+ const charset cs; |
|
15721 | 15721 |
/* SunOS 4.1.1 cc rejects this. */ |
15722 |
- char const *const *ccp; |
|
15723 |
- char **p; |
|
15722 |
+ char const *const *pcpcc; |
|
15723 |
+ char **ppc; |
|
15724 | 15724 |
/* NEC SVR4.0.2 mips cc rejects this. */ |
15725 | 15725 |
struct point {int x, y;}; |
15726 | 15726 |
static struct point const zero = {0,0}; |
... | ... |
@@ -15729,11 +15733,11 @@ main () |
15729 | 15729 |
an arm of an if-expression whose if-part is not a constant |
15730 | 15730 |
expression */ |
15731 | 15731 |
const char *g = "string"; |
15732 |
- ccp = &g + (g ? g-g : 0); |
|
15732 |
+ pcpcc = &g + (g ? g-g : 0); |
|
15733 | 15733 |
/* HPUX 7.0 cc rejects these. */ |
15734 |
- ++ccp; |
|
15735 |
- p = (char**) ccp; |
|
15736 |
- ccp = (char const *const *) p; |
|
15734 |
+ ++pcpcc; |
|
15735 |
+ ppc = (char**) pcpcc; |
|
15736 |
+ pcpcc = (char const *const *) ppc; |
|
15737 | 15737 |
{ /* SCO 3.2v4 cc rejects this. */ |
15738 | 15738 |
char *t; |
15739 | 15739 |
char const *s = 0 ? (char *) 0 : (char const *) 0; |
... | ... |
@@ -15760,7 +15764,7 @@ main () |
15760 | 15760 |
const int foo = 10; |
15761 | 15761 |
if (!foo) return 0; |
15762 | 15762 |
} |
15763 |
- return !x[0] && !zero.x; |
|
15763 |
+ return !cs[0] && !zero.x; |
|
15764 | 15764 |
#endif |
15765 | 15765 |
|
15766 | 15766 |
; |
... | ... |
@@ -15925,7 +15929,8 @@ cat >>conftest.$ac_ext <<_ACEOF |
15925 | 15925 |
int |
15926 | 15926 |
main () |
15927 | 15927 |
{ |
15928 |
-#if !BYTE_ORDER || !BIG_ENDIAN || !LITTLE_ENDIAN |
|
15928 |
+#if ! (defined BYTE_ORDER && defined BIG_ENDIAN && defined LITTLE_ENDIAN \ |
|
15929 |
+ && BYTE_ORDER && BIG_ENDIAN && LITTLE_ENDIAN) |
|
15929 | 15930 |
bogus endian macros |
15930 | 15931 |
#endif |
15931 | 15932 |
|
... | ... |
@@ -17023,7 +17028,7 @@ exec 6>&1 |
17023 | 17023 |
# values after options handling. |
17024 | 17024 |
ac_log=" |
17025 | 17025 |
This file was extended by $as_me, which was |
17026 |
-generated by GNU Autoconf 2.60. Invocation command line was |
|
17026 |
+generated by GNU Autoconf 2.60a. Invocation command line was |
|
17027 | 17027 |
|
17028 | 17028 |
CONFIG_FILES = $CONFIG_FILES |
17029 | 17029 |
CONFIG_HEADERS = $CONFIG_HEADERS |
... | ... |
@@ -17052,7 +17057,7 @@ current configuration. |
17052 | 17052 |
Usage: $0 [OPTIONS] [FILE]... |
17053 | 17053 |
|
17054 | 17054 |
-h, --help print this help, then exit |
17055 |
- -V, --version print version number, then exit |
|
17055 |
+ -V, --version print version number and configuration settings, then exit |
|
17056 | 17056 |
-q, --quiet do not print progress messages |
17057 | 17057 |
-d, --debug don't remove temporary files |
17058 | 17058 |
--recheck update $as_me by reconfiguring in the same conditions |
... | ... |
@@ -17076,7 +17081,7 @@ _ACEOF |
17076 | 17076 |
cat >>$CONFIG_STATUS <<_ACEOF |
17077 | 17077 |
ac_cs_version="\\ |
17078 | 17078 |
config.status |
17079 |
-configured by $0, generated by GNU Autoconf 2.60, |
|
17079 |
+configured by $0, generated by GNU Autoconf 2.60a, |
|
17080 | 17080 |
with options \\"`echo "$ac_configure_args" | sed 's/^ //; s/[\\""\`\$]/\\\\&/g'`\\" |
17081 | 17081 |
|
17082 | 17082 |
Copyright (C) 2006 Free Software Foundation, Inc. |
... | ... |
@@ -142,6 +142,14 @@ libclamav_la_SOURCES = \ |
142 | 142 |
uuencode.c \ |
143 | 143 |
uuencode.h \ |
144 | 144 |
pst.c \ |
145 |
- pst.h |
|
145 |
+ pst.h \ |
|
146 |
+ phishcheck.c \ |
|
147 |
+ phishcheck.h \ |
|
148 |
+ phish_domaincheck_db.c \ |
|
149 |
+ phish_domaincheck_db.h \ |
|
150 |
+ phish_whitelist.c \ |
|
151 |
+ phish_whitelist.h \ |
|
152 |
+ regex_list.c \ |
|
153 |
+ regex_list.h |
|
146 | 154 |
|
147 | 155 |
lib_LTLIBRARIES = libclamav.la |
... | ... |
@@ -86,7 +86,8 @@ am_libclamav_la_OBJECTS = matcher-ac.lo matcher-bm.lo matcher.lo \ |
86 | 86 |
packlibs.lo fsg.lo line.lo untar.lo unzip.lo special.lo \ |
87 | 87 |
binhex.lo is_tar.lo tnef.lo unrar15.lo unrarvm.lo unrar.lo \ |
88 | 88 |
unrarfilter.lo unrarppm.lo unrar20.lo unrarcmd.lo pdf.lo \ |
89 |
- spin.lo yc.lo elf.lo sis.lo uuencode.lo pst.lo |
|
89 |
+ spin.lo yc.lo elf.lo sis.lo uuencode.lo pst.lo phishcheck.lo \ |
|
90 |
+ phish_domaincheck_db.lo phish_whitelist.lo regex_list.lo |
|
90 | 91 |
libclamav_la_OBJECTS = $(am_libclamav_la_OBJECTS) |
91 | 92 |
DEFAULT_INCLUDES = -I. -I$(srcdir) -I$(top_builddir) |
92 | 93 |
depcomp = $(SHELL) $(top_srcdir)/depcomp |
... | ... |
@@ -341,7 +342,15 @@ libclamav_la_SOURCES = \ |
341 | 341 |
uuencode.c \ |
342 | 342 |
uuencode.h \ |
343 | 343 |
pst.c \ |
344 |
- pst.h |
|
344 |
+ pst.h \ |
|
345 |
+ phishcheck.c \ |
|
346 |
+ phishcheck.h \ |
|
347 |
+ phish_domaincheck_db.c \ |
|
348 |
+ phish_domaincheck_db.h \ |
|
349 |
+ phish_whitelist.c \ |
|
350 |
+ phish_whitelist.h \ |
|
351 |
+ regex_list.c \ |
|
352 |
+ regex_list.h |
|
345 | 353 |
|
346 | 354 |
lib_LTLIBRARIES = libclamav.la |
347 | 355 |
all: all-am |
... | ... |
@@ -440,10 +449,14 @@ distclean-compile: |
440 | 440 |
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/pdf.Plo@am__quote@ |
441 | 441 |
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/pe.Plo@am__quote@ |
442 | 442 |
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/petite.Plo@am__quote@ |
443 |
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/phish_domaincheck_db.Plo@am__quote@ |
|
444 |
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/phish_whitelist.Plo@am__quote@ |
|
445 |
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/phishcheck.Plo@am__quote@ |
|
443 | 446 |
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/pst.Plo@am__quote@ |
444 | 447 |
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/qtmd.Plo@am__quote@ |
445 | 448 |
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/readdb.Plo@am__quote@ |
446 | 449 |
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/rebuildpe.Plo@am__quote@ |
450 |
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/regex_list.Plo@am__quote@ |
|
447 | 451 |
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/scanners.Plo@am__quote@ |
448 | 452 |
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/sis.Plo@am__quote@ |
449 | 453 |
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/snprintf.Plo@am__quote@ |
... | ... |
@@ -23,8 +23,10 @@ |
23 | 23 |
typedef struct tag_arguments_tag |
24 | 24 |
{ |
25 | 25 |
int count; |
26 |
+ int scanContents; |
|
26 | 27 |
unsigned char **tag; |
27 | 28 |
unsigned char **value; |
29 |
+ struct blob **contents; |
|
28 | 30 |
} tag_arguments_t; |
29 | 31 |
|
30 | 32 |
int html_normalise_mem(unsigned char *in_buff, off_t in_size, const char *dirname, tag_arguments_t *hrefs); |
31 | 33 |
new file mode 100644 |
... | ... |
@@ -0,0 +1,28 @@ |
0 |
+/* |
|
1 |
+ * Phishing module: iana tld list. |
|
2 |
+ * |
|
3 |
+ * Copyright (C) 2006 Torok Edvin <edwintorok@gmail.com> |
|
4 |
+ * |
|
5 |
+ * This program is free software; you can redistribute it and/or modify |
|
6 |
+ * it under the terms of the GNU General Public License as published by |
|
7 |
+ * the Free Software Foundation; either version 2 of the License, or |
|
8 |
+ * (at your option) any later version. |
|
9 |
+ * |
|
10 |
+ * This program is distributed in the hope that it will be useful, |
|
11 |
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of |
|
12 |
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
|
13 |
+ * GNU General Public License for more details. |
|
14 |
+ * |
|
15 |
+ * You should have received a copy of the GNU General Public License |
|
16 |
+ * along with this program; if not, write to the Free Software |
|
17 |
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, |
|
18 |
+ * MA 02110-1301, USA. |
|
19 |
+ * |
|
20 |
+ */ |
|
21 |
+ |
|
22 |
+#ifndef IANA_TLD_H |
|
23 |
+#define IANA_TLD_H |
|
24 |
+#define iana_tld "(A[CDEFGILMNOQRSTUWXZ]|B[ABDEFGHIJMNORSTVWYZ]|C[ACDFGHIKLMNORUVXYZ]|D[EJKMOZ]|E[CEGRSTU]|F[IJKMOR]|G[ABDEFGHILMNPQRSTUWY]|H[KMNRTU]|I[DELMNOQRST]|J[EMOP]|K[EGHIMNRWYZ]|L[ABCIKRSTUVY]|M[ACDGHKLMNOPQRSTUVWXYZ]|N[ACEFGILOPRUZ]|OM|P[AEFGHKLMNRSTWY]|QA|R[EOUW]|S[ABCDEGHIJKLMNORTUVYZ]|T[CDFGHJKLMNOPRTVWZ]|U[AGKMSYZ]|V[ACEGINU]|W[FS]|Y[ETU]|Z[AMW]|BIZ|CAT|COM|EDU|GOV|INT|MIL|NET|ORG|PRO|AERO|ARPA|COOP|INFO|JOBS|MOBI|NAME|MUSEUM)" |
|
25 |
+#define iana_cctld "(a[dfilmoqrtuwxz]|b[bdeghijmorstwyz]|c[ahlmnosuy]|d[ejkmz]|e[cegrstu]|f[ijr]|g[abdeghilmnprtuwy]|h[nrtu]|i[delnqst]|j[emop]|k[eghimwz]|l[birstuv]|m[acglmnoqrstuvwxyz]|n[aegilopru]|om|p[aehkltwy]|qa|r[ow]|s[cdeginorz]|t[dghjklmnorvwz]|u[agyz]|v[enu]|ws|y[etu])" |
|
26 |
+#endif |
|
27 |
+ |
0 | 28 |
new file mode 100644 |
... | ... |
@@ -0,0 +1,129 @@ |
0 |
+/* |
|
1 |
+ * Phishing module: domain list implementation. |
|
2 |
+ * |
|
3 | ||
4 |
+ * |
|
5 |
+ * This program is free software; you can redistribute it and/or modify |
|
6 |
+ * it under the terms of the GNU General Public License as published by |
|
7 |
+ * the Free Software Foundation; either version 2 of the License, or |
|
8 |
+ * (at your option) any later version. |
|
9 |
+ * |
|
10 |
+ * This program is distributed in the hope that it will be useful, |
|
11 |
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of |
|
12 |
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
|
13 |
+ * GNU General Public License for more details. |
|
14 |
+ * |
|
15 |
+ * You should have received a copy of the GNU General Public License |
|
16 |
+ * along with this program; if not, write to the Free Software |
|
17 |
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, |
|
18 |
+ * MA 02110-1301, USA. |
|
19 |
+ * |
|
20 |
+ * $Log: phish_domaincheck_db.c,v $ |
|
21 |
+ * Revision 1.1 2006/09/12 19:38:39 acab |
|
22 |
+ * Phishing module merge - libclamav |
|
23 |
+ * |
|
24 |
+ * Revision 1.3 2006/08/20 21:18:11 edwin |
|
25 |
+ * Added the script used to generate iana_tld.sh |
|
26 |
+ * Added checks for phish_domaincheck_db |
|
27 |
+ * Added phishing module design document from wiki (as discussed with aCaB). |
|
28 |
+ * Updated .wdb/.pdb format documentation (in regex_list.c) |
|
29 |
+ * Fixed some memory leaks in regex_list.c |
|
30 |
+ * IOW: cleanups before the deadline. |
|
31 |
+ * I consider my module to be ready for evaluation now. |
|
32 |
+ * |
|
33 |
+ * Revision 1.2 2006/08/09 16:26:44 edwin |
|
34 |
+ * Forgot to add these files |
|
35 |
+ * |
|
36 |
+ */ |
|
37 |
+ |
|
38 |
+ |
|
39 |
+#if HAVE_CONFIG_H |
|
40 |
+#include "clamav-config.h" |
|
41 |
+#endif |
|
42 |
+ |
|
43 |
+#ifdef CL_EXPERIMENTAL |
|
44 |
+ |
|
45 |
+#ifndef CL_DEBUG |
|
46 |
+#define NDEBUG |
|
47 |
+#endif |
|
48 |
+ |
|
49 |
+#ifdef CL_THREAD_SAFE |
|
50 |
+#ifndef _REENTRANT |
|
51 |
+#define _REENTRANT |
|
52 |
+#endif |
|
53 |
+#endif |
|
54 |
+ |
|
55 |
+#include <stdio.h> |
|
56 |
+#include <stdlib.h> |
|
57 |
+#include <errno.h> |
|
58 |
+#include <assert.h> |
|
59 |
+#include <string.h> |
|
60 |
+#include <strings.h> |
|
61 |
+#include <ctype.h> |
|
62 |
+ |
|
63 |
+#include <limits.h> |
|
64 |
+#include "clamav.h" |
|
65 |
+#include <sys/types.h> |
|
66 |
+ |
|
67 |
+/*#define USE_PCRE*/ |
|
68 |
+#include <regex.h> |
|
69 |
+ |
|
70 |
+#if defined(HAVE_READDIR_R_3) || defined(HAVE_READDIR_R_2) |
|
71 |
+#include <stddef.h> |
|
72 |
+#endif |
|
73 |
+ |
|
74 |
+#include "others.h" |
|
75 |
+#include "defaults.h" |
|
76 |
+#include "str.h" |
|
77 |
+#include "filetypes.h" |
|
78 |
+#include "mbox.h" |
|
79 |
+#include "phish_domaincheck_db.h" |
|
80 |
+#include "regex_list.h" |
|
81 |
+#include "matcher-ac.h" |
|
82 |
+ |
|
83 |
+ |
|
84 |
+static struct regex_matcher domainlist_matcher; |
|
85 |
+ |
|
86 |
+int domainlist_match(const char* real_url,const char* display_url,int hostOnly,unsigned short* flags) |
|
87 |
+{ |
|
88 |
+ const char* info; |
|
89 |
+ int rc = regex_list_match(&domainlist_matcher,real_url,display_url,hostOnly,&info); |
|
90 |
+ if(rc && info && info[0]) {/*match successfull, and has custom flags*/ |
|
91 |
+ if(strlen(info)==3 && isxdigit(info[0]) && isxdigit(info[1]) && isxdigit(info[2])) { |
|
92 |
+ unsigned short notwantedflags=0; |
|
93 |
+ sscanf(info,"%hx",¬wantedflags); |
|
94 |
+ *flags &= ~notwantedflags;/* filter unwanted phishcheck flags */ |
|
95 |
+ } |
|
96 |
+ else { |
|
97 |
+ cli_warnmsg("Phishcheck:Unknown flag format in domainlist, 3 hex digits expected"); |
|
98 |
+ } |
|
99 |
+ } |
|
100 |
+ return rc; |
|
101 |
+} |
|
102 |
+ |
|
103 |
+int init_domainlist(void) |
|
104 |
+{ |
|
105 |
+ return init_regex_list(&domainlist_matcher); |
|
106 |
+} |
|
107 |
+ |
|
108 |
+int is_domainlist_ok(void) |
|
109 |
+{ |
|
110 |
+ return is_regex_ok(&domainlist_matcher); |
|
111 |
+} |
|
112 |
+ |
|
113 |
+int cli_loadpdb(FILE* fd,unsigned int options) |
|
114 |
+{ |
|
115 |
+ return load_regex_matcher(&domainlist_matcher,fd,options); |
|
116 |
+} |
|
117 |
+ |
|
118 |
+void domainlist_cleanup(void) |
|
119 |
+{ |
|
120 |
+ regex_list_cleanup(&domainlist_matcher); |
|
121 |
+} |
|
122 |
+ |
|
123 |
+void domainlist_done(void) |
|
124 |
+{ |
|
125 |
+ regex_list_done(&domainlist_matcher); |
|
126 |
+} |
|
127 |
+ |
|
128 |
+#endif |
... | ... |
@@ -19,6 +19,9 @@ |
19 | 19 |
* MA 02110-1301, USA. |
20 | 20 |
* |
21 | 21 |
*/ |
22 |
+ |
|
23 |
+#ifdef CL_EXPERIMENTAL |
|
24 |
+ |
|
22 | 25 |
#ifndef _PHISH_DOMAINCHECK_DB_H |
23 | 26 |
#define _PHISH_DOMAINCHECK_DB_H |
24 | 27 |
|
... | ... |
@@ -31,3 +34,5 @@ int is_domainlist_ok(void); |
31 | 31 |
int domainlist_match(const char* real_url,const char* display_url,int hostOnly,unsigned short* flags); |
32 | 32 |
|
33 | 33 |
#endif |
34 |
+ |
|
35 |
+#endif |
34 | 36 |
new file mode 100644 |
... | ... |
@@ -0,0 +1,157 @@ |
0 |
+/* |
|
1 |
+ * Phishing module: whitelist implementation. |
|
2 |
+ * |
|
3 | ||
4 |
+ * |
|
5 |
+ * This program is free software; you can redistribute it and/or modify |
|
6 |
+ * it under the terms of the GNU General Public License as published by |
|
7 |
+ * the Free Software Foundation; either version 2 of the License, or |
|
8 |
+ * (at your option) any later version. |
|
9 |
+ * |
|
10 |
+ * This program is distributed in the hope that it will be useful, |
|
11 |
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of |
|
12 |
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
|
13 |
+ * GNU General Public License for more details. |
|
14 |
+ * |
|
15 |
+ * You should have received a copy of the GNU General Public License |
|
16 |
+ * along with this program; if not, write to the Free Software |
|
17 |
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, |
|
18 |
+ * MA 02110-1301, USA. |
|
19 |
+ * |
|
20 |
+ * $Log: phish_whitelist.c,v $ |
|
21 |
+ * Revision 1.1 2006/09/12 19:38:39 acab |
|
22 |
+ * Phishing module merge - libclamav |
|
23 |
+ * |
|
24 |
+ * Revision 1.16 2006/08/06 20:27:07 edwin |
|
25 |
+ * New option to enable phish scan for all domains (disabled by default). |
|
26 |
+ * You will now have to run clamscan --phish-scan-alldomains to have any phishes detected. |
|
27 |
+ * Updated phishcheck control flow to better incorporate the domainlist. |
|
28 |
+ * Updated manpage with new options. |
|
29 |
+ * |
|
30 |
+ * TODO:there is a still-reachable leak in regex_list.c |
|
31 |
+ * |
|
32 |
+ * Revision 1.15 2006/07/31 20:12:30 edwin |
|
33 |
+ * Preliminary support for domain databases (domains to check by phishmodule) |
|
34 |
+ * Better memory allocation failure handling in regex_list |
|
35 |
+ * |
|
36 |
+ */ |
|
37 |
+ |
|
38 |
+#if HAVE_CONFIG_H |
|
39 |
+#include "clamav-config.h" |
|
40 |
+#endif |
|
41 |
+ |
|
42 |
+#ifdef CL_EXPERIMENTAL |
|
43 |
+ |
|
44 |
+#ifndef CL_DEBUG |
|
45 |
+#define NDEBUG |
|
46 |
+#endif |
|
47 |
+ |
|
48 |
+#ifdef CL_THREAD_SAFE |
|
49 |
+#ifndef _REENTRANT |
|
50 |
+#define _REENTRANT |
|
51 |
+#endif |
|
52 |
+#endif |
|
53 |
+ |
|
54 |
+#include <stdio.h> |
|
55 |
+#include <stdlib.h> |
|
56 |
+#include <errno.h> |
|
57 |
+#include <assert.h> |
|
58 |
+#include <string.h> |
|
59 |
+#include <strings.h> |
|
60 |
+#include <ctype.h> |
|
61 |
+ |
|
62 |
+#include <limits.h> |
|
63 |
+#include "clamav.h" |
|
64 |
+#include <sys/types.h> |
|
65 |
+ |
|
66 |
+/*#define USE_PCRE*/ |
|
67 |
+#include <regex.h> |
|
68 |
+ |
|
69 |
+#if defined(HAVE_READDIR_R_3) || defined(HAVE_READDIR_R_2) |
|
70 |
+#include <stddef.h> |
|
71 |
+#endif |
|
72 |
+ |
|
73 |
+#include "others.h" |
|
74 |
+#include "defaults.h" |
|
75 |
+#include "str.h" |
|
76 |
+#include "filetypes.h" |
|
77 |
+#include "mbox.h" |
|
78 |
+#include "phish_whitelist.h" |
|
79 |
+#include "regex_list.h" |
|
80 |
+#include "matcher-ac.h" |
|
81 |
+ |
|
82 |
+ |
|
83 |
+static struct regex_matcher whitelist_matcher; |
|
84 |
+ |
|
85 |
+int whitelist_match(const char* real_url,const char* display_url,int hostOnly) |
|
86 |
+{ |
|
87 |
+ const char* info;/*unused*/ |
|
88 |
+ return regex_list_match(&whitelist_matcher,real_url,display_url,hostOnly,&info); |
|
89 |
+} |
|
90 |
+ |
|
91 |
+int init_whitelist(void) |
|
92 |
+{ |
|
93 |
+ return init_regex_list(&whitelist_matcher); |
|
94 |
+} |
|
95 |
+ |
|
96 |
+int is_whitelist_ok(void) |
|
97 |
+{ |
|
98 |
+ return is_regex_ok(&whitelist_matcher); |
|
99 |
+} |
|
100 |
+ |
|
101 |
+int cli_loadwdb(FILE* fd,unsigned int options) |
|
102 |
+{ |
|
103 |
+ return load_regex_matcher(&whitelist_matcher,fd,options); |
|
104 |
+} |
|
105 |
+ |
|
106 |
+void whitelist_cleanup(void) |
|
107 |
+{ |
|
108 |
+ regex_list_cleanup(&whitelist_matcher); |
|
109 |
+} |
|
110 |
+ |
|
111 |
+void whitelist_done(void) |
|
112 |
+{ |
|
113 |
+ regex_list_done(&whitelist_matcher); |
|
114 |
+} |
|
115 |
+ |
|
116 |
+#define WHITELIST_TEST |
|
117 |
+#ifdef WHITELIST_TEST |
|
118 |
+int main(int argc,char* argv[]) |
|
119 |
+{ |
|
120 |
+/* struct tree_node* root=tree_node_alloc(NULL,1); |
|
121 |
+ const char* info; |
|
122 |
+ const unsigned char test[]="tesxt"; |
|
123 |
+ setup_matcher(); |
|
124 |
+ root->op=OP_ROOT; |
|
125 |
+ root->c=0; |
|
126 |
+ root->next=NULL; |
|
127 |
+ root->listend=1; |
|
128 |
+ dump_tree(root); |
|
129 |
+ add_pattern(&root,"test","1"); |
|
130 |
+ dump_tree(root); |
|
131 |
+ add_pattern(&root,"tesv","2"); |
|
132 |
+ dump_tree(root); |
|
133 |
+ add_pattern(&root,"tert","3"); |
|
134 |
+ dump_tree(root); |
|
135 |
+ add_pattern(&root,"terr+","4"); |
|
136 |
+ dump_tree(root); |
|
137 |
+ add_pattern(&root,"tes[xy]t","5"); |
|
138 |
+ dump_tree(root); |
|
139 |
+ match_node(root,test,sizeof(test),&info); |
|
140 |
+ destroy_tree(root); |
|
141 |
+ if(info) |
|
142 |
+ printf("%s\n",info); |
|
143 |
+ else printf("not found\n");*/ |
|
144 |
+ /*FILE* f=fopen("w.wdb","r"); |
|
145 |
+ init_whitelist(); |
|
146 |
+ load_whitelist(f); |
|
147 |
+ fclose(f); |
|
148 |
+ dump_tree(root_regex); |
|
149 |
+ build_whitelist(); |
|
150 |
+ printf("%d\n",whitelist_match("http://www.google.ro","http://www.google.me.ro",0)); |
|
151 |
+ whitelist_done();*/ |
|
152 |
+ return 0; |
|
153 |
+} |
|
154 |
+#endif |
|
155 |
+ |
|
156 |
+#endif |
... | ... |
@@ -20,8 +20,10 @@ |
20 | 20 |
* |
21 | 21 |
*/ |
22 | 22 |
|
23 |
-#ifndef _WHITELIST_H |
|
24 |
-#define _WHITELIST_H |
|
23 |
+#ifdef CL_EXPERIMENTAL |
|
24 |
+ |
|
25 |
+#ifndef _PHISH_WHITELIST_H |
|
26 |
+#define _PHISH_WHITELIST_H |
|
25 | 27 |
|
26 | 28 |
int cli_loadwdb(FILE* fd, unsigned int options); |
27 | 29 |
int build_whitelist(void); |
... | ... |
@@ -32,3 +34,5 @@ int is_whitelist_ok(void); |
32 | 32 |
int whitelist_match(const char* real_url,const char* display_url,int hostOnly); |
33 | 33 |
|
34 | 34 |
#endif |
35 |
+ |
|
36 |
+#endif |
35 | 37 |
new file mode 100644 |
... | ... |
@@ -0,0 +1,1258 @@ |
0 |
+/* |
|
1 |
+ * Detect phishing, based on URL spoofing detection. |
|
2 |
+ * |
|
3 | ||
4 |
+ * |
|
5 |
+ * This program is free software; you can redistribute it and/or modify |
|
6 |
+ * it under the terms of the GNU General Public License as published by |
|
7 |
+ * the Free Software Foundation; either version 2 of the License, or |
|
8 |
+ * (at your option) any later version. |
|
9 |
+ * |
|
10 |
+ * This program is distributed in the hope that it will be useful, |
|
11 |
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of |
|
12 |
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
|
13 |
+ * GNU General Public License for more details. |
|
14 |
+ * |
|
15 |
+ * You should have received a copy of the GNU General Public License |
|
16 |
+ * along with this program; if not, write to the Free Software |
|
17 |
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, |
|
18 |
+ * MA 02110-1301, USA. |
|
19 |
+ * |
|
20 |
+ * $Log: phishcheck.c,v $ |
|
21 |
+ * Revision 1.1 2006/09/12 19:38:39 acab |
|
22 |
+ * Phishing module merge - libclamav |
|
23 |
+ * |
|
24 |
+ * Revision 1.28 2006/09/09 09:49:27 edwin |
|
25 |
+ * Fix Solaris compilation problem |
|
26 |
+ * |
|
27 |
+ * Revision 1.27 2006/08/28 08:43:06 edwin |
|
28 |
+ * Fixed a few minor leaks. |
|
29 |
+ * Valgrind now says:"All heap blocks were freed -- no leaks are possible" |
|
30 |
+ * |
|
31 |
+ * Revision 1.26 2006/08/20 21:18:11 edwin |
|
32 |
+ * Added the script used to generate iana_tld.sh |
|
33 |
+ * Added checks for phish_domaincheck_db |
|
34 |
+ * Added phishing module design document from wiki (as discussed with aCaB). |
|
35 |
+ * Updated .wdb/.pdb format documentation (in regex_list.c) |
|
36 |
+ * Fixed some memory leaks in regex_list.c |
|
37 |
+ * IOW: cleanups before the deadline. |
|
38 |
+ * I consider my module to be ready for evaluation now. |
|
39 |
+ * |
|
40 |
+ * Revision 1.25 2006/08/19 21:08:47 edwin |
|
41 |
+ * Fixed:Forgot to add form tag handling when it contains images. |
|
42 |
+ * Various fixes to get rid of gcc warnings. |
|
43 |
+ * |
|
44 |
+ * Revision 1.24 2006/08/19 13:30:34 edwin |
|
45 |
+ * iana_tld.h was missing from the list of header files. |
|
46 |
+ * commentedout network code (unused currently) |
|
47 |
+ * |
|
48 |
+ * Revision 1.23 2006/08/17 20:31:43 edwin |
|
49 |
+ * Disable extracting hrefs from mails in mbox, if: we aren't scanning for phish, and mailfollowurls is off. |
|
50 |
+ * Fix a still reachable leak. Remove unneeded build_regex_list export. |
|
51 |
+ * |
|
52 |
+ * Revision 1.22 2006/08/12 14:35:34 edwin |
|
53 |
+ * Fix some compiler warnings. |
|
54 |
+ * Fix an assertion failure in regex_list. |
|
55 |
+ * Interpret display links that start with http|https|ftp, always as an URL. |
|
56 |
+ * |
|
57 |
+ * Revision 1.21 2006/08/06 20:27:07 edwin |
|
58 |
+ * New option to enable phish scan for all domains (disabled by default). |
|
59 |
+ * You will now have to run clamscan --phish-scan-alldomains to have any phishes detected. |
|
60 |
+ * Updated phishcheck control flow to better incorporate the domainlist. |
|
61 |
+ * Updated manpage with new options. |
|
62 |
+ * |
|
63 |
+ * TODO:there is a still-reachable leak in regex_list.c |
|
64 |
+ * |
|
65 |
+ * Revision 1.20 2006/08/01 20:19:14 edwin |
|
66 |
+ * Integrate domainlist check into phishcheck. Warning: enabled by default. |
|
67 |
+ * Regex bracket handling update. |
|
68 |
+ * Better regex paranthesized & alternate expression handling. |
|
69 |
+ * |
|
70 |
+ |
|
71 |
+case CL_PHISH_HOST_NOT_LISTED: |
|
72 |
+ return "Host not listed in .pdb -> not checked";* Revision 1.19 2006/07/31 20:12:30 edwin |
|
73 |
+ * Preliminary support for domain databases (domains to check by phishmodule) |
|
74 |
+ * Better memory allocation failure handling in regex_list |
|
75 |
+ * |
|
76 |
+ */ |
|
77 |
+ |
|
78 |
+#if HAVE_CONFIG_H |
|
79 |
+#include "clamav-config.h" |
|
80 |
+#endif |
|
81 |
+ |
|
82 |
+#ifdef CL_EXPERIMENTAL |
|
83 |
+ |
|
84 |
+#ifndef CL_DEBUG |
|
85 |
+#define NDEBUG |
|
86 |
+#endif |
|
87 |
+ |
|
88 |
+#ifdef CL_THREAD_SAFE |
|
89 |
+#ifndef _REENTRANT |
|
90 |
+#define _REENTRANT |
|
91 |
+#endif |
|
92 |
+#endif |
|
93 |
+ |
|
94 |
+#include <stdio.h> |
|
95 |
+#include <stdlib.h> |
|
96 |
+#include <errno.h> |
|
97 |
+#include <assert.h> |
|
98 |
+#include <string.h> |
|
99 |
+#include <strings.h> |
|
100 |
+#include <ctype.h> |
|
101 |
+#include <limits.h> |
|
102 |
+#include <clamav.h> |
|
103 |
+#include <netdb.h> |
|
104 |
+#include <netinet/in.h> |
|
105 |
+ |
|
106 |
+#if defined(HAVE_READDIR_R_3) || defined(HAVE_READDIR_R_2) |
|
107 |
+#include <stddef.h> |
|
108 |
+#endif |
|
109 |
+ |
|
110 |
+#include <sys/types.h> |
|
111 |
+#include <sys/socket.h> |
|
112 |
+#include <regex.h> |
|
113 |
+ |
|
114 |
+#include "others.h" |
|
115 |
+#include "defaults.h" |
|
116 |
+#include "str.h" |
|
117 |
+#include "filetypes.h" |
|
118 |
+#include "mbox.h" |
|
119 |
+#include "htmlnorm.h" |
|
120 |
+#include "phishcheck.h" |
|
121 |
+#include "phish_whitelist.h" |
|
122 |
+#include "phish_domaincheck_db.h" |
|
123 |
+#include "iana_tld.h" |
|
124 |
+ |
|
125 |
+#define DOMAIN_REAL 1 |
|
126 |
+#define DOMAIN_DISPLAY 0 |
|
127 |
+ |
|
128 |
+#define PHISHY_USERNAME_IN_URL 1 |
|
129 |
+#define PHISHY_NUMERIC_IP 2 |
|
130 |
+#define REAL_IS_MAILTO 4 |
|
131 |
+/* this is just a flag, so that the displayed url will be parsed as mailto too, for example |
|
132 |
+ * <a href='mailto:somebody@yahoo.com'>to:somebody@yahoo.com</a>*/ |
|
133 |
+#define DOMAIN_LISTED 8 |
|
134 |
+#define PHISHY_CLOAKED_NULL 16 |
|
135 |
+#define PHISHY_HEX_URL 32 |
|
136 |
+ |
|
137 |
+ |
|
138 |
+/* |
|
139 |
+* Phishing design documentation, |
|
140 |
+(initially written at http://wiki.clamav.net/index.php/phishing_design as discussed with aCaB) |
|
141 |
+ |
|
142 |
+*Warning*: if flag *--phish-scan-alldomains* (or equivalent clamd/clamav-milter config option) isn't given, then phishing scanning is done only for domains listed in daily.pdb. |
|
143 |
+If your daily.pdb is empty, then by default NO PHISHING is DONE, UNLESS you give the *--phish-scan-alldomains* |
|
144 |
+This is just a side-effect, daily.pdb is empty, because it isn't yet officialy in daily.cvd. |
|
145 |
+ |
|
146 |
+phishingCheck() determines if @displayedLink is a legit representation of @realLink. |
|
147 |
+ |
|
148 |
+Steps: |
|
149 |
+ |
|
150 |
+1. if _realLink_ *==* _displayLink_ => *CLEAN* |
|
151 |
+ |
|
152 |
+2. url cleanup (normalization) |
|
153 |
+- whitespace elimination |
|
154 |
+- html entity conversion |
|
155 |
+- convert hostname to lowercase |
|
156 |
+- normalize \ to / |
|
157 |
+If there is a dot after the last space, then all spaces are replaced with dots, |
|
158 |
+otherwise spaces are stripped. |
|
159 |
+So both: 'Go to yahoo.com', and 'Go to e b a y . c o m', and 'Go to ebay. com' will work. |
|
160 |
+ |
|
161 |
+ |
|
162 |
+3. Matched the urls against a _whitelist_: |
|
163 |
+a _realLink_, _displayedLink_ pair is matched against the _whitelist_. |
|
164 |
+the _whitelist_ is a list of pairs of realLink, displayedLink. Any of the elements of those pairs can be a _regex_. |
|
165 |
+ if url *is found* in _whitelist_ --> *CLEAN* |
|
166 |
+ |
|
167 |
+4. URL is looked up in the _domainlist_, unless disabled via flags (_--phish-scan-alldomains_). |
|
168 |
+The _domainlist_ is a list of pairs of realLink, displayedLink (any of which can be regex). |
|
169 |
+This is the list of domains we do phishing detection for (such as ebay,paypal,chase,....) |
|
170 |
+We can't decide to stop processing here or not, so we just set a flag. |
|
171 |
+ |
|
172 |
+Note(*!*): the flags are modified by the the domainlist checker. If domain is found, then the flags associated with it filter the default compile-time flags. |
|
173 |
+ |
|
174 |
+5. _Hostname_ is extracted from the _displayed URL_. |
|
175 |
+It is checked against the _whitelist_, and _domainlist_. |
|
176 |
+ |
|
177 |
+6. Now we know if we want to stop processing. |
|
178 |
+If we are only scanning domains in the _domainlist_ (default behaviour), and the url/domain |
|
179 |
+isn't found in it, we return (and mark url as not_list/clean). |
|
180 |
+If we scan all domains, then the domainlist isn't even checked. |
|
181 |
+ |
|
182 |
+7. URL cloak check. |
|
183 |
+check for %00, and hex-encoded IPs in URL. |
|
184 |
+ |
|
185 |
+8. Skip empty displayedURLs |
|
186 |
+ |
|
187 |
+9. SSL mismatch detection. |
|
188 |
+Checks if realLink is http, but displayedLink is https or viceversa. |
|
189 |
+(by default the SSL detection is done for hrefs only, not for imgs) |
|
190 |
+ |
|
191 |
+10. Hostname of real URL is extracted. |
|
192 |
+ |
|
193 |
+11. Skip cid: displayedLink urls (images embedded in mails). |
|
194 |
+ |
|
195 |
+12. Numeric IP detection. |
|
196 |
+If url is a numeric IP, then -> phish. |
|
197 |
+Maybe we should do DNS lookup? |
|
198 |
+Maybe we should disable numericIP checks for --phish-scan-alldomains? |
|
199 |
+ |
|
200 |
+13. isURL(displayedLink). |
|
201 |
+Checks if displayedLink is really a url. |
|
202 |
+if not -> clean |
|
203 |
+ |
|
204 |
+14. Hostnames of real, displayedLink are compared. If equal -> clean |
|
205 |
+ |
|
206 |
+15. Extract domain names, and compare. If equal -> clean |
|
207 |
+ |
|
208 |
+16. Do DNS lookups/reverse lookups. Disabled now (too much load/too many lookups). * |
|
209 |
+ |
|
210 |
+For the Whitelist(.wdb)/Domainlist(.pdb) format see regex_list.c (search for Flags) |
|
211 |
+ * |
|
212 |
+ */ |
|
213 |
+static char empty_string[]=""; |
|
214 |
+ |
|
215 |
+void url_check_init(struct url_check* urls) |
|
216 |
+{ |
|
217 |
+ urls->realLink.refcount=0; |
|
218 |
+ urls->realLink.data=empty_string; |
|
219 |
+ urls->realLink.ref=NULL; |
|
220 |
+ urls->displayLink.refcount=0; |
|
221 |
+ urls->displayLink.data=empty_string; |
|
222 |
+ urls->displayLink.ref=NULL; |
|
223 |
+} |
|
224 |
+ |
|
225 |
+/* string reference counting implementation, |
|
226 |
+ * so that: we don't have to keep in mind who allocated what, and when needs to be freed, |
|
227 |
+ * and thus we won't leek memory*/ |
|
228 |
+ |
|
229 |
+inline void string_free(struct string* str) |
|
230 |
+{ |
|
231 |
+ for(;;){ |
|
232 |
+ str->refcount--; |
|
233 |
+ if(!str->refcount) { |
|
234 |
+ if(str->ref)/* don't free, this is a portion of another string */ |
|
235 |
+ str=str->ref;/* try to free that one*/ |
|
236 |
+ else { |
|
237 |
+ free(str->data); |
|
238 |
+ break; |
|
239 |
+ } |
|
240 |
+ } |
|
241 |
+ else break; |
|
242 |
+ } |
|
243 |
+} |
|
244 |
+ |
|
245 |
+/* always use the string_assign when assigning to a string, this makes sure the old one's refcount is decremented*/ |
|
246 |
+void string_assign(struct string* dest,struct string* src) |
|
247 |
+{ |
|
248 |
+ string_free(dest); |
|
249 |
+ src->refcount++; |
|
250 |
+ dest->data=src->data; |
|
251 |
+ dest->refcount=1; |
|
252 |
+ dest->ref=src; |
|
253 |
+} |
|
254 |
+ |
|
255 |
+/* data will be freed when string freed */ |
|
256 |
+void string_assign_c(struct string* dest,char* data) |
|
257 |
+{ |
|
258 |
+ string_free(dest); |
|
259 |
+ dest->data=data; |
|
260 |
+ dest->ref=NULL; |
|
261 |
+ dest->refcount=1; |
|
262 |
+} |
|
263 |
+ |
|
264 |
+/* same as above, but it doesn't free old string, use only for initialization |
|
265 |
+ * Doesn't allow NULL pointers, they are replaced by pointer to empty string |
|
266 |
+ * */ |
|
267 |
+inline void string_init_c(struct string* dest,char* data) |
|
268 |
+{ |
|
269 |
+ dest->refcount = 1; |
|
270 |
+ dest->data = data ? data : empty_string; |
|
271 |
+ dest->ref = NULL; |
|
272 |
+} |
|
273 |
+ |
|
274 |
+/* make a copy of the string between start -> end*/ |
|
275 |
+inline void string_assign_dup(struct string* dest,const char* start,const char* end) |
|
276 |
+{ |
|
277 |
+ char* ret = cli_malloc(end-start+1); |
|
278 |
+ strncpy(ret,start,end-start); |
|
279 |
+ ret[end-start]='\0'; |
|
280 |
+ |
|
281 |
+ string_free(dest); |
|
282 |
+ dest->data=ret; |
|
283 |
+ dest->refcount=1; |
|
284 |
+ dest->ref=NULL; |
|
285 |
+} |
|
286 |
+ |
|
287 |
+inline void string_assign_null(struct string* dest) |
|
288 |
+{ |
|
289 |
+ string_free(dest); |
|
290 |
+ dest->data=empty_string; |
|
291 |
+ dest->refcount=-1;/* don't free it! */ |
|
292 |
+ dest->ref=NULL; |
|
293 |
+} |
|
294 |
+ |
|
295 |
+/* this string uses portion of another string*/ |
|
296 |
+void string_assign_ref(struct string* dest,struct string* ref,char* data) |
|
297 |
+{ |
|
298 |
+ string_free(dest); |
|
299 |
+ ref->refcount++; |
|
300 |
+ dest->data=data; |
|
301 |
+ dest->refcount=1; |
|
302 |
+ dest->ref=ref; |
|
303 |
+} |
|
304 |
+ |
|
305 |
+inline void free_if_needed(struct url_check* url) |
|
306 |
+{ |
|
307 |
+ string_free(&url->realLink); |
|
308 |
+ string_free(&url->displayLink); |
|
309 |
+} |
|
310 |
+ |
|
311 |
+static int phish_disabled = 0;/* disabled due to fatal startup error */ |
|
312 |
+static int build_regex(regex_t** preg,const char* regex,int nosub) |
|
313 |
+{ |
|
314 |
+ int rc; |
|
315 |
+ *preg = cli_malloc(sizeof(**preg)); |
|
316 |
+ cli_dbgmsg("Compiling regex:%s\n",regex); |
|
317 |
+ rc = regcomp(*preg,regex,REG_EXTENDED|REG_ICASE|(nosub ? REG_NOSUB :0)); |
|
318 |
+ if(rc) { |
|
319 |
+ size_t buflen = regerror(rc,*preg,NULL,0); |
|
320 |
+ char* errbuf = cli_malloc(buflen); |
|
321 |
+ regerror(rc,*preg,errbuf,buflen); |
|
322 |
+ cli_errmsg("Error in compiling regex:%s\nDisabling phishing checks\n",errbuf); |
|
323 |
+ free(errbuf); |
|
324 |
+ free(*preg); |
|
325 |
+ *preg=NULL; |
|
326 |
+ phish_disabled=1; |
|
327 |
+ return 1; |
|
328 |
+ } |
|
329 |
+ return 0; |
|
330 |
+} |
|
331 |
+ |
|
332 |
+ |
|
333 |
+/*static regex_t* host_preg = NULL; |
|
334 |
+static const char* host_regex="cid:.+|mailto:(.+)|([[:alpha:]]+://)?(([^:/?]+@)+([^:/?]+)([:/?].+)?|([^@:/?]+)([:/?].+)?)"; <- this is slower than the function below |
|
335 |
+*/ |
|
336 |
+/* allocates memory */ |
|
337 |
+void get_host(struct string* dest,const char* URL,int isReal,int* phishy) |
|
338 |
+{ |
|
339 |
+ const char mailto[] = "mailto:"; |
|
340 |
+ int ismailto = 0; |
|
341 |
+ const char* start; |
|
342 |
+ const char* end=NULL; |
|
343 |
+ if(!URL) { |
|
344 |
+ string_assign_null(dest); |
|
345 |
+ return; |
|
346 |
+ } |
|
347 |
+ start = strstr(URL,"://"); |
|
348 |
+ if(!start) { |
|
349 |
+ if(!strncmp(URL,mailto,sizeof(mailto)-1)) { |
|
350 |
+ start = URL + sizeof(mailto)-1; |
|
351 |
+ ismailto = 1; |
|
352 |
+ } |
|
353 |
+ else if (!isReal && *phishy&REAL_IS_MAILTO) { |
|
354 |
+ /* it is not required to use mailto: in the displayed url, they might use to:, or whatever */ |
|
355 |
+ end = URL+strlen(URL)+1; |
|
356 |
+ start = URL + strcspn(URL,": ")+1; |
|
357 |
+ if (start==end) |
|
358 |
+ start = URL; |
|
359 |
+ ismailto = 1; |
|
360 |
+ } |
|
361 |
+ else { |
|
362 |
+/* if(!strncmp(URL,"cid:",4)) {handled in phishcheck |
|
363 |
+ string_assign_null(dest); |
|
364 |
+ return;* cid: image, nothing to verify |
|
365 |
+ } |
|
366 |
+*/ |
|
367 |
+ start=URL;/*URL without protocol*/ |
|
368 |
+ if(isReal) |
|
369 |
+ cli_dbgmsg("PH:Real URL without protocol:%s\n",URL); |
|
370 |
+ else ismailto=2;/*no-protocol, might be mailto, @ is no problem*/ |
|
371 |
+ } |
|
372 |
+ } |
|
373 |
+ else start += 3;/* :// */ |
|
374 |
+ |
|
375 |
+ if(!ismailto || !isReal) { |
|
376 |
+ const char* realhost; |
|
377 |
+ do { |
|
378 |
+ end = start+strcspn(start,":/?"); |
|
379 |
+ realhost = strchr(start,'@'); |
|
380 |
+ if(start!=end && realhost>end) realhost = NULL;/*don't check beyond end of hostname*/ |
|
381 |
+ if(realhost) { |
|
382 |
+ const char* tld = strrchr(realhost,'.'); |
|
383 |
+ if(tld && isTLD(tld,tld-realhost-1)) |
|
384 |
+ *phishy |= PHISHY_USERNAME_IN_URL;/* if the url contains a username that is there just to fool people, |
|
385 |
+ like http://www.ebay.com@somevilplace.someevildomain.com/ */ |
|
386 |
+ start=realhost+1;/*skip the username*/ |
|
387 |
+ } |
|
388 |
+ } while(realhost);/*skip over multiple @ characters, text following last @ character is the real host*/ |
|
389 |
+ } |
|
390 |
+ else |
|
391 |
+ if (ismailto && isReal) |
|
392 |
+ *phishy |= REAL_IS_MAILTO; |
|
393 |
+ |
|
394 |
+ if(!end) { |
|
395 |
+ end = start+strcspn(start,":/?");/*especially important for mailto:somebody@yahoo.com?subject=...*/ |
|
396 |
+ if(!end) |
|
397 |
+ end = start + strlen(start); |
|
398 |
+ } |
|
399 |
+ |
|
400 |
+ string_assign_dup(dest,start,end); |
|
401 |
+} |
|
402 |
+ |
|
403 |
+static regex_t* preg = NULL; |
|
404 |
+static regex_t* preg_tld = NULL; |
|
405 |
+static regex_t* preg_cctld = NULL; |
|
406 |
+static regex_t* preg_numeric = NULL; |
|
407 |
+ |
|
408 |
+static const char tld_regex[] = "^"iana_tld"$"; |
|
409 |
+static const char cctld_regex[] = "^"iana_cctld"$"; |
|
410 |
+ |
|
411 |
+int isCountryCode(const char* str) |
|
412 |
+{ |
|
413 |
+ if(!preg_cctld) { |
|
414 |
+ if(build_regex(&preg_cctld,cctld_regex,1)) |
|
415 |
+ return -1; |
|
416 |
+ } |
|
417 |
+ return str ? !regexec(preg_cctld,str,0,NULL,0) : 0; |
|
418 |
+} |
|
419 |
+ |
|
420 |
+int isTLD(const char* str,int len) |
|
421 |
+{ |
|
422 |
+ if (!str) |
|
423 |
+ return 0; |
|
424 |
+ else { |
|
425 |
+ char* s = cli_malloc(len+1); |
|
426 |
+ int rc; |
|
427 |
+ strncpy(s,str,len); |
|
428 |
+ s[len]='\0'; |
|
429 |
+ if(!preg_tld) { |
|
430 |
+ if(build_regex(&preg_tld,tld_regex,1)) |
|
431 |
+ return -1; |
|
432 |
+ } |
|
433 |
+ rc = !regexec(preg_tld,s,0,NULL,0); |
|
434 |
+ free(s); |
|
435 |
+ return rc; |
|
436 |
+ } |
|
437 |
+} |
|
438 |
+ |
|
439 |
+/* |
|
440 |
+ * memrchr isn't standard, so I use this |
|
441 |
+ */ |
|
442 |
+char* rfind(char* start,char c,size_t len) |
|
443 |
+{ |
|
444 |
+ char* p; |
|
445 |
+ for(p=start+len;p>=start && *p!=c;p--); |
|
446 |
+ return p<start ? NULL : p; |
|
447 |
+} |
|
448 |
+ |
|
449 |
+void get_domain(struct string* dest,struct string* host) |
|
450 |
+{ |
|
451 |
+ char* domain; |
|
452 |
+ char* tld = strrchr(host->data,'.'); |
|
453 |
+ if(!tld) { |
|
454 |
+ cli_dbgmsg("PH:What? A host without a tld? (%s)\n",host->data); |
|
455 |
+ string_assign(dest,host); |
|
456 |
+ return; |
|
457 |
+ } |
|
458 |
+ if(isCountryCode(tld+1)) { |
|
459 |
+ const char* countrycode=tld+1; |
|
460 |
+ tld = rfind(host->data,'.',tld-host->data-1); |
|
461 |
+ if(!tld) { |
|
462 |
+ cli_dbgmsg("PH:Weird, a name with only 2 levels (%s)\n",host); |
|
463 |
+ string_assign(dest,host); |
|
464 |
+ return; |
|
465 |
+ } |
|
466 |
+ if(!isTLD(tld+1,countrycode-tld-1)) { |
|
467 |
+ string_assign_ref(dest,host,tld+1); |
|
468 |
+ return;/*it was a name like: subdomain.domain.uk, return domain.uk*/ |
|
469 |
+ } |
|
470 |
+ } |
|
471 |
+ /*we need to strip one more level, this is the actual domain*/ |
|
472 |
+ domain = rfind(host->data,'.',tld-host->data-1); |
|
473 |
+ if(!domain) { |
|
474 |
+ string_assign(dest,host); |
|
475 |
+ return;/* it was like sourceforge.net?*/ |
|
476 |
+ } |
|
477 |
+ string_assign_ref(dest,host,domain+1); |
|
478 |
+} |
|
479 |
+ |
|
480 |
+ |
|
481 |
+/* |
|
482 |
+int ip_reverse(struct url_check* urls,int isReal) |
|
483 |
+{ |
|
484 |
+ const char* host = isReal ? urls->realLink.data : urls->displayLink.data; |
|
485 |
+ struct hostent *he = gethostbyname (host); |
|
486 |
+ if (he) |
|
487 |
+ { |
|
488 |
+ char *addr = 0; |
|
489 |
+ switch (he->h_addrtype) |
|
490 |
+ { |
|
491 |
+ case AF_INET: |
|
492 |
+ addr = inet_ntoa (*(struct in_addr *) he->h_addr); |
|
493 |
+ break; |
|
494 |
+ } |
|
495 |
+ if (addr && strcmp (he->h_name, addr) == 0) |
|
496 |
+ { |
|
497 |
+ char *h_addr_copy = strdup (he->h_addr); |
|
498 |
+ if (h_addr_copy == NULL) |
|
499 |
+ he = NULL; |
|
500 |
+ else |
|
501 |
+ { |
|
502 |
+ he = gethostbyaddr (h_addr_copy, he->h_length, he->h_addrtype); |
|
503 |
+ free (h_addr_copy); |
|
504 |
+ } |
|
505 |
+ } |
|
506 |
+ if (he) |
|
507 |
+ string_assign_dup(isReal ? &urls->realLink : &urls->displayLink,he->h_name,he->h_name+strlen(he->h_name)); |
|
508 |
+ } |
|
509 |
+ return 0; |
|
510 |
+} |
|
511 |
+* frees its argument, and allocates memory* |
|
512 |
+void reverse_lookup(struct url_check* url,int isReal) |
|
513 |
+{ |
|
514 |
+ ip_reverse(url,isReal); |
|
515 |
+} |
|
516 |
+*/ |
|
517 |
+int isNumeric(const char* host) |
|
518 |
+{ |
|
519 |
+ int len = strlen(host); |
|
520 |
+ int a,b,c,d,n=0; |
|
521 |
+ /* 1.2.3.4 -> 7*/ |
|
522 |
+ /* 127.127.127.127 -> 15*/ |
|
523 |
+ if(len<7 || len>15) |
|
524 |
+ return 0; |
|
525 |
+ sscanf(host,"%d.%d.%d.%d%n",&a,&b,&c,&d,&n); |
|
526 |
+ if(n==len) |
|
527 |
+ if(a>=0 && a<=256 && b>=0 && b<=256 && c>=0 && c<=256 && d>=0 && d<=256) |
|
528 |
+ return 1; |
|
529 |
+ return 0; |
|
530 |
+} |
|
531 |
+ |
|
532 |
+int isSSL(const char* URL) |
|
533 |
+{ |
|
534 |
+ const char https[]="https://"; |
|
535 |
+ return URL ? !strncmp(https,URL,sizeof(https)-1) : 0; |
|
536 |
+} |
|
537 |
+ |
|
538 |
+static int hexinited=0; |
|
539 |
+static short int hextable[256]; |
|
540 |
+static inline char hex2int(const unsigned char* src) |
|
541 |
+{ |
|
542 |
+ assert(hexinited); |
|
543 |
+ return hextable[src[0]]<<4 | hextable[src[1]]; |
|
544 |
+} |
|
545 |
+ |
|
546 |
+ |
|
547 |
+/* deletes @what from the string @begin. |
|
548 |
+ * @what_len: length of @what, excluding the terminating \0 */ |
|
549 |
+static void str_hex_to_char(char** begin,const char** end) |
|
550 |
+{ |
|
551 |
+ char* sbegin = *begin; |
|
552 |
+ const char* str_end = *end; |
|
553 |
+ assert(str_end>sbegin); |
|
554 |
+ /* convert leading %xx*/ |
|
555 |
+ if (sbegin[0] == '%') { |
|
556 |
+ sbegin[2] = hex2int((unsigned char*)sbegin+1); |
|
557 |
+ sbegin += 2; |
|
558 |
+ } |
|
559 |
+ *begin = sbegin++; |
|
560 |
+ while(sbegin+3 < str_end) { |
|
561 |
+ while(sbegin+3<str_end && sbegin[0]=='%') { |
|
562 |
+ const char* src = sbegin+3; |
|
563 |
+ *sbegin = hex2int((unsigned char*)sbegin+1); |
|
564 |
+ /* move string */ |
|
565 |
+ memmove(sbegin+1,src,str_end-src+1); |
|
566 |
+ str_end -= 2; |
|
567 |
+ } |
|
568 |
+ sbegin++; |
|
569 |
+ } |
|
570 |
+ *end = str_end; |
|
571 |
+} |
|
572 |
+/* deletes @what from the string @begin. |
|
573 |
+ * @what_len: length of @what, excluding the terminating \0 */ |
|
574 |
+static void str_strip(char** begin,const char** end,const char* what,size_t what_len) |
|
575 |
+{ |
|
576 |
+ char* sbegin = *begin; |
|
577 |
+ const char* str_end = *end; |
|
578 |
+ const char* str_end_what; |
|
579 |
+ size_t cmp_len = what_len; |
|
580 |
+ assert(str_end>sbegin); |
|
581 |
+ if(str_end < sbegin + what_len) |
|
582 |
+ return; |
|
583 |
+ /* strip leading @what */ |
|
584 |
+ while(cmp_len && !strncmp(sbegin,what,cmp_len)) { |
|
585 |
+ sbegin += what_len; |
|
586 |
+ if(cmp_len > what_len) |
|
587 |
+ cmp_len -= what_len; |
|
588 |
+ else cmp_len = 0; |
|
589 |
+ } |
|
590 |
+ /* strip trailing @what */ |
|
591 |
+ str_end_what = str_end - what_len; |
|
592 |
+ while(str_end_what>sbegin && !strncmp(str_end_what,what,what_len)) { |
|
593 |
+ str_end -= what_len; |
|
594 |
+ str_end_what -= what_len; |
|
595 |
+ } |
|
596 |
+ *begin = sbegin++; |
|
597 |
+ while(sbegin+what_len < str_end) { |
|
598 |
+ while(sbegin+what_len<str_end && !strncmp(sbegin,what,what_len)) { |
|
599 |
+ const char* src = sbegin+what_len; |
|
600 |
+ /* move string */ |
|
601 |
+ memmove(sbegin,src,str_end-src+1); |
|
602 |
+ str_end -= what_len; |
|
603 |
+ } |
|
604 |
+ sbegin++; |
|
605 |
+ } |
|
606 |
+ *end = str_end; |
|
607 |
+} |
|
608 |
+ |
|
609 |
+static const char dotnet[] = ".net"; |
|
610 |
+static const char adonet[] = "ado.net"; |
|
611 |
+static const char aspnet[] = "asp.net"; |
|
612 |
+static const char lt[]="<"; |
|
613 |
+static const char gt[]=">"; |
|
614 |
+static const size_t dotnet_len = sizeof(dotnet)-1; |
|
615 |
+static const size_t adonet_len = sizeof(adonet)-1; |
|
616 |
+static const size_t aspnet_len = sizeof(aspnet)-1; |
|
617 |
+static const size_t lt_len = sizeof(lt)-1; |
|
618 |
+static const size_t gt_len = sizeof(gt)-1; |
|
619 |
+ |
|
620 |
+/* replace every occurence of @c in @str with @r*/ |
|
621 |
+static inline void str_replace(char* str,const char* end,char c,char r) |
|
622 |
+{ |
|
623 |
+ for(;str<end;str++) { |
|
624 |
+ if(*str==c) |
|
625 |
+ *str=r; |
|
626 |
+ } |
|
627 |
+} |
|
628 |
+static inline void str_make_lowercase(char* str,size_t len) |
|
629 |
+{ |
|
630 |
+ for(;len;str++,len--) { |
|
631 |
+ *str = tolower(*str); |
|
632 |
+ } |
|
633 |
+} |
|
634 |
+ |
|
635 |
+#define fix32(x) ((x)<32 ? 32 : (x)) |
|
636 |
+static inline void clear_msb(char* begin) |
|
637 |
+{ |
|
638 |
+ for(;*begin;begin++) |
|
639 |
+ *begin = fix32((*begin)&0x7f); |
|
640 |
+} |
|
641 |
+ |
|
642 |
+/* |
|
643 |
+ * Particularly yahoo puts links like this in mails: |
|
644 |
+ * http:/ /mail.yahoo.com |
|
645 |
+ * So first step: delete space between / / |
|
646 |
+ * |
|
647 |
+ * Next there could be possible links like this: |
|
648 |
+ * <a href="phishlink">w w w . e b a y . c o m</a> |
|
649 |
+ * Here we need to strip spaces to get this picked up. |
|
650 |
+ * |
|
651 |
+ * Next there are links like: |
|
652 |
+ * <a href="www.yahoo.com">Check out yahoo.com</a> |
|
653 |
+ * Here we add a ., so we get: check.out.yahoo.com (it won't trigger) |
|
654 |
+ * |
|
655 |
+ * Rule for adding .: if substring from right contains dot, then add dot, otherwise strip space |
|
656 |
+ * |
|
657 |
+ */ |
|
658 |
+static inline void str_fixup_spaces(char **begin,const char** end) |
|
659 |
+{ |
|
660 |
+ char* space = strchr(*begin,' '); |
|
661 |
+ /* strip any number of spaces after / */ |
|
662 |
+ while(space>*begin && space[-1]=='/' && space[0]==' ' && space<*end) { |
|
663 |
+ memmove(space,space+1,*end-space+1); |
|
664 |
+ (*end)--; |
|
665 |
+ } |
|
666 |
+ |
|
667 |
+ for(space = rfind(*begin,' ',*end-*begin);space && space[0]!='.' && space<*end;space++) {} |
|
668 |
+ if(space && space[0]=='.') |
|
669 |
+ str_replace(*begin,*end,' ','.'); |
|
670 |
+ else |
|
671 |
+ str_strip(begin,end," ",1); |
|
672 |
+} |
|
673 |
+ |
|
674 |
+/* allocates memory */ |
|
675 |
+void cleanupURL(struct string* URL,int isReal) |
|
676 |
+{ |
|
677 |
+ char* begin = URL->data; |
|
678 |
+ const char* end; |
|
679 |
+ size_t len; |
|
680 |
+ clear_msb(begin); |
|
681 |
+/* if(!URL->data) |
|
682 |
+ return;*/ |
|
683 |
+ /*TODO: handle hex-encoded IPs*/ |
|
684 |
+ while(isspace(*begin)) begin++; |
|
685 |
+ len=strlen(begin); |
|
686 |
+ end = begin+len-1; |
|
687 |
+ /*cli_dbgmsg("%d\n",end-begin);*/ |
|
688 |
+ if(begin>=end) { |
|
689 |
+ string_assign_null(URL); |
|
690 |
+ return; |
|
691 |
+ } |
|
692 |
+ while(isspace(*end)) |
|
693 |
+ end--; |
|
694 |
+ /*TODO: convert \ to /, and stuff like that*/ |
|
695 |
+ /* From mailscanner, my comments enclosed in {} */ |
|
696 |
+ if(!strncmp(begin,dotnet,dotnet_len) || !strncmp(begin,adonet,adonet_len) || !strncmp(begin,aspnet,aspnet_len)) |
|
697 |
+ string_assign_null(URL); |
|
698 |
+ else { |
|
699 |
+ size_t host_len; |
|
700 |
+ char* host_begin; |
|
701 |
+ str_replace(begin,end,'\\','/'); |
|
702 |
+ str_strip(&begin,&end,"\"",1); |
|
703 |
+ str_strip(&begin,&end,lt,lt_len); |
|
704 |
+ str_strip(&begin,&end,gt,gt_len); |
|
705 |
+ /* convert hostname to lowercase, but only hostname! */ |
|
706 |
+ host_begin = strchr(begin,':'); |
|
707 |
+ while(host_begin && host_begin[1]=='/') host_begin++; |
|
708 |
+ if(!host_begin) host_begin=begin; |
|
709 |
+ else host_begin++; |
|
710 |
+ host_len = strcspn(host_begin,"/?"); |
|
711 |
+ str_make_lowercase(host_begin,host_len); |
|
712 |
+ /* convert %xx to real value */ |
|
713 |
+ str_hex_to_char(&begin,&end); |
|
714 |
+ str_fixup_spaces(&begin,&end); |
|
715 |
+ string_assign_dup(URL,begin,end+1); |
|
716 |
+ /*cli_dbgmsg("%p::%s\n",URL->data,URL->data);*/ |
|
717 |
+ } |
|
718 |
+} |
|
719 |
+ |
|
720 |
+void get_redirected_URL(struct string* URL) |
|
721 |
+{ |
|
722 |
+ /*TODO: see if URL redirects sowhere, if so, then follow |
|
723 |
+ returns redirected URL*/ |
|
724 |
+} |
|
725 |
+ |
|
726 |
+static inline int is_phish_disabled(void) |
|
727 |
+{ |
|
728 |
+ if (phish_disabled) |
|
729 |
+ return 1; |
|
730 |
+ else if (!is_whitelist_ok()) { |
|
731 |
+ phish_disabled = 1; |
|
732 |
+ return 1; |
|
733 |
+ } |
|
734 |
+ else return 0; |
|
735 |
+} |
|
736 |
+ |
|
737 |
+static void init_hextable(void) |
|
738 |
+{ |
|
739 |
+ unsigned char c; |
|
740 |
+ memset(hextable,0,256); |
|
741 |
+ for(c='0';c<='9';c++) |
|
742 |
+ hextable[c] = c-'0'; |
|
743 |
+ for(c='a';c<='z';c++) |
|
744 |
+ hextable[c] = 10+c-'a'; |
|
745 |
+ for(c='A';c<='Z';c++) |
|
746 |
+ hextable[c] = 10+c-'A'; |
|
747 |
+ hexinited=1; |
|
748 |
+} |
|
749 |
+ |
|
750 |
+int phishingScan(message* m,const char* dir,cli_ctx* ctx,tag_arguments_t* hrefs) |
|
751 |
+{ |
|
752 |
+ const char src_text[]="src"; |
|
753 |
+ const char href_text[]="href"; |
|
754 |
+ const size_t href_text_len = sizeof(href_text); |
|
755 |
+ const size_t src_text_len = sizeof(src_text); |
|
756 |
+ int i; |
|
757 |
+ if(is_phish_disabled()) |
|
758 |
+ return 0; |
|
759 |
+ if(!hexinited) { |
|
760 |
+ init_hextable(); |
|
761 |
+ atexit(phishing_done);/*TODO: replace this with a proper phishing_done call from manager.c*/ |
|
762 |
+ } |
|
763 |
+ |
|
764 |
+ *ctx->virname=NULL; |
|
765 |
+ for(i=0;i<hrefs->count;i++) |
|
766 |
+ if(hrefs->contents[i]) { |
|
767 |
+ struct url_check urls; |
|
768 |
+ enum phish_status rc; |
|
769 |
+ urls.flags = strncmp((char*)hrefs->tag[i],href_text,href_text_len)? (CL_PHISH_ALL_CHECKS&~CHECK_SSL): CL_PHISH_ALL_CHECKS; |
|
770 |
+ if (!(urls.flags&CHECK_IMG_URL) && !strncmp((char*)hrefs->tag[i],src_text,src_text_len)) |
|
771 |
+ continue; |
|
772 |
+ if (ctx->options&CL_PHISH_NO_DOMAINLIST) |
|
773 |
+ urls.flags &= ~DOMAINLIST_REQUIRED; |
|
774 |
+ string_init_c(&urls.realLink,(char*)hrefs->value[i]); |
|
775 |
+/* if(!hrefs->contents[i]->isClosed) { |
|
776 |
+ blobAddData(hrefs->contents[i],empty_string,1); |
|
777 |
+ blobClose(hrefs->contents[i]); |
|
778 |
+ }*/ |
|
779 |
+ string_init_c(&urls.displayLink,(char*)blobGetData(hrefs->contents[i])); |
|
780 |
+ assert(!urls.displayLink.data[blobGetDataSize(hrefs->contents[i])-1]); |
|
781 |
+/* assert(strlen(urls.displayLink.data) < blobGetDataSize(hrefs->contents[i]));*/ |
|
782 |
+ urls.realLink.refcount=-1; |
|
783 |
+ urls.displayLink.refcount=-1;/*don't free these, caller will free*/ |
|
784 |
+ if(strcmp((char*)hrefs->tag[i],"href")) { |
|
785 |
+ char *url; |
|
786 |
+ url = urls.realLink.data; |
|
787 |
+ urls.realLink.data = urls.displayLink.data; |
|
788 |
+ urls.displayLink.data = url; |
|
789 |
+ } |
|
790 |
+ |
|
791 |
+ rc = phishingCheck(&urls); |
|
792 |
+ if(phish_disabled) |
|
793 |
+ return 0; |
|
794 |
+ free_if_needed(&urls); |
|
795 |
+ cli_dbgmsg("Phishing scan result:%s\n",phishing_ret_toString(rc)); |
|
796 |
+ switch(rc)/*TODO: support flags from ctx->options,*/ |
|
797 |
+ { |
|
798 |
+ case CL_PHISH_CLEAN: |
|
799 |
+ case CL_PHISH_CLEANUP_OK: |
|
800 |
+ case CL_PHISH_HOST_OK: |
|
801 |
+ case CL_PHISH_DOMAIN_OK: |
|
802 |
+ case CL_PHISH_REDIR_OK: |
|
803 |
+ case CL_PHISH_HOST_REDIR_OK: |
|
804 |
+ case CL_PHISH_DOMAIN_REDIR_OK: |
|
805 |
+ case CL_PHISH_HOST_REVERSE_OK: |
|
806 |
+ case CL_PHISH_DOMAIN_REVERSE_OK: |
|
807 |
+ case CL_PHISH_WHITELISTED: |
|
808 |
+ case CL_PHISH_HOST_WHITELISTED: |
|
809 |
+ case CL_PHISH_MAILTO_OK: |
|
810 |
+ case CL_PHISH_TEXTURL: |
|
811 |
+ case CL_PHISH_HOST_NOT_LISTED: |
|
812 |
+ case CL_PHISH_CLEAN_CID: |
|
813 |
+ continue; |
|
814 |
+/* break;*/ |
|
815 |
+ case CL_PHISH_HEX_URL: |
|
816 |
+ *ctx->virname="Phishing.Email.HexURL"; |
|
817 |
+ return CL_VIRUS; |
|
818 |
+/* break;*/ |
|
819 |
+ case CL_PHISH_NUMERIC_IP: |
|
820 |
+ *ctx->virname="Phishing.Email.Cloaked.NumericIP"; |
|
821 |
+ return CL_VIRUS; |
|
822 |
+ case CL_PHISH_CLOAKED_NULL: |
|
823 |
+ *ctx->virname="Phishing.Email.Cloaked.Null";/*http://www.real.com%01%00@www.evil.com*/ |
|
824 |
+ return CL_VIRUS; |
|
825 |
+ case CL_PHISH_SSL_SPOOF: |
|
826 |
+ *ctx->virname="Phishing.Email.SSL-Spoof"; |
|
827 |
+ return CL_VIRUS; |
|
828 |
+ case CL_PHISH_CLOAKED_UIU: |
|
829 |
+ *ctx->virname="Phishing.Email.Cloaked.Username";/*http://www.ebay.com@www.evil.com*/ |
|
830 |
+ return CL_VIRUS; |
|
831 |
+ case CL_PHISH_NOMATCH: |
|
832 |
+ default: |
|
833 |
+ *ctx->virname="Phishing.Email"; |
|
834 |
+ return CL_VIRUS; |
|
835 |
+ } |
|
836 |
+ } |
|
837 |
+ else |
|
838 |
+ if(strcmp((char*)hrefs->tag[i],"href")) |
|
839 |
+ cli_dbgmsg("PH:href with no contents?\n"); |
|
840 |
+ return 0;/*texturlfound?CL_VIRUS:0;*/ |
|
841 |
+} |
|
842 |
+ |
|
843 |
+static char* str_compose(const char* a,const char* b,const char* c) |
|
844 |
+{ |
|
845 |
+ const size_t a_len = strlen(a); |
|
846 |
+ const size_t b_len = strlen(b); |
|
847 |
+ const size_t c_len = strlen(c); |
|
848 |
+ const size_t r_len = a_len+b_len+c_len+1; |
|
849 |
+ char* concated = malloc(r_len); |
|
850 |
+ strncpy(concated,a,a_len); |
|
851 |
+ strncpy(concated+a_len,b,b_len); |
|
852 |
+ strncpy(concated+a_len+b_len,c,c_len); |
|
853 |
+ concated[r_len-1]='\0'; |
|
854 |
+ return concated; |
|
855 |
+} |
|
856 |
+ |
|
857 |
+/*static const char* url_regex="^ *([[:alnum:]%_-]+:(//)?)?([[:alnum:]%_-]@)*[[:alnum:]%_-]+\\.([[:alnum:]%_-]+\\.)*[[:alnum:]_%-]+(/[[:alnum:];:@$=?&/.,%_-]+) *$";*/ |
|
858 |
+/* for urls, including mailto: urls, and (broken) http:www... style urls*/ |
|
859 |
+/* refer to: http://www.w3.org/Addressing/URL/5_URI_BNF.html |
|
860 |
+ * Modifications: don't allow empty domains/subdomains, such as www..com <- that is no url |
|
861 |
+ * So the 'safe' char class has been split up |
|
862 |
+ * */ |
|
863 |
+/* character classes */ |
|
864 |
+#define URI_alpha "a-zA-Z" |
|
865 |
+#define URI_digit "0-9" |
|
866 |
+#define URI_safe_nodot "-$_@&" |
|
867 |
+#define URI_safe "-$_@.&" |
|
868 |
+#define URI_extra "!*\"'()," |
|
869 |
+#define URI_reserved "=;/#?: " |
|
870 |
+#define URI_national "{}|[]\\^~" |
|
871 |
+#define URI_punctuation "<>" |
|
872 |
+ |
|
873 |
+#define URI_hex "[0-9a-fA-f]" |
|
874 |
+#define URI_escape "%"URI_hex"{2}" |
|
875 |
+#define URI_xalpha "([" URI_safe URI_alpha URI_digit URI_extra "]|"URI_escape")" /* URI_safe has to be first, because it contains - */ |
|
876 |
+#define URI_xalpha_nodot "([" URI_safe_nodot URI_alpha URI_digit URI_extra "]|"URI_escape")" |
|
877 |
+ |
|
878 |
+#define URI_xalphas URI_xalpha"+" |
|
879 |
+#define URI_xalphas_nodot URI_xalpha_nodot"*" |
|
880 |
+ |
|
881 |
+#define URI_ialpha "["URI_alpha"]"URI_xalphas_nodot"" |
|
882 |
+#define URI_xpalpha URI_xalpha"|\\+" |
|
883 |
+#define URI_xpalpha_nodot URI_xalpha_nodot"|\\+" |
|
884 |
+#define URI_xpalphas "("URI_xpalpha")+" |
|
885 |
+#define URI_xpalphas_nodot "("URI_xpalpha_nodot")+" |
|
886 |
+ |
|
887 |
+#define URI_scheme URI_ialpha |
|
888 |
+#define URI_tld iana_tld |
|
889 |
+#define URI_path1 URI_xpalphas_nodot"\\.("URI_xpalphas_nodot"\\.)*" |
|
890 |
+#define URI_path2 URI_tld |
|
891 |
+#define URI_path3 "(/("URI_xpalphas"/?)*)?" |
|
892 |
+ |
|
893 |
+#define URI_search "("URI_xalphas"\\+)*" |
|
894 |
+#define URI_fragmentid URI_xalphas |
|
895 |
+ |
|
896 |
+#define URI_IP_digits "["URI_digit"]{1,3}" |
|
897 |
+#define URI_numeric_path URI_IP_digits"(\\."URI_IP_digits"){3}(:"URI_xpalphas_nodot")?(/("URI_xpalphas"/?)*)?" |
|
898 |
+#define URI_numeric_URI "("URI_scheme":(//)?)?"URI_numeric_path"(\\?" URI_search")?" |
|
899 |
+#define URI_numeric_fragmentaddress URI_numeric_URI"(#"URI_fragmentid")?" |
|
900 |
+ |
|
901 |
+#define URI_URI1 "("URI_scheme":(//)?)?"URI_path1 |
|
902 |
+#define URI_URI2 URI_path2 |
|
903 |
+#define URI_URI3 URI_path3"(\\?" URI_search")?" |
|
904 |
+ |
|
905 |
+#define URI_fragmentaddress1 URI_URI1 |
|
906 |
+#define URI_fragmentaddress2 URI_URI2 |
|
907 |
+#define URI_fragmentaddress3 URI_URI3"(#"URI_fragmentid")?" |
|
908 |
+ |
|
909 |
+#define URI_CHECK_PROTOCOLS "(http|https|ftp)://.+" |
|
910 |
+ |
|
911 |
+/*Warning: take care when modifying this regex, it has been tweaked, and tuned, just don't break it please. |
|
912 |
+ * there is fragmentaddress1, and 2 to work around the ISO limitation of 509 bytes max length for string constants*/ |
|
913 |
+static char* url_regex = NULL; |
|
914 |
+static const char numeric_url_regex[] = "^ *"URI_numeric_fragmentaddress" *$"; |
|
915 |
+/* |
|
916 |
+ * Only those URLs are identified as URLs for which phishing detection can be performed. |
|
917 |
+ * This means that no attempt is made to properly recognize 'cid:' URLs |
|
918 |
+ */ |
|
919 |
+int isURL(const char* URL) |
|
920 |
+{ |
|
921 |
+ if(!preg) { |
|
922 |
+ url_regex = str_compose("^ *("URI_fragmentaddress1,URI_fragmentaddress2,URI_fragmentaddress3"|"URI_CHECK_PROTOCOLS") *$"); |
|
923 |
+ if(build_regex(&preg,url_regex,1)) |
|
924 |
+ return -1; |
|
925 |
+ } |
|
926 |
+ return URL ? !regexec(preg,URL,0,NULL,0) : 0; |
|
927 |
+} |
|
928 |
+ |
|
929 |
+int isNumericURL(const char* URL) |
|
930 |
+{ |
|
931 |
+ if(!preg_numeric) { |
|
932 |
+ if(build_regex(&preg_numeric,numeric_url_regex,1)) |
|
933 |
+ return -1; |
|
934 |
+ } |
|
935 |
+ return URL ? !regexec(preg_numeric,URL,0,NULL,0) : 0; |
|
936 |
+} |
|
937 |
+ |
|
938 |
+/* Cleans up @urls |
|
939 |
+ * If URLs are identical after cleanup it will return CL_PHISH_CLEANUP_OK. |
|
940 |
+ * */ |
|
941 |
+enum phish_status cleanupURLs(struct url_check* urls) |
|
942 |
+{ |
|
943 |
+ if(urls->flags&CLEANUP_URL) { |
|
944 |
+ cleanupURL(&urls->realLink,1); |
|
945 |
+ cleanupURL(&urls->displayLink,0); |
|
946 |
+ if(!urls->displayLink.data || !urls->realLink.data) |
|
947 |
+ return CL_PHISH_NODECISION; |
|
948 |
+ if(!strcmp(urls->realLink.data,urls->displayLink.data)) |
|
949 |
+ return CL_PHISH_CLEANUP_OK; |
|
950 |
+ } |
|
951 |
+ return CL_PHISH_NODECISION; |
|
952 |
+} |
|
953 |
+ |
|
954 |
+ |
|
955 |
+enum phish_status url_get_host(struct url_check* url,struct url_check* host_url,int isReal,int* phishy) |
|
956 |
+{ |
|
957 |
+ struct string* host = isReal ? &host_url->realLink : &host_url->displayLink; |
|
958 |
+ get_host(host,isReal ? url->realLink.data : url->displayLink.data, isReal,phishy); |
|
959 |
+ if(!host->data) |
|
960 |
+ return CL_PHISH_CLEANUP_OK; |
|
961 |
+ if(*phishy&REAL_IS_MAILTO) |
|
962 |
+ return CL_PHISH_MAILTO_OK; |
|
963 |
+ if(strchr(host->data,' ')) { |
|
964 |
+ string_free(host); |
|
965 |
+ return CL_PHISH_TEXTURL; |
|
966 |
+ } |
|
967 |
+ if(isReal && (!strncmp(host->data,"0x",2) || !strncmp(host->data,"0X",2))) { |
|
968 |
+ string_free(host); |
|
969 |
+ return CL_PHISH_HEX_URL; |
|
970 |
+ } |
|
971 |
+ if(isReal && host->data[0]=='\0') |
|
972 |
+ return CL_PHISH_CLEAN;/* link without domain, such as: href="/isapi.dll?... */ |
|
973 |
+ if(isNumeric(host->data)) { |
|
974 |
+ *phishy |= PHISHY_NUMERIC_IP; |
|
975 |
+/* if(url->flags&DO_REVERSE_LOOKUP) |
|
976 |
+ reverse_lookup(host_url,isReal);*/ |
|
977 |
+ } |
|
978 |
+ return CL_PHISH_NODECISION; |
|
979 |
+} |
|
980 |
+ |
|
981 |
+ |
|
982 |
+void url_get_domain(struct url_check* url,struct url_check* domains) |
|
983 |
+{ |
|
984 |
+ get_domain(&domains->realLink, &url->realLink); |
|
985 |
+ get_domain(&domains->displayLink, &url->displayLink); |
|
986 |
+ domains->flags = url->flags; |
|
987 |
+} |
|
988 |
+ |
|
989 |
+enum phish_status phishy_map(int phishy,enum phish_status fallback) |
|
990 |
+{ |
|
991 |
+ if(phishy&PHISHY_USERNAME_IN_URL) |
|
992 |
+ return CL_PHISH_CLOAKED_UIU; |
|
993 |
+ else if(phishy&PHISHY_NUMERIC_IP) |
|
994 |
+ return CL_PHISH_NUMERIC_IP; |
|
995 |
+ else |
|
996 |
+ return fallback; |
|
997 |
+} |
|
998 |
+ |
|
999 |
+int isEncoded(const char* url) |
|
1000 |
+{ |
|
1001 |
+ const char* start=url; |
|
1002 |
+ size_t cnt=0; |
|
1003 |
+ do{ |
|
1004 |
+ cnt++; |
|
1005 |
+ /*last=start;*/ |
|
1006 |
+ start=strstr(start,"&#"); |
|
1007 |
+ if(start) |
|
1008 |
+ start=strstr(start,";"); |
|
1009 |
+ } while(start); |
|
1010 |
+ return (cnt-1 >strlen(url)*7/10);/*more than 70% made up of &#;*/ |
|
1011 |
+} |
|
1012 |
+ |
|
1013 |
+static void free_regex(regex_t** p) |
|
1014 |
+{ |
|
1015 |
+ if(p) { |
|
1016 |
+ if(*p) { |
|
1017 |
+ regfree(*p); |
|
1018 |
+ free(*p); |
|
1019 |
+ *p=NULL; |
|
1020 |
+ } |
|
1021 |
+ } |
|
1022 |
+} |
|
1023 |
+ |
|
1024 |
+void phishing_done(void) |
|
1025 |
+{ |
|
1026 |
+ free_regex(&preg); |
|
1027 |
+ free_regex(&preg_cctld); |
|
1028 |
+ free_regex(&preg_tld); |
|
1029 |
+ free_regex(&preg_numeric); |
|
1030 |
+ whitelist_done(); |
|
1031 |
+ domainlist_done(); |
|
1032 |
+ if(url_regex) |
|
1033 |
+ free(url_regex); |
|
1034 |
+} |
|
1035 |
+ |
|
1036 |
+int whitelist_check(struct url_check* urls,int hostOnly) |
|
1037 |
+{ |
|
1038 |
+ return whitelist_match(urls->realLink.data,urls->displayLink.data,hostOnly); |
|
1039 |
+} |
|
1040 |
+ |
|
1041 |
+/* urls can't contain null pointer, caller must ensure this */ |
|
1042 |
+enum phish_status phishingCheck(struct url_check* urls) |
|
1043 |
+{ |
|
1044 |
+ struct url_check host_url; |
|
1045 |
+ const char cid[] = "cid:"; |
|
1046 |
+ const size_t cid_len = sizeof(cid)-1; |
|
1047 |
+ enum phish_status rc=CL_PHISH_NODECISION; |
|
1048 |
+ int phishy=0; |
|
1049 |
+ if(!urls->realLink.data) |
|
1050 |
+ return CL_PHISH_CLEAN; |
|
1051 |
+ cli_dbgmsg("\nPH:Checking url %s->%s \n",urls->realLink.data,urls->displayLink.data); |
|
1052 |
+ |
|
1053 |
+ if(!strcmp(urls->realLink.data,urls->displayLink.data)) |
|
1054 |
+ return CL_PHISH_CLEAN;/* displayed and real URL are identical -> clean */ |
|
1055 |
+ |
|
1056 |
+ if((rc = cleanupURLs(urls))) { |
|
1057 |
+ assert(!isPhishing(rc));/* not allowed to decide this is phishing */ |
|
1058 |
+ return rc;/* URLs identical after cleanup */ |
|
1059 |
+ } |
|
1060 |
+ |
|
1061 |
+ if(whitelist_check(urls,0)) |
|
1062 |
+ return CL_PHISH_WHITELISTED;/* if url is whitelist don't perform further checks */ |
|
1063 |
+ |
|
1064 |
+ if(urls->flags&DOMAINLIST_REQUIRED && domainlist_match(urls->realLink.data,urls->displayLink.data,0,&urls->flags)) |
|
1065 |
+ phishy |= DOMAIN_LISTED; |
|
1066 |
+ else { |
|
1067 |
+ /* although entire url is not listed, the host might be, |
|
1068 |
+ * so defer phishing decisions till we know if host is listed*/ |
|
1069 |
+ } |
|
1070 |
+ |
|
1071 |
+ url_check_init(&host_url); |
|
1072 |
+ |
|
1073 |
+ if((rc = url_get_host(urls,&host_url,DOMAIN_DISPLAY,&phishy))) { |
|
1074 |
+ free_if_needed(&host_url); |
|
1075 |
+ assert(!isPhishing(rc)); |
|
1076 |
+ return rc; |
|
1077 |
+ } |
|
1078 |
+ |
|
1079 |
+ if(whitelist_check(&host_url,1)) { |
|
1080 |
+ free_if_needed(&host_url); |
|
1081 |
+ return CL_PHISH_HOST_WHITELISTED; |
|
1082 |
+ } |
|
1083 |
+ |
|
1084 |
+ if(urls->flags&DOMAINLIST_REQUIRED) { |
|
1085 |
+ if(!(phishy&DOMAIN_LISTED)) { |
|
1086 |
+ if(domainlist_match(urls->displayLink.data,urls->realLink.data,1,&urls->flags)) |
|
1087 |
+ phishy |= DOMAIN_LISTED; |
|
1088 |
+ else { |
|
1089 |
+ free_if_needed(&host_url); |
|
1090 |
+ return CL_PHISH_HOST_NOT_LISTED; |
|
1091 |
+ } |
|
1092 |
+ } |
|
1093 |
+ } |
|
1094 |
+ |
|
1095 |
+ if(urls->flags&CHECK_CLOAKING) { |
|
1096 |
+ /*Checks if URL is cloaked. |
|
1097 |
+ Should we check if it containts another http://, https://? |
|
1098 |
+ No because we might get false positives from redirect services.*/ |
|
1099 |
+ if(strstr(urls->realLink.data,"%00")) { |
|
1100 |
+ free_if_needed(&host_url); |
|
1101 |
+ return CL_PHISH_CLOAKED_NULL; |
|
1102 |
+ } |
|
1103 |
+ if(isEncoded(urls->displayLink.data)) { |
|
1104 |
+ free_if_needed(&host_url); |
|
1105 |
+ return CL_PHISH_HEX_URL; |
|
1106 |
+ } |
|
1107 |
+ } |
|
1108 |
+ |
|
1109 |
+ if(urls->displayLink.data[0]=='\0') { |
|
1110 |
+ free_if_needed(&host_url); |
|
1111 |
+ return CL_PHISH_CLEAN; |
|
1112 |
+ } |
|
1113 |
+ |
|
1114 |
+ if(urls->flags&CHECK_SSL && isSSL(urls->displayLink.data) && !isSSL(urls->realLink.data)) { |
|
1115 |
+ free_if_needed(&host_url); |
|
1116 |
+ return CL_PHISH_SSL_SPOOF; |
|
1117 |
+ } |
|
1118 |
+ |
|
1119 |
+ if((rc = url_get_host(urls,&host_url,DOMAIN_REAL,&phishy))) |
|
1120 |
+ { |
|
1121 |
+ free_if_needed(&host_url); |
|
1122 |
+ return rc; |
|
1123 |
+ } |
|
1124 |
+ |
|
1125 |
+ if(!strncmp(urls->displayLink.data,cid,cid_len))/* cid: image */{ |
|
1126 |
+ free_if_needed(&host_url); |
|
1127 |
+ return CL_PHISH_CLEAN_CID; |
|
1128 |
+ } |
|
1129 |
+ |
|
1130 |
+ if(!isURL(urls->displayLink.data) && |
|
1131 |
+ ( (phishy&PHISHY_NUMERIC_IP && !isNumericURL(urls->displayLink.data)) || |
|
1132 |
+ !(phishy&PHISHY_NUMERIC_IP))) { |
|
1133 |
+ free_if_needed(&host_url); |
|
1134 |
+ return CL_PHISH_TEXTURL; |
|
1135 |
+ } |
|
1136 |
+ |
|
1137 |
+ if(urls->flags&HOST_SUFFICIENT) { |
|
1138 |
+ if(!strcmp(urls->realLink.data,urls->displayLink.data)) { |
|
1139 |
+ free_if_needed(&host_url); |
|
1140 |
+ return CL_PHISH_HOST_OK; |
|
1141 |
+ } |
|
1142 |
+ |
|
1143 |
+ |
|
1144 |
+ if(urls->flags&DOMAIN_SUFFICIENT) { |
|
1145 |
+ struct url_check domain_url; |
|
1146 |
+ url_check_init(&domain_url); |
|
1147 |
+ url_get_domain(&host_url,&domain_url); |
|
1148 |
+ if(!strcmp(domain_url.realLink.data,domain_url.displayLink.data)) { |
|
1149 |
+ free_if_needed(&host_url); |
|
1150 |
+ free_if_needed(&domain_url); |
|
1151 |
+ return CL_PHISH_DOMAIN_OK; |
|
1152 |
+ } |
|
1153 |
+ free_if_needed(&domain_url); |
|
1154 |
+ } |
|
1155 |
+ |
|
1156 |
+ /*if(urls->flags&CHECK_REDIR) { |
|
1157 |
+ //see where the realLink redirects, and compare that with the displayed Link |
|
1158 |
+ const uchar* redirectedURL = getRedirectedURL(urls->realLink); |
|
1159 |
+ if(urls->needsfree) |
|
1160 |
+ free(urls->realLink); |
|
1161 |
+ urls->realLink = redirectedURL; |
|
1162 |
+ |
|
1163 |
+ if(!strcmp(urls->realLink,urls->displayLink)) |
|
1164 |
+ return CL_PHISH_REDIR_OK; |
|
1165 |
+ |
|
1166 |
+ if(urls->flags&HOST_SUFFICIENT) { |
|
1167 |
+ if(rc = url_get_host(urls,&host_url,DOMAIN_REAL)) |
|
1168 |
+ if(!strcmp(host_url.realLink,host_url.displayLink)) { |
|
1169 |
+ free_if_needed(&host_url); |
|
1170 |
+ return CL_PHISH_HOST_REDIR_OK; |
|
1171 |
+ } |
|
1172 |
+ if(urls->flags&DOMAIN_SUFFICIENT) { |
|
1173 |
+ struct url_check domain_url; |
|
1174 |
+ url_get_domain(&host_url,&domain_url); |
|
1175 |
+ if(!strcmp(domain_url.realLink,domain_url.displayLink)) { |
|
1176 |
+ free_if_needed(&host_url); |
|
1177 |
+ free_if_needed(&domain_url); |
|
1178 |
+ return CL_PHISH_DOMAIN_REDIR_OK; |
|
1179 |
+ } |
|
1180 |
+ } |
|
1181 |
+ }//HOST_SUFFICIENT&CHECK_REDIR |
|
1182 |
+ } |
|
1183 |
+ free_if_needed(&host_url);*/ |
|
1184 |
+ /* if(urls->flags&CHECK_DOMAIN_REVERSE) { |
|
1185 |
+ //do a DNS lookup of the domain, and see what IP it corresponds to |
|
1186 |
+ //then do a reverse lookup on the IP, and see what domain you get |
|
1187 |
+ //There are some corporate signatures that mix different domains belonging to same company |
|
1188 |
+ struct url_check domain_url; |
|
1189 |
+ url_check_init(&domain_url); |
|
1190 |
+ if(!dns_to_ip_and_reverse(&host_url,DOMAIN_DISPLAY)) { |
|
1191 |
+ if(!strcmp(host_url.realLink.data,host_url.displayLink.data)) { |
|
1192 |
+ free_if_needed(&host_url); |
|
1193 |
+ return CL_PHISH_HOST_REVERSE_OK; |
|
1194 |
+ } |
|
1195 |
+ if(urls->flags&DOMAIN_SUFFICIENT) { |
|
1196 |
+ url_get_domain(&host_url,&domain_url); |
|
1197 |
+ if(!strcmp(domain_url.realLink.data,domain_url.displayLink.data)) { |
|
1198 |
+ free_if_needed(&host_url); |
|
1199 |
+ free_if_needed(&domain_url); |
|
1200 |
+ return CL_PHISH_DOMAIN_REVERSE_OK; |
|
1201 |
+ } |
|
1202 |
+ free_if_needed(&domain_url); |
|
1203 |
+ } |
|
1204 |
+ } |
|
1205 |
+ }*/ |
|
1206 |
+ free_if_needed(&host_url); |
|
1207 |
+ }/*HOST_SUFFICIENT*/ |
|
1208 |
+ /*we failed to find a reason why the 2 URLs are different, this is definetely phishing*/ |
|
1209 |
+ return phishy_map(phishy,CL_PHISH_NOMATCH); |
|
1210 |
+} |
|
1211 |
+ |
|
1212 |
+const char* phishing_ret_toString(enum phish_status rc) |
|
1213 |
+{ |
|
1214 |
+ switch(rc) { |
|
1215 |
+ case CL_PHISH_CLEAN: |
|
1216 |
+ return "Clean"; |
|
1217 |
+ case CL_PHISH_CLEANUP_OK: |
|
1218 |
+ return "URLs match after cleanup"; |
|
1219 |
+ case CL_PHISH_WHITELISTED: |
|
1220 |
+ return "URL is whitelisted"; |
|
1221 |
+ case CL_PHISH_HOST_WHITELISTED: |
|
1222 |
+ return "host part of URL is whitelist"; |
|
1223 |
+ case CL_PHISH_HOST_OK: |
|
1224 |
+ return "Hosts match"; |
|
1225 |
+ case CL_PHISH_DOMAIN_OK: |
|
1226 |
+ return "Domains match"; |
|
1227 |
+ case CL_PHISH_REDIR_OK: |
|
1228 |
+ return "After redirecting realURL, they match"; |
|
1229 |
+ case CL_PHISH_HOST_REDIR_OK: |
|
1230 |
+ return "After redirecting realURL, hosts match"; |
|
1231 |
+ case CL_PHISH_DOMAIN_REDIR_OK: |
|
1232 |
+ return "After redirecting the domains match"; |
|
1233 |
+ case CL_PHISH_MAILTO_OK: |
|
1234 |
+ return "URL is mailto"; |
|
1235 |
+ case CL_PHISH_NUMERIC_IP: |
|
1236 |
+ return "IP address encountered in hostname"; |
|
1237 |
+ case CL_PHISH_TEXTURL: |
|
1238 |
+ return "Displayed link is not an URL, can't check if phishing or not"; |
|
1239 |
+ case CL_PHISH_CLOAKED_NULL: |
|
1240 |
+ return "Link URL is cloaked (null byte %00)"; |
|
1241 |
+ case CL_PHISH_CLOAKED_UIU: |
|
1242 |
+ return "Link URL contains username, and real<->displayed hosts don't match."; |
|
1243 |
+ /*username is a legit domain, and after the @ comes the evil one*/ |
|
1244 |
+ case CL_PHISH_SSL_SPOOF: |
|
1245 |
+ return "Visible links is SSL, real link is not"; |
|
1246 |
+ case CL_PHISH_NOMATCH: |
|
1247 |
+ return "URLs are way too different"; |
|
1248 |
+ case CL_PHISH_HOST_NOT_LISTED: |
|
1249 |
+ return "Host not listed in .pdb -> not checked"; |
|
1250 |
+ case CL_PHISH_CLEAN_CID: |
|
1251 |
+ return "Embedded image in mail -> clean"; |
|
1252 |
+ default: |
|
1253 |
+ return "Unknown return code"; |
|
1254 |
+ } |
|
1255 |
+} |
|
1256 |
+ |
|
1257 |
+#endif |
0 | 1258 |
new file mode 100644 |
... | ... |
@@ -0,0 +1,130 @@ |
0 |
+/* |
|
1 | ||
2 |
+ * |
|
3 |
+ * This program is free software; you can redistribute it and/or modify |
|
4 |
+ * it under the terms of the GNU General Public License as published by |
|
5 |
+ * the Free Software Foundation; either version 2 of the License, or |
|
6 |
+ * (at your option) any later version. |
|
7 |
+ * |
|
8 |
+ * This program is distributed in the hope that it will be useful, |
|
9 |
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of |
|
10 |
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
|
11 |
+ * GNU General Public License for more details. |
|
12 |
+ * |
|
13 |
+ * You should have received a copy of the GNU General Public License |
|
14 |
+ * along with this program; if not, write to the Free Software |
|
15 |
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, |
|
16 |
+ * MA 02110-1301, USA. |
|
17 |
+ */ |
|
18 |
+ |
|
19 |
+#ifdef CL_EXPERIMENTAL |
|
20 |
+ |
|
21 |
+#ifndef _PHISH_CHECK_H |
|
22 |
+#define _PHISH_CHECK_H |
|
23 |
+ |
|
24 |
+ |
|
25 |
+#define CL_PHISH_BASE 100 |
|
26 |
+enum phish_status {CL_PHISH_NODECISION=0,CL_PHISH_CLEAN=CL_PHISH_BASE, CL_PHISH_CLEANUP_OK,CL_PHISH_HOST_OK, CL_PHISH_DOMAIN_OK, |
|
27 |
+ CL_PHISH_HOST_NOT_LISTED, |
|
28 |
+ CL_PHISH_REDIR_OK, CL_PHISH_HOST_REDIR_OK, CL_PHISH_DOMAIN_REDIR_OK, |
|
29 |
+ CL_PHISH_HOST_REVERSE_OK,CL_PHISH_DOMAIN_REVERSE_OK, |
|
30 |
+ CL_PHISH_WHITELISTED,CL_PHISH_HOST_WHITELISTED, |
|
31 |
+ CL_PHISH_CLEAN_CID, |
|
32 |
+ CL_PHISH_TEXTURL, CL_PHISH_MAILTO_OK, |
|
33 |
+ CL_PHISH_CLOAKED_UIU, CL_PHISH_NUMERIC_IP,CL_PHISH_HEX_URL,CL_PHISH_CLOAKED_NULL,CL_PHISH_SSL_SPOOF, CL_PHISH_NOMATCH}; |
|
34 |
+ |
|
35 |
+#define HOST_SUFFICIENT 1 |
|
36 |
+#define DOMAIN_SUFFICIENT (HOST_SUFFICIENT | 2) |
|
37 |
+#define DO_REVERSE_LOOKUP 4 |
|
38 |
+#define CHECK_REDIR 8 |
|
39 |
+#define CHECK_SSL 16 |
|
40 |
+#define CHECK_CLOAKING 32 |
|
41 |
+#define CLEANUP_URL 64 |
|
42 |
+#define CHECK_DOMAIN_REVERSE 128 |
|
43 |
+#define CHECK_IMG_URL 256 |
|
44 |
+#define DOMAINLIST_REQUIRED 512 |
|
45 |
+/* img checking disabled by default */ |
|
46 |
+ |
|
47 |
+ |
|
48 |
+#define CL_PHISH_ALL_CHECKS (CLEANUP_URL|DOMAIN_SUFFICIENT|CHECK_SSL|CHECK_CLOAKING|DOMAINLIST_REQUIRED|CHECK_IMG_URL) |
|
49 |
+ |
|
50 |
+struct string { |
|
51 |
+ int refcount; |
|
52 |
+ struct string* ref; |
|
53 |
+ char* data; |
|
54 |
+}; |
|
55 |
+ |
|
56 |
+struct url_check { |
|
57 |
+ struct string realLink; |
|
58 |
+ struct string displayLink; |
|
59 |
+ unsigned short flags; |
|
60 |
+}; |
|
61 |
+ |
|
62 |
+int phishingScan(message* m,const char* dir,cli_ctx* ctx,tag_arguments_t* hrefs); |
|
63 |
+enum phish_status phishingCheck(struct url_check* urls); |
|
64 |
+ |
|
65 |
+int whitelist_check(struct url_check* urls,int hostOnly); |
|
66 |
+void url_check_init(struct url_check* urls); |
|
67 |
+void get_host(struct string* dest,const char* URL,int isReal,int* phishy); |
|
68 |
+void string_free(struct string* str); |
|
69 |
+void string_assign(struct string* dest,struct string* src); |
|
70 |
+void string_assign_c(struct string* dest,char* data); |
|
71 |
+void string_init_c(struct string* dest,char* data); |
|
72 |
+void string_assign_dup(struct string* dest,const char* start,const char* end); |
|
73 |
+void string_assign_null(struct string* dest); |
|
74 |
+void string_assign_ref(struct string* dest,struct string* ref,char* data); |
|
75 |
+void free_if_needed(struct url_check* url); |
|
76 |
+void get_host(struct string* dest,const char* URL,int isReal,int* phishy); |
|
77 |
+int isCountryCode(const char* str); |
|
78 |
+int isTLD(const char* str,int len); |
|
79 |
+char* rfind(char* start,char c,size_t len); |
|
80 |
+void get_domain(struct string* dest,struct string* host); |
|
81 |
+int ip_reverse(struct url_check* urls,int isReal); |
|
82 |
+void reverse_lookup(struct url_check* url,int isReal); |
|
83 |
+int isNumeric(const char* host); |
|
84 |
+int isSSL(const char* URL); |
|
85 |
+void cleanupURL(struct string* URL,int isReal); |
|
86 |
+void get_redirected_URL(struct string* URL); |
|
87 |
+int isURL(const char* URL); |
|
88 |
+enum phish_status cleanupURLs(struct url_check* urls); |
|
89 |
+int isNumericURL(const char* URL); |
|
90 |
+enum phish_status url_get_host(struct url_check* url,struct url_check* host_url,int isReal,int* phishy); |
|
91 |
+void url_get_domain(struct url_check* url,struct url_check* domains); |
|
92 |
+enum phish_status phishy_map(int phishy,enum phish_status fallback); |
|
93 |
+int isEncoded(const char* url); |
|
94 |
+void phishing_done(void); |
|
95 |
+ |
|
96 |
+static inline int isPhishing(enum phish_status rc) |
|
97 |
+{ |
|
98 |
+ switch(rc) { |
|
99 |
+ case CL_PHISH_CLEAN: |
|
100 |
+ case CL_PHISH_CLEANUP_OK: |
|
101 |
+ case CL_PHISH_WHITELISTED: |
|
102 |
+ case CL_PHISH_HOST_WHITELISTED: |
|
103 |
+ case CL_PHISH_HOST_OK: |
|
104 |
+ case CL_PHISH_DOMAIN_OK: |
|
105 |
+ case CL_PHISH_REDIR_OK: |
|
106 |
+ case CL_PHISH_HOST_REDIR_OK: |
|
107 |
+ case CL_PHISH_DOMAIN_REDIR_OK: |
|
108 |
+ case CL_PHISH_HOST_REVERSE_OK: |
|
109 |
+ case CL_PHISH_DOMAIN_REVERSE_OK: |
|
110 |
+ case CL_PHISH_MAILTO_OK: |
|
111 |
+ case CL_PHISH_TEXTURL: |
|
112 |
+ case CL_PHISH_HOST_NOT_LISTED: |
|
113 |
+ case CL_PHISH_CLEAN_CID: |
|
114 |
+ return 0; |
|
115 |
+ case CL_PHISH_HEX_URL: |
|
116 |
+ case CL_PHISH_CLOAKED_NULL: |
|
117 |
+ case CL_PHISH_SSL_SPOOF: |
|
118 |
+ case CL_PHISH_CLOAKED_UIU: |
|
119 |
+ case CL_PHISH_NUMERIC_IP: |
|
120 |
+ case CL_PHISH_NOMATCH: |
|
121 |
+ return 1; |
|
122 |
+ default: |
|
123 |
+ return 1; |
|
124 |
+ } |
|
125 |
+} |
|
126 |
+const char* phishing_ret_toString(enum phish_status rc); |
|
127 |
+#endif |
|
128 |
+ |
|
129 |
+#endif |
... | ... |
@@ -42,10 +42,8 @@ |
42 | 42 |
#include "defaults.h" |
43 | 43 |
|
44 | 44 |
#ifdef CL_EXPERIMENTAL |
45 |
-/* |
|
46 | 45 |
#include "phish_whitelist.h" |
47 | 46 |
#include "phish_domaincheck_db.h" |
48 |
-*/ |
|
49 | 47 |
#endif |
50 | 48 |
|
51 | 49 |
|
... | ... |
@@ -1094,7 +1092,6 @@ static int cli_load(const char *filename, struct cl_engine **engine, unsigned in |
1094 | 1094 |
#endif |
1095 | 1095 |
skipped = 1; |
1096 | 1096 |
#ifdef CL_EXPERIMENTAL |
1097 |
-/* |
|
1098 | 1097 |
} else if(cli_strbcasestr(filename, ".wdb")) { |
1099 | 1098 |
if(!(options & CL_SCAN_NOPHISHING)) |
1100 | 1099 |
ret = cli_loadwdb(fd, options); |
... | ... |
@@ -1105,7 +1102,6 @@ static int cli_load(const char *filename, struct cl_engine **engine, unsigned in |
1105 | 1105 |
ret = cli_loadpdb(fd, options); |
1106 | 1106 |
else |
1107 | 1107 |
skipped = 1; |
1108 |
-*/ |
|
1109 | 1108 |
#endif |
1110 | 1109 |
} else { |
1111 | 1110 |
cli_dbgmsg("cli_load: unknown extension - assuming old database format\n"); |
... | ... |
@@ -1172,10 +1168,8 @@ static int cli_loaddbdir(const char *dirname, struct cl_engine **engine, unsigne |
1172 | 1172 |
cli_strbcasestr(dent->d_name, ".zmd") || |
1173 | 1173 |
cli_strbcasestr(dent->d_name, ".rmd") || |
1174 | 1174 |
#ifdef CL_EXPERIMENTAL |
1175 |
-/* |
|
1176 | 1175 |
cli_strbcasestr(dent->d_name, ".pdb") || |
1177 | 1176 |
cli_strbcasestr(dent->d_name, ".wdb") || |
1178 |
-*/ |
|
1179 | 1177 |
#endif |
1180 | 1178 |
cli_strbcasestr(dent->d_name, ".hw") || |
1181 | 1179 |
cli_strbcasestr(dent->d_name, ".inc") || |
... | ... |
@@ -1294,10 +1288,8 @@ int cl_statinidir(const char *dirname, struct cl_stat *dbstat) |
1294 | 1294 |
cli_strbcasestr(dent->d_name, ".zmd") || |
1295 | 1295 |
cli_strbcasestr(dent->d_name, ".rmd") || |
1296 | 1296 |
#ifdef CL_EXPERIMENTAL |
1297 |
-/* |
|
1298 | 1297 |
cli_strbcasestr(dent->d_name, ".pdb") || |
1299 | 1298 |
cli_strbcasestr(dent->d_name, ".wdb") || |
1300 |
-*/ |
|
1301 | 1299 |
#endif |
1302 | 1300 |
cli_strbcasestr(dent->d_name, ".hw") || |
1303 | 1301 |
cli_strbcasestr(dent->d_name, ".inc") || |
... | ... |
@@ -1374,10 +1366,8 @@ int cl_statchkdir(const struct cl_stat *dbstat) |
1374 | 1374 |
cli_strbcasestr(dent->d_name, ".zmd") || |
1375 | 1375 |
cli_strbcasestr(dent->d_name, ".rmd") || |
1376 | 1376 |
#ifdef CL_EXPERIMENTAL |
1377 |
-/* |
|
1378 | 1377 |
cli_strbcasestr(dent->d_name, ".pdb") || |
1379 | 1378 |
cli_strbcasestr(dent->d_name, ".wdb") || |
1380 |
-*/ |
|
1381 | 1379 |
#endif |
1382 | 1380 |
cli_strbcasestr(dent->d_name, ".hw") || |
1383 | 1381 |
cli_strbcasestr(dent->d_name, ".inc") || |
1384 | 1382 |
new file mode 100644 |
... | ... |
@@ -0,0 +1,1521 @@ |
0 |
+/* |
|
1 |
+ * Match a string against a list of patterns/regexes. |
|
2 |
+ * |
|
3 | ||
4 |
+ * |
|
5 |
+ * This program is free software; you can redistribute it and/or modify |
|
6 |
+ * it under the terms of the GNU General Public License as published by |
|
7 |
+ * the Free Software Foundation; either version 2 of the License, or |
|
8 |
+ * (at your option) any later version. |
|
9 |
+ * |
|
10 |
+ * This program is distributed in the hope that it will be useful, |
|
11 |
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of |
|
12 |
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
|
13 |
+ * GNU General Public License for more details. |
|
14 |
+ * |
|
15 |
+ * You should have received a copy of the GNU General Public License |
|
16 |
+ * along with this program; if not, write to the Free Software |
|
17 |
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, |
|
18 |
+ * MA 02110-1301, USA. |
|
19 |
+ * |
|
20 |
+ * $Log: regex_list.c,v $ |
|
21 |
+ * Revision 1.1 2006/09/12 19:38:39 acab |
|
22 |
+ * Phishing module merge - libclamav |
|
23 |
+ * |
|
24 |
+ * Revision 1.13 2006/09/11 19:25:08 edwin |
|
25 |
+ * Non-printable characters in regex (although they are invalid inside an url, added some support for it). |
|
26 |
+ * |
|
27 |
+ * Revision 1.12 2006/08/28 08:43:06 edwin |
|
28 |
+ * Fixed a few minor leaks. |
|
29 |
+ * Valgrind now says:"All heap blocks were freed -- no leaks are possible" |
|
30 |
+ * |
|
31 |
+ * Revision 1.11 2006/08/20 21:18:11 edwin |
|
32 |
+ * Added the script used to generate iana_tld.sh |
|
33 |
+ * Added checks for phish_domaincheck_db |
|
34 |
+ * Added phishing module design document from wiki (as discussed with aCaB). |
|
35 |
+ * Updated .wdb/.pdb format documentation (in regex_list.c) |
|
36 |
+ * Fixed some memory leaks in regex_list.c |
|
37 |
+ * IOW: cleanups before the deadline. |
|
38 |
+ * I consider my module to be ready for evaluation now. |
|
39 |
+ * |
|
40 |
+ * Revision 1.10 2006/08/20 19:42:02 edwin |
|
41 |
+ * Fix custom character class, and generic regex handling. |
|
42 |
+ * |
|
43 |
+ * Revision 1.9 2006/08/19 21:08:47 edwin |
|
44 |
+ * Fixed:Forgot to add form tag handling when it contains images. |
|
45 |
+ * Various fixes to get rid of gcc warnings. |
|
46 |
+ * |
|
47 |
+ * Revision 1.8 2006/08/19 09:26:51 edwin |
|
48 |
+ * regex_list.c: Fixed regex alternatives handling (bug discovered with autotests). |
|
49 |
+ * And forgot to commit manager.c last time. |
|
50 |
+ * |
|
51 |
+ * Revision 1.7 2006/08/17 20:31:43 edwin |
|
52 |
+ * Disable extracting hrefs from mails in mbox, if: we aren't scanning for phish, and mailfollowurls is off. |
|
53 |
+ * Fix a still reachable leak. Remove unneeded build_regex_list export. |
|
54 |
+ * |
|
55 |
+ * Revision 1.6 2006/08/12 14:35:34 edwin |
|
56 |
+ * Fix some compiler warnings. |
|
57 |
+ * Fix an assertion failure in regex_list. |
|
58 |
+ * Interpret display links that start with http|https|ftp, always as an URL. |
|
59 |
+ * |
|
60 |
+ * Revision 1.5 2006/08/06 20:27:07 edwin |
|
61 |
+ * New option to enable phish scan for all domains (disabled by default). |
|
62 |
+ * You will now have to run clamscan --phish-scan-alldomains to have any phishes detected. |
|
63 |
+ * Updated phishcheck control flow to better incorporate the domainlist. |
|
64 |
+ * Updated manpage with new options. |
|
65 |
+ * |
|
66 |
+ * TODO:there is a still-reachable leak in regex_list.c |
|
67 |
+ * |
|
68 |
+ * Revision 1.4 2006/08/01 20:19:15 edwin |
|
69 |
+ * Integrate domainlist check into phishcheck. Warning: enabled by default. |
|
70 |
+ * Regex bracket handling update. |
|
71 |
+ * Better regex paranthesized & alternate expression handling. |
|
72 |
+ * |
|
73 |
+ * Revision 1.3 2006/07/31 20:12:30 edwin |
|
74 |
+ * Preliminary support for domain databases (domains to check by phishmodule) |
|
75 |
+ * Better memory allocation failure handling in regex_list |
|
76 |
+ * |
|
77 |
+ */ |
|
78 |
+ |
|
79 |
+#if HAVE_CONFIG_H |
|
80 |
+#include "clamav-config.h" |
|
81 |
+#endif |
|
82 |
+ |
|
83 |
+#ifdef CL_EXPERIMENTAL |
|
84 |
+ |
|
85 |
+#ifndef CL_DEBUG |
|
86 |
+#define NDEBUG |
|
87 |
+#endif |
|
88 |
+ |
|
89 |
+#ifdef CL_THREAD_SAFE |
|
90 |
+#ifndef _REENTRANT |
|
91 |
+#define _REENTRANT |
|
92 |
+#endif |
|
93 |
+#endif |
|
94 |
+ |
|
95 |
+#include <stdio.h> |
|
96 |
+#include <stdlib.h> |
|
97 |
+#include <errno.h> |
|
98 |
+#include <assert.h> |
|
99 |
+#include <string.h> |
|
100 |
+#include <strings.h> |
|
101 |
+#include <ctype.h> |
|
102 |
+ |
|
103 |
+#include <limits.h> |
|
104 |
+#include <sys/types.h> |
|
105 |
+ |
|
106 |
+/*#define USE_PCRE*/ |
|
107 |
+#include <regex.h> |
|
108 |
+ |
|
109 |
+#if defined(HAVE_READDIR_R_3) || defined(HAVE_READDIR_R_2) |
|
110 |
+#include <stddef.h> |
|
111 |
+#endif |
|
112 |
+ |
|
113 |
+#include "clamav.h" |
|
114 |
+#include "others.h" |
|
115 |
+#include "defaults.h" |
|
116 |
+#include "str.h" |
|
117 |
+#include "filetypes.h" |
|
118 |
+#include "mbox.h" |
|
119 |
+#include "regex_list.h" |
|
120 |
+#include "matcher-ac.h" |
|
121 |
+ |
|
122 |
+ |
|
123 |
+/*Tree*/ |
|
124 |
+enum token_op_t {OP_CHAR,OP_STDCLASS,OP_CUSTOMCLASS,OP_DOT,OP_LEAF,OP_ROOT,OP_PARCLOSE}; |
|
125 |
+typedef char* char_bitmap_p; |
|
126 |
+/* |
|
127 |
+ * |
|
128 |
+ * OP_CHAR: 1 character, c = character |
|
129 |
+ * complex stuff: |
|
130 |
+ * OP_STDCLASS: standard character class, c = char class, class: 1<<(index into std_class of class name) |
|
131 |
+ * OP_CUSTOMCLASS: custom character class, first pointer in ptr array is a pointer to the bitmap table for this class |
|
132 |
+ * OP_DOT: single . matching any character except \n |
|
133 |
+ * OP_LEAF: this is a leaf node, reinterpret structure |
|
134 |
+ */ |
|
135 |
+struct tree_node { |
|
136 |
+ enum token_op_t op; |
|
137 |
+ unsigned char c; |
|
138 |
+ char alternatives;/* number of (non-regex) children of node, i.e. sizeof(children)*/ |
|
139 |
+ char listend;/* no more siblings, next pointer is pointer to parent*/ |
|
140 |
+ struct tree_node* next;/* next regex/complex sibling, or parent, if no more siblings , can't be NULL except for root node*/ |
|
141 |
+ union { |
|
142 |
+ struct tree_node** children;/* alternatives nr. of children, followed by (a null pointer terminated) regex leaf node pointers) */ |
|
143 |
+ char_bitmap_p* bitmap; |
|
144 |
+ struct leaf_info* leaf; |
|
145 |
+ } u; |
|
146 |
+}; |
|
147 |
+ |
|
148 |
+struct leaf_info { |
|
149 |
+ char* info;/* what does it mean that we reached the leaf...*/ |
|
150 |
+ regex_t* preg;/* this is NULL if leaf node, and non-regex*/ |
|
151 |
+}; |
|
152 |
+ |
|
153 |
+/* Character classes */ |
|
154 |
+enum wctype_t {ALNUM,DIGIT,PUNCT,ALPHA,GRAPH,SPACE,BLANK,LOWER,UPPER,CNTRL,PRINT,XDIGIT}; |
|
155 |
+static struct std_classmap { |
|
156 |
+ const char* classname; |
|
157 |
+ const enum wctype_t type; |
|
158 |
+} std_class[] = { |
|
159 |
+ {"[:alnum:]",ALNUM}, |
|
160 |
+ {"[:digit:]",DIGIT}, |
|
161 |
+ {"[:punct:]",PUNCT}, |
|
162 |
+ {"[:alpha:]",ALPHA}, |
|
163 |
+ {"[:graph:]",GRAPH}, |
|
164 |
+ {"[:space:]",SPACE}, |
|
165 |
+ {"[:blank:]",BLANK}, |
|
166 |
+ {"[:lower:]",LOWER}, |
|
167 |
+ {"[:upper:]",UPPER}, |
|
168 |
+ {"[:cntrl:]",CNTRL}, |
|
169 |
+ {"[:print:]",PRINT}, |
|
170 |
+ {"[:xdigit:]",XDIGIT} |
|
171 |
+}; |
|
172 |
+ |
|
173 |
+static const size_t std_class_cnt = sizeof(std_class)/sizeof(std_class[0]); |
|
174 |
+#define STD_CLASS_CNT sizeof(std_class)/sizeof(std_class[0]) |
|
175 |
+typedef char char_bitmap_t[32]; |
|
176 |
+static char_bitmap_p char_class_bitmap[STD_CLASS_CNT]; |
|
177 |
+static unsigned short int char_class[256]; |
|
178 |
+ |
|
179 |
+/* Prototypes */ |
|
180 |
+static void setup_matcher_engine(void); |
|
181 |
+static void matcher_engine_done(void); |
|
182 |
+static int add_pattern(struct regex_matcher* matcher,const unsigned char* pat,const char* info); |
|
183 |
+static int match_node(struct tree_node* node,const unsigned char* c,size_t len,const char** info); |
|
184 |
+static void destroy_tree(struct regex_matcher* matcher); |
|
185 |
+ |
|
186 |
+ |
|
187 |
+#define MATCH_SUCCESS 0 |
|
188 |
+#define MATCH_FAILED -1 |
|
189 |
+ |
|
190 |
+ |
|
191 |
+/* |
|
192 |
+ * Call this function when an unrecoverable error has occured, (instead of exit). |
|
193 |
+ */ |
|
194 |
+static void fatal_error(struct regex_matcher* matcher) |
|
195 |
+{ |
|
196 |
+ regex_list_done(matcher); |
|
197 |
+ matcher->list_inited = -1;/* the phishing module will know we tried to load a whitelist, and failed, so it will disable itself too*/ |
|
198 |
+} |
|
199 |
+ |
|
200 |
+ |
|
201 |
+/* |
|
202 |
+ * @matcher - matcher structure to use |
|
203 |
+ * @real_url - href target |
|
204 |
+ * @display_url - <a> tag contents |
|
205 |
+ * @hostOnly - if you want to match only the host part |
|
206 |
+ * |
|
207 |
+ * @return - CL_SUCCESS - url doesn't match |
|
208 |
+ * - CL_VIRUS - url matches list |
|
209 |
+ * |
|
210 |
+ * Do not send NULL pointers to this function!! |
|
211 |
+ * |
|
212 |
+ */ |
|
213 |
+int regex_list_match(struct regex_matcher* matcher,const char* real_url,const char* display_url,int hostOnly,const char** info) |
|
214 |
+{ |
|
215 |
+ assert(matcher); |
|
216 |
+ assert(real_url); |
|
217 |
+ assert(display_url); |
|
218 |
+ assert(info); |
|
219 |
+ if(!matcher->list_inited) |
|
220 |
+ return 0; |
|
221 |
+ assert(matcher->list_built); |
|
222 |
+ { |
|
223 |
+ size_t real_len = strlen(real_url); |
|
224 |
+ size_t display_len = strlen(display_url); |
|
225 |
+ size_t buffer_len = real_len + display_len + 1; |
|
226 |
+ char* buffer = cli_malloc(buffer_len+1); |
|
227 |
+ int partcnt,rc; |
|
228 |
+ unsigned long int partoff; |
|
229 |
+ |
|
230 |
+ if(!buffer) |
|
231 |
+ return CL_EMEM; |
|
232 |
+ |
|
233 |
+ strncpy(buffer,real_url,real_len); |
|
234 |
+ buffer[real_len]=' '; |
|
235 |
+ strncpy(buffer+real_len+1,display_url,display_len); |
|
236 |
+ buffer[buffer_len]=0; |
|
237 |
+ cli_dbgmsg("Looking up in regex_list: %s\n"); |
|
238 |
+ |
|
239 |
+ rc = cli_ac_scanbuff(buffer,buffer_len,info,hostOnly ? matcher->root_hosts : matcher->root_urls,&partcnt,0,0,&partoff,0,-1,NULL); |
|
240 |
+ if(!rc && !hostOnly) |
|
241 |
+ rc = match_node(matcher->root_regex,(unsigned char*)buffer,buffer_len,info) == MATCH_SUCCESS ? CL_VIRUS : CL_SUCCESS; |
|
242 |
+ free(buffer); |
|
243 |
+ if(!rc) |
|
244 |
+ cli_dbgmsg("not in regex list\n"); |
|
245 |
+ return rc; |
|
246 |
+ } |
|
247 |
+} |
|
248 |
+ |
|
249 |
+static struct tree_node* tree_root_alloc(void); |
|
250 |
+ |
|
251 |
+ |
|
252 |
+/* node stack */ |
|
253 |
+#define NODE_STACK_INITIAL 1024 |
|
254 |
+#define NODE_STACK_GROW 4096 |
|
255 |
+/* Initialize @stack */ |
|
256 |
+static int stack_init(struct node_stack* stack) |
|
257 |
+{ |
|
258 |
+ assert(stack); |
|
259 |
+ |
|
260 |
+ stack->cnt = 0; |
|
261 |
+ stack->capacity = NODE_STACK_INITIAL; |
|
262 |
+ stack->data = cli_malloc(stack->capacity * sizeof(*stack->data)); |
|
263 |
+ if(!stack->data) |
|
264 |
+ return CL_EMEM; |
|
265 |
+ else |
|
266 |
+ return CL_SUCCESS; |
|
267 |
+} |
|
268 |
+ |
|
269 |
+/* Reset @stack pointer, but don't realloc */ |
|
270 |
+static void stack_reset(struct node_stack* stack) |
|
271 |
+{ |
|
272 |
+ assert(stack); |
|
273 |
+ |
|
274 |
+ stack->cnt = 0; |
|
275 |
+} |
|
276 |
+ |
|
277 |
+/* Push @node on @stack, growing it if necessarry */ |
|
278 |
+static inline int stack_push(struct node_stack* stack,struct tree_node* node) |
|
279 |
+{ |
|
280 |
+ assert(stack); |
|
281 |
+ assert(stack->data); |
|
282 |
+ |
|
283 |
+ if(stack->cnt == stack->capacity) { |
|
284 |
+ stack->capacity += NODE_STACK_GROW; |
|
285 |
+ stack->data = cli_realloc(stack->data,stack->capacity*sizeof(*stack->data)); |
|
286 |
+ if(!stack->data) |
|
287 |
+ return CL_EMEM; |
|
288 |
+ } |
|
289 |
+ stack->data[stack->cnt++] = node; |
|
290 |
+ return CL_SUCCESS; |
|
291 |
+} |
|
292 |
+ |
|
293 |
+/* Pops node from @stack, doesn't realloc */ |
|
294 |
+static inline struct tree_node* stack_pop(struct node_stack* stack) |
|
295 |
+{ |
|
296 |
+ assert(stack); |
|
297 |
+ assert(stack->data); |
|
298 |
+ assert(stack->cnt);/*don't pop from empty stack */ |
|
299 |
+ |
|
300 |
+ return stack->cnt ? stack->data[--stack->cnt] : NULL; |
|
301 |
+} |
|
302 |
+ |
|
303 |
+/* Initialization & loading */ |
|
304 |
+ |
|
305 |
+/* Initializes @matcher, allocating necesarry substructures */ |
|
306 |
+int init_regex_list(struct regex_matcher* matcher) |
|
307 |
+{ |
|
308 |
+ assert(matcher); |
|
309 |
+ |
|
310 |
+ setup_matcher_engine(); |
|
311 |
+ |
|
312 |
+ matcher->list_inited = 0; |
|
313 |
+ matcher->root_hosts = (struct cli_matcher*) cli_calloc(1,sizeof(*matcher->root_hosts)); |
|
314 |
+ if(!matcher->root_hosts) |
|
315 |
+ return CL_EMEM; |
|
316 |
+ |
|
317 |
+ matcher->root_hosts->ac_root = (struct cli_ac_node *) cli_calloc(1, sizeof(struct cli_ac_node)); |
|
318 |
+ if(!matcher->root_hosts->ac_root) { |
|
319 |
+ free(matcher->root_hosts); |
|
320 |
+ return CL_EMEM; |
|
321 |
+ } |
|
322 |
+ |
|
323 |
+ matcher->root_urls = (struct cli_matcher*) cli_calloc(1,sizeof(*matcher->root_hosts)); |
|
324 |
+ if(!matcher->root_urls) { |
|
325 |
+ free(matcher->root_hosts->ac_root); |
|
326 |
+ free(matcher->root_hosts); |
|
327 |
+ return CL_EMEM; |
|
328 |
+ } |
|
329 |
+ |
|
330 |
+ matcher->root_urls->ac_root = (struct cli_ac_node *) cli_calloc(1, sizeof(struct cli_ac_node)); |
|
331 |
+ if(!matcher->root_urls->ac_root) { |
|
332 |
+ free(matcher->root_hosts->ac_root); |
|
333 |
+ free(matcher->root_hosts); |
|
334 |
+ free(matcher->root_urls); |
|
335 |
+ return CL_EMEM; |
|
336 |
+ } |
|
337 |
+ |
|
338 |
+ matcher->root_regex = tree_root_alloc(); |
|
339 |
+ if(!matcher->root_regex) { |
|
340 |
+ free(matcher->root_hosts->ac_root); |
|
341 |
+ free(matcher->root_hosts); |
|
342 |
+ free(matcher->root_urls->ac_root); |
|
343 |
+ free(matcher->root_urls); |
|
344 |
+ return CL_EMEM; |
|
345 |
+ } |
|
346 |
+ |
|
347 |
+ stack_init(&matcher->node_stack); |
|
348 |
+ stack_init(&matcher->node_stack_alt); |
|
349 |
+ |
|
350 |
+ matcher->list_inited=1; |
|
351 |
+ matcher->list_built=0; |
|
352 |
+ matcher->list_loaded=0; |
|
353 |
+ |
|
354 |
+ return CL_SUCCESS; |
|
355 |
+} |
|
356 |
+ |
|
357 |
+/* inserts @pattern into @root, using ac-matcher |
|
358 |
+ * although the name might be confusing, @pattern is not a regex!*/ |
|
359 |
+static int add_regex_list_element(struct cli_matcher* root,const char* pattern,char* info) |
|
360 |
+{ |
|
361 |
+ int ret; |
|
362 |
+ struct cli_ac_patt *new = cli_calloc(1,sizeof(*new)); |
|
363 |
+ size_t len; |
|
364 |
+ |
|
365 |
+ if(!new) |
|
366 |
+ return CL_EMEM; |
|
367 |
+ assert(root); |
|
368 |
+ assert(pattern); |
|
369 |
+ |
|
370 |
+ len = strlen(pattern); |
|
371 |
+ new->type = 0; |
|
372 |
+ new->sigid = 0; |
|
373 |
+ new->parts = 0; |
|
374 |
+ new->partno = 0; |
|
375 |
+ new->mindist = 0; |
|
376 |
+ new->maxdist = 0; |
|
377 |
+ new->offset = 0; |
|
378 |
+ new->target = 0; |
|
379 |
+ new->length = len; |
|
380 |
+ if(new->length > root->maxpatlen) |
|
381 |
+ root->maxpatlen = new->length; |
|
382 |
+ |
|
383 |
+ new->pattern = cli_malloc(sizeof(new->pattern[0])*len); |
|
384 |
+ if(!new->pattern) { |
|
385 |
+ free(new); |
|
386 |
+ return CL_EMEM; |
|
387 |
+ } |
|
388 |
+ strncpy((char*)new->pattern,(const char*)pattern,len); |
|
389 |
+ |
|
390 |
+ new->virname = info; |
|
391 |
+ if((ret = cli_ac_addpatt(root,new))) { |
|
392 |
+ free(new->virname); |
|
393 |
+ free(new->pattern); |
|
394 |
+ free(new); |
|
395 |
+ return ret; |
|
396 |
+ } |
|
397 |
+ return CL_SUCCESS; |
|
398 |
+} |
|
399 |
+ |
|
400 |
+ |
|
401 |
+#ifndef NDEBUG |
|
402 |
+void dump_tree(struct tree_node* root); |
|
403 |
+#endif |
|
404 |
+static int matcher_engine_refcount=0; |
|
405 |
+ |
|
406 |
+static int build_regex_list(struct regex_matcher* matcher); |
|
407 |
+/* Load patterns/regexes from file */ |
|
408 |
+int load_regex_matcher(struct regex_matcher* matcher,FILE* fd,unsigned int options) |
|
409 |
+{ |
|
410 |
+ int rc,line=0; |
|
411 |
+ char buffer[FILEBUFF]; |
|
412 |
+ |
|
413 |
+ assert(matcher); |
|
414 |
+ assert(fd); |
|
415 |
+ |
|
416 |
+ if(matcher->list_inited==-1) |
|
417 |
+ return -1; |
|
418 |
+ if(matcher->list_loaded) { |
|
419 |
+ cli_warnmsg("Regex list has already been loaded, ignoring further requests for load\n"); |
|
420 |
+ return -1;/*TODO: better return code*/ |
|
421 |
+ } |
|
422 |
+ if(!fd) { |
|
423 |
+ cli_errmsg("Unable to load regex list (null file)\n"); |
|
424 |
+ return -1;/*TODO: return appropiate return code*/ |
|
425 |
+ } |
|
426 |
+ |
|
427 |
+ cli_dbgmsg("Loading regex_list\n"); |
|
428 |
+ if(!matcher->list_inited) { |
|
429 |
+ init_regex_list(matcher); |
|
430 |
+ if (!matcher->list_inited) { |
|
431 |
+ cli_errmsg("Regex list failed to initialize!\n"); |
|
432 |
+ fatal_error(matcher); |
|
433 |
+ return -1; |
|
434 |
+ } |
|
435 |
+ /*atexit(regex_list_done); TODO: destroy this in manager.c */ |
|
436 |
+ } |
|
437 |
+ /* |
|
438 |
+ * Regexlist db format (common to .wdb(whitelist) and .pdb(domainlist) files: |
|
439 |
+ * Multiple lines of form, (empty lines are skipped): |
|
440 |
+ * Flags RealURL DisplayedURL |
|
441 |
+ * Where: |
|
442 |
+ * Flags: R - regex, H - host-only, followed by (optional) 3-digit hexnumber representing |
|
443 |
+ * flags that should be filtered. |
|
444 |
+ * [i.e. phishcheck urls.flags that we don't want to be done for this particular host] |
|
445 |
+ * Note:Flag filtering only makes sense in .pdb files. |
|
446 |
+ * |
|
447 |
+ * If a line in the file doesn't conform to this format, loading fails |
|
448 |
+ * |
|
449 |
+ */ |
|
450 |
+ while(fgets(buffer,FILEBUFF,fd)) { |
|
451 |
+ char* pattern; |
|
452 |
+ char* flags; |
|
453 |
+ line++; |
|
454 |
+ cli_chomp(buffer); |
|
455 |
+ if(!*buffer) |
|
456 |
+ continue;/* skip empty lines */ |
|
457 |
+ pattern = strchr(buffer,' '); |
|
458 |
+ if(!pattern) { |
|
459 |
+ cli_errmsg("Malformed regex list line %d\n",line); |
|
460 |
+ fatal_error(matcher); |
|
461 |
+ return CL_EMALFDB; |
|
462 |
+ } |
|
463 |
+ pattern[0]='\0'; |
|
464 |
+ flags=buffer+1; |
|
465 |
+ pattern++; |
|
466 |
+ if(buffer[0] == 'R') { |
|
467 |
+ if(( rc = add_pattern(matcher,(const unsigned char*)pattern,flags) )) |
|
468 |
+ return rc==CL_EMEM ? CL_EMEM : CL_EMALFDB; |
|
469 |
+ } |
|
470 |
+ else if(buffer[0] == 'H') { |
|
471 |
+ if(( rc = add_regex_list_element(matcher->root_hosts,pattern,flags) )) |
|
472 |
+ return rc==CL_EMEM ? CL_EMEM : CL_EMALFDB; |
|
473 |
+ } |
|
474 |
+ else { |
|
475 |
+ if(( rc = add_regex_list_element(matcher->root_urls,pattern,flags) )) |
|
476 |
+ return rc==CL_EMEM ? CL_EMEM : CL_EMALFDB; |
|
477 |
+ } |
|
478 |
+ } |
|
479 |
+ matcher->list_loaded = 1; |
|
480 |
+ build_regex_list(matcher); |
|
481 |
+ |
|
482 |
+#ifndef NDEBUG |
|
483 |
+/* dump_tree(matcher->root_regex);*/ |
|
484 |
+#endif |
|
485 |
+ if(!matcher->list_built) { |
|
486 |
+ cli_errmsg("Regex list not loaded: build failed!\n"); |
|
487 |
+ fatal_error(matcher); |
|
488 |
+ return CL_EMALFDB; |
|
489 |
+ } |
|
490 |
+ regex_list_cleanup(matcher); |
|
491 |
+ matcher_engine_refcount++; |
|
492 |
+ return CL_SUCCESS; |
|
493 |
+} |
|
494 |
+ |
|
495 |
+/* |
|
496 |
+static void tree_node_merge_nonbin(struct tree_node* into,const struct tree_node* node) |
|
497 |
+{ |
|
498 |
+ assert(into); |
|
499 |
+ assert(node); |
|
500 |
+ |
|
501 |
+ if(node->alternatives){ |
|
502 |
+ if(node->u.children[0]->next == node) { |
|
503 |
+ *no non-bin alternatives here* |
|
504 |
+ } |
|
505 |
+ else { |
|
506 |
+ struct tree_node* p; |
|
507 |
+ for(p = node->u.children[0]->next; p->next != node; p = p->next) |
|
508 |
+ tree_node_insert_nonbin(into,p); |
|
509 |
+ } |
|
510 |
+ } |
|
511 |
+ else |
|
512 |
+ tree_node_insert_nonbin(into,node->u.children[0]); |
|
513 |
+} |
|
514 |
+* |
|
515 |
+static void tree_node_merge_bin(struct tree_node* into,const struct tree_node* node) |
|
516 |
+{ |
|
517 |
+ if(node->u.children && node->alternatives) { |
|
518 |
+ if(!into->alternatives) { |
|
519 |
+ * into has no bin part, just copy+link the node there* |
|
520 |
+ int i; |
|
521 |
+ struct tree_node* next = into->u.children[0]; |
|
522 |
+ into->u.children = node->u.children; |
|
523 |
+ into->alternatives = node->alternatives; |
|
524 |
+ for(i=0;i < into->alternatives;i++) { |
|
525 |
+ if(into->u.children[i]->next == node) { |
|
526 |
+ into->u.children[i]->next = next; |
|
527 |
+ into->u.children[i]->listend = 0; |
|
528 |
+ } |
|
529 |
+ else { |
|
530 |
+ struct tree_node* p; |
|
531 |
+ for(p = into->u.children[0]->next; p->next != node; p = p->next); |
|
532 |
+ p->listend = 0; |
|
533 |
+ p->next = next; |
|
534 |
+ } |
|
535 |
+ } |
|
536 |
+ } |
|
537 |
+ const size_t new_size = tree_node_get_array_size(into) + tree_node_get_array_size(node); |
|
538 |
+ struct tree_node** new_children = cli_malloc(sizeof( |
|
539 |
+ } |
|
540 |
+ * else: no bin part to merge * |
|
541 |
+} |
|
542 |
+*/ |
|
543 |
+ |
|
544 |
+static struct tree_node ** tree_node_get_children(const struct tree_node* node) |
|
545 |
+{ |
|
546 |
+ return node->op==OP_CUSTOMCLASS ? (node->u.children[1] ? node->u.children+1 : NULL) :node->u.children; |
|
547 |
+} |
|
548 |
+/* don't do this, it wastes too much memory, and has no benefit |
|
549 |
+static void regex_list_dobuild(struct tree_node* called_from,struct tree_node* node) |
|
550 |
+{ |
|
551 |
+ struct tree_node **children; |
|
552 |
+ assert(node); |
|
553 |
+ |
|
554 |
+ children = tree_node_get_children(node); |
|
555 |
+ if(node->op!=OP_ROOT) |
|
556 |
+ assert(called_from); |
|
557 |
+ if(node->op==OP_TMP_PARCLOSE) { |
|
558 |
+ const size_t array_size = (node->alternatives +(called_from->op==OP_CUSTOMCLASS ? 1:0))*sizeof(*called_from->u.children); |
|
559 |
+ if(node->c) |
|
560 |
+ return;* already processed this common node* |
|
561 |
+ else |
|
562 |
+ node->c = 1; |
|
563 |
+ * copy children to called_from from this node |
|
564 |
+ * called_from should have 0 alternatives, and a link to this node via ->u.children[0] |
|
565 |
+ * * |
|
566 |
+ assert(called_from->alternatives == 0); |
|
567 |
+ assert(called_from->u.children); |
|
568 |
+ assert(called_from->u.children[0] == node); |
|
569 |
+ called_from->u.children = cli_realloc(called_from->u.children,array_size); |
|
570 |
+ called_from->u.children = node->u.children; |
|
571 |
+ called_from->alternatives = node->alternatives; |
|
572 |
+ if(called_from->alternatives) { |
|
573 |
+ * fix parent pointers * |
|
574 |
+ int i;TODO: do a deep copy of children here |
|
575 |
+ struct tree_node **from_children = tree_node_get_children(called_from); |
|
576 |
+ assert(from_children); |
|
577 |
+ for(i=0;i < called_from->alternatives;i++) { |
|
578 |
+ struct tree_node* p; |
|
579 |
+ for(p=from_children[i];p->next != node; p = p->next); |
|
580 |
+ p->next = called_from; |
|
581 |
+ } |
|
582 |
+ } |
|
583 |
+ } |
|
584 |
+ |
|
585 |
+ if(node->op==OP_LEAF) |
|
586 |
+ return; |
|
587 |
+ else if (node->alternatives) { |
|
588 |
+ int i; |
|
589 |
+ struct tree_node* p; |
|
590 |
+ assert(children); |
|
591 |
+ p = children[0]->op==OP_LEAF ? NULL : children[0]->next; |
|
592 |
+ for(i=0;i<node->alternatives;i++) |
|
593 |
+ regex_list_dobuild(node,children[i]); |
|
594 |
+ if(p && p!=node) |
|
595 |
+ regex_list_dobuild(node,p); |
|
596 |
+ } else { |
|
597 |
+ if(children) |
|
598 |
+ if (children[0]) |
|
599 |
+ regex_list_dobuild(node,children[0]); |
|
600 |
+ } |
|
601 |
+ if(node->next && !node->listend) |
|
602 |
+ regex_list_dobuild(node,node->next); |
|
603 |
+ if(node->op==OP_TMP_PARCLOSE) |
|
604 |
+ node->c=0; |
|
605 |
+ *free(node);* |
|
606 |
+} |
|
607 |
+*/ |
|
608 |
+/* Build the matcher list */ |
|
609 |
+static int build_regex_list(struct regex_matcher* matcher) |
|
610 |
+{ |
|
611 |
+ if(!matcher->list_inited || !matcher->list_loaded) { |
|
612 |
+ cli_errmsg("Regex list not loaded!\n"); |
|
613 |
+ return -1;/*TODO: better error code */ |
|
614 |
+ } |
|
615 |
+ cli_dbgmsg("Building regex list\n"); |
|
616 |
+ cli_ac_buildtrie(matcher->root_hosts); |
|
617 |
+ cli_ac_buildtrie(matcher->root_urls); |
|
618 |
+ matcher->list_built=1; |
|
619 |
+ |
|
620 |
+ return CL_SUCCESS; |
|
621 |
+} |
|
622 |
+ |
|
623 |
+ |
|
624 |
+static void stack_destroy(struct node_stack* stack); |
|
625 |
+/* Done with this matcher, free resources */ |
|
626 |
+void regex_list_done(struct regex_matcher* matcher) |
|
627 |
+{ |
|
628 |
+ assert(matcher); |
|
629 |
+ |
|
630 |
+ regex_list_cleanup(matcher); |
|
631 |
+ if(matcher->list_loaded) { |
|
632 |
+ cli_ac_free(matcher->root_hosts); |
|
633 |
+ free(matcher->root_hosts); |
|
634 |
+ matcher->root_hosts=NULL; |
|
635 |
+ |
|
636 |
+ cli_ac_free(matcher->root_urls); |
|
637 |
+ free(matcher->root_urls); |
|
638 |
+ matcher->root_urls=NULL; |
|
639 |
+ |
|
640 |
+ matcher->list_built=0; |
|
641 |
+ destroy_tree(matcher); |
|
642 |
+ matcher->list_loaded=0; |
|
643 |
+ } |
|
644 |
+ if(matcher->list_inited) { |
|
645 |
+ matcher_engine_done(); |
|
646 |
+ matcher->list_inited=0; |
|
647 |
+ } |
|
648 |
+ stack_destroy(&matcher->node_stack); |
|
649 |
+ stack_destroy(&matcher->node_stack_alt); |
|
650 |
+} |
|
651 |
+ |
|
652 |
+/* Tree matcher algorithm */ |
|
653 |
+ |
|
654 |
+static int cli_iswctype(const char c,const enum wctype_t type) |
|
655 |
+{ |
|
656 |
+ switch(type) { |
|
657 |
+ case ALNUM: |
|
658 |
+ return isalnum(c); |
|
659 |
+ case DIGIT: |
|
660 |
+ return isdigit(c); |
|
661 |
+ case PUNCT: |
|
662 |
+ return ispunct(c); |
|
663 |
+ case ALPHA: |
|
664 |
+ return isalpha(c); |
|
665 |
+ case GRAPH: |
|
666 |
+ return isgraph(c); |
|
667 |
+ case SPACE: |
|
668 |
+ return isspace(c); |
|
669 |
+ case BLANK: |
|
670 |
+ return c=='\t' || c==' '; |
|
671 |
+ case LOWER: |
|
672 |
+ return islower(c); |
|
673 |
+ case UPPER: |
|
674 |
+ return isupper(c); |
|
675 |
+ case CNTRL: |
|
676 |
+ return iscntrl(c); |
|
677 |
+ case PRINT: |
|
678 |
+ return isprint(c); |
|
679 |
+ case XDIGIT: |
|
680 |
+ return isxdigit(c); |
|
681 |
+ default: { |
|
682 |
+ cli_warnmsg("Unknown char class in iswctype\n"); |
|
683 |
+ return 0; |
|
684 |
+ } |
|
685 |
+ } |
|
686 |
+} |
|
687 |
+ |
|
688 |
+static int engine_inited=0; |
|
689 |
+ |
|
690 |
+static void setup_matcher_engine(void) |
|
691 |
+{ |
|
692 |
+ /*Set up std character classes*/ |
|
693 |
+ size_t i; |
|
694 |
+ size_t j; |
|
695 |
+ if(engine_inited) |
|
696 |
+ return; |
|
697 |
+ memset(char_class,0,256); |
|
698 |
+ for(i=0;i<std_class_cnt;i++) { |
|
699 |
+ enum wctype_t type = std_class[i].type; |
|
700 |
+ char_class_bitmap[i]=cli_calloc(256>>3,1); |
|
701 |
+ for(j=0;j<256;j++) |
|
702 |
+ if(cli_iswctype(j,type)) { |
|
703 |
+ char_class[j] |= 1<<i; |
|
704 |
+ char_class_bitmap[i][j>>3] |= 1<<(j&0x07); |
|
705 |
+ } |
|
706 |
+ } |
|
707 |
+ engine_inited=1; |
|
708 |
+} |
|
709 |
+ |
|
710 |
+static void matcher_engine_done(void) |
|
711 |
+{ |
|
712 |
+ size_t i; |
|
713 |
+ matcher_engine_refcount--; |
|
714 |
+ if(!matcher_engine_refcount) { |
|
715 |
+ for(i=0;i<std_class_cnt;i++) |
|
716 |
+ free(char_class_bitmap[i]); |
|
717 |
+ } |
|
718 |
+ engine_inited=0; |
|
719 |
+} |
|
720 |
+ |
|
721 |
+struct token_t |
|
722 |
+{ |
|
723 |
+ size_t len; |
|
724 |
+ char type; |
|
725 |
+ union { |
|
726 |
+ const unsigned char* start; |
|
727 |
+ char_bitmap_p bitmap; |
|
728 |
+ } u; |
|
729 |
+}; |
|
730 |
+ |
|
731 |
+enum {TOKEN_CHAR,TOKEN_DOT,TOKEN_PAR_OPEN,TOKEN_PAR_CLOSE,TOKEN_BRACKET,TOKEN_ALT,TOKEN_REGEX,TOKEN_DONE}; |
|
732 |
+ |
|
733 |
+static const unsigned char* getNextToken(const unsigned char* pat,struct token_t* token) |
|
734 |
+{ |
|
735 |
+ assert(pat); |
|
736 |
+ assert(token); |
|
737 |
+ |
|
738 |
+ switch(*pat) { |
|
739 |
+ case '\\': |
|
740 |
+ token->type=TOKEN_CHAR; |
|
741 |
+ token->u.start = ++pat; |
|
742 |
+ if(islower(token->u.start)) { |
|
743 |
+ /* handle \n, \t, etc. */ |
|
744 |
+ char c; |
|
745 |
+ if(snprintf(&c,1,"\%c",token->u.start)!=1) |
|
746 |
+ token->type=TOKEN_REGEX; |
|
747 |
+ token->u.start=c; |
|
748 |
+ } |
|
749 |
+ token->len = 1; |
|
750 |
+ break; |
|
751 |
+ case '|': |
|
752 |
+ token->type=TOKEN_ALT; |
|
753 |
+ break; |
|
754 |
+ case '*': |
|
755 |
+ case '+': |
|
756 |
+ case '?': |
|
757 |
+ case '{': |
|
758 |
+ case '}': |
|
759 |
+ token->type=TOKEN_REGEX; |
|
760 |
+/* assert(0 && "find_regex_start should have forbidden us from finding regex special chars");*/ |
|
761 |
+ break; |
|
762 |
+ case '[': |
|
763 |
+ { |
|
764 |
+ /*TODO: implement*/ |
|
765 |
+ /*see if it is something simple like a list of characters, a range, or negated ...*/ |
|
766 |
+ const unsigned char* old=pat++;/* save this in case we change our mind and decide this is too complicated for us to handle*/ |
|
767 |
+ unsigned char range_start=0; |
|
768 |
+ int hasprev = 0; |
|
769 |
+ char_bitmap_p bitmap = cli_malloc(32); |
|
770 |
+ if(!bitmap) |
|
771 |
+ return NULL; |
|
772 |
+ if (*pat=='^') { |
|
773 |
+ memset(bitmap,0xFF,32);/*match chars not in brackets*/ |
|
774 |
+ pat++; |
|
775 |
+ } |
|
776 |
+ else |
|
777 |
+ memset(bitmap,0x00,32); |
|
778 |
+ do { |
|
779 |
+ /* literal ] can be first character, so test for it at the end of the loop, for example: []] */ |
|
780 |
+ if (*pat=='-' && hasprev) { |
|
781 |
+ /* it is a range*/ |
|
782 |
+ unsigned char range_end; |
|
783 |
+ unsigned int c; |
|
784 |
+ assert(range_start); |
|
785 |
+ pat++; |
|
786 |
+ if (pat[0]=='[') |
|
787 |
+ if (pat[1]=='.') { |
|
788 |
+ if(pat[2]=='-' && pat[3]=='.' && pat[4]==']') |
|
789 |
+ range_end = '-'; |
|
790 |
+ else { |
|
791 |
+ /* this is getting complicated, bail out */ |
|
792 |
+ cli_warnmsg("confused about collating sequences in regex,bailing out"); |
|
793 |
+ pat=old; |
|
794 |
+ token->type=TOKEN_REGEX; |
|
795 |
+ break; |
|
796 |
+ } |
|
797 |
+ } |
|
798 |
+ else |
|
799 |
+ range_end = *pat; |
|
800 |
+ else |
|
801 |
+ range_end = *pat; |
|
802 |
+ for(c=range_start+1;c<=range_end;c++) |
|
803 |
+ bitmap[c>>3] ^= 1<<(c&0x7); |
|
804 |
+ hasprev = 0; |
|
805 |
+ } |
|
806 |
+ else if (pat[0]=='[' && pat[1]==':') { |
|
807 |
+ const unsigned char* end; |
|
808 |
+ int len,found=-1; |
|
809 |
+ size_t i; |
|
810 |
+ |
|
811 |
+ pat+=2; |
|
812 |
+ end=(unsigned char*)strstr((const char*)pat,":]"); |
|
813 |
+ if(!end) { |
|
814 |
+ cli_warnmsg("confused about std char class syntax regex,bailing out"); |
|
815 |
+ pat=old; |
|
816 |
+ token->type=TOKEN_REGEX; |
|
817 |
+ break; |
|
818 |
+ } |
|
819 |
+ |
|
820 |
+ len = end-pat; |
|
821 |
+ for(i=0;i<std_class_cnt;i++) |
|
822 |
+ if(!strncmp((const char*)pat,std_class[i].classname,len)) { |
|
823 |
+ found=i; |
|
824 |
+ break; |
|
825 |
+ } |
|
826 |
+ if(found!=-1) { |
|
827 |
+ for(i=0;i<256;i++) |
|
828 |
+ if(char_class[i]&(1<<found)) |
|
829 |
+ bitmap[i>>3] ^= 1<<(i&0x7); |
|
830 |
+ } |
|
831 |
+ else { |
|
832 |
+ /*unknown class*/ |
|
833 |
+ cli_warnmsg("confused about regex bracket expression, bailing out"); |
|
834 |
+ pat=old; |
|
835 |
+ token->type=TOKEN_REGEX; |
|
836 |
+ break; |
|
837 |
+ } |
|
838 |
+ } |
|
839 |
+ else { |
|
840 |
+ bitmap[*pat>>3] ^= 1<<(*pat&0x7); |
|
841 |
+ pat++; |
|
842 |
+ range_start = *pat; |
|
843 |
+ hasprev = 1; |
|
844 |
+ } |
|
845 |
+ } while(*pat!=']'); |
|
846 |
+ /*TODO: see if this bitmap already exists, then reuse*/ |
|
847 |
+ token->type = TOKEN_BRACKET; |
|
848 |
+ token->u.bitmap = bitmap; |
|
849 |
+ break; |
|
850 |
+ } |
|
851 |
+ case ']': |
|
852 |
+ assert(0 && "Encountered ] without matching ["); |
|
853 |
+ /* bad state */ |
|
854 |
+ break; |
|
855 |
+ case '.': |
|
856 |
+ token->type=TOKEN_DOT; |
|
857 |
+ break; |
|
858 |
+ case '(': |
|
859 |
+ token->type=TOKEN_PAR_OPEN; |
|
860 |
+ break; |
|
861 |
+ case ')': |
|
862 |
+ token->type=TOKEN_PAR_CLOSE; |
|
863 |
+ break; |
|
864 |
+ default: |
|
865 |
+ token->type=TOKEN_CHAR; |
|
866 |
+ token->u.start = pat; |
|
867 |
+ token->len=1; |
|
868 |
+ break; |
|
869 |
+ } |
|
870 |
+ return ++pat; |
|
871 |
+} |
|
872 |
+ |
|
873 |
+#define INITIAL_ALT_STACK 10 |
|
874 |
+#define ALT_STACK_GROW 20 |
|
875 |
+ |
|
876 |
+static const unsigned char* find_regex_start(const unsigned char* pat) |
|
877 |
+{ |
|
878 |
+ struct token_t token; |
|
879 |
+ /*TODO: find where the regex part begins, for ex: |
|
880 |
+ * abcd+, regex begins at 'd' |
|
881 |
+ * */ |
|
882 |
+ const unsigned char* last=NULL; |
|
883 |
+ const unsigned char* tmp=NULL; |
|
884 |
+ const unsigned char** altpositions = cli_malloc(INITIAL_ALT_STACK*sizeof(*altpositions)); |
|
885 |
+ size_t altpositions_capacity = INITIAL_ALT_STACK; |
|
886 |
+ size_t altpositions_cnt = 0; |
|
887 |
+ char lasttype = -1; |
|
888 |
+ if(!altpositions) |
|
889 |
+ return NULL; |
|
890 |
+ assert(pat); |
|
891 |
+ |
|
892 |
+ /* Try to parse pattern till special regex chars are encountered, that the tree-matcher doesn't handle, like: +,*,{}. |
|
893 |
+ * The tricky part is that once we encounter these, the previous 'atom' has to be passed on to the regex matcher, so we have to |
|
894 |
+ * back up to the last known good position |
|
895 |
+ * Example, if we have: abc(defg)+, then only abc can be handled by tree parser, so we have to return the position of (. |
|
896 |
+ * Another example: abc(defg|xyz|oz+|pdo), the last known good position is |, after xyz |
|
897 |
+ * TODO: what about open parantheses? maybe once we found a special char, we have top back out before the first (? |
|
898 |
+ * */ |
|
899 |
+ do { |
|
900 |
+ tmp = pat; |
|
901 |
+ pat = getNextToken(pat,&token); |
|
902 |
+ if(token.type!=TOKEN_REGEX) { |
|
903 |
+ last = tmp; |
|
904 |
+ lasttype = token.type; |
|
905 |
+ if(token.type==TOKEN_BRACKET) |
|
906 |
+ free(token.u.bitmap); |
|
907 |
+ if(token.type==TOKEN_ALT || token.type==TOKEN_PAR_OPEN) { |
|
908 |
+ /* save this position on stack, succesfully parsed till here*/ |
|
909 |
+ if(altpositions_cnt && altpositions[altpositions_cnt-1][0]=='|') |
|
910 |
+ /* encountered another alternate (|) operator, override previous | position stored */ |
|
911 |
+ altpositions[altpositions_cnt-1]=last; |
|
912 |
+ else { |
|
913 |
+ altpositions[altpositions_cnt++] = last; |
|
914 |
+ if(altpositions_cnt == altpositions_capacity) { |
|
915 |
+ altpositions_capacity += ALT_STACK_GROW; |
|
916 |
+ altpositions = cli_realloc(altpositions,altpositions_capacity*sizeof(*altpositions)); |
|
917 |
+ if(!altpositions) |
|
918 |
+ return NULL; |
|
919 |
+ } |
|
920 |
+ } |
|
921 |
+ } else if (lasttype==TOKEN_PAR_CLOSE) { |
|
922 |
+ /* remove last stored position from stack, succesfully this last group */ |
|
923 |
+ altpositions_cnt--; |
|
924 |
+ assert(altpositions_cnt>0); |
|
925 |
+ } |
|
926 |
+ } |
|
927 |
+ else { |
|
928 |
+ if(altpositions_cnt) |
|
929 |
+ last = altpositions[0 /*altpositions_cnt-1*/];/*TODO: which index here?, see above TODO... */ |
|
930 |
+ /*last stored 'safe' position where no special (+,*,{}) regex chars were encountered*/ |
|
931 |
+ } |
|
932 |
+ } while(*pat && token.type!=TOKEN_REGEX); |
|
933 |
+ free(altpositions); |
|
934 |
+ return *pat ? last : last+1; |
|
935 |
+} |
|
936 |
+ |
|
937 |
+static struct tree_node* tree_node_alloc(struct tree_node* next,char listend) |
|
938 |
+{ |
|
939 |
+ struct tree_node* node = cli_malloc(sizeof(*node)); |
|
940 |
+ if(node) { |
|
941 |
+ node->alternatives=0; |
|
942 |
+ node->next=next; |
|
943 |
+ node->listend=listend; |
|
944 |
+ node->u.children=NULL; |
|
945 |
+ } |
|
946 |
+ return node; |
|
947 |
+} |
|
948 |
+ |
|
949 |
+static struct tree_node* tree_root_alloc(void) |
|
950 |
+{ |
|
951 |
+ struct tree_node* root=tree_node_alloc(NULL,1); |
|
952 |
+ if(root) { |
|
953 |
+ root->op=OP_ROOT; |
|
954 |
+ root->c=0; |
|
955 |
+ root->next=NULL; |
|
956 |
+ root->listend=1; |
|
957 |
+ } |
|
958 |
+ return root; |
|
959 |
+} |
|
960 |
+static inline struct tree_node* tree_node_char_binsearch(const struct tree_node* node,const char csearch,int* left) |
|
961 |
+{ |
|
962 |
+ int right; |
|
963 |
+ struct tree_node **children; |
|
964 |
+ assert(node); |
|
965 |
+ assert(left); |
|
966 |
+ |
|
967 |
+ children = tree_node_get_children(node); |
|
968 |
+ right = node->alternatives-1; |
|
969 |
+ *left = 0; |
|
970 |
+ if(!node->alternatives) |
|
971 |
+ return NULL; |
|
972 |
+ assert(children); |
|
973 |
+ while(*left<=right) { |
|
974 |
+ int mid = *left+(right-*left)/2; |
|
975 |
+ if(children[mid]->c == csearch) |
|
976 |
+ return children[mid]; |
|
977 |
+ else if(children[mid]->c < csearch) |
|
978 |
+ *left=mid+1; |
|
979 |
+ else |
|
980 |
+ right=mid-1; |
|
981 |
+ } |
|
982 |
+ return NULL; |
|
983 |
+} |
|
984 |
+ |
|
985 |
+static inline struct tree_node* tree_get_next(struct tree_node* node) |
|
986 |
+{ |
|
987 |
+ struct tree_node** children; |
|
988 |
+ assert(node); |
|
989 |
+ children = tree_node_get_children(node); |
|
990 |
+ |
|
991 |
+ if(!node->alternatives && children && children[0]) |
|
992 |
+ return children[0]; |
|
993 |
+ else if(node->alternatives<=1) |
|
994 |
+ return node; |
|
995 |
+ else |
|
996 |
+ return children[0]->next; |
|
997 |
+} |
|
998 |
+ |
|
999 |
+static inline size_t tree_node_get_array_size(const struct tree_node* node) |
|
1000 |
+{ |
|
1001 |
+ assert(node); |
|
1002 |
+ /* if op is CUSTOMCLASS, then first pointer is pointer to bitmap, so array size is +1 */ |
|
1003 |
+ return (node->alternatives + (node->op==OP_CUSTOMCLASS ? 1 : 0)) * sizeof(node->u.children[0]); |
|
1004 |
+} |
|
1005 |
+ |
|
1006 |
+static inline struct tree_node* tree_node_char_insert(struct tree_node* node,const char c,int left) |
|
1007 |
+{ |
|
1008 |
+ struct tree_node* new, *alt = tree_get_next(node); |
|
1009 |
+ node->alternatives++; |
|
1010 |
+ node->u.children = cli_realloc(node->u.children,tree_node_get_array_size(node)); |
|
1011 |
+ if(!node->u.children) |
|
1012 |
+ return NULL; |
|
1013 |
+ |
|
1014 |
+ new = tree_node_alloc(alt , node == alt ); |
|
1015 |
+ if(new) { |
|
1016 |
+ new->op=OP_CHAR; |
|
1017 |
+ new->c=c; |
|
1018 |
+ } |
|
1019 |
+ |
|
1020 |
+ if(node->alternatives-left-1>0) |
|
1021 |
+ memmove(&node->u.children[left+1],&node->u.children[left],(node->alternatives-left-1)*sizeof(node->u.children[0])); |
|
1022 |
+ node->u.children[left] = new; |
|
1023 |
+ |
|
1024 |
+ return new; |
|
1025 |
+} |
|
1026 |
+ |
|
1027 |
+static inline void tree_node_insert_nonbin(struct tree_node* node, struct tree_node* new) |
|
1028 |
+{ |
|
1029 |
+ struct tree_node **children; |
|
1030 |
+ assert(node); |
|
1031 |
+ assert(new); |
|
1032 |
+ |
|
1033 |
+ children = tree_node_get_children(node); |
|
1034 |
+ if(node->alternatives) { |
|
1035 |
+ assert(children); |
|
1036 |
+ if(children[0]->next == node) { |
|
1037 |
+ int i; |
|
1038 |
+ new->listend = 1; |
|
1039 |
+ for(i=0;i<node->alternatives;i++) { |
|
1040 |
+ children[i]->next = new; |
|
1041 |
+ children[i]->listend = 0; |
|
1042 |
+ } |
|
1043 |
+ } |
|
1044 |
+ else { |
|
1045 |
+ struct tree_node* p; |
|
1046 |
+ for(p = children[0]->next ; p->next != node ; p = p->next) |
|
1047 |
+ assert(!p->listend); |
|
1048 |
+ new->listend = 1; |
|
1049 |
+ p->listend = 0; |
|
1050 |
+ p->next = new; |
|
1051 |
+ } |
|
1052 |
+ } |
|
1053 |
+ else { |
|
1054 |
+ node->u.children = cli_realloc(node->u.children,sizeof(node->u.children[0])*( node->op==OP_CUSTOMCLASS ? 2 : 1 )); |
|
1055 |
+ if(node->u.children) |
|
1056 |
+ node->u.children[ node->op==OP_CUSTOMCLASS ? 1 : 0 ] = new; |
|
1057 |
+ } |
|
1058 |
+} |
|
1059 |
+ |
|
1060 |
+static inline unsigned char char_getclass(const unsigned char* bitmap) |
|
1061 |
+{ |
|
1062 |
+ size_t i; |
|
1063 |
+ assert(bitmap); |
|
1064 |
+ |
|
1065 |
+ for(i=0;i<std_class_cnt;i++) |
|
1066 |
+ if(!memcmp(bitmap,char_class_bitmap[i],256>>3)) |
|
1067 |
+ return i; |
|
1068 |
+ return std_class_cnt; |
|
1069 |
+} |
|
1070 |
+ |
|
1071 |
+static void stack_destroy(struct node_stack* stack) |
|
1072 |
+{ |
|
1073 |
+ assert(stack); |
|
1074 |
+ if(stack->data) |
|
1075 |
+ free(stack->data); |
|
1076 |
+ stack->data = NULL; |
|
1077 |
+ stack->capacity = 0; |
|
1078 |
+} |
|
1079 |
+ |
|
1080 |
+ |
|
1081 |
+/* call this after whitelist load is complete, and the tree is no longer going to be modified */ |
|
1082 |
+void regex_list_cleanup(struct regex_matcher* matcher) |
|
1083 |
+{ |
|
1084 |
+ assert(matcher); |
|
1085 |
+ |
|
1086 |
+ stack_destroy(&matcher->node_stack); |
|
1087 |
+ stack_destroy(&matcher->node_stack_alt); |
|
1088 |
+ stack_init(&matcher->node_stack); |
|
1089 |
+ stack_init(&matcher->node_stack_alt); |
|
1090 |
+} |
|
1091 |
+ |
|
1092 |
+int is_regex_ok(struct regex_matcher* matcher) |
|
1093 |
+{ |
|
1094 |
+ assert(matcher); |
|
1095 |
+ return (!matcher->list_inited || matcher->list_inited!=-1);/* either we don't have a regexlist, or we initialized it successfully */ |
|
1096 |
+} |
|
1097 |
+ |
|
1098 |
+/* returns 0 on success, regexec error code otherwise */ |
|
1099 |
+static int add_pattern(struct regex_matcher* matcher,const unsigned char* pat,const char* info) |
|
1100 |
+{ |
|
1101 |
+ int bol=1; |
|
1102 |
+ const unsigned char* pat_end = find_regex_start(pat); |
|
1103 |
+ struct token_t token; |
|
1104 |
+ struct tree_node* node; |
|
1105 |
+ |
|
1106 |
+ assert(matcher); |
|
1107 |
+ |
|
1108 |
+ node = matcher->root_regex; |
|
1109 |
+ |
|
1110 |
+ stack_reset(&matcher->node_stack); |
|
1111 |
+ stack_reset(&matcher->node_stack_alt); |
|
1112 |
+ stack_push(&matcher->node_stack,node); |
|
1113 |
+ |
|
1114 |
+ for(;node->op!=OP_LEAF;){ |
|
1115 |
+ if(pat<pat_end) |
|
1116 |
+ pat = getNextToken(pat,&token); |
|
1117 |
+ else if(*pat) { |
|
1118 |
+ token.type = TOKEN_REGEX; |
|
1119 |
+ token.u.start=pat; |
|
1120 |
+ } |
|
1121 |
+ else |
|
1122 |
+ token.type = TOKEN_DONE; |
|
1123 |
+ |
|
1124 |
+ switch(token.type) { |
|
1125 |
+ case TOKEN_CHAR: |
|
1126 |
+ { |
|
1127 |
+ /* search for char in tree */ |
|
1128 |
+ int left; |
|
1129 |
+ struct tree_node* newnode = tree_node_char_binsearch(node,*token.u.start,&left); |
|
1130 |
+ if(newnode) |
|
1131 |
+ node = newnode; |
|
1132 |
+ else { |
|
1133 |
+ /* not found, insert it */ |
|
1134 |
+ node = tree_node_char_insert(node,*token.u.start,left); |
|
1135 |
+ } |
|
1136 |
+ break; |
|
1137 |
+ } |
|
1138 |
+ |
|
1139 |
+ case TOKEN_PAR_OPEN: |
|
1140 |
+ stack_push(&matcher->node_stack_alt,NULL);/* marker */ |
|
1141 |
+ stack_push(&matcher->node_stack,node); |
|
1142 |
+ break; |
|
1143 |
+ |
|
1144 |
+ case TOKEN_PAR_CLOSE: { |
|
1145 |
+ /*TODO: test this!!!*/ |
|
1146 |
+ struct tree_node* node_alt = node; |
|
1147 |
+ node = tree_node_alloc(NULL,1); |
|
1148 |
+ node->op=OP_PARCLOSE; |
|
1149 |
+ node->c=0; |
|
1150 |
+ node->listend=1; |
|
1151 |
+ tree_node_insert_nonbin(node_alt,node); |
|
1152 |
+ while (( node_alt = stack_pop(&matcher->node_stack_alt) )) { |
|
1153 |
+ tree_node_insert_nonbin(node_alt,node); |
|
1154 |
+ } |
|
1155 |
+ stack_pop(&matcher->node_stack); |
|
1156 |
+ break; |
|
1157 |
+ } |
|
1158 |
+ |
|
1159 |
+ case TOKEN_ALT: |
|
1160 |
+ stack_push(&matcher->node_stack_alt,node); |
|
1161 |
+ node = stack_pop(&matcher->node_stack); |
|
1162 |
+ stack_push(&matcher->node_stack,node); |
|
1163 |
+ break; |
|
1164 |
+ |
|
1165 |
+ case TOKEN_BRACKET: |
|
1166 |
+ { |
|
1167 |
+ struct tree_node* new = tree_node_alloc(tree_get_next(node),1); |
|
1168 |
+ unsigned char charclass = char_getclass(token.u.start); |
|
1169 |
+ if(charclass == std_class_cnt) {/*not a std char class*/ |
|
1170 |
+ new->op = OP_CUSTOMCLASS; |
|
1171 |
+ new->u.children = cli_malloc(sizeof(new->u.children[0])*2); |
|
1172 |
+ new->u.bitmap[0] = token.u.bitmap; |
|
1173 |
+ new->u.bitmap[1] = NULL; |
|
1174 |
+ tree_node_insert_nonbin(node,new); |
|
1175 |
+ node = new; |
|
1176 |
+ } |
|
1177 |
+ else { |
|
1178 |
+ new->op = OP_STDCLASS; |
|
1179 |
+ new->c = charclass; |
|
1180 |
+ tree_node_insert_nonbin(node,new); |
|
1181 |
+ node=new; |
|
1182 |
+ } |
|
1183 |
+ break; |
|
1184 |
+ } |
|
1185 |
+ |
|
1186 |
+ case TOKEN_DOT: |
|
1187 |
+ { |
|
1188 |
+ struct tree_node* new = tree_node_alloc(tree_get_next(node),1); |
|
1189 |
+ new->op = OP_DOT; |
|
1190 |
+ tree_node_insert_nonbin(node,new); |
|
1191 |
+ node=new; |
|
1192 |
+ break; |
|
1193 |
+ } |
|
1194 |
+ |
|
1195 |
+ case TOKEN_REGEX: |
|
1196 |
+ case TOKEN_DONE: { |
|
1197 |
+ struct leaf_info* leaf=cli_malloc(sizeof(*leaf)); |
|
1198 |
+ leaf->info=strdup(info); |
|
1199 |
+ if(token.type==TOKEN_REGEX) { |
|
1200 |
+ int rc; |
|
1201 |
+ struct tree_node* new; |
|
1202 |
+ regex_t* preg; |
|
1203 |
+ preg=cli_malloc(sizeof(*preg)); |
|
1204 |
+ rc = regcomp(preg,(const char*)token.u.start,bol?0:REG_NOTBOL); |
|
1205 |
+ leaf->preg=preg; |
|
1206 |
+ if(rc) |
|
1207 |
+ return rc; |
|
1208 |
+ new=cli_malloc(sizeof(*new)); |
|
1209 |
+ new->op=OP_LEAF; |
|
1210 |
+ new->next=node; |
|
1211 |
+ new->alternatives=0; |
|
1212 |
+ new->u.leaf=leaf; |
|
1213 |
+ new->listend=1; |
|
1214 |
+ tree_node_insert_nonbin(node,new); |
|
1215 |
+ } |
|
1216 |
+ else { |
|
1217 |
+ leaf->preg=NULL; |
|
1218 |
+ node->alternatives=0; |
|
1219 |
+ node->u.leaf=leaf; |
|
1220 |
+ node->op=OP_LEAF; |
|
1221 |
+ } |
|
1222 |
+ return 0; |
|
1223 |
+ } |
|
1224 |
+ } |
|
1225 |
+ |
|
1226 |
+ bol=0; |
|
1227 |
+ } |
|
1228 |
+ return 0; |
|
1229 |
+} |
|
1230 |
+ |
|
1231 |
+/* c has to be unsigned char here!! */ |
|
1232 |
+static int match_node(struct tree_node* node,const unsigned char* c,size_t len,const char** info) |
|
1233 |
+{ |
|
1234 |
+ struct tree_node** children; |
|
1235 |
+ int rc; |
|
1236 |
+ |
|
1237 |
+ assert(node); |
|
1238 |
+ assert(c); |
|
1239 |
+ assert(info); |
|
1240 |
+ |
|
1241 |
+ *info = NULL; |
|
1242 |
+ len++; |
|
1243 |
+ c--; |
|
1244 |
+ for(;;) { |
|
1245 |
+ assert(node); |
|
1246 |
+ children = node->u.children; |
|
1247 |
+ switch(node->op) { |
|
1248 |
+ case OP_ROOT: |
|
1249 |
+ rc=1; |
|
1250 |
+ break; |
|
1251 |
+ case OP_PARCLOSE: |
|
1252 |
+ /*this isn't a real character, so don't move*/ |
|
1253 |
+ c--; |
|
1254 |
+ len++; |
|
1255 |
+ rc=1; |
|
1256 |
+ break; |
|
1257 |
+ case OP_CHAR: |
|
1258 |
+ assert(*c==node->c && "We know this has to match"); |
|
1259 |
+ rc = 1;/* *c==node->c;- we know it has matched */ |
|
1260 |
+ break; |
|
1261 |
+ case OP_DOT: |
|
1262 |
+ rc = *c!='\n'; |
|
1263 |
+ break; |
|
1264 |
+ case OP_STDCLASS: |
|
1265 |
+ rc = char_class[*c]&(node->c); |
|
1266 |
+ break; |
|
1267 |
+ case OP_CUSTOMCLASS: |
|
1268 |
+ { |
|
1269 |
+ char_bitmap_p bitmap; |
|
1270 |
+ assert(children); |
|
1271 |
+ bitmap = (char_bitmap_p)node->u.bitmap[0]; |
|
1272 |
+ children++; |
|
1273 |
+ rc = bitmap[*c>>3]&(1<<(*c&0x7)); |
|
1274 |
+ break; |
|
1275 |
+ } |
|
1276 |
+ case OP_LEAF: |
|
1277 |
+ { |
|
1278 |
+ const struct leaf_info* leaf = node->u.leaf; |
|
1279 |
+ /*isleaf = 1;*/ |
|
1280 |
+ if(leaf->preg) { |
|
1281 |
+ rc = !regexec(leaf->preg,(const char*)c,0,NULL,0); |
|
1282 |
+ } |
|
1283 |
+ else { |
|
1284 |
+ assert(*c==node->c && "We know this has to match[2]"); |
|
1285 |
+ rc = 1; |
|
1286 |
+ } |
|
1287 |
+ if(rc) { |
|
1288 |
+ *info = leaf->info; |
|
1289 |
+ return MATCH_SUCCESS; |
|
1290 |
+ } |
|
1291 |
+ break; |
|
1292 |
+ } |
|
1293 |
+ default: |
|
1294 |
+ /* impossible */ |
|
1295 |
+ cli_errmsg("Encountered invalid operator in tree:%d\n",node->op); |
|
1296 |
+ exit(1); |
|
1297 |
+ } |
|
1298 |
+ len--; |
|
1299 |
+ if(!len) rc=0; |
|
1300 |
+ c++; |
|
1301 |
+ if(rc) { |
|
1302 |
+ const char csearch = *c; |
|
1303 |
+ int left = 0,right = node->alternatives-1; |
|
1304 |
+ int mid; |
|
1305 |
+ /*matched so far, go deeper*/ |
|
1306 |
+ /*do a binary search between children */ |
|
1307 |
+ assert(children); |
|
1308 |
+ while(left<=right) { |
|
1309 |
+ mid = left+(right-left)/2; |
|
1310 |
+ if (children[mid]->c == csearch) |
|
1311 |
+ break; |
|
1312 |
+ else if(children[mid]->c < csearch) |
|
1313 |
+ left=mid+1; |
|
1314 |
+ else |
|
1315 |
+ right=mid-1; |
|
1316 |
+ } |
|
1317 |
+ if(left<=right) { |
|
1318 |
+ node = children[mid]; |
|
1319 |
+ assert(node); |
|
1320 |
+ } |
|
1321 |
+ else { |
|
1322 |
+ if(node->alternatives) { |
|
1323 |
+ if(!children[0]->listend) { |
|
1324 |
+ node = children[0]; |
|
1325 |
+ c++; |
|
1326 |
+ len--; |
|
1327 |
+ } |
|
1328 |
+ while(node && node->listend) { |
|
1329 |
+ node = node->next;/* climb up */ |
|
1330 |
+ c--; |
|
1331 |
+ len++; |
|
1332 |
+ } |
|
1333 |
+ if(!node || !node->next) |
|
1334 |
+ return MATCH_FAILED;/* reached root node */ |
|
1335 |
+ node=node->next; |
|
1336 |
+ c--; |
|
1337 |
+ len++; |
|
1338 |
+ } |
|
1339 |
+ else if(node->u.children) { |
|
1340 |
+ struct tree_node* rewrite_next = NULL; |
|
1341 |
+ if(node->op==OP_PARCLOSE) |
|
1342 |
+ rewrite_next = node; |
|
1343 |
+ node = children[0]; |
|
1344 |
+ assert(node); |
|
1345 |
+ assert(node->op!=OP_CHAR); |
|
1346 |
+ if(rewrite_next) |
|
1347 |
+ node->next = rewrite_next;/* this node is pointed to by several parent nodes, |
|
1348 |
+ we need to know |
|
1349 |
+ from which one we came, so we can find out way back |
|
1350 |
+ should we fail to match somewhere deeper*/ |
|
1351 |
+ } |
|
1352 |
+ } |
|
1353 |
+ } |
|
1354 |
+ else { |
|
1355 |
+ /* this node didn't match, try sibling, or parent (if no more siblings) */ |
|
1356 |
+ while(node && node->listend) { |
|
1357 |
+ node = node->next;/* sibling of parent */ |
|
1358 |
+ c--; |
|
1359 |
+ len++; |
|
1360 |
+ } |
|
1361 |
+ if(!node || !node->next) /* reached root node, it has no next */ |
|
1362 |
+ return MATCH_FAILED; |
|
1363 |
+ else node=node->next; |
|
1364 |
+ } |
|
1365 |
+ } |
|
1366 |
+ return MATCH_FAILED; |
|
1367 |
+} |
|
1368 |
+ |
|
1369 |
+/* push node on stack, only if it isn't there already */ |
|
1370 |
+static inline void stack_push_once(struct node_stack* stack,struct tree_node* node) |
|
1371 |
+{ |
|
1372 |
+ size_t i; |
|
1373 |
+ assert(stack); |
|
1374 |
+ assert(node); |
|
1375 |
+ |
|
1376 |
+ for(i=0;i < stack->cnt;i++) |
|
1377 |
+ if(stack->data[i]==node) |
|
1378 |
+ return; |
|
1379 |
+ stack_push(stack,node); |
|
1380 |
+} |
|
1381 |
+ |
|
1382 |
+static void destroy_tree_internal(struct regex_matcher* matcher,struct tree_node* node) |
|
1383 |
+{ |
|
1384 |
+ struct tree_node **children; |
|
1385 |
+ assert(matcher); |
|
1386 |
+ assert(node); |
|
1387 |
+ |
|
1388 |
+ children = tree_node_get_children(node); |
|
1389 |
+ if(node->op==OP_LEAF) { |
|
1390 |
+ struct leaf_info* leaf = node->u.leaf; |
|
1391 |
+ if(node->next && !node->listend) |
|
1392 |
+ destroy_tree_internal(matcher,node->next); |
|
1393 |
+ stack_push_once(&matcher->node_stack,(struct tree_node*)node->u.leaf);/* cast to make compiler happy, and to not make another stack implementation for storing void* */ |
|
1394 |
+ stack_push_once(&matcher->node_stack,node); |
|
1395 |
+ if(leaf->preg) { |
|
1396 |
+ regfree(leaf->preg); |
|
1397 |
+ free(leaf->preg); |
|
1398 |
+ leaf->preg=NULL; |
|
1399 |
+ } |
|
1400 |
+ if(leaf->info) { |
|
1401 |
+ free(leaf->info); |
|
1402 |
+ leaf->info=NULL; |
|
1403 |
+ } |
|
1404 |
+ /* return;*/ |
|
1405 |
+ } |
|
1406 |
+ if(node->alternatives) { |
|
1407 |
+ int i; |
|
1408 |
+ struct tree_node* p; |
|
1409 |
+ assert(children); |
|
1410 |
+ p = children[0]->op==OP_LEAF ? NULL : children[0]->next; |
|
1411 |
+ for(i=0;i<node->alternatives;i++) |
|
1412 |
+ destroy_tree_internal(matcher,children[i]); |
|
1413 |
+ if(p && p!=node) |
|
1414 |
+ destroy_tree_internal(matcher,p);/*?? is this ok, or without _internal?*/ |
|
1415 |
+ } |
|
1416 |
+ else { |
|
1417 |
+ if(children) { |
|
1418 |
+ if(children[0]) |
|
1419 |
+ destroy_tree_internal(matcher,children[0]); |
|
1420 |
+ } |
|
1421 |
+ } |
|
1422 |
+ if(node->next && !node->listend) |
|
1423 |
+ destroy_tree_internal(matcher,node->next); |
|
1424 |
+ if(node->u.children) |
|
1425 |
+ stack_push_once(&matcher->node_stack,(struct tree_node*)node->u.children);/* cast to make compiler happy, it isn't really a tree_node* */ |
|
1426 |
+ if(node->op==OP_CUSTOMCLASS && node->u.children[0]) { |
|
1427 |
+ free(node->u.children[0]); |
|
1428 |
+ node->u.children[0]=NULL; |
|
1429 |
+ } |
|
1430 |
+ stack_push_once(&matcher->node_stack,node); |
|
1431 |
+} |
|
1432 |
+ |
|
1433 |
+static void destroy_tree(struct regex_matcher* matcher) |
|
1434 |
+{ |
|
1435 |
+ /* we might have the same node linked by different nodes, so a recursive walk&free doesn't work in all situations, |
|
1436 |
+ * i.e. it might double-free, so instead of freeing, just push the nodes on a stack, and later free the nodes in that stack, |
|
1437 |
+ * (and push to stack only if it doesn't contain it already*/ |
|
1438 |
+ assert(matcher); |
|
1439 |
+ |
|
1440 |
+ stack_reset(&matcher->node_stack); |
|
1441 |
+ destroy_tree_internal(matcher,matcher->root_regex); |
|
1442 |
+ while (matcher->node_stack.cnt) { |
|
1443 |
+ struct tree_node* node = stack_pop(&matcher->node_stack); |
|
1444 |
+ free(node); |
|
1445 |
+ } |
|
1446 |
+} |
|
1447 |
+#ifndef NDEBUG |
|
1448 |
+static void dump_node(struct tree_node* node) |
|
1449 |
+{ |
|
1450 |
+ int i; |
|
1451 |
+ struct tree_node* p,**children; |
|
1452 |
+ assert(node); |
|
1453 |
+ if(node->op==OP_LEAF) { |
|
1454 |
+ if(node->u.leaf->preg) |
|
1455 |
+ printf("n%p [label=\"regex\\nleaf\"]",(void*)node); |
|
1456 |
+ else |
|
1457 |
+ printf("n%p [label=\"%c\\nleaf\"];\n",(void*)node,node->c); |
|
1458 |
+ if(node->next && !node->listend) { |
|
1459 |
+ printf("n%p -> n%p;\n",(void*)node,(void*)node->next); |
|
1460 |
+ dump_node(node->next); |
|
1461 |
+ } |
|
1462 |
+ return; |
|
1463 |
+ } |
|
1464 |
+ printf("n%p [label=\"%c\\n%d\\nlistend:%d\"];\n",(void*)node,(node->op==OP_ROOT||node->op==OP_PARCLOSE) ?'@' :node->c,node->op,node->listend); |
|
1465 |
+ if(node->next) |
|
1466 |
+ printf("n%p -> n%p;\n",(void*)node,(void*)node->next); |
|
1467 |
+ printf("n%p -> {",(void*)node);/*using address of node as id*/ |
|
1468 |
+ children = tree_node_get_children(node); |
|
1469 |
+ if(node->alternatives) |
|
1470 |
+ assert(children); |
|
1471 |
+ for(i=0;i<node->alternatives;i++) |
|
1472 |
+ printf("n%p ",(void*)children[i]); |
|
1473 |
+ if(node->alternatives && children[0]->op!=OP_LEAF) |
|
1474 |
+ for(p=children[0]->next;p!=node;p=p->next) |
|
1475 |
+ { |
|
1476 |
+ assert(p); |
|
1477 |
+ printf("n%p ",(void*)p); |
|
1478 |
+ if(p->op==OP_LEAF || p->listend) |
|
1479 |
+ break; |
|
1480 |
+ } |
|
1481 |
+ if(!node->alternatives && children && children[0]) |
|
1482 |
+ printf("n%p ",(void*)children[0]); |
|
1483 |
+ printf("};\n"); |
|
1484 |
+ printf("{rank=same;"); |
|
1485 |
+ for(i=0;i<node->alternatives;i++) |
|
1486 |
+ printf("n%p ",(void*)node->u.children[i]); |
|
1487 |
+ if(node->alternatives && children[0]->op!=OP_LEAF) |
|
1488 |
+ for(p=children[0]->next;p!=node;p=p->next) |
|
1489 |
+ { |
|
1490 |
+ printf("n%p ",(void*)p); |
|
1491 |
+ if(p->op==OP_LEAF || p->listend) |
|
1492 |
+ break; |
|
1493 |
+ } |
|
1494 |
+ if(!node->alternatives && children && children[0]) |
|
1495 |
+ printf("n%p ",(void*)children[0]); |
|
1496 |
+ printf("};\n"); |
|
1497 |
+ for(i=0;i<node->alternatives;i++) |
|
1498 |
+ dump_node(children[i]); |
|
1499 |
+ if(node->alternatives && children[0]->op!=OP_LEAF) |
|
1500 |
+ for(p=children[0]->next;p!=node;p=p->next) |
|
1501 |
+ { |
|
1502 |
+ dump_node(p); |
|
1503 |
+ if(p->op==OP_LEAF || p->listend) |
|
1504 |
+ break; |
|
1505 |
+ } |
|
1506 |
+ if(!node->alternatives && children && children[0]) |
|
1507 |
+ dump_node(children[0]); |
|
1508 |
+} |
|
1509 |
+ |
|
1510 |
+void dump_tree(struct tree_node* root) |
|
1511 |
+{ |
|
1512 |
+ /*use dot/dotty from graphviz to view it*/ |
|
1513 |
+ assert(root); |
|
1514 |
+ printf("digraph tree {\n"); |
|
1515 |
+ dump_node(root); |
|
1516 |
+ printf("}\n"); |
|
1517 |
+} |
|
1518 |
+#endif |
|
1519 |
+ |
|
1520 |
+#endif |
0 | 1521 |
new file mode 100644 |
... | ... |
@@ -0,0 +1,53 @@ |
0 |
+/* |
|
1 |
+ * Match a string against a list of patterns/regexes. |
|
2 |
+ * |
|
3 | ||
4 |
+ * |
|
5 |
+ * This program is free software; you can redistribute it and/or modify |
|
6 |
+ * it under the terms of the GNU General Public License as published by |
|
7 |
+ * the Free Software Foundation; either version 2 of the License, or |
|
8 |
+ * (at your option) any later version. |
|
9 |
+ * |
|
10 |
+ * This program is distributed in the hope that it will be useful, |
|
11 |
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of |
|
12 |
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
|
13 |
+ * GNU General Public License for more details. |
|
14 |
+ * |
|
15 |
+ * You should have received a copy of the GNU General Public License |
|
16 |
+ * along with this program; if not, write to the Free Software |
|
17 |
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, |
|
18 |
+ * MA 02110-1301, USA. |
|
19 |
+ * |
|
20 |
+ */ |
|
21 |
+ |
|
22 |
+#ifdef CL_EXPERIMENTAL |
|
23 |
+ |
|
24 |
+#ifndef _REGEX_LIST_H |
|
25 |
+#define _REGEX_LIST_H |
|
26 |
+ |
|
27 |
+struct node_stack { |
|
28 |
+ struct tree_node** data; |
|
29 |
+ size_t capacity; |
|
30 |
+ size_t cnt; |
|
31 |
+}; |
|
32 |
+ |
|
33 |
+struct regex_matcher { |
|
34 |
+ struct cli_matcher* root_hosts; |
|
35 |
+ struct cli_matcher* root_urls; |
|
36 |
+ struct tree_node* root_regex; |
|
37 |
+ int list_inited; |
|
38 |
+ int list_loaded; |
|
39 |
+ int list_built; |
|
40 |
+ struct node_stack node_stack; |
|
41 |
+ struct node_stack node_stack_alt; |
|
42 |
+}; |
|
43 |
+ |
|
44 |
+int regex_list_match(struct regex_matcher* matcher,const char* real_url,const char* display_url,int hostOnly,const char** info); |
|
45 |
+int init_regex_list(struct regex_matcher* matcher); |
|
46 |
+int load_regex_matcher(struct regex_matcher* matcher,FILE* fd,unsigned int options); |
|
47 |
+void regex_list_cleanup(struct regex_matcher* matcher); |
|
48 |
+void regex_list_done(struct regex_matcher* matcher); |
|
49 |
+int is_regex_ok(struct regex_matcher* matcher); |
|
50 |
+#endif |
|
51 |
+ |
|
52 |
+#endif |