GitList

Browse code

Phishing module merge - libclamav

git-svn: trunk@2244

aCaB authored on 2006/09/13 04:38:39
Showing 15 changed files

clamav-devel/ChangeLog index e0bee19..10f7185 100644
clamav-devel/configure index 4fb7f20..9f41c81 100755
clamav-devel/libclamav/Makefile.am index 77697b2..c5e4c8a 100644
clamav-devel/libclamav/Makefile.in index 96937d3..e0cc30c 100644
clamav-devel/libclamav/htmlnorm.h index 25e793c..bf358db 100644
clamav-devel/libclamav/iana_tld.h index 0000000..95730ea
clamav-devel/libclamav/phish_domaincheck_db.c index 0000000..b24a292
clamav-devel/libclamav/phish_domaincheck_db.h index 5d30c77..667d851 100644
clamav-devel/libclamav/phish_whitelist.c index 0000000..b295924
clamav-devel/libclamav/phish_whitelist.h index 26e42e4..02bc8df 100644
clamav-devel/libclamav/phishcheck.c index 0000000..51e60d2
clamav-devel/libclamav/phishcheck.h index 0000000..2d64157
clamav-devel/libclamav/readdb.c index 53efb14..58e5e21 100644
clamav-devel/libclamav/regex_list.c index 0000000..f74e695
clamav-devel/libclamav/regex_list.h index 0000000..e9ba4aa

@@ -1,3 +1,9 @@
                     +Tue Sep 12 21:59:17 CEST 2006 (acab)
                     +------------------------------------
                     +  * libclamav: Merge of the related part of the phishing module from
                     +               Torok Edvin <edwintorok*gmail.com>
                     +	       Part of the Google Summer of Code program
+                    +
                      Tue Sep 12 20:42:04 CEST 2006 (acab)
                      ------------------------------------
                        * sigtool: Merge of the related part of the phishing module from

clamav-devel/configure

History View file @ bd912dd

@@ -1,6 +1,6 @@
                      #! /bin/sh
                      # Guess values for system-dependent variables and create Makefiles.
                     -# Generated by GNU Autoconf 2.60.
                     +# Generated by GNU Autoconf 2.60a.
+                     #
                      # Copyright (C) 1992, 1993, 1994, 1995, 1996, 1998, 1999, 2000, 2001,
                      # 2002, 2003, 2004, 2005, 2006 Free Software Foundation, Inc.
@@ -724,36 +724,36 @@ ac_unique_file="clamscan/clamscan.c"
                      # Factoring default headers for most tests.
                      ac_includes_default="\
                      #include <stdio.h>
                     -#if HAVE_SYS_TYPES_H
                     +#ifdef HAVE_SYS_TYPES_H
                      # include <sys/types.h>
                      #endif
                     -#if HAVE_SYS_STAT_H
                     +#ifdef HAVE_SYS_STAT_H
                      # include <sys/stat.h>
                      #endif
                     -#if STDC_HEADERS
                     +#ifdef STDC_HEADERS
                      # include <stdlib.h>
                      # include <stddef.h>
                      #else
                     -# if HAVE_STDLIB_H
                     +# ifdef HAVE_STDLIB_H
                      #  include <stdlib.h>
                      # endif
                      #endif
                     -#if HAVE_STRING_H
                     -# if !STDC_HEADERS && HAVE_MEMORY_H
                     +#ifdef HAVE_STRING_H
                     +# if !defined STDC_HEADERS && defined HAVE_MEMORY_H
                      #  include <memory.h>
                      # endif
                      # include <string.h>
                      #endif
                     -#if HAVE_STRINGS_H
                     +#ifdef HAVE_STRINGS_H
                      # include <strings.h>
                      #endif
                     -#if HAVE_INTTYPES_H
                     +#ifdef HAVE_INTTYPES_H
                      # include <inttypes.h>
                      #endif
                     -#if HAVE_STDINT_H
                     +#ifdef HAVE_STDINT_H
                      # include <stdint.h>
                      #endif
                     -#if HAVE_UNISTD_H
                     +#ifdef HAVE_UNISTD_H
                      # include <unistd.h>
                      #endif"
@@ -1576,7 +1576,7 @@ test -n "$ac_init_help" && exit $ac_status
                      if $ac_init_version; then
                        cat <<\_ACEOF
                      configure
                     -generated by GNU Autoconf 2.60
                     +generated by GNU Autoconf 2.60a
                      Copyright (C) 1992, 1993, 1994, 1995, 1996, 1998, 1999, 2000, 2001,
 , 2003, 2004, 2005, 2006 Free Software Foundation, Inc.
@@ -1590,7 +1590,7 @@ This file contains any messages produced by compilers while
                      running configure, to aid debugging if configure makes a mistake.
                      It was created by $as_me, which was
                     -generated by GNU Autoconf 2.60.  Invocation command line was
                     +generated by GNU Autoconf 2.60a.  Invocation command line was
                        $ $0 $@
@@ -3164,7 +3164,7 @@ eval "echo \"\$as_me:$LINENO: $ac_try_echo\"") >&5
                      # in a Makefile.  We should not override ac_cv_exeext if it was cached,
                      # so that the user can short-circuit this test for compilers unknown to
                      # Autoconf.
                     -for ac_file in $ac_files
                     +for ac_file in $ac_files ''
                      do
                        test -f "$ac_file" || continue
                        case $ac_file in
@@ -3192,6 +3192,12 @@ done
                      test "$ac_cv_exeext" = no && ac_cv_exeext=
                      else
                     +  ac_file=''
                     +fi
+                    +
                     +{ echo "$as_me:$LINENO: result: $ac_file" >&5
                     +echo "${ECHO_T}$ac_file" >&6; }
                     +if test -z "$ac_file"; then
                        echo "$as_me: failed program was:" >&5
                      sed 's/^/| /' conftest.$ac_ext >&5
@@ -3203,8 +3209,6 @@ See \`config.log' for more details." >&2;}
                      fi
                      ac_exeext=$ac_cv_exeext
                     -{ echo "$as_me:$LINENO: result: $ac_file" >&5
                     -echo "${ECHO_T}$ac_file" >&6; }
                      # Check that the compiler produces executables we can run.  If not, either
                      # the compiler is broken, or we cross compile.
@@ -5882,7 +5886,7 @@ test "x$enable_libtool_lock" != xno && enable_libtool_lock=yes
                      case $host in
                      *-*-irix6*)
                        # Find out which ABI we are using.
                     -  echo '#line 5885 "configure"' > conftest.$ac_ext
                     +  echo '#line 5889 "configure"' > conftest.$ac_ext
                        if { (eval echo "$as_me:$LINENO: \"$ac_compile\"") >&5
                        (eval $ac_compile) 2>&5
                        ac_status=$?
@@ -5994,7 +5998,7 @@ sed 's/^/| /' conftest.$ac_ext >&5
                      	lt_cv_cc_needs_belf=no
                      fi
                     -rm -f core conftest.err conftest.$ac_objext \
                     +rm -f core conftest.err conftest.$ac_objext conftest_ipa8_conftest.oo \
                            conftest$ac_exeext conftest.$ac_ext
                           ac_ext=c
                      ac_cpp='$CPP $CPPFLAGS'
@@ -6470,7 +6474,7 @@ sed 's/^/| /' conftest.$ac_ext >&5
                      fi
                     -rm -f core conftest.err conftest.$ac_objext \
                     +rm -f core conftest.err conftest.$ac_objext conftest_ipa8_conftest.oo \
                            conftest$ac_exeext conftest.$ac_ext
                        LDFLAGS="$save_LDFLAGS"
@@ -6515,7 +6519,7 @@ chmod -w .
                      save_CFLAGS="$CFLAGS"
                      CFLAGS="$CFLAGS -o out/conftest2.$ac_objext"
                      compiler_c_o=no
                     -if { (eval echo configure:6518: \"$ac_compile\") 1>&5; (eval $ac_compile) 2>out/conftest.err; } && test -s out/conftest2.$ac_objext; then
                     +if { (eval echo configure:6522: \"$ac_compile\") 1>&5; (eval $ac_compile) 2>out/conftest.err; } && test -s out/conftest2.$ac_objext; then
                        # The compiler can only warn and ignore the option if not recognized
                        # So say no if there are warnings
                        if test -s out/conftest.err; then
@@ -8076,7 +8080,7 @@ sed 's/^/| /' conftest.$ac_ext >&5
                      	ac_cv_func_shl_load=no
                      fi
                     -rm -f core conftest.err conftest.$ac_objext \
                     +rm -f core conftest.err conftest.$ac_objext conftest_ipa8_conftest.oo \
                            conftest$ac_exeext conftest.$ac_ext
                      fi
                      { echo "$as_me:$LINENO: result: $ac_cv_func_shl_load" >&5
@@ -8155,7 +8159,7 @@ sed 's/^/| /' conftest.$ac_ext >&5
                      	ac_cv_lib_dld_shl_load=no
                      fi
                     -rm -f core conftest.err conftest.$ac_objext \
                     +rm -f core conftest.err conftest.$ac_objext conftest_ipa8_conftest.oo \
                            conftest$ac_exeext conftest.$ac_ext
                      LIBS=$ac_check_lib_save_LIBS
                      fi
@@ -8256,7 +8260,7 @@ sed 's/^/| /' conftest.$ac_ext >&5
                      	ac_cv_func_dlopen=no
                      fi
                     -rm -f core conftest.err conftest.$ac_objext \
                     +rm -f core conftest.err conftest.$ac_objext conftest_ipa8_conftest.oo \
                            conftest$ac_exeext conftest.$ac_ext
                      fi
                      { echo "$as_me:$LINENO: result: $ac_cv_func_dlopen" >&5
@@ -8335,7 +8339,7 @@ sed 's/^/| /' conftest.$ac_ext >&5
                      	ac_cv_lib_dl_dlopen=no
                      fi
                     -rm -f core conftest.err conftest.$ac_objext \
                     +rm -f core conftest.err conftest.$ac_objext conftest_ipa8_conftest.oo \
                            conftest$ac_exeext conftest.$ac_ext
                      LIBS=$ac_check_lib_save_LIBS
                      fi
@@ -8415,7 +8419,7 @@ sed 's/^/| /' conftest.$ac_ext >&5
                      	ac_cv_lib_svld_dlopen=no
                      fi
                     -rm -f core conftest.err conftest.$ac_objext \
                     +rm -f core conftest.err conftest.$ac_objext conftest_ipa8_conftest.oo \
                            conftest$ac_exeext conftest.$ac_ext
                      LIBS=$ac_check_lib_save_LIBS
                      fi
@@ -8495,7 +8499,7 @@ sed 's/^/| /' conftest.$ac_ext >&5
                      	ac_cv_lib_dld_dld_link=no
                      fi
                     -rm -f core conftest.err conftest.$ac_objext \
                     +rm -f core conftest.err conftest.$ac_objext conftest_ipa8_conftest.oo \
                            conftest$ac_exeext conftest.$ac_ext
                      LIBS=$ac_check_lib_save_LIBS
                      fi
@@ -8551,7 +8555,7 @@ else
                          lt_dlunknown=0; lt_dlno_uscore=1; lt_dlneed_uscore=2
                        lt_status=$lt_dlunknown
                        cat > conftest.$ac_ext <<EOF
                     -#line 8554 "configure"
                     +#line 8558 "configure"
                      #include "confdefs.h"
                      #if HAVE_DLFCN_H
@@ -8649,7 +8653,7 @@ else
                          lt_dlunknown=0; lt_dlno_uscore=1; lt_dlneed_uscore=2
                        lt_status=$lt_dlunknown
                        cat > conftest.$ac_ext <<EOF
                     -#line 8652 "configure"
                     +#line 8656 "configure"
                      #include "confdefs.h"
                      #if HAVE_DLFCN_H
@@ -10423,7 +10427,7 @@ sed 's/^/| /' conftest.$ac_ext >&5
                      	ac_cv_lib_socket_bind=no
                      fi
                     -rm -f core conftest.err conftest.$ac_objext \
                     +rm -f core conftest.err conftest.$ac_objext conftest_ipa8_conftest.oo \
                            conftest$ac_exeext conftest.$ac_ext
                      LIBS=$ac_check_lib_save_LIBS
                      fi
@@ -10504,7 +10508,7 @@ sed 's/^/| /' conftest.$ac_ext >&5
                      	ac_cv_lib_nsl_gethostent=no
                      fi
                     -rm -f core conftest.err conftest.$ac_objext \
                     +rm -f core conftest.err conftest.$ac_objext conftest_ipa8_conftest.oo \
                            conftest$ac_exeext conftest.$ac_ext
                      LIBS=$ac_check_lib_save_LIBS
                      fi
@@ -10622,7 +10626,7 @@ sed 's/^/| /' conftest.$ac_ext >&5
                      	eval "$as_ac_var=no"
                      fi
                     -rm -f core conftest.err conftest.$ac_objext \
                     +rm -f core conftest.err conftest.$ac_objext conftest_ipa8_conftest.oo \
                            conftest$ac_exeext conftest.$ac_ext
                      fi
                      ac_res=`eval echo '${'$as_ac_var'}'`
@@ -10897,7 +10901,7 @@ sed 's/^/| /' conftest.$ac_ext >&5
                      	eval "$as_ac_var=no"
                      fi
                     -rm -f core conftest.err conftest.$ac_objext \
                     +rm -f core conftest.err conftest.$ac_objext conftest_ipa8_conftest.oo \
                            conftest$ac_exeext conftest.$ac_ext
                      fi
                      ac_res=`eval echo '${'$as_ac_var'}'`
@@ -10954,21 +10958,21 @@ $ac_includes_default
                      #include <fcntl.h>
                      #include <sys/mman.h>
                     -#if !STDC_HEADERS && !HAVE_STDLIB_H
                     +#if !defined STDC_HEADERS && !defined HAVE_STDLIB_H
                      char *malloc ();
                      #endif
                      /* This mess was copied from the GNU getpagesize.h.  */
                     -#if !HAVE_GETPAGESIZE
                     +#ifndef HAVE_GETPAGESIZE
                      /* Assume that all systems that can run configure have sys/param.h.  */
                     -# if !HAVE_SYS_PARAM_H
                     +# ifndef HAVE_SYS_PARAM_H
                      #  define HAVE_SYS_PARAM_H 1
                      # endif
                      # ifdef _SC_PAGESIZE
                      #  define getpagesize() sysconf(_SC_PAGESIZE)
                      # else /* no _SC_PAGESIZE */
                     -#  if HAVE_SYS_PARAM_H
                     +#  ifdef HAVE_SYS_PARAM_H
                      #   include <sys/param.h>
                      #   ifdef EXEC_PAGESIZE
                      #    define getpagesize() EXEC_PAGESIZE
@@ -11300,7 +11304,7 @@ sed 's/^/| /' conftest.$ac_ext >&5
                      	ac_cv_func_fseeko=no
                      fi
                     -rm -f core conftest.err conftest.$ac_objext \
                     +rm -f core conftest.err conftest.$ac_objext conftest_ipa8_conftest.oo \
                            conftest$ac_exeext conftest.$ac_ext
                      fi
                      { echo "$as_me:$LINENO: result: $ac_cv_func_fseeko" >&5
@@ -11696,7 +11700,7 @@ sed 's/^/| /' conftest.$ac_ext >&5
                      	ac_cv_lib_z_inflateEnd=no
                      fi
                     -rm -f core conftest.err conftest.$ac_objext \
                     +rm -f core conftest.err conftest.$ac_objext conftest_ipa8_conftest.oo \
                            conftest$ac_exeext conftest.$ac_ext
                      LIBS=$ac_check_lib_save_LIBS
                      fi
@@ -11786,7 +11790,7 @@ sed 's/^/| /' conftest.$ac_ext >&5
                      	ac_cv_lib_z_inflateEnd=no
                      fi
                     -rm -f core conftest.err conftest.$ac_objext \
                     +rm -f core conftest.err conftest.$ac_objext conftest_ipa8_conftest.oo \
                            conftest$ac_exeext conftest.$ac_ext
                      LIBS=$ac_check_lib_save_LIBS
                      fi
@@ -11888,7 +11892,7 @@ sed 's/^/| /' conftest.$ac_ext >&5
                      	ac_cv_lib_bz2_bzReadOpen=no
                      fi
                     -rm -f core conftest.err conftest.$ac_objext \
                     +rm -f core conftest.err conftest.$ac_objext conftest_ipa8_conftest.oo \
                            conftest$ac_exeext conftest.$ac_ext
                      LIBS=$ac_check_lib_save_LIBS
                      fi
@@ -12146,7 +12150,7 @@ sed 's/^/| /' conftest.$ac_ext >&5
                      	ac_cv_lib_sn_sigscan_sn_sigscan_initdb=no
                      fi
                     -rm -f core conftest.err conftest.$ac_objext \
                     +rm -f core conftest.err conftest.$ac_objext conftest_ipa8_conftest.oo \
                            conftest$ac_exeext conftest.$ac_ext
                      LIBS=$ac_check_lib_save_LIBS
                      fi
@@ -12402,7 +12406,7 @@ sed 's/^/| /' conftest.$ac_ext >&5
                      	ac_cv_lib_resolv___dn_expand=no
                      fi
                     -rm -f core conftest.err conftest.$ac_objext \
                     +rm -f core conftest.err conftest.$ac_objext conftest_ipa8_conftest.oo \
                            conftest$ac_exeext conftest.$ac_ext
                      LIBS=$ac_check_lib_save_LIBS
                      fi
@@ -12484,7 +12488,7 @@ sed 's/^/| /' conftest.$ac_ext >&5
                      	ac_cv_lib_resolv_dn_expand=no
                      fi
                     -rm -f core conftest.err conftest.$ac_objext \
                     +rm -f core conftest.err conftest.$ac_objext conftest_ipa8_conftest.oo \
                            conftest$ac_exeext conftest.$ac_ext
                      LIBS=$ac_check_lib_save_LIBS
                      fi
@@ -12915,7 +12919,7 @@ sed 's/^/| /' conftest.$ac_ext >&5
                      	ac_cv_lib_gmp___gmpz_init=no
                      fi
                     -rm -f core conftest.err conftest.$ac_objext \
                     +rm -f core conftest.err conftest.$ac_objext conftest_ipa8_conftest.oo \
                            conftest$ac_exeext conftest.$ac_ext
                      LIBS=$ac_check_lib_save_LIBS
                      fi
@@ -12999,7 +13003,7 @@ sed 's/^/| /' conftest.$ac_ext >&5
                      	ac_cv_lib_gmp_mpz_init=no
                      fi
                     -rm -f core conftest.err conftest.$ac_objext \
                     +rm -f core conftest.err conftest.$ac_objext conftest_ipa8_conftest.oo \
                            conftest$ac_exeext conftest.$ac_ext
                      LIBS=$ac_check_lib_save_LIBS
                      fi
@@ -13906,7 +13910,7 @@ sed 's/^/| /' conftest.$ac_ext >&5
                      	ac_cv_lib_milter_mi_stop=no
                      fi
                     -rm -f core conftest.err conftest.$ac_objext \
                     +rm -f core conftest.err conftest.$ac_objext conftest_ipa8_conftest.oo \
                            conftest$ac_exeext conftest.$ac_ext
                      LIBS=$ac_check_lib_save_LIBS
                      fi
@@ -13993,7 +13997,7 @@ sed 's/^/| /' conftest.$ac_ext >&5
                      fi
                     -rm -f core conftest.err conftest.$ac_objext \
                     +rm -f core conftest.err conftest.$ac_objext conftest_ipa8_conftest.oo \
                            conftest$ac_exeext
                        if test "${ac_cv_search_strlcpy+set}" = set; then
                        break
@@ -14088,7 +14092,7 @@ sed 's/^/| /' conftest.$ac_ext >&5
                      	ac_cv_lib_milter_mi_stop=no
                      fi
                     -rm -f core conftest.err conftest.$ac_objext \
                     +rm -f core conftest.err conftest.$ac_objext conftest_ipa8_conftest.oo \
                            conftest$ac_exeext conftest.$ac_ext
                      LIBS=$ac_check_lib_save_LIBS
                      fi
@@ -14407,7 +14411,7 @@ sed 's/^/| /' conftest.$ac_ext >&5
                      	eval "$as_ac_var=no"
                      fi
                     -rm -f core conftest.err conftest.$ac_objext \
                     +rm -f core conftest.err conftest.$ac_objext conftest_ipa8_conftest.oo \
                            conftest$ac_exeext conftest.$ac_ext
                      fi
                      ac_res=`eval echo '${'$as_ac_var'}'`
@@ -14898,11 +14902,11 @@ echo "${ECHO_T}no" >&6; }
                                      LIBS=$save_LIBS
                      fi
                     -rm -f core conftest.err conftest.$ac_objext \
                     +rm -f core conftest.err conftest.$ac_objext conftest_ipa8_conftest.oo \
                            conftest$ac_exeext conftest.$ac_ext
                      fi
                     -rm -f core conftest.err conftest.$ac_objext \
                     +rm -f core conftest.err conftest.$ac_objext conftest_ipa8_conftest.oo \
                            conftest$ac_exeext conftest.$ac_ext
                      else
                        have_wrappers=no
@@ -15717,10 +15721,10 @@ main ()
                      #ifndef __cplusplus
                        /* Ultrix mips cc rejects this.  */
                        typedef int charset[2];
                     -  const charset x;
                     +  const charset cs;
                        /* SunOS 4.1.1 cc rejects this.  */
                     -  char const *const *ccp;
                     -  char **p;
                     +  char const *const *pcpcc;
                     +  char **ppc;
                        /* NEC SVR4.0.2 mips cc rejects this.  */
                        struct point {int x, y;};
                        static struct point const zero = {0,0};
@@ -15729,11 +15733,11 @@ main ()
                           an arm of an if-expression whose if-part is not a constant
                           expression */
                        const char *g = "string";
                     -  ccp = &g + (g ? g-g : 0);
                     +  pcpcc = &g + (g ? g-g : 0);
                        /* HPUX 7.0 cc rejects these. */
                     -  ++ccp;
                     -  p = (char**) ccp;
                     -  ccp = (char const *const *) p;
                     +  ++pcpcc;
                     +  ppc = (char**) pcpcc;
                     +  pcpcc = (char const *const *) ppc;
                        { /* SCO 3.2v4 cc rejects this.  */
                          char *t;
                          char const *s = 0 ? (char *) 0 : (char const *) 0;
@@ -15760,7 +15764,7 @@ main ()
                          const int foo = 10;
                          if (!foo) return 0;
+                       }
                     -  return !x[0] && !zero.x;
                     +  return !cs[0] && !zero.x;
                      #endif
+                       ;
@@ -15925,7 +15929,8 @@ cat >>conftest.$ac_ext <<_ACEOF
                      int
                      main ()
+                     {
                     -#if !BYTE_ORDER || !BIG_ENDIAN || !LITTLE_ENDIAN
                     +#if  ! (defined BYTE_ORDER && defined BIG_ENDIAN && defined LITTLE_ENDIAN \
                     +	&& BYTE_ORDER && BIG_ENDIAN && LITTLE_ENDIAN)
                       bogus endian macros
                      #endif
@@ -17023,7 +17028,7 @@ exec 6>&1
                      # values after options handling.
                      ac_log="
                      This file was extended by $as_me, which was
                     -generated by GNU Autoconf 2.60.  Invocation command line was
                     +generated by GNU Autoconf 2.60a.  Invocation command line was
                        CONFIG_FILES    = $CONFIG_FILES
                        CONFIG_HEADERS  = $CONFIG_HEADERS
@@ -17052,7 +17057,7 @@ current configuration.
                      Usage: $0 [OPTIONS] [FILE]...
                        -h, --help       print this help, then exit
                     -  -V, --version    print version number, then exit
                     +  -V, --version    print version number and configuration settings, then exit
                        -q, --quiet      do not print progress messages
                        -d, --debug      don't remove temporary files
                            --recheck    update $as_me by reconfiguring in the same conditions
@@ -17076,7 +17081,7 @@ _ACEOF
                      cat >>$CONFIG_STATUS <<_ACEOF
                      ac_cs_version="\\
                      config.status
                     -configured by $0, generated by GNU Autoconf 2.60,
                     +configured by $0, generated by GNU Autoconf 2.60a,
                        with options \\"`echo "$ac_configure_args" | sed 's/^ //; s/[\\""\`\$]/\\\\&/g'`\\"
                      Copyright (C) 2006 Free Software Foundation, Inc.

clamav-devel/libclamav/Makefile.am

History View file @ bd912dd

@@ -142,6 +142,14 @@ libclamav_la_SOURCES = \
                      	uuencode.c \
                      	uuencode.h \
                      	pst.c \
                     -	pst.h
                     +	pst.h \
                     +	phishcheck.c \
                     +	phishcheck.h \
                     +	phish_domaincheck_db.c \
                     +	phish_domaincheck_db.h \
                     +	phish_whitelist.c \
                     +	phish_whitelist.h \
                     +	regex_list.c \
                     +	regex_list.h
                      lib_LTLIBRARIES = libclamav.la

clamav-devel/libclamav/Makefile.in

History View file @ bd912dd

@@ -86,7 +86,8 @@ am_libclamav_la_OBJECTS = matcher-ac.lo matcher-bm.lo matcher.lo \
                      	packlibs.lo fsg.lo line.lo untar.lo unzip.lo special.lo \
                      	binhex.lo is_tar.lo tnef.lo unrar15.lo unrarvm.lo unrar.lo \
                      	unrarfilter.lo unrarppm.lo unrar20.lo unrarcmd.lo pdf.lo \
                     -	spin.lo yc.lo elf.lo sis.lo uuencode.lo pst.lo
                     +	spin.lo yc.lo elf.lo sis.lo uuencode.lo pst.lo phishcheck.lo \
                     +	phish_domaincheck_db.lo phish_whitelist.lo regex_list.lo
                      libclamav_la_OBJECTS = $(am_libclamav_la_OBJECTS)
                      DEFAULT_INCLUDES = -I. -I$(srcdir) -I$(top_builddir)
                      depcomp = $(SHELL) $(top_srcdir)/depcomp
@@ -341,7 +342,15 @@ libclamav_la_SOURCES = \
                      	uuencode.c \
                      	uuencode.h \
                      	pst.c \
                     -	pst.h
                     +	pst.h \
                     +	phishcheck.c \
                     +	phishcheck.h \
                     +	phish_domaincheck_db.c \
                     +	phish_domaincheck_db.h \
                     +	phish_whitelist.c \
                     +	phish_whitelist.h \
                     +	regex_list.c \
                     +	regex_list.h
                      lib_LTLIBRARIES = libclamav.la
                      all: all-am
@@ -440,10 +449,14 @@ distclean-compile:
                      @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/pdf.Plo@am__quote@
                      @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/pe.Plo@am__quote@
                      @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/petite.Plo@am__quote@
                     +@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/phish_domaincheck_db.Plo@am__quote@
                     +@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/phish_whitelist.Plo@am__quote@
                     +@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/phishcheck.Plo@am__quote@
                      @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/pst.Plo@am__quote@
                      @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/qtmd.Plo@am__quote@
                      @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/readdb.Plo@am__quote@
                      @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/rebuildpe.Plo@am__quote@
                     +@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/regex_list.Plo@am__quote@
                      @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/scanners.Plo@am__quote@
                      @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/sis.Plo@am__quote@
                      @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/snprintf.Plo@am__quote@

clamav-devel/libclamav/htmlnorm.h

History View file @ bd912dd

@@ -23,8 +23,10 @@
                      typedef struct tag_arguments_tag
+                     {
                              int count;
                     +	int scanContents;
                              unsigned char **tag;
                              unsigned char **value;
                     +	struct blob   **contents;
                      } tag_arguments_t;
                      int html_normalise_mem(unsigned char *in_buff, off_t in_size, const char *dirname, tag_arguments_t *hrefs);

clamav-devel/libclamav/iana_tld.h

History View file @ bd912dd

                     new file mode 100644
@@ -0,0 +1,28 @@
                     +/*
                     + *  Phishing module: iana tld list.
                     + *
                     + *  Copyright (C) 2006 Torok Edvin <edwintorok@gmail.com>
                     + *
                     + *  This program is free software; you can redistribute it and/or modify
                     + *  it under the terms of the GNU General Public License as published by
                     + *  the Free Software Foundation; either version 2 of the License, or
                     + *  (at your option) any later version.
                     + *
                     + *  This program is distributed in the hope that it will be useful,
                     + *  but WITHOUT ANY WARRANTY; without even the implied warranty of
                     + *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
                     + *  GNU General Public License for more details.
                     + *
                     + *  You should have received a copy of the GNU General Public License
                     + *  along with this program; if not, write to the Free Software
                     + *  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
                     + *  MA 02110-1301, USA.
                     + *
                     + */
+                    +
                     +#ifndef IANA_TLD_H
                     +#define IANA_TLD_H
                     +#define iana_tld "(A[CDEFGILMNOQRSTUWXZ]|B[ABDEFGHIJMNORSTVWYZ]|C[ACDFGHIKLMNORUVXYZ]|D[EJKMOZ]|E[CEGRSTU]|F[IJKMOR]|G[ABDEFGHILMNPQRSTUWY]|H[KMNRTU]|I[DELMNOQRST]|J[EMOP]|K[EGHIMNRWYZ]|L[ABCIKRSTUVY]|M[ACDGHKLMNOPQRSTUVWXYZ]|N[ACEFGILOPRUZ]|OM|P[AEFGHKLMNRSTWY]|QA|R[EOUW]|S[ABCDEGHIJKLMNORTUVYZ]|T[CDFGHJKLMNOPRTVWZ]|U[AGKMSYZ]|V[ACEGINU]|W[FS]|Y[ETU]|Z[AMW]|BIZ|CAT|COM|EDU|GOV|INT|MIL|NET|ORG|PRO|AERO|ARPA|COOP|INFO|JOBS|MOBI|NAME|MUSEUM)"
                     +#define iana_cctld "(a[dfilmoqrtuwxz]|b[bdeghijmorstwyz]|c[ahlmnosuy]|d[ejkmz]|e[cegrstu]|f[ijr]|g[abdeghilmnprtuwy]|h[nrtu]|i[delnqst]|j[emop]|k[eghimwz]|l[birstuv]|m[acglmnoqrstuvwxyz]|n[aegilopru]|om|p[aehkltwy]|qa|r[ow]|s[cdeginorz]|t[dghjklmnorvwz]|u[agyz]|v[enu]|ws|y[etu])"
                     +#endif
+                    +

clamav-devel/libclamav/phish_domaincheck_db.c

History View file @ bd912dd

                     new file mode 100644
@@ -0,0 +1,129 @@
                     +/*
                     + *  Phishing module: domain list implementation.
                     + *
                     + *
                     + *  This program is free software; you can redistribute it and/or modify
                     + *  it under the terms of the GNU General Public License as published by
                     + *  the Free Software Foundation; either version 2 of the License, or
                     + *  (at your option) any later version.
                     + *
                     + *  This program is distributed in the hope that it will be useful,
                     + *  but WITHOUT ANY WARRANTY; without even the implied warranty of
                     + *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
                     + *  GNU General Public License for more details.
                     + *
                     + *  You should have received a copy of the GNU General Public License
                     + *  along with this program; if not, write to the Free Software
                     + *  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
                     + *  MA 02110-1301, USA.
                     + *
                     + *  $Log: phish_domaincheck_db.c,v $
                     + *  Revision 1.1  2006/09/12 19:38:39  acab
                     + *  Phishing module merge - libclamav
                     + *
                     + *  Revision 1.3  2006/08/20 21:18:11  edwin
                     + *  Added the script used to generate iana_tld.sh
                     + *  Added checks for phish_domaincheck_db
                     + *  Added phishing module design document from wiki (as discussed with aCaB).
                     + *  Updated .wdb/.pdb format documentation (in regex_list.c)
                     + *  Fixed some memory leaks in regex_list.c
                     + *  IOW: cleanups before the deadline.
                     + *  I consider my module to be ready for evaluation now.
                     + *
                     + *  Revision 1.2  2006/08/09 16:26:44  edwin
                     + *  Forgot to add these files
                     + *
                     + */
+                    +
+                    +
                     +#if HAVE_CONFIG_H
                     +#include "clamav-config.h"
                     +#endif
+                    +
                     +#ifdef CL_EXPERIMENTAL
+                    +
                     +#ifndef CL_DEBUG
                     +#define NDEBUG
                     +#endif
+                    +
                     +#ifdef CL_THREAD_SAFE
                     +#ifndef _REENTRANT
                     +#define _REENTRANT
                     +#endif
                     +#endif
+                    +
                     +#include <stdio.h>
                     +#include <stdlib.h>
                     +#include <errno.h>
                     +#include <assert.h>
                     +#include <string.h>
                     +#include <strings.h>
                     +#include <ctype.h>
+                    +
                     +#include <limits.h>
                     +#include "clamav.h"
                     +#include <sys/types.h>
+                    +
                     +/*#define USE_PCRE*/
                     +#include <regex.h>
+                    +
                     +#if defined(HAVE_READDIR_R_3) || defined(HAVE_READDIR_R_2)
                     +#include <stddef.h>
                     +#endif
+                    +
                     +#include "others.h"
                     +#include "defaults.h"
                     +#include "str.h"
                     +#include "filetypes.h"
                     +#include "mbox.h"
                     +#include "phish_domaincheck_db.h"
                     +#include "regex_list.h"
                     +#include "matcher-ac.h"
+                    +
+                    +
                     +static struct regex_matcher domainlist_matcher;
+                    +
                     +int domainlist_match(const char* real_url,const char* display_url,int hostOnly,unsigned short* flags)
                     +{
                     +	const char* info;
                     +	int rc = regex_list_match(&domainlist_matcher,real_url,display_url,hostOnly,&info);
                     +	if(rc && info && info[0]) {/*match successfull, and has custom flags*/
                     +		if(strlen(info)==3 && isxdigit(info[0]) && isxdigit(info[1]) && isxdigit(info[2])) {
                     +			unsigned short notwantedflags=0;
                     +			sscanf(info,"%hx",&notwantedflags);
                     +		        *flags &= ~notwantedflags;/* filter unwanted phishcheck flags */
                     +		}
                     +		else {
                     +			cli_warnmsg("Phishcheck:Unknown flag format in domainlist, 3 hex digits expected");
                     +		}
                     +	}
                     +	return rc;
                     +}
+                    +
                     +int init_domainlist(void)
                     +{
                     +	return	init_regex_list(&domainlist_matcher);
                     +}
+                    +
                     +int is_domainlist_ok(void)
                     +{
                     +	return is_regex_ok(&domainlist_matcher);
                     +}
+                    +
                     +int cli_loadpdb(FILE* fd,unsigned int options)
                     +{
                     +	return load_regex_matcher(&domainlist_matcher,fd,options);
                     +}
+                    +
                     +void domainlist_cleanup(void)
                     +{
                     +	regex_list_cleanup(&domainlist_matcher);
                     +}
+                    +
                     +void domainlist_done(void)
                     +{
                     +	regex_list_done(&domainlist_matcher);
                     +}
+                    +
                     +#endif

clamav-devel/libclamav/phish_domaincheck_db.h

History View file @ bd912dd

@@ -19,6 +19,9 @@
                       *  MA 02110-1301, USA.
+                      *
                       */
+                    +
                     +#ifdef CL_EXPERIMENTAL
+                    +
                      #ifndef _PHISH_DOMAINCHECK_DB_H
                      #define _PHISH_DOMAINCHECK_DB_H
@@ -31,3 +34,5 @@ int is_domainlist_ok(void);
                      int domainlist_match(const char* real_url,const char* display_url,int hostOnly,unsigned short* flags);
                      #endif
+                    +
                     +#endif

clamav-devel/libclamav/phish_whitelist.c

History View file @ bd912dd

                     new file mode 100644
@@ -0,0 +1,157 @@
                     +/*
                     + *  Phishing module: whitelist implementation.
                     + *
                     + *
                     + *  This program is free software; you can redistribute it and/or modify
                     + *  it under the terms of the GNU General Public License as published by
                     + *  the Free Software Foundation; either version 2 of the License, or
                     + *  (at your option) any later version.
                     + *
                     + *  This program is distributed in the hope that it will be useful,
                     + *  but WITHOUT ANY WARRANTY; without even the implied warranty of
                     + *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
                     + *  GNU General Public License for more details.
                     + *
                     + *  You should have received a copy of the GNU General Public License
                     + *  along with this program; if not, write to the Free Software
                     + *  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
                     + *  MA 02110-1301, USA.
                     + *
                     + *  $Log: phish_whitelist.c,v $
                     + *  Revision 1.1  2006/09/12 19:38:39  acab
                     + *  Phishing module merge - libclamav
                     + *
                     + *  Revision 1.16  2006/08/06 20:27:07  edwin
                     + *  New option to enable phish scan for all domains (disabled by default).
                     + *  You will now have to run clamscan --phish-scan-alldomains to have any phishes detected.
                     + *  Updated phishcheck control flow to better incorporate the domainlist.
                     + *  Updated manpage with new options.
                     + *
                     + *  TODO:there is a still-reachable leak in regex_list.c
                     + *
                     + *  Revision 1.15  2006/07/31 20:12:30  edwin
                     + *  Preliminary support for domain databases (domains to check by phishmodule)
                     + *  Better memory allocation failure handling in regex_list
                     + *
                     + */
+                    +
                     +#if HAVE_CONFIG_H
                     +#include "clamav-config.h"
                     +#endif
+                    +
                     +#ifdef CL_EXPERIMENTAL
+                    +
                     +#ifndef CL_DEBUG
                     +#define NDEBUG
                     +#endif
+                    +
                     +#ifdef CL_THREAD_SAFE
                     +#ifndef _REENTRANT
                     +#define _REENTRANT
                     +#endif
                     +#endif
+                    +
                     +#include <stdio.h>
                     +#include <stdlib.h>
                     +#include <errno.h>
                     +#include <assert.h>
                     +#include <string.h>
                     +#include <strings.h>
                     +#include <ctype.h>
+                    +
                     +#include <limits.h>
                     +#include "clamav.h"
                     +#include <sys/types.h>
+                    +
                     +/*#define USE_PCRE*/
                     +#include <regex.h>
+                    +
                     +#if defined(HAVE_READDIR_R_3) || defined(HAVE_READDIR_R_2)
                     +#include <stddef.h>
                     +#endif
+                    +
                     +#include "others.h"
                     +#include "defaults.h"
                     +#include "str.h"
                     +#include "filetypes.h"
                     +#include "mbox.h"
                     +#include "phish_whitelist.h"
                     +#include "regex_list.h"
                     +#include "matcher-ac.h"
+                    +
+                    +
                     +static struct regex_matcher whitelist_matcher;
+                    +
                     +int whitelist_match(const char* real_url,const char* display_url,int hostOnly)
                     +{
                     +	const char* info;/*unused*/
                     +	return	regex_list_match(&whitelist_matcher,real_url,display_url,hostOnly,&info);
                     +}
+                    +
                     +int init_whitelist(void)
                     +{
                     +	return	init_regex_list(&whitelist_matcher);
                     +}
+                    +
                     +int is_whitelist_ok(void)
                     +{
                     +	return is_regex_ok(&whitelist_matcher);
                     +}
+                    +
                     +int cli_loadwdb(FILE* fd,unsigned int options)
                     +{
                     +	return load_regex_matcher(&whitelist_matcher,fd,options);
                     +}
+                    +
                     +void whitelist_cleanup(void)
                     +{
                     +	regex_list_cleanup(&whitelist_matcher);
                     +}
+                    +
                     +void whitelist_done(void)
                     +{
                     +	regex_list_done(&whitelist_matcher);
                     +}
+                    +
                     +#define WHITELIST_TEST
                     +#ifdef WHITELIST_TEST
                     +int main(int argc,char* argv[])
                     +{
                     +/*	struct tree_node* root=tree_node_alloc(NULL,1);
                     +	const  char* info;
                     +	const  unsigned char test[]="tesxt";
                     +	setup_matcher();
                     +	root->op=OP_ROOT;
                     +	root->c=0;
                     +	root->next=NULL;
                     +	root->listend=1;
                     +	dump_tree(root);
                     +	add_pattern(&root,"test","1");
                     +	dump_tree(root);
                     +	add_pattern(&root,"tesv","2");
                     +	dump_tree(root);
                     +	add_pattern(&root,"tert","3");
                     +	dump_tree(root);
                     +	add_pattern(&root,"terr+","4");
                     +	dump_tree(root);
                     +	add_pattern(&root,"tes[xy]t","5");
                     +	dump_tree(root);
                     +	match_node(root,test,sizeof(test),&info);
                     +	destroy_tree(root);
                     +	if(info)
                     +		printf("%s\n",info);
                     +	else printf("not found\n");*/
                     +	/*FILE* f=fopen("w.wdb","r");
                     +	init_whitelist();
                     +	load_whitelist(f);
                     +	fclose(f);
                     +	dump_tree(root_regex);
                     +	build_whitelist();
                     +	printf("%d\n",whitelist_match("http://www.google.ro","http://www.google.me.ro",0));
                     +	whitelist_done();*/
                     +	return 0;
                     +}
                     +#endif
+                    +
                     +#endif

clamav-devel/libclamav/phish_whitelist.h

History View file @ bd912dd

@@ -20,8 +20,10 @@
+                      *
                       */
                     -#ifndef _WHITELIST_H
                     -#define _WHITELIST_H
                     +#ifdef CL_EXPERIMENTAL
+                    +
                     +#ifndef _PHISH_WHITELIST_H
                     +#define _PHISH_WHITELIST_H
                      int cli_loadwdb(FILE* fd, unsigned int options);
                      int build_whitelist(void);
@@ -32,3 +34,5 @@ int is_whitelist_ok(void);
                      int whitelist_match(const char* real_url,const char* display_url,int hostOnly);
                      #endif
+                    +
                     +#endif

clamav-devel/libclamav/phishcheck.c

History View file @ bd912dd

                     new file mode 100644
@@ -0,0 +1,1258 @@
                     +/*
                     + *  Detect phishing, based on URL spoofing detection.
                     + *
                     + *
                     + *  This program is free software; you can redistribute it and/or modify
                     + *  it under the terms of the GNU General Public License as published by
                     + *  the Free Software Foundation; either version 2 of the License, or
                     + *  (at your option) any later version.
                     + *
                     + *  This program is distributed in the hope that it will be useful,
                     + *  but WITHOUT ANY WARRANTY; without even the implied warranty of
                     + *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
                     + *  GNU General Public License for more details.
                     + *
                     + *  You should have received a copy of the GNU General Public License
                     + *  along with this program; if not, write to the Free Software
                     + *  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
                     + *  MA 02110-1301, USA.
                     + *
                     + *  $Log: phishcheck.c,v $
                     + *  Revision 1.1  2006/09/12 19:38:39  acab
                     + *  Phishing module merge - libclamav
                     + *
                     + *  Revision 1.28  2006/09/09 09:49:27  edwin
                     + *  Fix Solaris compilation problem
                     + *
                     + *  Revision 1.27  2006/08/28 08:43:06  edwin
                     + *  Fixed a few minor leaks.
                     + *  Valgrind now says:"All heap blocks were freed -- no leaks are possible"
                     + *
                     + *  Revision 1.26  2006/08/20 21:18:11  edwin
                     + *  Added the script used to generate iana_tld.sh
                     + *  Added checks for phish_domaincheck_db
                     + *  Added phishing module design document from wiki (as discussed with aCaB).
                     + *  Updated .wdb/.pdb format documentation (in regex_list.c)
                     + *  Fixed some memory leaks in regex_list.c
                     + *  IOW: cleanups before the deadline.
                     + *  I consider my module to be ready for evaluation now.
                     + *
                     + *  Revision 1.25  2006/08/19 21:08:47  edwin
                     + *  Fixed:Forgot to add form tag handling when it contains images.
                     + *  Various fixes to get rid of gcc warnings.
                     + *
                     + *  Revision 1.24  2006/08/19 13:30:34  edwin
                     + *  iana_tld.h was missing from the list of header files.
                     + *  commentedout network code (unused currently)
                     + *
                     + *  Revision 1.23  2006/08/17 20:31:43  edwin
                     + *  Disable extracting hrefs from mails in mbox, if: we aren't scanning for phish, and mailfollowurls is off.
                     + *  Fix a still reachable leak. Remove unneeded build_regex_list export.
                     + *
                     + *  Revision 1.22  2006/08/12 14:35:34  edwin
                     + *  Fix some compiler warnings.
                     + *  Fix an assertion failure in regex_list.
                     + *  Interpret display links that start with http|https|ftp, always as an URL.
                     + *
                     + *  Revision 1.21  2006/08/06 20:27:07  edwin
                     + *  New option to enable phish scan for all domains (disabled by default).
                     + *  You will now have to run clamscan --phish-scan-alldomains to have any phishes detected.
                     + *  Updated phishcheck control flow to better incorporate the domainlist.
                     + *  Updated manpage with new options.
                     + *
                     + *  TODO:there is a still-reachable leak in regex_list.c
                     + *
                     + *  Revision 1.20  2006/08/01 20:19:14  edwin
                     + *  Integrate domainlist check into phishcheck. Warning: enabled by default.
                     + *  Regex bracket handling update.
                     + *  Better regex paranthesized & alternate expression handling.
                     + *
+                    +
                     +case CL_PHISH_HOST_NOT_LISTED:
                     + return "Host not listed in .pdb -> not checked";*  Revision 1.19  2006/07/31 20:12:30  edwin
                     + *  Preliminary support for domain databases (domains to check by phishmodule)
                     + *  Better memory allocation failure handling in regex_list
                     + *
                     + */
+                    +
                     +#if HAVE_CONFIG_H
                     +#include "clamav-config.h"
                     +#endif
+                    +
                     +#ifdef CL_EXPERIMENTAL
+                    +
                     +#ifndef CL_DEBUG
                     +#define NDEBUG
                     +#endif
+                    +
                     +#ifdef CL_THREAD_SAFE
                     +#ifndef _REENTRANT
                     +#define _REENTRANT
                     +#endif
                     +#endif
+                    +
                     +#include <stdio.h>
                     +#include <stdlib.h>
                     +#include <errno.h>
                     +#include <assert.h>
                     +#include <string.h>
                     +#include <strings.h>
                     +#include <ctype.h>
                     +#include <limits.h>
                     +#include <clamav.h>
                     +#include <netdb.h>
                     +#include <netinet/in.h>
+                    +
                     +#if defined(HAVE_READDIR_R_3) || defined(HAVE_READDIR_R_2)
                     +#include <stddef.h>
                     +#endif
+                    +
                     +#include <sys/types.h>
                     +#include <sys/socket.h>
                     +#include <regex.h>
+                    +
                     +#include "others.h"
                     +#include "defaults.h"
                     +#include "str.h"
                     +#include "filetypes.h"
                     +#include "mbox.h"
                     +#include "htmlnorm.h"
                     +#include "phishcheck.h"
                     +#include "phish_whitelist.h"
                     +#include "phish_domaincheck_db.h"
                     +#include "iana_tld.h"
+                    +
                     +#define DOMAIN_REAL 1
                     +#define DOMAIN_DISPLAY 0
+                    +
                     +#define PHISHY_USERNAME_IN_URL 1
                     +#define PHISHY_NUMERIC_IP      2
                     +#define REAL_IS_MAILTO         4
                     +/* this is just a flag, so that the displayed url will be parsed as mailto too, for example
                     + * <a href='mailto:somebody@yahoo.com'>to:somebody@yahoo.com</a>*/
                     +#define DOMAIN_LISTED          8
                     +#define PHISHY_CLOAKED_NULL    16
                     +#define PHISHY_HEX_URL         32
+                    +
+                    +
                     +/*
                     +* Phishing design documentation,
                     +(initially written at http://wiki.clamav.net/index.php/phishing_design as discussed with aCaB)
+                    +
                     +*Warning*: if flag *--phish-scan-alldomains* (or equivalent clamd/clamav-milter config option) isn't given, then phishing scanning is done only for domains listed in daily.pdb.
                     +If your daily.pdb is empty, then by default NO PHISHING is DONE, UNLESS you give the *--phish-scan-alldomains*
                     +This is just a side-effect, daily.pdb is empty, because it isn't yet officialy in daily.cvd.
+                    +
                     +phishingCheck() determines if @displayedLink is  a legit representation of @realLink.
+                    +
                     +Steps:
+                    +
                     +1. if _realLink_ *==* _displayLink_ => *CLEAN*
+                    +
                     +2. url cleanup (normalization)
                     +- whitespace elimination
                     +- html entity conversion
                     +- convert hostname to lowercase
                     +- normalize \ to /
                     +If there is a dot after the last space, then all spaces are replaced with dots,
                     +otherwise spaces are stripped.
                     +So both: 'Go to yahoo.com', and 'Go to e b a y . c o m', and 'Go to ebay. com' will work.
+                    +
+                    +
                     +3. Matched the urls against a _whitelist_:
                     +a _realLink_, _displayedLink_ pair is matched against the _whitelist_.
                     +the _whitelist_ is a list of pairs of realLink, displayedLink. Any of the elements of those pairs can be a _regex_.
                     + if url *is found* in _whitelist_ --> *CLEAN*
+                    +
                     +4. URL is looked up in the _domainlist_, unless disabled via flags (_--phish-scan-alldomains_).
                     +The _domainlist_ is a list of pairs of realLink, displayedLink (any of which can be regex).
                     +This is the list of domains we do phishing detection for (such as ebay,paypal,chase,....)
                     +We can't decide to stop processing here or not, so we just set a flag.
+                    +
                     +Note(*!*): the flags are modified by the the domainlist checker. If domain is found, then the flags associated with it filter the default compile-time flags.
+                    +
                     +5. _Hostname_ is extracted from the _displayed URL_.
                     +It is checked against the _whitelist_, and _domainlist_.
+                    +
                     +6. Now we know if we want to stop processing.
                     +If we are only scanning domains in the _domainlist_ (default behaviour), and the url/domain
                     +isn't found in it, we return (and mark url as not_list/clean).
                     +If we scan all domains, then the domainlist isn't even checked.
+                    +
                     +7. URL cloak check.
                     +check for %00, and hex-encoded IPs in URL.
+                    +
                     +8. Skip empty displayedURLs
+                    +
                     +9. SSL mismatch detection.
                     +Checks if realLink is http, but displayedLink is https or viceversa.
                     +(by default the SSL detection is done for hrefs only, not for imgs)
+                    +
                     +10. Hostname of real URL is extracted.
+                    +
                     +11. Skip cid: displayedLink urls (images embedded in mails).
+                    +
                     +12. Numeric IP detection.
                     +If url is a numeric IP, then -> phish.
                     +Maybe we should do DNS lookup?
                     +Maybe we should disable numericIP checks for --phish-scan-alldomains?
+                    +
                     +13. isURL(displayedLink).
                     +Checks if displayedLink is really a url.
                     +if not -> clean
+                    +
                     +14. Hostnames of real, displayedLink are compared. If equal -> clean
+                    +
                     +15. Extract domain names, and compare. If equal -> clean
+                    +
                     +16. Do DNS lookups/reverse lookups. Disabled now (too much load/too many lookups). *
+                    +
                     +For the Whitelist(.wdb)/Domainlist(.pdb) format see regex_list.c (search for Flags)
                     + *
                     + */
                     +static char empty_string[]="";
+                    +
                     +void url_check_init(struct url_check* urls)
                     +{
                     +	urls->realLink.refcount=0;
                     +	urls->realLink.data=empty_string;
                     +	urls->realLink.ref=NULL;
                     +	urls->displayLink.refcount=0;
                     +	urls->displayLink.data=empty_string;
                     +	urls->displayLink.ref=NULL;
                     +}
+                    +
                     +/* string reference counting implementation,
                     + * so that: we don't have to keep in mind who allocated what, and when needs to be freed,
                     + * and thus we won't leek memory*/
+                    +
                     +inline void string_free(struct string* str)
                     +{
                     +	for(;;){
                     +		str->refcount--;
                     +		if(!str->refcount) {
                     +			if(str->ref)/* don't free, this is a portion of another string */
                     +				str=str->ref;/* try to free that one*/
                     +			else {
                     +				free(str->data);
                     +				break;
                     +			}
                     +		}
                     +		else break;
                     +	}
                     +}
+                    +
                     +/* always use the string_assign when assigning to a string, this makes sure the old one's refcount is decremented*/
                     +void string_assign(struct string* dest,struct string* src)
                     +{
                     +	string_free(dest);
                     +	src->refcount++;
                     +	dest->data=src->data;
                     +	dest->refcount=1;
                     +	dest->ref=src;
                     +}
+                    +
                     +/* data will be freed when string freed */
                     +void string_assign_c(struct string* dest,char* data)
                     +{
                     +	string_free(dest);
                     +	dest->data=data;
                     +	dest->ref=NULL;
                     +	dest->refcount=1;
                     +}
+                    +
                     +/* same as above, but it doesn't free old string, use only for initialization
                     + * Doesn't allow NULL pointers, they are replaced by pointer to empty string
                     + * */
                     +inline void string_init_c(struct string* dest,char* data)
                     +{
                     +	dest->refcount = 1;
                     +	dest->data = data ? data : empty_string;
                     +	dest->ref = NULL;
                     +}
+                    +
                     +/* make a copy of the string between start -> end*/
                     +inline void string_assign_dup(struct string* dest,const char* start,const char* end)
                     +{
                     +	char*	    ret  = cli_malloc(end-start+1);
                     +	strncpy(ret,start,end-start);
                     +	ret[end-start]='\0';
+                    +
                     +	string_free(dest);
                     +	dest->data=ret;
                     +	dest->refcount=1;
                     +	dest->ref=NULL;
                     +}
+                    +
                     +inline void string_assign_null(struct string* dest)
                     +{
                     +	string_free(dest);
                     +	dest->data=empty_string;
                     +	dest->refcount=-1;/* don't free it! */
                     +	dest->ref=NULL;
                     +}
+                    +
                     +/* this string uses portion of another string*/
                     +void string_assign_ref(struct string* dest,struct string* ref,char* data)
                     +{
                     +	string_free(dest);
                     +	ref->refcount++;
                     +	dest->data=data;
                     +	dest->refcount=1;
                     +	dest->ref=ref;
                     +}
+                    +
                     +inline void free_if_needed(struct url_check* url)
                     +{
                     +	string_free(&url->realLink);
                     +	string_free(&url->displayLink);
                     +}
+                    +
                     +static int phish_disabled = 0;/* disabled due to fatal startup error */
                     +static int build_regex(regex_t** preg,const char* regex,int nosub)
                     +{
                     +	int rc;
                     +	*preg = cli_malloc(sizeof(**preg));
                     +	cli_dbgmsg("Compiling regex:%s\n",regex);
                     +	rc = regcomp(*preg,regex,REG_EXTENDED|REG_ICASE|(nosub ? REG_NOSUB :0));
                     +	if(rc) {
                     +		size_t buflen =	regerror(rc,*preg,NULL,0);
                     +		char*  errbuf = cli_malloc(buflen);
                     +		regerror(rc,*preg,errbuf,buflen);
                     +		cli_errmsg("Error in compiling regex:%s\nDisabling phishing checks\n",errbuf);
                     +		free(errbuf);
                     +		free(*preg);
                     +		*preg=NULL;
                     +		phish_disabled=1;
                     +		return 1;
                     +	}
                     +	return 0;
                     +}
+                    +
+                    +
                     +/*static regex_t* host_preg = NULL;
                     +static const char* host_regex="cid:.+|mailto:(.+)|([[:alpha:]]+://)?(([^:/?]+@)+([^:/?]+)([:/?].+)?|([^@:/?]+)([:/?].+)?)"; <- this is slower than the function below
                     +*/
                     +/* allocates memory */
                     +void get_host(struct string* dest,const char* URL,int isReal,int* phishy)
                     +{
                     +	const char mailto[] = "mailto:";
                     +	int ismailto = 0;
                     +	const char* start;
                     +	const char* end=NULL;
                     +	if(!URL) {
                     +		string_assign_null(dest);
                     +		return;
                     +	}
                     +	start = strstr(URL,"://");
                     +	if(!start) {
                     +		if(!strncmp(URL,mailto,sizeof(mailto)-1)) {
                     +			start = URL + sizeof(mailto)-1;
                     +			ismailto = 1;
                     +		}
                     +		else if (!isReal && *phishy&REAL_IS_MAILTO) {
                     +			/* it is not required to use mailto: in the displayed url, they might use to:, or whatever */
                     +			end = URL+strlen(URL)+1;
                     +			start = URL + strcspn(URL,": ")+1;
                     +			if (start==end)
                     +				start = URL;
                     +			ismailto = 1;
                     +		}
                     +		else {
                     +/*			if(!strncmp(URL,"cid:",4)) {handled in phishcheck
                     +				string_assign_null(dest);
                     +				return;* cid: image, nothing to verify
                     +			}
                     +*/
                     +			start=URL;/*URL without protocol*/
                     +			if(isReal)
                     +				cli_dbgmsg("PH:Real URL without protocol:%s\n",URL);
                     +			else ismailto=2;/*no-protocol, might be mailto, @ is no problem*/
                     +		}
                     +	}
                     +	else start += 3;/* :// */
+                    +
                     +	if(!ismailto || !isReal) {
                     +		const char* realhost;
                     +		do {
                     +			end	 = start+strcspn(start,":/?");
                     +			realhost = strchr(start,'@');
                     +			if(start!=end && realhost>end) realhost = NULL;/*don't check beyond end of hostname*/
                     +			if(realhost) {
                     +				const char* tld = strrchr(realhost,'.');
                     +				if(tld && isTLD(tld,tld-realhost-1))
                     +					*phishy |= PHISHY_USERNAME_IN_URL;/* if the url contains a username that is there just to fool people,
                     +					like http://www.ebay.com@somevilplace.someevildomain.com/ */
                     +				start=realhost+1;/*skip the username*/
                     +			}
                     +		} while(realhost);/*skip over multiple @ characters, text following last @ character is the real host*/
                     +	}
                     +	else
                     +	if (ismailto && isReal)
                     +		*phishy |= REAL_IS_MAILTO;
+                    +
                     +	if(!end) {
                     +		end  = start+strcspn(start,":/?");/*especially important for mailto:somebody@yahoo.com?subject=...*/
                     +		if(!end)
                     +			end  = start + strlen(start);
                     +	}
+                    +
                     +	string_assign_dup(dest,start,end);
                     +}
+                    +
                     +static regex_t* preg = NULL;
                     +static regex_t* preg_tld = NULL;
                     +static regex_t* preg_cctld = NULL;
                     +static regex_t* preg_numeric = NULL;
+                    +
                     +static const char tld_regex[] = "^"iana_tld"$";
                     +static const char cctld_regex[] = "^"iana_cctld"$";
+                    +
                     +int isCountryCode(const char* str)
                     +{
                     +	if(!preg_cctld) {
                     +		if(build_regex(&preg_cctld,cctld_regex,1))
                     +			return -1;
                     +	}
                     +	return str ? !regexec(preg_cctld,str,0,NULL,0) : 0;
                     +}
+                    +
                     +int isTLD(const char* str,int len)
                     +{
                     +	if (!str)
                     +		return 0;
                     +	else {
                     +		char*	s  = cli_malloc(len+1);
                     +		int rc;
                     +		strncpy(s,str,len);
                     +		s[len]='\0';
                     +		if(!preg_tld) {
                     +			if(build_regex(&preg_tld,tld_regex,1))
                     +				return -1;
                     +		}
                     +		rc = !regexec(preg_tld,s,0,NULL,0);
                     +		free(s);
                     +		return rc;
                     +	}
                     +}
+                    +
                     +/*
                     + * memrchr isn't standard, so I use this
                     + */
                     +char* rfind(char* start,char c,size_t len)
                     +{
                     +	char* p;
                     +	for(p=start+len;p>=start && *p!=c;p--);
                     +	return p<start ? NULL : p;
                     +}
+                    +
                     +void get_domain(struct string* dest,struct string* host)
                     +{
                     +	char* domain;
                     +	char* tld = strrchr(host->data,'.');
                     +	if(!tld) {
                     +		cli_dbgmsg("PH:What? A host without a tld? (%s)\n",host->data);
                     +		string_assign(dest,host);
                     +		return;
                     +	}
                     +	if(isCountryCode(tld+1)) {
                     +		const char* countrycode=tld+1;
                     +		tld = rfind(host->data,'.',tld-host->data-1);
                     +		if(!tld) {
                     +			cli_dbgmsg("PH:Weird, a name with only 2 levels (%s)\n",host);
                     +			string_assign(dest,host);
                     +			return;
                     +		}
                     +		if(!isTLD(tld+1,countrycode-tld-1)) {
                     +			string_assign_ref(dest,host,tld+1);
                     +			return;/*it was a name like: subdomain.domain.uk, return domain.uk*/
                     +		}
                     +	}
                     +	/*we need to strip one more level, this is the actual domain*/
                     +	domain = rfind(host->data,'.',tld-host->data-1);
                     +	if(!domain) {
                     +		string_assign(dest,host);
                     +		return;/* it was like sourceforge.net?*/
                     +	}
                     +	string_assign_ref(dest,host,domain+1);
                     +}
+                    +
+                    +
                     +/*
                     +int ip_reverse(struct url_check* urls,int isReal)
                     +{
                     +	const char* host = isReal ? urls->realLink.data : urls->displayLink.data;
                     +	struct hostent *he = gethostbyname (host);
                     +	if (he)
                     +	{
                     +		char *addr = 0;
                     +		switch (he->h_addrtype)
                     +		{
                     +			case AF_INET:
                     +			  addr = inet_ntoa (*(struct in_addr *) he->h_addr);
                     +			  break;
                     +		}
                     +		if (addr && strcmp (he->h_name, addr) == 0)
                     +		{
                     +			char *h_addr_copy = strdup (he->h_addr);
                     +			if (h_addr_copy == NULL)
                     +			    he = NULL;
                     +			else
                     +			{
                     +			      he = gethostbyaddr (h_addr_copy, he->h_length, he->h_addrtype);
                     +			      free (h_addr_copy);
                     +			}
                     +		}
                     +	     if (he)
                     +		string_assign_dup(isReal ? &urls->realLink : &urls->displayLink,he->h_name,he->h_name+strlen(he->h_name));
                     +    }
                     +    return 0;
                     +}
                     +* frees its argument, and allocates memory*
                     +void reverse_lookup(struct url_check* url,int isReal)
                     +{
                     +	ip_reverse(url,isReal);
                     +}
                     +*/
                     +int isNumeric(const char* host)
                     +{
                     +	int len = strlen(host);
                     +	int a,b,c,d,n=0;
                     +	/* 1.2.3.4 -> 7*/
                     +	/* 127.127.127.127 -> 15*/
                     +	if(len<7 || len>15)
                     +		return 0;
                     +	sscanf(host,"%d.%d.%d.%d%n",&a,&b,&c,&d,&n);
                     +	if(n==len)
                     +		if(a>=0 && a<=256 && b>=0 && b<=256 && c>=0 && c<=256 && d>=0 && d<=256)
                     +			return 1;
                     +	return 0;
                     +}
+                    +
                     +int isSSL(const char* URL)
                     +{
                     +	const char https[]="https://";
                     +	return URL ? !strncmp(https,URL,sizeof(https)-1) : 0;
                     +}
+                    +
                     +static int hexinited=0;
                     +static short int hextable[256];
                     +static inline char hex2int(const unsigned char* src)
                     +{
                     +	assert(hexinited);
                     +	return hextable[src[0]]<<4 | hextable[src[1]];
                     +}
+                    +
+                    +
                     +/* deletes @what from the string @begin.
                     + * @what_len: length of @what, excluding the terminating \0 */
                     +static void str_hex_to_char(char** begin,const char** end)
                     +{
                     +	char* sbegin = *begin;
                     +	const char* str_end = *end;
                     +	assert(str_end>sbegin);
                     +	/* convert leading %xx*/
                     +	if (sbegin[0] == '%') {
                     +		sbegin[2] = hex2int((unsigned char*)sbegin+1);
                     +		sbegin += 2;
                     +	}
                     +	*begin = sbegin++;
                     +	while(sbegin+3 < str_end) {
                     +		while(sbegin+3<str_end && sbegin[0]=='%') {
                     +			const char* src = sbegin+3;
                     +			*sbegin = hex2int((unsigned char*)sbegin+1);
                     +			/* move string */
                     +			memmove(sbegin+1,src,str_end-src+1);
                     +			str_end -= 2;
                     +		}
                     +		sbegin++;
                     +	}
                     +	*end = str_end;
                     +}
                     +/* deletes @what from the string @begin.
                     + * @what_len: length of @what, excluding the terminating \0 */
                     +static void str_strip(char** begin,const char** end,const char* what,size_t what_len)
                     +{
                     +	char* sbegin = *begin;
                     +	const char* str_end = *end;
                     +	const char* str_end_what;
                     +	size_t cmp_len = what_len;
                     +	assert(str_end>sbegin);
                     +	if(str_end < sbegin + what_len)
                     +		return;
                     +	/* strip leading @what */
                     +	while(cmp_len && !strncmp(sbegin,what,cmp_len)) {
                     +		sbegin += what_len;
                     +		if(cmp_len > what_len)
                     +			cmp_len -= what_len;
                     +		else cmp_len = 0;
                     +	}
                     +	/* strip trailing @what */
                     +	str_end_what = str_end - what_len;
                     +	while(str_end_what>sbegin && !strncmp(str_end_what,what,what_len)) {
                     +		str_end -= what_len;
                     +		str_end_what -= what_len;
                     +	}
                     +	*begin = sbegin++;
                     +	while(sbegin+what_len < str_end) {
                     +		while(sbegin+what_len<str_end && !strncmp(sbegin,what,what_len)) {
                     +			const char* src = sbegin+what_len;
                     +			/* move string */
                     +			memmove(sbegin,src,str_end-src+1);
                     +			str_end -= what_len;
                     +		}
                     +		sbegin++;
                     +	}
                     +	*end = str_end;
                     +}
+                    +
                     +static const char dotnet[] = ".net";
                     +static const char adonet[] = "ado.net";
                     +static const char aspnet[] = "asp.net";
                     +static const char lt[]="&lt;";
                     +static const char gt[]="&gt;";
                     +static const size_t dotnet_len = sizeof(dotnet)-1;
                     +static const size_t adonet_len = sizeof(adonet)-1;
                     +static const size_t aspnet_len = sizeof(aspnet)-1;
                     +static const size_t lt_len = sizeof(lt)-1;
                     +static const size_t gt_len = sizeof(gt)-1;
+                    +
                     +/* replace every occurence of @c in @str with @r*/
                     +static inline void str_replace(char* str,const char* end,char c,char r)
                     +{
                     +	for(;str<end;str++) {
                     +		if(*str==c)
                     +			*str=r;
                     +	}
                     +}
                     +static inline void str_make_lowercase(char* str,size_t len)
                     +{
                     +	for(;len;str++,len--) {
                     +		*str = tolower(*str);
                     +	}
                     +}
+                    +
                     +#define fix32(x) ((x)<32 ? 32 : (x))
                     +static inline void clear_msb(char* begin)
                     +{
                     +	for(;*begin;begin++)
                     +		*begin = fix32((*begin)&0x7f);
                     +}
+                    +
                     +/*
                     + * Particularly yahoo puts links like this in mails:
                     + * http:/ /mail.yahoo.com
                     + * So first step: delete space between / /
                     + *
                     + * Next there could be possible links like this:
                     + * <a href="phishlink">w  w w . e b a y . c o m</a>
                     + * Here we need to strip spaces to get this picked up.
                     + *
                     + * Next there are links like:
                     + * <a href="www.yahoo.com">Check out yahoo.com</a>
                     + * Here we add a ., so we get: check.out.yahoo.com (it won't trigger)
                     + *
                     + * Rule for adding .: if substring from right contains dot, then add dot, otherwise strip space
                     + *
                     + */
                     +static inline void str_fixup_spaces(char **begin,const char** end)
                     +{
                     +	char* space = strchr(*begin,' ');
                     +	/* strip any number of spaces after / */
                     +	while(space>*begin && space[-1]=='/' && space[0]==' ' && space<*end) {
                     +		memmove(space,space+1,*end-space+1);
                     +		(*end)--;
                     +	}
+                    +
                     +	for(space = rfind(*begin,' ',*end-*begin);space && space[0]!='.' && space<*end;space++) {}
                     +	if(space && space[0]=='.')
                     +		str_replace(*begin,*end,' ','.');
                     +	else
                     +		str_strip(begin,end," ",1);
                     +}
+                    +
                     +/* allocates memory */
                     +void cleanupURL(struct string* URL,int isReal)
                     +{
                     +	char* begin = URL->data;
                     +	const char* end;
                     +	size_t len;
                     +	clear_msb(begin);
                     +/*	if(!URL->data)
                     +		return;*/
                     +	/*TODO: handle hex-encoded IPs*/
                     +	while(isspace(*begin)) begin++;
                     +	len=strlen(begin);
                     +	end = begin+len-1;
                     +	/*cli_dbgmsg("%d\n",end-begin);*/
                     +	if(begin>=end) {
                     +		string_assign_null(URL);
                     +		return;
                     +	}
                     +	while(isspace(*end))
                     +		end--;
                     +	/*TODO: convert \ to /, and stuff like that*/
                     +	/* From mailscanner, my comments enclosed in {} */
                     +        if(!strncmp(begin,dotnet,dotnet_len) || !strncmp(begin,adonet,adonet_len) || !strncmp(begin,aspnet,aspnet_len))
                     +		string_assign_null(URL);
                     +	else {
                     +		size_t host_len;
                     +		char* host_begin;
                     +		str_replace(begin,end,'\\','/');
                     +		str_strip(&begin,&end,"\"",1);
                     +		str_strip(&begin,&end,lt,lt_len);
                     +		str_strip(&begin,&end,gt,gt_len);
                     +		/* convert hostname to lowercase, but only hostname! */
                     +		host_begin = strchr(begin,':');
                     +		while(host_begin && host_begin[1]=='/') host_begin++;
                     +		if(!host_begin) host_begin=begin;
                     +		else host_begin++;
                     +		host_len = strcspn(host_begin,"/?");
                     +		str_make_lowercase(host_begin,host_len);
                     +		/* convert %xx to real value */
                     +		str_hex_to_char(&begin,&end);
                     +		str_fixup_spaces(&begin,&end);
                     +		string_assign_dup(URL,begin,end+1);
                     +		/*cli_dbgmsg("%p::%s\n",URL->data,URL->data);*/
                     +	}
                     +}
+                    +
                     +void get_redirected_URL(struct string* URL)
                     +{
                     +	/*TODO: see if URL redirects sowhere, if so, then follow
                     +	returns redirected URL*/
                     +}
+                    +
                     +static inline int is_phish_disabled(void)
                     +{
                     +	if (phish_disabled)
                     +		return 1;
                     +	else if (!is_whitelist_ok()) {
                     +		phish_disabled = 1;
                     +		return 1;
                     +	}
                     +	else return 0;
                     +}
+                    +
                     +static void init_hextable(void)
                     +{
                     +	unsigned char c;
                     +	memset(hextable,0,256);
                     +	for(c='0';c<='9';c++)
                     +		hextable[c] = c-'0';
                     +	for(c='a';c<='z';c++)
                     +		hextable[c] = 10+c-'a';
                     +	for(c='A';c<='Z';c++)
                     +		hextable[c] = 10+c-'A';
                     +	hexinited=1;
                     +}
+                    +
                     +int phishingScan(message* m,const char* dir,cli_ctx* ctx,tag_arguments_t* hrefs)
                     +{
                     +	const char src_text[]="src";
                     +	const char href_text[]="href";
                     +	const size_t href_text_len = sizeof(href_text);
                     +	const size_t src_text_len = sizeof(src_text);
                     +	int i;
                     +	if(is_phish_disabled())
                     +		return 0;
                     +	if(!hexinited) {
                     +		init_hextable();
                     +		atexit(phishing_done);/*TODO: replace this with a proper phishing_done call from manager.c*/
                     +	}
+                    +
                     +	*ctx->virname=NULL;
                     +	for(i=0;i<hrefs->count;i++)
                     +		if(hrefs->contents[i]) {
                     +			struct url_check urls;
                     +			enum phish_status rc;
                     +			urls.flags	 = strncmp((char*)hrefs->tag[i],href_text,href_text_len)? (CL_PHISH_ALL_CHECKS&~CHECK_SSL): CL_PHISH_ALL_CHECKS;
                     +			if (!(urls.flags&CHECK_IMG_URL) && !strncmp((char*)hrefs->tag[i],src_text,src_text_len))
                     +				continue;
                     +			if (ctx->options&CL_PHISH_NO_DOMAINLIST)
                     +				urls.flags &= ~DOMAINLIST_REQUIRED;
                     +			string_init_c(&urls.realLink,(char*)hrefs->value[i]);
                     +/*			if(!hrefs->contents[i]->isClosed) {
                     +				blobAddData(hrefs->contents[i],empty_string,1);
                     +				blobClose(hrefs->contents[i]);
                     +			}*/
                     +			string_init_c(&urls.displayLink,(char*)blobGetData(hrefs->contents[i]));
                     +			assert(!urls.displayLink.data[blobGetDataSize(hrefs->contents[i])-1]);
                     +/*			assert(strlen(urls.displayLink.data) < blobGetDataSize(hrefs->contents[i]));*/
                     +			urls.realLink.refcount=-1;
                     +			urls.displayLink.refcount=-1;/*don't free these, caller will free*/
                     +			if(strcmp((char*)hrefs->tag[i],"href")) {
                     +				char *url;
                     +				url = urls.realLink.data;
                     +				urls.realLink.data = urls.displayLink.data;
                     +				urls.displayLink.data = url;
                     +			}
+                    +
                     +			rc = phishingCheck(&urls);
                     +			if(phish_disabled)
                     +				return 0;
                     +			free_if_needed(&urls);
                     +			cli_dbgmsg("Phishing scan result:%s\n",phishing_ret_toString(rc));
                     +			switch(rc)/*TODO: support flags from ctx->options,*/
                     +				{
                     +					case CL_PHISH_CLEAN:
                     +					case CL_PHISH_CLEANUP_OK:
                     +					case CL_PHISH_HOST_OK:
                     +					case CL_PHISH_DOMAIN_OK:
                     +					case CL_PHISH_REDIR_OK:
                     +					case CL_PHISH_HOST_REDIR_OK:
                     +					case CL_PHISH_DOMAIN_REDIR_OK:
                     +					case CL_PHISH_HOST_REVERSE_OK:
                     +					case CL_PHISH_DOMAIN_REVERSE_OK:
                     +					case CL_PHISH_WHITELISTED:
                     +					case CL_PHISH_HOST_WHITELISTED:
                     +					case CL_PHISH_MAILTO_OK:
                     +					case CL_PHISH_TEXTURL:
                     +					case CL_PHISH_HOST_NOT_LISTED:
                     +					case CL_PHISH_CLEAN_CID:
                     +						continue;
                     +/*						break;*/
                     +					case CL_PHISH_HEX_URL:
                     +						*ctx->virname="Phishing.Email.HexURL";
                     +						return CL_VIRUS;
                     +/*						break;*/
                     +					case CL_PHISH_NUMERIC_IP:
                     +						*ctx->virname="Phishing.Email.Cloaked.NumericIP";
                     +						return CL_VIRUS;
                     +					case CL_PHISH_CLOAKED_NULL:
                     +						*ctx->virname="Phishing.Email.Cloaked.Null";/*http://www.real.com%01%00@www.evil.com*/
                     +						return CL_VIRUS;
                     +					case CL_PHISH_SSL_SPOOF:
                     +						*ctx->virname="Phishing.Email.SSL-Spoof";
                     +						return CL_VIRUS;
                     +					case CL_PHISH_CLOAKED_UIU:
                     +						*ctx->virname="Phishing.Email.Cloaked.Username";/*http://www.ebay.com@www.evil.com*/
                     +						return CL_VIRUS;
                     +					case CL_PHISH_NOMATCH:
                     +					default:
                     +						*ctx->virname="Phishing.Email";
                     +						return CL_VIRUS;
                     +				}
                     +		}
                     +		else
                     +			if(strcmp((char*)hrefs->tag[i],"href"))
                     +					cli_dbgmsg("PH:href with no contents?\n");
                     +	return 0;/*texturlfound?CL_VIRUS:0;*/
                     +}
+                    +
                     +static char* str_compose(const char* a,const char* b,const char* c)
                     +{
                     +	const size_t a_len = strlen(a);
                     +	const size_t b_len = strlen(b);
                     +	const size_t c_len = strlen(c);
                     +	const size_t r_len = a_len+b_len+c_len+1;
                     +	char* concated = malloc(r_len);
                     +	strncpy(concated,a,a_len);
                     +	strncpy(concated+a_len,b,b_len);
                     +	strncpy(concated+a_len+b_len,c,c_len);
                     +	concated[r_len-1]='\0';
                     +	return concated;
                     +}
+                    +
                     +/*static const char* url_regex="^ *([[:alnum:]%_-]+:(//)?)?([[:alnum:]%_-]@)*[[:alnum:]%_-]+\\.([[:alnum:]%_-]+\\.)*[[:alnum:]_%-]+(/[[:alnum:];:@$=?&/.,%_-]+) *$";*/
                     +/* for urls, including mailto: urls, and (broken) http:www... style urls*/
                     +/* refer to: http://www.w3.org/Addressing/URL/5_URI_BNF.html
                     + * Modifications: don't allow empty domains/subdomains, such as www..com <- that is no url
                     + * So the 'safe' char class has been split up
                     + * */
                     +/* character classes */
                     +#define URI_alpha       "a-zA-Z"
                     +#define URI_digit       "0-9"
                     +#define URI_safe_nodot  "-$_@&"
                     +#define URI_safe        "-$_@.&"
                     +#define URI_extra       "!*\"'(),"
                     +#define URI_reserved    "=;/#?: "
                     +#define URI_national    "{}|[]\\^~"
                     +#define URI_punctuation "<>"
+                    +
                     +#define URI_hex         "[0-9a-fA-f]"
                     +#define URI_escape      "%"URI_hex"{2}"
                     +#define URI_xalpha "([" URI_safe URI_alpha URI_digit  URI_extra "]|"URI_escape")" /* URI_safe has to be first, because it contains - */
                     +#define URI_xalpha_nodot "([" URI_safe_nodot URI_alpha URI_digit URI_extra "]|"URI_escape")"
+                    +
                     +#define URI_xalphas URI_xalpha"+"
                     +#define URI_xalphas_nodot URI_xalpha_nodot"*"
+                    +
                     +#define URI_ialpha  "["URI_alpha"]"URI_xalphas_nodot""
                     +#define URI_xpalpha URI_xalpha"|\\+"
                     +#define URI_xpalpha_nodot URI_xalpha_nodot"|\\+"
                     +#define URI_xpalphas "("URI_xpalpha")+"
                     +#define URI_xpalphas_nodot "("URI_xpalpha_nodot")+"
+                    +
                     +#define URI_scheme URI_ialpha
                     +#define URI_tld iana_tld
                     +#define URI_path1 URI_xpalphas_nodot"\\.("URI_xpalphas_nodot"\\.)*"
                     +#define URI_path2 URI_tld
                     +#define URI_path3 "(/("URI_xpalphas"/?)*)?"
+                    +
                     +#define URI_search "("URI_xalphas"\\+)*"
                     +#define URI_fragmentid URI_xalphas
+                    +
                     +#define URI_IP_digits "["URI_digit"]{1,3}"
                     +#define URI_numeric_path URI_IP_digits"(\\."URI_IP_digits"){3}(:"URI_xpalphas_nodot")?(/("URI_xpalphas"/?)*)?"
                     +#define URI_numeric_URI "("URI_scheme":(//)?)?"URI_numeric_path"(\\?" URI_search")?"
                     +#define URI_numeric_fragmentaddress URI_numeric_URI"(#"URI_fragmentid")?"
+                    +
                     +#define URI_URI1 "("URI_scheme":(//)?)?"URI_path1
                     +#define URI_URI2 URI_path2
                     +#define URI_URI3 URI_path3"(\\?" URI_search")?"
+                    +
                     +#define URI_fragmentaddress1 URI_URI1
                     +#define URI_fragmentaddress2 URI_URI2
                     +#define URI_fragmentaddress3 URI_URI3"(#"URI_fragmentid")?"
+                    +
                     +#define URI_CHECK_PROTOCOLS "(http|https|ftp)://.+"
+                    +
                     +/*Warning: take care when modifying this regex, it has been tweaked, and tuned, just don't break it please.
                     + * there is fragmentaddress1, and 2  to work around the ISO limitation of 509 bytes max length for string constants*/
                     +static char* url_regex = NULL;
                     +static const char numeric_url_regex[] = "^ *"URI_numeric_fragmentaddress" *$";
                     +/*
                     + * Only those URLs are identified as URLs for which phishing detection can be performed.
                     + * This means that no attempt is made to properly recognize 'cid:' URLs
                     + */
                     +int isURL(const char* URL)
                     +{
                     +	if(!preg) {
                     +		url_regex = str_compose("^ *("URI_fragmentaddress1,URI_fragmentaddress2,URI_fragmentaddress3"|"URI_CHECK_PROTOCOLS") *$");
                     +		if(build_regex(&preg,url_regex,1))
                     +			return -1;
                     +	}
                     +	return URL ? !regexec(preg,URL,0,NULL,0) : 0;
                     +}
+                    +
                     +int isNumericURL(const char* URL)
                     +{
                     +	if(!preg_numeric) {
                     +		if(build_regex(&preg_numeric,numeric_url_regex,1))
                     +			return -1;
                     +	}
                     +	return URL ? !regexec(preg_numeric,URL,0,NULL,0) : 0;
                     +}
+                    +
                     +/* Cleans up @urls
                     + * If URLs are identical after cleanup it will return CL_PHISH_CLEANUP_OK.
                     + * */
                     +enum phish_status cleanupURLs(struct url_check* urls)
                     +{
                     +	if(urls->flags&CLEANUP_URL) {
                     +		cleanupURL(&urls->realLink,1);
                     +		cleanupURL(&urls->displayLink,0);
                     +		if(!urls->displayLink.data || !urls->realLink.data)
                     +			return CL_PHISH_NODECISION;
                     +		if(!strcmp(urls->realLink.data,urls->displayLink.data))
                     +			return CL_PHISH_CLEANUP_OK;
                     +	}
                     +	return CL_PHISH_NODECISION;
                     +}
+                    +
+                    +
                     +enum phish_status url_get_host(struct url_check* url,struct url_check* host_url,int isReal,int* phishy)
                     +{
                     +	struct string* host = isReal ? &host_url->realLink : &host_url->displayLink;
                     +	get_host(host,isReal ? url->realLink.data : url->displayLink.data, isReal,phishy);
                     +	if(!host->data)
                     +		return CL_PHISH_CLEANUP_OK;
                     +	if(*phishy&REAL_IS_MAILTO)
                     +		return CL_PHISH_MAILTO_OK;
                     +	if(strchr(host->data,' ')) {
                     +		string_free(host);
                     +		return CL_PHISH_TEXTURL;
                     +	}
                     +	if(isReal && (!strncmp(host->data,"0x",2) || !strncmp(host->data,"0X",2))) {
                     +		string_free(host);
                     +		return CL_PHISH_HEX_URL;
                     +	}
                     +	if(isReal && host->data[0]=='\0')
                     +		return CL_PHISH_CLEAN;/* link without domain, such as: href="/isapi.dll?... */
                     +	if(isNumeric(host->data)) {
                     +		*phishy |= PHISHY_NUMERIC_IP;
                     +/*		if(url->flags&DO_REVERSE_LOOKUP)
                     +			reverse_lookup(host_url,isReal);*/
                     +	}
                     +	return CL_PHISH_NODECISION;
                     +}
+                    +
+                    +
                     +void url_get_domain(struct url_check* url,struct url_check* domains)
                     +{
                     +	get_domain(&domains->realLink, &url->realLink);
                     +	get_domain(&domains->displayLink, &url->displayLink);
                     +	domains->flags	     = url->flags;
                     +}
+                    +
                     +enum phish_status phishy_map(int phishy,enum phish_status fallback)
                     +{
                     +	if(phishy&PHISHY_USERNAME_IN_URL)
                     +		return CL_PHISH_CLOAKED_UIU;
                     +	else if(phishy&PHISHY_NUMERIC_IP)
                     +		return CL_PHISH_NUMERIC_IP;
                     +	else
                     +		return fallback;
                     +}
+                    +
                     +int isEncoded(const char* url)
                     +{
                     +	const char* start=url;
                     +	size_t cnt=0;
                     +	do{
                     +		cnt++;
                     +		/*last=start;*/
                     +		start=strstr(start,"&#");
                     +		if(start)
                     +			start=strstr(start,";");
                     +	} while(start);
                     +	return (cnt-1 >strlen(url)*7/10);/*more than 70% made up of &#;*/
                     +}
+                    +
                     +static void free_regex(regex_t** p)
                     +{
                     +	if(p) {
                     +		if(*p) {
                     +			regfree(*p);
                     +			free(*p);
                     +			*p=NULL;
                     +		}
                     +	}
                     +}
+                    +
                     +void phishing_done(void)
                     +{
                     +	free_regex(&preg);
                     +	free_regex(&preg_cctld);
                     +	free_regex(&preg_tld);
                     +	free_regex(&preg_numeric);
                     +	whitelist_done();
                     +	domainlist_done();
                     +	if(url_regex)
                     +		free(url_regex);
                     +}
+                    +
                     +int whitelist_check(struct url_check* urls,int hostOnly)
                     +{
                     +	return whitelist_match(urls->realLink.data,urls->displayLink.data,hostOnly);
                     +}
+                    +
                     +/* urls can't contain null pointer, caller must ensure this */
                     +enum phish_status phishingCheck(struct url_check* urls)
                     +{
                     +	struct url_check host_url;
                     +	const char cid[] = "cid:";
                     +	const size_t cid_len = sizeof(cid)-1;
                     +	enum phish_status rc=CL_PHISH_NODECISION;
                     +	int phishy=0;
                     +	if(!urls->realLink.data)
                     +		return CL_PHISH_CLEAN;
                     +	cli_dbgmsg("\nPH:Checking url %s->%s \n",urls->realLink.data,urls->displayLink.data);
+                    +
                     +	if(!strcmp(urls->realLink.data,urls->displayLink.data))
                     +		return CL_PHISH_CLEAN;/* displayed and real URL are identical -> clean */
+                    +
                     +	if((rc = cleanupURLs(urls))) {
                     +		assert(!isPhishing(rc));/* not allowed to decide this is phishing */
                     +		return rc;/* URLs identical after cleanup */
                     +	}
+                    +
                     +	if(whitelist_check(urls,0))
                     +		return CL_PHISH_WHITELISTED;/* if url is whitelist don't perform further checks */
+                    +
                     +	if(urls->flags&DOMAINLIST_REQUIRED && domainlist_match(urls->realLink.data,urls->displayLink.data,0,&urls->flags))
                     +		phishy |= DOMAIN_LISTED;
                     +	else {
                     +		/* although entire url is not listed, the host might be,
                     +		 * so defer phishing decisions till we know if host is listed*/
                     +	}
+                    +
                     +	url_check_init(&host_url);
+                    +
                     +	if((rc = url_get_host(urls,&host_url,DOMAIN_DISPLAY,&phishy))) {
                     +		free_if_needed(&host_url);
                     +		assert(!isPhishing(rc));
                     +		return rc;
                     +	}
+                    +
                     +	if(whitelist_check(&host_url,1)) {
                     +		free_if_needed(&host_url);
                     +		return CL_PHISH_HOST_WHITELISTED;
                     +	}
+                    +
                     +	if(urls->flags&DOMAINLIST_REQUIRED) {
                     +		if(!(phishy&DOMAIN_LISTED)) {
                     +			if(domainlist_match(urls->displayLink.data,urls->realLink.data,1,&urls->flags))
                     +				phishy |= DOMAIN_LISTED;
                     +			else {
                     +				free_if_needed(&host_url);
                     +				return CL_PHISH_HOST_NOT_LISTED;
                     +			}
                     +		}
                     +	}
+                    +
                     +	if(urls->flags&CHECK_CLOAKING) {
                     +		/*Checks if URL is cloaked.
                     +		Should we check if it containts another http://, https://?
                     +		No because we might get false positives from redirect services.*/
                     +		if(strstr(urls->realLink.data,"%00")) {
                     +			free_if_needed(&host_url);
                     +			return CL_PHISH_CLOAKED_NULL;
                     +		}
                     +		if(isEncoded(urls->displayLink.data)) {
                     +			free_if_needed(&host_url);
                     +			return CL_PHISH_HEX_URL;
                     +		}
                     +	}
+                    +
                     +	if(urls->displayLink.data[0]=='\0') {
                     +		free_if_needed(&host_url);
                     +		return CL_PHISH_CLEAN;
                     +	}
+                    +
                     +	if(urls->flags&CHECK_SSL && isSSL(urls->displayLink.data) && !isSSL(urls->realLink.data)) {
                     +		free_if_needed(&host_url);
                     +		return CL_PHISH_SSL_SPOOF;
                     +	}
+                    +
                     +	if((rc = url_get_host(urls,&host_url,DOMAIN_REAL,&phishy)))
                     +	{
                     +		free_if_needed(&host_url);
                     +		return rc;
                     +	}
+                    +
                     +	if(!strncmp(urls->displayLink.data,cid,cid_len))/* cid: image */{
                     +		free_if_needed(&host_url);
                     +		return CL_PHISH_CLEAN_CID;
                     +	}
+                    +
                     +	if(!isURL(urls->displayLink.data) &&
                     +			( (phishy&PHISHY_NUMERIC_IP && !isNumericURL(urls->displayLink.data)) ||
                     +			  !(phishy&PHISHY_NUMERIC_IP))) {
                     +		free_if_needed(&host_url);
                     +		return CL_PHISH_TEXTURL;
                     +	}
+                    +
                     +	if(urls->flags&HOST_SUFFICIENT) {
                     +		if(!strcmp(urls->realLink.data,urls->displayLink.data)) {
                     +			free_if_needed(&host_url);
                     +			return CL_PHISH_HOST_OK;
                     +		}
+                    +
+                    +
                     +		if(urls->flags&DOMAIN_SUFFICIENT) {
                     +			struct url_check domain_url;
                     +			url_check_init(&domain_url);
                     +			url_get_domain(&host_url,&domain_url);
                     +			if(!strcmp(domain_url.realLink.data,domain_url.displayLink.data)) {
                     +				free_if_needed(&host_url);
                     +				free_if_needed(&domain_url);
                     +				return CL_PHISH_DOMAIN_OK;
                     +			}
                     +			free_if_needed(&domain_url);
                     +		}
+                    +
                     +		/*if(urls->flags&CHECK_REDIR) {
                     +			//see where the realLink redirects, and compare that with the displayed Link
                     +			const uchar* redirectedURL  = getRedirectedURL(urls->realLink);
                     +			if(urls->needsfree)
                     +				free(urls->realLink);
                     +			urls->realLink = redirectedURL;
+                    +
                     +			if(!strcmp(urls->realLink,urls->displayLink))
                     +				return CL_PHISH_REDIR_OK;
+                    +
                     +			if(urls->flags&HOST_SUFFICIENT) {
                     +				if(rc = url_get_host(urls,&host_url,DOMAIN_REAL))
                     +				if(!strcmp(host_url.realLink,host_url.displayLink)) {
                     +					free_if_needed(&host_url);
                     +					return CL_PHISH_HOST_REDIR_OK;
                     +				}
                     +				if(urls->flags&DOMAIN_SUFFICIENT) {
                     +					struct url_check domain_url;
                     +					url_get_domain(&host_url,&domain_url);
                     +					if(!strcmp(domain_url.realLink,domain_url.displayLink)) {
                     +						free_if_needed(&host_url);
                     +						free_if_needed(&domain_url);
                     +						return CL_PHISH_DOMAIN_REDIR_OK;
                     +					}
                     +				}
                     +			}//HOST_SUFFICIENT&CHECK_REDIR
                     +		}
                     +		free_if_needed(&host_url);*/
                     +	/*	if(urls->flags&CHECK_DOMAIN_REVERSE) {
                     +			//do a DNS lookup of the domain, and see what IP it corresponds to
                     +			//then do a reverse lookup on the IP, and see what domain you get
                     +			//There are some corporate signatures that mix different domains belonging to same company
                     +			struct url_check domain_url;
                     +			url_check_init(&domain_url);
                     +			if(!dns_to_ip_and_reverse(&host_url,DOMAIN_DISPLAY)) {
                     +				if(!strcmp(host_url.realLink.data,host_url.displayLink.data)) {
                     +					free_if_needed(&host_url);
                     +					return CL_PHISH_HOST_REVERSE_OK;
                     +				}
                     +				if(urls->flags&DOMAIN_SUFFICIENT) {
                     +					url_get_domain(&host_url,&domain_url);
                     +					if(!strcmp(domain_url.realLink.data,domain_url.displayLink.data)) {
                     +						free_if_needed(&host_url);
                     +						free_if_needed(&domain_url);
                     +						return CL_PHISH_DOMAIN_REVERSE_OK;
                     +					}
                     +					free_if_needed(&domain_url);
                     +				}
                     +			}
                     +		}*/
                     +		free_if_needed(&host_url);
                     +	}/*HOST_SUFFICIENT*/
                     +	/*we failed to find a reason why the 2 URLs are different, this is definetely phishing*/
                     +	return phishy_map(phishy,CL_PHISH_NOMATCH);
                     +}
+                    +
                     +const char* phishing_ret_toString(enum phish_status rc)
                     +{
                     +	switch(rc) {
                     +		case CL_PHISH_CLEAN:
                     +			return "Clean";
                     +		case CL_PHISH_CLEANUP_OK:
                     +			return "URLs match after cleanup";
                     +		case CL_PHISH_WHITELISTED:
                     +			return "URL is whitelisted";
                     +		case CL_PHISH_HOST_WHITELISTED:
                     +			return "host part of URL is whitelist";
                     +		case CL_PHISH_HOST_OK:
                     +			return "Hosts match";
                     +		case CL_PHISH_DOMAIN_OK:
                     +			return "Domains match";
                     +		case CL_PHISH_REDIR_OK:
                     +			return "After redirecting realURL, they match";
                     +		case CL_PHISH_HOST_REDIR_OK:
                     +			return "After redirecting realURL, hosts match";
                     +		case CL_PHISH_DOMAIN_REDIR_OK:
                     +			return "After redirecting the domains match";
                     +		case CL_PHISH_MAILTO_OK:
                     +			return "URL is mailto";
                     +		case CL_PHISH_NUMERIC_IP:
                     +			return "IP address encountered in hostname";
                     +		case CL_PHISH_TEXTURL:
                     +			return "Displayed link is not an URL, can't check if phishing or not";
                     +		case CL_PHISH_CLOAKED_NULL:
                     +			return "Link URL is cloaked (null byte %00)";
                     +		case CL_PHISH_CLOAKED_UIU:
                     +			return "Link URL contains username, and real<->displayed hosts don't match.";
                     +			/*username is a legit domain, and after the @ comes the evil one*/
                     +		case CL_PHISH_SSL_SPOOF:
                     +			return "Visible links is SSL, real link is not";
                     +		case CL_PHISH_NOMATCH:
                     +			return "URLs are way too different";
                     +		case CL_PHISH_HOST_NOT_LISTED:
                     +			return "Host not listed in .pdb -> not checked";
                     +		case CL_PHISH_CLEAN_CID:
                     +			return "Embedded image in mail -> clean";
                     +		default:
                     +			return "Unknown return code";
                     +	}
                     +}
+                    +
                     +#endif

clamav-devel/libclamav/phishcheck.h

History View file @ bd912dd

                     new file mode 100644
@@ -0,0 +1,130 @@
                     +/*
                     + *
                     + *  This program is free software; you can redistribute it and/or modify
                     + *  it under the terms of the GNU General Public License as published by
                     + *  the Free Software Foundation; either version 2 of the License, or
                     + *  (at your option) any later version.
                     + *
                     + *  This program is distributed in the hope that it will be useful,
                     + *  but WITHOUT ANY WARRANTY; without even the implied warranty of
                     + *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
                     + *  GNU General Public License for more details.
                     + *
                     + *  You should have received a copy of the GNU General Public License
                     + *  along with this program; if not, write to the Free Software
                     + *  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
                     + *  MA 02110-1301, USA.
                     + */
+                    +
                     +#ifdef CL_EXPERIMENTAL
+                    +
                     +#ifndef _PHISH_CHECK_H
                     +#define _PHISH_CHECK_H
+                    +
+                    +
                     +#define CL_PHISH_BASE 100
                     +enum phish_status {CL_PHISH_NODECISION=0,CL_PHISH_CLEAN=CL_PHISH_BASE, CL_PHISH_CLEANUP_OK,CL_PHISH_HOST_OK, CL_PHISH_DOMAIN_OK,
                     +	CL_PHISH_HOST_NOT_LISTED,
                     +	CL_PHISH_REDIR_OK, CL_PHISH_HOST_REDIR_OK, CL_PHISH_DOMAIN_REDIR_OK,
                     +	CL_PHISH_HOST_REVERSE_OK,CL_PHISH_DOMAIN_REVERSE_OK,
                     +	CL_PHISH_WHITELISTED,CL_PHISH_HOST_WHITELISTED,
                     +	CL_PHISH_CLEAN_CID,
                     +	CL_PHISH_TEXTURL, CL_PHISH_MAILTO_OK,
                     +	CL_PHISH_CLOAKED_UIU, CL_PHISH_NUMERIC_IP,CL_PHISH_HEX_URL,CL_PHISH_CLOAKED_NULL,CL_PHISH_SSL_SPOOF, CL_PHISH_NOMATCH};
+                    +
                     +#define HOST_SUFFICIENT   1
                     +#define DOMAIN_SUFFICIENT (HOST_SUFFICIENT | 2)
                     +#define DO_REVERSE_LOOKUP 4
                     +#define CHECK_REDIR       8
                     +#define CHECK_SSL         16
                     +#define CHECK_CLOAKING    32
                     +#define CLEANUP_URL       64
                     +#define CHECK_DOMAIN_REVERSE 128
                     +#define CHECK_IMG_URL        256
                     +#define DOMAINLIST_REQUIRED  512
                     +/* img checking disabled by default */
+                    +
+                    +
                     +#define CL_PHISH_ALL_CHECKS (CLEANUP_URL|DOMAIN_SUFFICIENT|CHECK_SSL|CHECK_CLOAKING|DOMAINLIST_REQUIRED|CHECK_IMG_URL)
+                    +
                     +struct string {
                     +	int refcount;
                     +	struct string* ref;
                     +	char* data;
                     +};
+                    +
                     +struct url_check {
                     +	struct string realLink;
                     +	struct string displayLink;
                     +	unsigned short       flags;
                     +};
+                    +
                     +int phishingScan(message* m,const char* dir,cli_ctx* ctx,tag_arguments_t* hrefs);
                     +enum phish_status phishingCheck(struct url_check* urls);
+                    +
                     +int whitelist_check(struct url_check* urls,int hostOnly);
                     +void url_check_init(struct url_check* urls);
                     +void get_host(struct string* dest,const char* URL,int isReal,int* phishy);
                     +void string_free(struct string* str);
                     +void string_assign(struct string* dest,struct string* src);
                     +void string_assign_c(struct string* dest,char* data);
                     +void string_init_c(struct string* dest,char* data);
                     +void string_assign_dup(struct string* dest,const char* start,const char* end);
                     +void string_assign_null(struct string* dest);
                     +void string_assign_ref(struct string* dest,struct string* ref,char* data);
                     +void free_if_needed(struct url_check* url);
                     +void get_host(struct string* dest,const char* URL,int isReal,int* phishy);
                     +int isCountryCode(const char* str);
                     +int isTLD(const char* str,int len);
                     +char* rfind(char* start,char c,size_t len);
                     +void get_domain(struct string* dest,struct string* host);
                     +int ip_reverse(struct url_check* urls,int isReal);
                     +void reverse_lookup(struct url_check* url,int isReal);
                     +int isNumeric(const char* host);
                     +int isSSL(const char* URL);
                     +void cleanupURL(struct string* URL,int isReal);
                     +void get_redirected_URL(struct string* URL);
                     +int isURL(const char* URL);
                     +enum phish_status cleanupURLs(struct url_check* urls);
                     +int isNumericURL(const char* URL);
                     +enum phish_status url_get_host(struct url_check* url,struct url_check* host_url,int isReal,int* phishy);
                     +void url_get_domain(struct url_check* url,struct url_check* domains);
                     +enum phish_status phishy_map(int phishy,enum phish_status fallback);
                     +int isEncoded(const char* url);
                     +void phishing_done(void);
+                    +
                     +static inline int isPhishing(enum phish_status rc)
                     +{
                     +	switch(rc) {
                     +		case CL_PHISH_CLEAN:
                     +		case CL_PHISH_CLEANUP_OK:
                     +		case CL_PHISH_WHITELISTED:
                     +		case CL_PHISH_HOST_WHITELISTED:
                     +		case CL_PHISH_HOST_OK:
                     +		case CL_PHISH_DOMAIN_OK:
                     +		case CL_PHISH_REDIR_OK:
                     +		case CL_PHISH_HOST_REDIR_OK:
                     +		case CL_PHISH_DOMAIN_REDIR_OK:
                     +		case CL_PHISH_HOST_REVERSE_OK:
                     +		case CL_PHISH_DOMAIN_REVERSE_OK:
                     +		case CL_PHISH_MAILTO_OK:
                     +		case CL_PHISH_TEXTURL:
                     +		case CL_PHISH_HOST_NOT_LISTED:
                     +		case CL_PHISH_CLEAN_CID:
                     +			return 0;
                     +		case CL_PHISH_HEX_URL:
                     +		case CL_PHISH_CLOAKED_NULL:
                     +		case CL_PHISH_SSL_SPOOF:
                     +		case CL_PHISH_CLOAKED_UIU:
                     +		case CL_PHISH_NUMERIC_IP:
                     +		case CL_PHISH_NOMATCH:
                     +			return 1;
                     +		default:
                     +			return 1;
                     +	}
                     +}
                     +const char* phishing_ret_toString(enum phish_status rc);
                     +#endif
+                    +
                     +#endif

clamav-devel/libclamav/readdb.c

History View file @ bd912dd

@@ -42,10 +42,8 @@
                      #include "defaults.h"
                      #ifdef CL_EXPERIMENTAL
                     -/*
                      #include "phish_whitelist.h"
                      #include "phish_domaincheck_db.h"
                     -*/
                      #endif
@@ -1094,7 +1092,6 @@ static int cli_load(const char *filename, struct cl_engine **engine, unsigned in
                      #endif
                      	    skipped = 1;
                      #ifdef CL_EXPERIMENTAL
                     -/*
                          } else if(cli_strbcasestr(filename, ".wdb")) {
                      	if(!(options & CL_SCAN_NOPHISHING))
                      	    ret = cli_loadwdb(fd, options);
@@ -1105,7 +1102,6 @@ static int cli_load(const char *filename, struct cl_engine **engine, unsigned in
                      	    ret = cli_loadpdb(fd, options);
                      	else
                      	    skipped = 1;
                     -*/
                      #endif
                          } else {
                      	cli_dbgmsg("cli_load: unknown extension - assuming old database format\n");
@@ -1172,10 +1168,8 @@ static int cli_loaddbdir(const char *dirname, struct cl_engine **engine, unsigne
                      	     cli_strbcasestr(dent->d_name, ".zmd")  ||
                      	     cli_strbcasestr(dent->d_name, ".rmd")  ||
                      #ifdef CL_EXPERIMENTAL
                     -/*
                      	     cli_strbcasestr(dent->d_name, ".pdb")  ||
                      	     cli_strbcasestr(dent->d_name, ".wdb")  ||
                     -*/
                      #endif
                      	     cli_strbcasestr(dent->d_name, ".hw")  ||
                      	     cli_strbcasestr(dent->d_name, ".inc")  ||
@@ -1294,10 +1288,8 @@ int cl_statinidir(const char *dirname, struct cl_stat *dbstat)
                      	    cli_strbcasestr(dent->d_name, ".zmd")  ||
                      	    cli_strbcasestr(dent->d_name, ".rmd")  ||
                      #ifdef CL_EXPERIMENTAL
                     -/*
                      	    cli_strbcasestr(dent->d_name, ".pdb")  ||
                      	    cli_strbcasestr(dent->d_name, ".wdb")  ||
                     -*/
                      #endif
                      	    cli_strbcasestr(dent->d_name, ".hw")   ||
                      	    cli_strbcasestr(dent->d_name, ".inc")   ||
@@ -1374,10 +1366,8 @@ int cl_statchkdir(const struct cl_stat *dbstat)
                      	    cli_strbcasestr(dent->d_name, ".zmd")  ||
                      	    cli_strbcasestr(dent->d_name, ".rmd")  ||
                      #ifdef CL_EXPERIMENTAL
                     -/*
                      	    cli_strbcasestr(dent->d_name, ".pdb")  ||
                      	    cli_strbcasestr(dent->d_name, ".wdb")  ||
                     -*/
                      #endif
                      	    cli_strbcasestr(dent->d_name, ".hw")   ||
                      	    cli_strbcasestr(dent->d_name, ".inc")   ||

clamav-devel/libclamav/regex_list.c

History View file @ bd912dd

                     new file mode 100644
@@ -0,0 +1,1521 @@
                     +/*
                     + *  Match a string against a list of patterns/regexes.
                     + *
                     + *
                     + *  This program is free software; you can redistribute it and/or modify
                     + *  it under the terms of the GNU General Public License as published by
                     + *  the Free Software Foundation; either version 2 of the License, or
                     + *  (at your option) any later version.
                     + *
                     + *  This program is distributed in the hope that it will be useful,
                     + *  but WITHOUT ANY WARRANTY; without even the implied warranty of
                     + *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
                     + *  GNU General Public License for more details.
                     + *
                     + *  You should have received a copy of the GNU General Public License
                     + *  along with this program; if not, write to the Free Software
                     + *  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
                     + *  MA 02110-1301, USA.
                     + *
                     + *  $Log: regex_list.c,v $
                     + *  Revision 1.1  2006/09/12 19:38:39  acab
                     + *  Phishing module merge - libclamav
                     + *
                     + *  Revision 1.13  2006/09/11 19:25:08  edwin
                     + *  Non-printable characters in regex (although they are invalid inside an url, added some support for it).
                     + *
                     + *  Revision 1.12  2006/08/28 08:43:06  edwin
                     + *  Fixed a few minor leaks.
                     + *  Valgrind now says:"All heap blocks were freed -- no leaks are possible"
                     + *
                     + *  Revision 1.11  2006/08/20 21:18:11  edwin
                     + *  Added the script used to generate iana_tld.sh
                     + *  Added checks for phish_domaincheck_db
                     + *  Added phishing module design document from wiki (as discussed with aCaB).
                     + *  Updated .wdb/.pdb format documentation (in regex_list.c)
                     + *  Fixed some memory leaks in regex_list.c
                     + *  IOW: cleanups before the deadline.
                     + *  I consider my module to be ready for evaluation now.
                     + *
                     + *  Revision 1.10  2006/08/20 19:42:02  edwin
                     + *  Fix custom character class, and generic regex handling.
                     + *
                     + *  Revision 1.9  2006/08/19 21:08:47  edwin
                     + *  Fixed:Forgot to add form tag handling when it contains images.
                     + *  Various fixes to get rid of gcc warnings.
                     + *
                     + *  Revision 1.8  2006/08/19 09:26:51  edwin
                     + *  regex_list.c: Fixed regex alternatives handling (bug discovered with autotests).
                     + *  And forgot to commit manager.c last time.
                     + *
                     + *  Revision 1.7  2006/08/17 20:31:43  edwin
                     + *  Disable extracting hrefs from mails in mbox, if: we aren't scanning for phish, and mailfollowurls is off.
                     + *  Fix a still reachable leak. Remove unneeded build_regex_list export.
                     + *
                     + *  Revision 1.6  2006/08/12 14:35:34  edwin
                     + *  Fix some compiler warnings.
                     + *  Fix an assertion failure in regex_list.
                     + *  Interpret display links that start with http|https|ftp, always as an URL.
                     + *
                     + *  Revision 1.5  2006/08/06 20:27:07  edwin
                     + *  New option to enable phish scan for all domains (disabled by default).
                     + *  You will now have to run clamscan --phish-scan-alldomains to have any phishes detected.
                     + *  Updated phishcheck control flow to better incorporate the domainlist.
                     + *  Updated manpage with new options.
                     + *
                     + *  TODO:there is a still-reachable leak in regex_list.c
                     + *
                     + *  Revision 1.4  2006/08/01 20:19:15  edwin
                     + *  Integrate domainlist check into phishcheck. Warning: enabled by default.
                     + *  Regex bracket handling update.
                     + *  Better regex paranthesized & alternate expression handling.
                     + *
                     + *  Revision 1.3  2006/07/31 20:12:30  edwin
                     + *  Preliminary support for domain databases (domains to check by phishmodule)
                     + *  Better memory allocation failure handling in regex_list
                     + *
                     + */
+                    +
                     +#if HAVE_CONFIG_H
                     +#include "clamav-config.h"
                     +#endif
+                    +
                     +#ifdef CL_EXPERIMENTAL
+                    +
                     +#ifndef CL_DEBUG
                     +#define NDEBUG
                     +#endif
+                    +
                     +#ifdef CL_THREAD_SAFE
                     +#ifndef _REENTRANT
                     +#define _REENTRANT
                     +#endif
                     +#endif
+                    +
                     +#include <stdio.h>
                     +#include <stdlib.h>
                     +#include <errno.h>
                     +#include <assert.h>
                     +#include <string.h>
                     +#include <strings.h>
                     +#include <ctype.h>
+                    +
                     +#include <limits.h>
                     +#include <sys/types.h>
+                    +
                     +/*#define USE_PCRE*/
                     +#include <regex.h>
+                    +
                     +#if defined(HAVE_READDIR_R_3) || defined(HAVE_READDIR_R_2)
                     +#include <stddef.h>
                     +#endif
+                    +
                     +#include "clamav.h"
                     +#include "others.h"
                     +#include "defaults.h"
                     +#include "str.h"
                     +#include "filetypes.h"
                     +#include "mbox.h"
                     +#include "regex_list.h"
                     +#include "matcher-ac.h"
+                    +
+                    +
                     +/*Tree*/
                     +enum token_op_t {OP_CHAR,OP_STDCLASS,OP_CUSTOMCLASS,OP_DOT,OP_LEAF,OP_ROOT,OP_PARCLOSE};
                     +typedef char* char_bitmap_p;
                     +/*
                     + *
                     + * OP_CHAR: 1 character, c = character
                     + * complex stuff:
                     + * OP_STDCLASS: standard character class, c = char class, class: 1<<(index into std_class of class name)
                     + * OP_CUSTOMCLASS: custom character class, first pointer in ptr array is a pointer to the bitmap table for this class
                     + * OP_DOT: single . matching any character except \n
                     + * OP_LEAF: this is a leaf node, reinterpret structure
                     + */
                     +struct tree_node {
                     +	enum token_op_t op;
                     +	unsigned char c;
                     +	char alternatives;/* number of (non-regex) children of node, i.e. sizeof(children)*/
                     +	char listend;/* no more siblings, next pointer is pointer to parent*/
                     +	struct tree_node* next;/* next regex/complex sibling, or parent, if no more siblings , can't be NULL except for root node*/
                     +	union {
                     +		struct tree_node** children;/* alternatives nr. of children, followed by (a null pointer terminated) regex leaf node pointers) */
                     +		char_bitmap_p* bitmap;
                     +		struct leaf_info*  leaf;
                     +	} u;
                     +};
+                    +
                     +struct leaf_info {
                     +	char* info;/* what does it mean that we reached the leaf...*/
                     +	regex_t* preg;/* this is NULL if leaf node, and non-regex*/
                     +};
+                    +
                     +/* Character classes */
                     +enum wctype_t {ALNUM,DIGIT,PUNCT,ALPHA,GRAPH,SPACE,BLANK,LOWER,UPPER,CNTRL,PRINT,XDIGIT};
                     +static struct std_classmap {
                     +		const char* classname;
                     +		const enum wctype_t type;
                     +} std_class[] = {
                     +	{"[:alnum:]",ALNUM},
                     +	{"[:digit:]",DIGIT},
                     +	{"[:punct:]",PUNCT},
                     +	{"[:alpha:]",ALPHA},
                     +	{"[:graph:]",GRAPH},
                     +	{"[:space:]",SPACE},
                     +	{"[:blank:]",BLANK},
                     +	{"[:lower:]",LOWER},
                     +	{"[:upper:]",UPPER},
                     +	{"[:cntrl:]",CNTRL},
                     +	{"[:print:]",PRINT},
                     +	{"[:xdigit:]",XDIGIT}
                     +};
+                    +
                     +static const size_t std_class_cnt =  sizeof(std_class)/sizeof(std_class[0]);
                     +#define STD_CLASS_CNT sizeof(std_class)/sizeof(std_class[0])
                     +typedef char char_bitmap_t[32];
                     +static char_bitmap_p char_class_bitmap[STD_CLASS_CNT];
                     +static unsigned short int char_class[256];
+                    +
                     +/* Prototypes */
                     +static void setup_matcher_engine(void);
                     +static void matcher_engine_done(void);
                     +static int add_pattern(struct regex_matcher* matcher,const unsigned char* pat,const char* info);
                     +static int match_node(struct tree_node* node,const unsigned char* c,size_t len,const char** info);
                     +static void destroy_tree(struct regex_matcher* matcher);
+                    +
+                    +
                     +#define MATCH_SUCCESS 0
                     +#define MATCH_FAILED  -1
+                    +
+                    +
                     +/*
                     + * Call this function when an unrecoverable error has occured, (instead of exit).
                     + */
                     +static void fatal_error(struct regex_matcher* matcher)
                     +{
                     +	regex_list_done(matcher);
                     +	matcher->list_inited = -1;/* the phishing module will know we tried to load a whitelist, and failed, so it will disable itself too*/
                     +}
+                    +
+                    +
                     +/*
                     + * @matcher - matcher structure to use
                     + * @real_url - href target
                     + * @display_url - <a> tag contents
                     + * @hostOnly - if you want to match only the host part
                     + *
                     + * @return - CL_SUCCESS - url doesn't match
                     + *         - CL_VIRUS - url matches list
                     + *
                     + * Do not send NULL pointers to this function!!
                     + *
                     + */
                     +int regex_list_match(struct regex_matcher* matcher,const char* real_url,const char* display_url,int hostOnly,const char** info)
                     +{
                     +	assert(matcher);
                     +	assert(real_url);
                     +	assert(display_url);
                     +	assert(info);
                     +	if(!matcher->list_inited)
                     +		return 0;
                     +	assert(matcher->list_built);
                     +	{
                     +		size_t real_len    = strlen(real_url);
                     +		size_t display_len = strlen(display_url);
                     +		size_t buffer_len  = real_len + display_len + 1;
                     +		char*  buffer = cli_malloc(buffer_len+1);
                     +		int partcnt,rc;
                     +		unsigned long int partoff;
+                    +
                     +		if(!buffer)
                     +			return CL_EMEM;
+                    +
                     +		strncpy(buffer,real_url,real_len);
                     +		buffer[real_len]=' ';
                     +		strncpy(buffer+real_len+1,display_url,display_len);
                     +		buffer[buffer_len]=0;
                     +		cli_dbgmsg("Looking up in regex_list: %s\n");
+                    +
                     +		rc = cli_ac_scanbuff(buffer,buffer_len,info,hostOnly ? matcher->root_hosts : matcher->root_urls,&partcnt,0,0,&partoff,0,-1,NULL);
                     +		if(!rc && !hostOnly)
                     +			rc = match_node(matcher->root_regex,(unsigned char*)buffer,buffer_len,info) == MATCH_SUCCESS ? CL_VIRUS : CL_SUCCESS;
                     +		free(buffer);
                     +		if(!rc)
                     +			cli_dbgmsg("not in regex list\n");
                     +		return rc;
                     +	}
                     +}
+                    +
                     +static struct tree_node* tree_root_alloc(void);
+                    +
+                    +
                     +/* node stack */
                     +#define NODE_STACK_INITIAL 1024
                     +#define NODE_STACK_GROW    4096
                     +/* Initialize @stack */
                     +static int stack_init(struct node_stack* stack)
                     +{
                     +	assert(stack);
+                    +
                     +	stack->cnt = 0;
                     +	stack->capacity = NODE_STACK_INITIAL;
                     +	stack->data = cli_malloc(stack->capacity * sizeof(*stack->data));
                     +	if(!stack->data)
                     +		return CL_EMEM;
                     +	else
                     +		return CL_SUCCESS;
                     +}
+                    +
                     +/* Reset @stack pointer, but don't realloc */
                     +static void stack_reset(struct node_stack* stack)
                     +{
                     +	assert(stack);
+                    +
                     +	stack->cnt = 0;
                     +}
+                    +
                     +/* Push @node on @stack, growing it if necessarry */
                     +static inline int stack_push(struct node_stack* stack,struct tree_node* node)
                     +{
                     +	assert(stack);
                     +	assert(stack->data);
+                    +
                     +	if(stack->cnt == stack->capacity) {
                     +		stack->capacity += NODE_STACK_GROW;
                     +		stack->data = cli_realloc(stack->data,stack->capacity*sizeof(*stack->data));
                     +		if(!stack->data)
                     +			return CL_EMEM;
                     +	}
                     +	stack->data[stack->cnt++] = node;
                     +	return CL_SUCCESS;
                     +}
+                    +
                     +/* Pops node from @stack, doesn't realloc */
                     +static inline struct tree_node* stack_pop(struct node_stack* stack)
                     +{
                     +	assert(stack);
                     +	assert(stack->data);
                     +	assert(stack->cnt);/*don't pop from empty stack */
+                    +
                     +	return stack->cnt ? stack->data[--stack->cnt] : NULL;
                     +}
+                    +
                     +/* Initialization & loading */
+                    +
                     +/* Initializes @matcher, allocating necesarry substructures */
                     +int init_regex_list(struct regex_matcher* matcher)
                     +{
                     +	assert(matcher);
+                    +
                     +	setup_matcher_engine();
+                    +
                     +	matcher->list_inited = 0;
                     +	matcher->root_hosts = (struct cli_matcher*) cli_calloc(1,sizeof(*matcher->root_hosts));
                     +	if(!matcher->root_hosts)
                     +		return CL_EMEM;
+                    +
                     +	matcher->root_hosts->ac_root =  (struct cli_ac_node *) cli_calloc(1, sizeof(struct cli_ac_node));
                     +	if(!matcher->root_hosts->ac_root) {
                     +		free(matcher->root_hosts);
                     +		return CL_EMEM;
                     +	}
+                    +
                     +	matcher->root_urls = (struct cli_matcher*) cli_calloc(1,sizeof(*matcher->root_hosts));
                     +	if(!matcher->root_urls) {
                     +		free(matcher->root_hosts->ac_root);
                     +		free(matcher->root_hosts);
                     +		return CL_EMEM;
                     +	}
+                    +
                     +	matcher->root_urls->ac_root =  (struct cli_ac_node *) cli_calloc(1, sizeof(struct cli_ac_node));
                     +	if(!matcher->root_urls->ac_root) {
                     +		free(matcher->root_hosts->ac_root);
                     +		free(matcher->root_hosts);
                     +		free(matcher->root_urls);
                     +		return CL_EMEM;
                     +	}
+                    +
                     +	matcher->root_regex = tree_root_alloc();
                     +	if(!matcher->root_regex) {
                     +		free(matcher->root_hosts->ac_root);
                     +		free(matcher->root_hosts);
                     +		free(matcher->root_urls->ac_root);
                     +		free(matcher->root_urls);
                     +		return CL_EMEM;
                     +	}
+                    +
                     +	stack_init(&matcher->node_stack);
                     +	stack_init(&matcher->node_stack_alt);
+                    +
                     +	matcher->list_inited=1;
                     +	matcher->list_built=0;
                     +	matcher->list_loaded=0;
+                    +
                     +	return CL_SUCCESS;
                     +}
+                    +
                     +/* inserts @pattern into @root, using ac-matcher
                     + * although the name might be confusing, @pattern is not a regex!*/
                     +static int add_regex_list_element(struct cli_matcher* root,const char* pattern,char* info)
                     +{
                     +       int ret;
                     +       struct cli_ac_patt *new = cli_calloc(1,sizeof(*new));
                     +       size_t len;
+                    +
                     +       if(!new)
                     +	       return CL_EMEM;
                     +       assert(root);
                     +       assert(pattern);
+                    +
                     +       len = strlen(pattern);
                     +       new->type = 0;
                     +       new->sigid = 0;
                     +       new->parts = 0;
                     +       new->partno = 0;
                     +       new->mindist = 0;
                     +       new->maxdist = 0;
                     +       new->offset = 0;
                     +       new->target = 0;
                     +       new->length = len;
                     +       if(new->length > root->maxpatlen)
                     +               root->maxpatlen = new->length;
+                    +
                     +       new->pattern = cli_malloc(sizeof(new->pattern[0])*len);
                     +       if(!new->pattern) {
                     +	       free(new);
                     +	       return CL_EMEM;
                     +       }
                     +       strncpy((char*)new->pattern,(const char*)pattern,len);
+                    +
                     +       new->virname = info;
                     +       if((ret = cli_ac_addpatt(root,new))) {
                     +	       free(new->virname);
                     +               free(new->pattern);
                     +               free(new);
                     +               return ret;
                     +       }
                     +       return CL_SUCCESS;
                     +}
+                    +
+                    +
                     +#ifndef NDEBUG
                     +void dump_tree(struct tree_node* root);
                     +#endif
                     +static int matcher_engine_refcount=0;
+                    +
                     +static int build_regex_list(struct regex_matcher* matcher);
                     +/* Load patterns/regexes from file */
                     +int load_regex_matcher(struct regex_matcher* matcher,FILE* fd,unsigned int options)
                     +{
                     +	int rc,line=0;
                     +	char buffer[FILEBUFF];
+                    +
                     +	assert(matcher);
                     +	assert(fd);
+                    +
                     +	if(matcher->list_inited==-1)
                     +		return -1;
                     +	if(matcher->list_loaded) {
                     +		cli_warnmsg("Regex list has already been loaded, ignoring further requests for load\n");
                     +		return -1;/*TODO: better return code*/
                     +	}
                     +	if(!fd) {
                     +		cli_errmsg("Unable to load regex list (null file)\n");
                     +		return -1;/*TODO: return appropiate return code*/
                     +	}
+                    +
                     +	cli_dbgmsg("Loading regex_list\n");
                     +	if(!matcher->list_inited) {
                     +		init_regex_list(matcher);
                     +		if (!matcher->list_inited) {
                     +			cli_errmsg("Regex list failed to initialize!\n");
                     +			fatal_error(matcher);
                     +			return -1;
                     +		}
                     +		/*atexit(regex_list_done); TODO: destroy this in manager.c */
                     +	}
                     +	/*
                     +	 * Regexlist db format (common to .wdb(whitelist) and .pdb(domainlist) files:
                     +	 * Multiple lines of form, (empty lines are skipped):
                     + 	 * Flags RealURL DisplayedURL
                     +	 * Where:
                     +	 * Flags: R - regex, H - host-only, followed by (optional) 3-digit hexnumber representing
                     +	 * flags that should be filtered.
                     +	 * [i.e. phishcheck urls.flags that we don't want to be done for this particular host]
                     +	 * Note:Flag filtering only makes sense in .pdb files.
                     +	 *
                     +	 * If a line in the file doesn't conform to this format, loading fails
                     +	 *
                     +	 */
                     +	while(fgets(buffer,FILEBUFF,fd)) {
                     +		char* pattern;
                     +		char* flags;
                     +		line++;
                     +		cli_chomp(buffer);
                     +		if(!*buffer)
                     +			continue;/* skip empty lines */
                     +		pattern = strchr(buffer,' ');
                     +		if(!pattern) {
                     +			cli_errmsg("Malformed regex list line %d\n",line);
                     +			fatal_error(matcher);
                     +			return CL_EMALFDB;
                     +		}
                     +		pattern[0]='\0';
                     +		flags=buffer+1;
                     +		pattern++;
                     +		if(buffer[0] == 'R') {
                     +			if(( rc = add_pattern(matcher,(const unsigned char*)pattern,flags) ))
                     +				return rc==CL_EMEM ? CL_EMEM : CL_EMALFDB;
                     +		}
                     +		else if(buffer[0] == 'H') {
                     +			if(( rc = add_regex_list_element(matcher->root_hosts,pattern,flags) ))
                     +				return rc==CL_EMEM ? CL_EMEM : CL_EMALFDB;
                     +		}
                     +		else {
                     +			if(( rc = add_regex_list_element(matcher->root_urls,pattern,flags) ))
                     +				return rc==CL_EMEM ? CL_EMEM : CL_EMALFDB;
                     +		}
                     +	}
                     +	matcher->list_loaded = 1;
                     +	build_regex_list(matcher);
+                    +
                     +#ifndef NDEBUG
                     +/*			dump_tree(matcher->root_regex);*/
                     +#endif
                     +	if(!matcher->list_built) {
                     +		cli_errmsg("Regex list not loaded: build failed!\n");
                     +		fatal_error(matcher);
                     +		return CL_EMALFDB;
                     +	}
                     +	regex_list_cleanup(matcher);
                     +	matcher_engine_refcount++;
                     +	return CL_SUCCESS;
                     +}
+                    +
                     +/*
                     +static void tree_node_merge_nonbin(struct tree_node* into,const struct tree_node* node)
                     +{
                     +	assert(into);
                     +	assert(node);
+                    +
                     +	if(node->alternatives){
                     +		if(node->u.children[0]->next == node) {
                     +			*no non-bin alternatives here*
                     +		}
                     +		else {
                     +			struct tree_node* p;
                     +			for(p = node->u.children[0]->next; p->next != node; p = p->next)
                     +				tree_node_insert_nonbin(into,p);
                     +		}
                     +	}
                     +	else
                     +		tree_node_insert_nonbin(into,node->u.children[0]);
                     +}
                     +*
                     +static void tree_node_merge_bin(struct tree_node* into,const struct tree_node* node)
                     +{
                     +	if(node->u.children && node->alternatives) {
                     +		if(!into->alternatives) {
                     +			* into has no bin part, just copy+link the node there*
                     +			int i;
                     +			struct tree_node* next = into->u.children[0];
                     +			into->u.children = node->u.children;
                     +			into->alternatives = node->alternatives;
                     +			for(i=0;i < into->alternatives;i++) {
                     +				if(into->u.children[i]->next == node) {
                     +					into->u.children[i]->next = next;
                     +					into->u.children[i]->listend = 0;
                     +				}
                     +				else {
                     +					struct tree_node* p;
                     +					for(p = into->u.children[0]->next; p->next != node; p = p->next);
                     +					p->listend = 0;
                     +					p->next = next;
                     +				}
                     +			}
                     +		}
                     +		const size_t new_size = tree_node_get_array_size(into) + tree_node_get_array_size(node);
                     +		struct tree_node** new_children = cli_malloc(sizeof(
                     +	}
                     +	* else: no bin part to merge *
                     +}
                     +*/
+                    +
                     +static struct tree_node ** tree_node_get_children(const struct tree_node* node)
                     +{
                     +	return node->op==OP_CUSTOMCLASS ? (node->u.children[1] ? node->u.children+1 : NULL) :node->u.children;
                     +}
                     +/* don't do this, it wastes too much memory, and has no benefit
                     +static void regex_list_dobuild(struct tree_node* called_from,struct tree_node* node)
                     +{
                     +	struct tree_node **children;
                     +	assert(node);
+                    +
                     +	children = tree_node_get_children(node);
                     +	if(node->op!=OP_ROOT)
                     +		assert(called_from);
                     +	if(node->op==OP_TMP_PARCLOSE) {
                     +		const size_t array_size = (node->alternatives +(called_from->op==OP_CUSTOMCLASS ? 1:0))*sizeof(*called_from->u.children);
                     +		if(node->c)
                     +			return;* already processed this common node*
                     +		else
                     +			node->c = 1;
                     +		* copy children to called_from from this node
                     +		 * called_from should have 0 alternatives, and a link to this node via ->u.children[0]
                     +		 * *
                     +		assert(called_from->alternatives == 0);
                     +		assert(called_from->u.children);
                     +		assert(called_from->u.children[0] == node);
                     +		called_from->u.children = cli_realloc(called_from->u.children,array_size);
                     +		called_from->u.children = node->u.children;
                     +		called_from->alternatives = node->alternatives;
                     +		if(called_from->alternatives) {
                     +			* fix parent pointers *
                     +			int i;TODO: do a deep copy of children here
                     +			struct tree_node **from_children = tree_node_get_children(called_from);
                     +                        assert(from_children);
                     +			for(i=0;i < called_from->alternatives;i++) {
                     +				struct tree_node* p;
                     +				for(p=from_children[i];p->next != node; p = p->next);
                     +				p->next = called_from;
                     +			}
                     +		}
                     +	}
+                    +
                     +	if(node->op==OP_LEAF)
                     +	return;
                     +	else if (node->alternatives) {
                     +		int i;
                     +		struct tree_node* p;
                     +		assert(children);
                     +		p = children[0]->op==OP_LEAF ? NULL : children[0]->next;
                     +		for(i=0;i<node->alternatives;i++)
                     +			regex_list_dobuild(node,children[i]);
                     +		if(p && p!=node)
                     +			regex_list_dobuild(node,p);
                     +	} else {
                     +		if(children)
                     +			if (children[0])
                     +				regex_list_dobuild(node,children[0]);
                     +	}
                     +	if(node->next && !node->listend)
                     +		regex_list_dobuild(node,node->next);
                     +	if(node->op==OP_TMP_PARCLOSE)
                     +		node->c=0;
                     +	*free(node);*
                     +}
                     +*/
                     +/* Build the matcher list */
                     +static int build_regex_list(struct regex_matcher* matcher)
                     +{
                     +	if(!matcher->list_inited || !matcher->list_loaded) {
                     +		cli_errmsg("Regex list not loaded!\n");
                     +		return -1;/*TODO: better error code */
                     +	}
                     +	cli_dbgmsg("Building regex list\n");
                     +	cli_ac_buildtrie(matcher->root_hosts);
                     +	cli_ac_buildtrie(matcher->root_urls);
                     +	matcher->list_built=1;
+                    +
                     +	return CL_SUCCESS;
                     +}
+                    +
+                    +
                     +static void stack_destroy(struct node_stack* stack);
                     +/* Done with this matcher, free resources */
                     +void regex_list_done(struct regex_matcher* matcher)
                     +{
                     +	assert(matcher);
+                    +
                     +	regex_list_cleanup(matcher);
                     +	if(matcher->list_loaded) {
                     +		cli_ac_free(matcher->root_hosts);
                     +		free(matcher->root_hosts);
                     +		matcher->root_hosts=NULL;
+                    +
                     +		cli_ac_free(matcher->root_urls);
                     +		free(matcher->root_urls);
                     +		matcher->root_urls=NULL;
+                    +
                     +		matcher->list_built=0;
                     +		destroy_tree(matcher);
                     +		matcher->list_loaded=0;
                     +	}
                     +	if(matcher->list_inited) {
                     +		matcher_engine_done();
                     +		matcher->list_inited=0;
                     +	}
                     +	stack_destroy(&matcher->node_stack);
                     +	stack_destroy(&matcher->node_stack_alt);
                     +}
+                    +
                     +/* Tree matcher algorithm */
+                    +
                     +static int cli_iswctype(const char c,const enum wctype_t type)
                     +{
                     +	switch(type) {
                     +		case ALNUM:
                     +			return isalnum(c);
                     +		case DIGIT:
                     +			return isdigit(c);
                     +		case PUNCT:
                     +			return ispunct(c);
                     +		case ALPHA:
                     +			return isalpha(c);
                     +		case GRAPH:
                     +			return isgraph(c);
                     +		case SPACE:
                     +			return isspace(c);
                     +		case BLANK:
                     +			return c=='\t' || c==' ';
                     +		case LOWER:
                     +			return islower(c);
                     +		case UPPER:
                     +			return isupper(c);
                     +		case CNTRL:
                     +			return iscntrl(c);
                     +		case PRINT:
                     +			return isprint(c);
                     +		case XDIGIT:
                     +			return isxdigit(c);
                     +		default: {
                     +				 cli_warnmsg("Unknown char class in iswctype\n");
                     +	 			 return 0;
                     +			 }
                     +	}
                     +}
+                    +
                     +static int engine_inited=0;
+                    +
                     +static void setup_matcher_engine(void)
                     +{
                     +	/*Set up std character classes*/
                     +	size_t i;
                     +	size_t j;
                     +	if(engine_inited)
                     +		return;
                     +	memset(char_class,0,256);
                     +	for(i=0;i<std_class_cnt;i++) {
                     +		enum wctype_t type = std_class[i].type;
                     +		char_class_bitmap[i]=cli_calloc(256>>3,1);
                     +		for(j=0;j<256;j++)
                     +			if(cli_iswctype(j,type)) {
                     +				char_class[j] |= 1<<i;
                     +				char_class_bitmap[i][j>>3] |= 1<<(j&0x07);
                     +			}
                     +	}
                     +	engine_inited=1;
                     +}
+                    +
                     +static void matcher_engine_done(void)
                     +{
                     +	size_t i;
                     +	matcher_engine_refcount--;
                     +	if(!matcher_engine_refcount) {
                     +		for(i=0;i<std_class_cnt;i++)
                     +			free(char_class_bitmap[i]);
                     +	}
                     +	engine_inited=0;
                     +}
+                    +
                     +struct token_t
                     +{
                     +	size_t len;
                     +	char   type;
                     +	union {
                     +		const unsigned char* start;
                     +		char_bitmap_p        bitmap;
                     +	} u;
                     +};
+                    +
                     +enum {TOKEN_CHAR,TOKEN_DOT,TOKEN_PAR_OPEN,TOKEN_PAR_CLOSE,TOKEN_BRACKET,TOKEN_ALT,TOKEN_REGEX,TOKEN_DONE};
+                    +
                     +static const unsigned char* getNextToken(const unsigned char* pat,struct token_t* token)
                     +{
                     +	assert(pat);
                     +	assert(token);
+                    +
                     +	switch(*pat) {
                     +		case '\\':
                     +			token->type=TOKEN_CHAR;
                     +			token->u.start = ++pat;
                     +			if(islower(token->u.start)) {
                     +				/* handle \n, \t, etc. */
                     +				char c;
                     +				if(snprintf(&c,1,"\%c",token->u.start)!=1)
                     +					token->type=TOKEN_REGEX;
                     +				token->u.start=c;
                     +			}
                     +			token->len   = 1;
                     +			break;
                     +		case '|':
                     +			token->type=TOKEN_ALT;
                     +			break;
                     +		case '*':
                     +		case '+':
                     +		case '?':
                     +		case '{':
                     +		case '}':
                     +			token->type=TOKEN_REGEX;
                     +/*			assert(0 && "find_regex_start should have forbidden us from finding regex special chars");*/
                     +			break;
                     +		case '[':
                     +			{
                     +			/*TODO: implement*/
                     +			/*see if it is something simple like a list of characters, a range, or negated ...*/
                     +			const unsigned char* old=pat++;/* save this in case we change our mind and decide this is too complicated for us to handle*/
                     +			unsigned char range_start=0;
                     +			int hasprev = 0;
                     +			char_bitmap_p bitmap = cli_malloc(32);
                     +			if(!bitmap)
                     +				return NULL;
                     +			if (*pat=='^') {
                     +				memset(bitmap,0xFF,32);/*match chars not in brackets*/
                     +				pat++;
                     +			}
                     +			else
                     +				memset(bitmap,0x00,32);
                     +			do {
                     +				/* literal ] can be first character, so test for it at the end of the loop, for example: []] */
                     +				if (*pat=='-' && hasprev) {
                     +					/* it is a range*/
                     +					unsigned char range_end;
                     +					unsigned int c;
                     +					assert(range_start);
                     +					pat++;
                     +					if (pat[0]=='[')
                     +						if (pat[1]=='.') {
                     +							if(pat[2]=='-' && pat[3]=='.' && pat[4]==']')
                     +								range_end = '-';
                     +							else {
                     +								/* this is getting complicated, bail out */
                     +								cli_warnmsg("confused about collating sequences in regex,bailing out");
                     +								pat=old;
                     +								token->type=TOKEN_REGEX;
                     +								break;
                     +							}
                     +						}
                     +						else
                     +							range_end = *pat;
                     +					else
                     +						range_end = *pat;
                     +					for(c=range_start+1;c<=range_end;c++)
                     +						bitmap[c>>3] ^= 1<<(c&0x7);
                     +					hasprev = 0;
                     +				}
                     +				else if (pat[0]=='[' && pat[1]==':') {
                     +							const unsigned char* end;
                     +							int len,found=-1;
                     +							size_t i;
+                    +
                     +							pat+=2;
                     +							end=(unsigned char*)strstr((const char*)pat,":]");
                     +							if(!end) {
                     +								cli_warnmsg("confused about std char class syntax regex,bailing out");
                     +								pat=old;
                     +								token->type=TOKEN_REGEX;
                     +								break;
                     +							}
+                    +
                     +							len = end-pat;
                     +							for(i=0;i<std_class_cnt;i++)
                     +								if(!strncmp((const char*)pat,std_class[i].classname,len)) {
                     +									found=i;
                     +									break;
                     +								}
                     +							if(found!=-1) {
                     +								for(i=0;i<256;i++)
                     +									if(char_class[i]&(1<<found))
                     +										bitmap[i>>3] ^= 1<<(i&0x7);
                     +							}
                     +							else {
                     +								/*unknown class*/
                     +								cli_warnmsg("confused about regex bracket expression, bailing out");
                     +								pat=old;
                     +								token->type=TOKEN_REGEX;
                     +								break;
                     +							}
                     +						}
                     +				else {
                     +					bitmap[*pat>>3] ^= 1<<(*pat&0x7);
                     +					pat++;
                     +					range_start = *pat;
                     +					hasprev = 1;
                     +				}
                     +			} while(*pat!=']');
                     +			/*TODO: see if this bitmap already exists, then reuse*/
                     +			token->type = TOKEN_BRACKET;
                     +			token->u.bitmap = bitmap;
                     +			break;
                     +			}
                     +		case ']':
                     +			assert(0 && "Encountered ] without matching [");
                     +			/* bad state */
                     +			break;
                     +		case '.':
                     +			token->type=TOKEN_DOT;
                     +			break;
                     +		case '(':
                     +			token->type=TOKEN_PAR_OPEN;
                     +			break;
                     +		case ')':
                     +			token->type=TOKEN_PAR_CLOSE;
                     +			break;
                     +		default:
                     +			token->type=TOKEN_CHAR;
                     +			token->u.start = pat;
                     +			token->len=1;
                     +			break;
                     +	}
                     +	return ++pat;
                     +}
+                    +
                     +#define INITIAL_ALT_STACK 10
                     +#define ALT_STACK_GROW 20
+                    +
                     +static const unsigned char* find_regex_start(const unsigned char* pat)
                     +{
                     +	struct token_t token;
                     +	/*TODO: find where the regex part begins, for ex:
                     +	 * abcd+, regex begins at 'd'
                     +	 * */
                     +	const unsigned char* last=NULL;
                     +	const unsigned char* tmp=NULL;
                     +	const unsigned char** altpositions = cli_malloc(INITIAL_ALT_STACK*sizeof(*altpositions));
                     +	size_t altpositions_capacity = INITIAL_ALT_STACK;
                     +	size_t altpositions_cnt = 0;
                     +	char lasttype = -1;
                     +	if(!altpositions)
                     +		return NULL;
                     +	assert(pat);
+                    +
                     +	/* Try to parse pattern till special regex chars are encountered, that the tree-matcher doesn't handle, like: +,*,{}.
                     +	 * The tricky part is that once we encounter these, the previous 'atom' has to be passed on to the regex matcher, so we have to
                     +	 * back up to the last known good position
                     +	 * Example, if we have: abc(defg)+, then only abc can be handled by tree parser, so we have to return the position of (.
                     +	 * Another example: abc(defg|xyz|oz+|pdo), the last known good position is |, after xyz
                     +	 * TODO: what about open parantheses? maybe once we found a special char, we have top back out before the first (?
                     +	 * */
                     +	do {
                     +		tmp = pat;
                     +		pat = getNextToken(pat,&token);
                     +		if(token.type!=TOKEN_REGEX) {
                     +			last = tmp;
                     +			lasttype = token.type;
                     +			if(token.type==TOKEN_BRACKET)
                     +				free(token.u.bitmap);
                     +			if(token.type==TOKEN_ALT || token.type==TOKEN_PAR_OPEN) {
                     +				/* save this position on stack, succesfully parsed till here*/
                     +				if(altpositions_cnt && altpositions[altpositions_cnt-1][0]=='|')
                     +					/* encountered another alternate (|) operator, override previous | position stored */
                     +					altpositions[altpositions_cnt-1]=last;
                     +				else {
                     +					altpositions[altpositions_cnt++] = last;
                     +					if(altpositions_cnt == altpositions_capacity) {
                     +						altpositions_capacity += ALT_STACK_GROW;
                     +						altpositions = cli_realloc(altpositions,altpositions_capacity*sizeof(*altpositions));
                     +						if(!altpositions)
                     +							return NULL;
                     +					}
                     +				}
                     +			} else if (lasttype==TOKEN_PAR_CLOSE) {
                     +				/* remove last stored position from stack, succesfully this last group */
                     +				altpositions_cnt--;
                     +				assert(altpositions_cnt>0);
                     +			}
                     +		}
                     +		else {
                     +			if(altpositions_cnt)
                     +				last = altpositions[0 /*altpositions_cnt-1*/];/*TODO: which index here?, see above TODO... */
                     +			/*last stored 'safe' position where no special (+,*,{}) regex chars were encountered*/
                     +		}
                     +	} while(*pat && token.type!=TOKEN_REGEX);
                     +	free(altpositions);
                     +	return *pat ? last : last+1;
                     +}
+                    +
                     +static struct tree_node* tree_node_alloc(struct tree_node* next,char listend)
                     +{
                     +	struct tree_node* node = cli_malloc(sizeof(*node));
                     +	if(node) {
                     +		node->alternatives=0;
                     +		node->next=next;
                     +		node->listend=listend;
                     +		node->u.children=NULL;
                     +	}
                     +	return node;
                     +}
+                    +
                     +static struct tree_node* tree_root_alloc(void)
                     +{
                     +	struct tree_node* root=tree_node_alloc(NULL,1);
                     +	if(root) {
                     +		root->op=OP_ROOT;
                     +		root->c=0;
                     +		root->next=NULL;
                     +		root->listend=1;
                     +	}
                     +	return root;
                     +}
                     +static inline struct tree_node* tree_node_char_binsearch(const struct tree_node* node,const char csearch,int* left)
                     +{
                     +	int right;
                     +	struct tree_node **children;
                     +	assert(node);
                     +	assert(left);
+                    +
                     +	children = tree_node_get_children(node);
                     +	right = node->alternatives-1;
                     +	*left = 0;
                     +	if(!node->alternatives)
                     +		return NULL;
                     +	assert(children);
                     +	while(*left<=right) {
                     +		int mid  = *left+(right-*left)/2;
                     +		if(children[mid]->c == csearch)
                     +			return children[mid];
                     +		else if(children[mid]->c < csearch)
                     +			*left=mid+1;
                     +		else
                     +			right=mid-1;
                     +	}
                     +	return NULL;
                     +}
+                    +
                     +static inline struct tree_node* tree_get_next(struct tree_node* node)
                     +{
                     +	struct tree_node** children;
                     +	assert(node);
                     +	children = tree_node_get_children(node);
+                    +
                     +	if(!node->alternatives && children && children[0])
                     +		return children[0];
                     +	else if(node->alternatives<=1)
                     +		return node;
                     +	else
                     +		return children[0]->next;
                     +}
+                    +
                     +static inline size_t tree_node_get_array_size(const struct tree_node* node)
                     +{
                     +	assert(node);
                     +	/* if op is CUSTOMCLASS, then first pointer is pointer to bitmap, so array size is +1 */
                     +	return (node->alternatives + (node->op==OP_CUSTOMCLASS ? 1 : 0)) * sizeof(node->u.children[0]);
                     +}
+                    +
                     +static inline struct tree_node* tree_node_char_insert(struct tree_node* node,const char c,int left)
                     +{
                     +	struct tree_node* new, *alt = tree_get_next(node);
                     +	node->alternatives++;
                     +	node->u.children = cli_realloc(node->u.children,tree_node_get_array_size(node));
                     +	if(!node->u.children)
                     +		return NULL;
+                    +
                     +	new = tree_node_alloc(alt , node == alt );
                     +	if(new) {
                     +		new->op=OP_CHAR;
                     +		new->c=c;
                     +	}
+                    +
                     +	if(node->alternatives-left-1>0)
                     +			memmove(&node->u.children[left+1],&node->u.children[left],(node->alternatives-left-1)*sizeof(node->u.children[0]));
                     +	node->u.children[left] = new;
+                    +
                     +	return new;
                     +}
+                    +
                     +static inline void tree_node_insert_nonbin(struct tree_node* node, struct tree_node* new)
                     +{
                     +	struct tree_node **children;
                     +	assert(node);
                     +	assert(new);
+                    +
                     +	children = tree_node_get_children(node);
                     +	if(node->alternatives) {
                     +		assert(children);
                     +	       	if(children[0]->next == node) {
                     +			int i;
                     +			new->listend = 1;
                     +			for(i=0;i<node->alternatives;i++) {
                     +				children[i]->next = new;
                     +				children[i]->listend = 0;
                     +			}
                     +		}
                     +		else {
                     +			struct tree_node* p;
                     +			for(p = children[0]->next ; p->next != node ; p = p->next)
                     +				assert(!p->listend);
                     +			new->listend = 1;
                     +			p->listend = 0;
                     +			p->next = new;
                     +		}
                     +	}
                     +	else {
                     +		node->u.children = cli_realloc(node->u.children,sizeof(node->u.children[0])*( node->op==OP_CUSTOMCLASS ? 2 : 1 ));
                     +		if(node->u.children)
                     +			node->u.children[ node->op==OP_CUSTOMCLASS ? 1 : 0 ] = new;
                     +	}
                     +}
+                    +
                     +static inline unsigned char char_getclass(const unsigned char* bitmap)
                     +{
                     +	size_t i;
                     +	assert(bitmap);
+                    +
                     +	for(i=0;i<std_class_cnt;i++)
                     +		if(!memcmp(bitmap,char_class_bitmap[i],256>>3))
                     +			return i;
                     +	return std_class_cnt;
                     +}
+                    +
                     +static void stack_destroy(struct node_stack* stack)
                     +{
                     +	assert(stack);
                     +	if(stack->data)
                     +		free(stack->data);
                     +	stack->data = NULL;
                     +	stack->capacity = 0;
                     +}
+                    +
+                    +
                     +/* call this after whitelist load is complete, and the tree is no longer going to be modified */
                     +void regex_list_cleanup(struct regex_matcher* matcher)
                     +{
                     +	assert(matcher);
+                    +
                     +	stack_destroy(&matcher->node_stack);
                     +	stack_destroy(&matcher->node_stack_alt);
                     +	stack_init(&matcher->node_stack);
                     +	stack_init(&matcher->node_stack_alt);
                     +}
+                    +
                     +int is_regex_ok(struct regex_matcher* matcher)
                     +{
                     +	assert(matcher);
                     +	return (!matcher->list_inited || matcher->list_inited!=-1);/* either we don't have a regexlist, or we initialized it successfully */
                     +}
+                    +
                     +/* returns 0 on success, regexec error code otherwise */
                     +static int add_pattern(struct regex_matcher* matcher,const unsigned char* pat,const char* info)
                     +{
                     +	int bol=1;
                     +	const unsigned char* pat_end = find_regex_start(pat);
                     +	struct token_t token;
                     +	struct tree_node* node;
+                    +
                     +	assert(matcher);
+                    +
                     +	node = matcher->root_regex;
+                    +
                     +	stack_reset(&matcher->node_stack);
                     +	stack_reset(&matcher->node_stack_alt);
                     +	stack_push(&matcher->node_stack,node);
+                    +
                     +	for(;node->op!=OP_LEAF;){
                     +		if(pat<pat_end)
                     +			pat  = getNextToken(pat,&token);
                     +		else if(*pat) {
                     +			token.type = TOKEN_REGEX;
                     +			token.u.start=pat;
                     +		}
                     +		else
                     +			token.type = TOKEN_DONE;
+                    +
                     +		switch(token.type) {
                     +			case TOKEN_CHAR:
                     +				{
                     +					/* search for char in tree */
                     +					int left;
                     +					struct tree_node* newnode = tree_node_char_binsearch(node,*token.u.start,&left);
                     +					if(newnode)
                     +						node = newnode;
                     +					else {
                     +						/* not found, insert it */
                     +						node = tree_node_char_insert(node,*token.u.start,left);
                     +					}
                     +					break;
                     +				}
+                    +
                     +			case TOKEN_PAR_OPEN:
                     +				stack_push(&matcher->node_stack_alt,NULL);/* marker */
                     +				stack_push(&matcher->node_stack,node);
                     +				break;
+                    +
                     +			case TOKEN_PAR_CLOSE: {
                     +						      /*TODO: test this!!!*/
                     +						      struct tree_node* node_alt = node;
                     +						      node = tree_node_alloc(NULL,1);
                     +						      node->op=OP_PARCLOSE;
                     +						      node->c=0;
                     +						      node->listend=1;
                     +						      tree_node_insert_nonbin(node_alt,node);
                     +						      while (( node_alt = stack_pop(&matcher->node_stack_alt) )) {
                     +							      tree_node_insert_nonbin(node_alt,node);
                     +						      }
                     +				      		      stack_pop(&matcher->node_stack);
                     +		      				      break;
                     +					      }
+                    +
                     +			case TOKEN_ALT:
                     +				stack_push(&matcher->node_stack_alt,node);
                     +				node = stack_pop(&matcher->node_stack);
                     +				stack_push(&matcher->node_stack,node);
                     +				break;
+                    +
                     +			case TOKEN_BRACKET:
                     +				{
                     +					struct tree_node* new = tree_node_alloc(tree_get_next(node),1);
                     +					unsigned char charclass = char_getclass(token.u.start);
                     +					if(charclass == std_class_cnt) {/*not a std char class*/
                     +						new->op = OP_CUSTOMCLASS;
                     +						new->u.children = cli_malloc(sizeof(new->u.children[0])*2);
                     +						new->u.bitmap[0] = token.u.bitmap;
                     +						new->u.bitmap[1] = NULL;
                     +						tree_node_insert_nonbin(node,new);
                     +						node = new;
                     +					}
                     +					else {
                     +						new->op = OP_STDCLASS;
                     +						new->c = charclass;
                     +						tree_node_insert_nonbin(node,new);
                     +						node=new;
                     +					}
                     +					break;
                     +				}
+                    +
                     +			case TOKEN_DOT:
                     +				{
                     +					struct tree_node* new = tree_node_alloc(tree_get_next(node),1);
                     +					new->op = OP_DOT;
                     +					tree_node_insert_nonbin(node,new);
                     +					node=new;
                     +					break;
                     +				}
+                    +
                     +			case TOKEN_REGEX:
                     +			case TOKEN_DONE: {
                     +						 struct leaf_info* leaf=cli_malloc(sizeof(*leaf));
                     +						 leaf->info=strdup(info);
                     +						 if(token.type==TOKEN_REGEX) {
                     +							 int rc;
                     +							 struct tree_node* new;
                     +							 regex_t* preg;
                     +							 preg=cli_malloc(sizeof(*preg));
                     +							 rc = regcomp(preg,(const char*)token.u.start,bol?0:REG_NOTBOL);
                     +							 leaf->preg=preg;
                     +							 if(rc)
                     +								 return rc;
                     +							 new=cli_malloc(sizeof(*new));
                     +							 new->op=OP_LEAF;
                     +							 new->next=node;
                     +							 new->alternatives=0;
                     +							 new->u.leaf=leaf;
                     +							 new->listend=1;
                     +							 tree_node_insert_nonbin(node,new);
                     +						 }
                     +						 else {
                     +							 leaf->preg=NULL;
                     +							 node->alternatives=0;
                     +							 node->u.leaf=leaf;
                     +							 node->op=OP_LEAF;
                     +						 }
                     +						 return 0;
                     +					 }
                     +		}
+                    +
                     +		bol=0;
                     +	}
                     +	return 0;
                     +}
+                    +
                     +/* c has to be unsigned char here!! */
                     +static int match_node(struct tree_node* node,const unsigned char* c,size_t len,const char** info)
                     +{
                     +	struct tree_node** children;
                     +	int rc;
+                    +
                     +	assert(node);
                     +	assert(c);
                     +	assert(info);
+                    +
                     +	*info = NULL;
                     +	len++;
                     +	c--;
                     +	for(;;) {
                     +		assert(node);
                     +		children = node->u.children;
                     +		switch(node->op) {
                     +			case OP_ROOT:
                     +				rc=1;
                     +				break;
                     +			case OP_PARCLOSE:
                     +				/*this isn't a real character, so don't move*/
                     +				c--;
                     +				len++;
                     +				rc=1;
                     +				break;
                     +			case OP_CHAR:
                     +				assert(*c==node->c && "We know this has to match");
                     +				rc = 1;/* *c==node->c;- we know it has matched */
                     +				break;
                     +			case OP_DOT:
                     +				rc = *c!='\n';
                     +				break;
                     +			case OP_STDCLASS:
                     +				rc = char_class[*c]&(node->c);
                     +				break;
                     +			case OP_CUSTOMCLASS:
                     +			{
                     +				char_bitmap_p bitmap;
                     +				assert(children);
                     +				bitmap = (char_bitmap_p)node->u.bitmap[0];
                     +				children++;
                     +				rc = bitmap[*c>>3]&(1<<(*c&0x7));
                     +				break;
                     +			}
                     +			case OP_LEAF:
                     +			{
                     +				const struct leaf_info* leaf = node->u.leaf;
                     +				/*isleaf = 1;*/
                     +				if(leaf->preg) {
                     +					rc = !regexec(leaf->preg,(const char*)c,0,NULL,0);
                     +				}
                     +				else  {
                     +					assert(*c==node->c && "We know this has to match[2]");
                     +					rc = 1;
                     +				}
                     +				if(rc) {
                     +					*info = leaf->info;
                     +					return MATCH_SUCCESS;
                     +				}
                     +				break;
                     +			}
                     +			default:
                     +				/* impossible */
                     +				cli_errmsg("Encountered invalid operator in tree:%d\n",node->op);
                     +				exit(1);
                     +		}
                     +		len--;
                     +		if(!len) rc=0;
                     +		c++;
                     +		if(rc) {
                     +			const char csearch = *c;
                     +			int left = 0,right = node->alternatives-1;
                     +			int mid;
                     +			/*matched so far, go deeper*/
                     +			/*do a binary search between children */
                     +			assert(children);
                     +			while(left<=right) {
                     +				mid  = left+(right-left)/2;
                     +				if (children[mid]->c == csearch)
                     +					break;
                     +				else if(children[mid]->c < csearch)
                     +					left=mid+1;
                     +				else
                     +					right=mid-1;
                     +			}
                     +			if(left<=right) {
                     +				node = children[mid];
                     +				assert(node);
                     +			}
                     +			else {
                     +				if(node->alternatives) {
                     +					if(!children[0]->listend) {
                     +						node = children[0];
                     +						c++;
                     +						len--;
                     +					}
                     +					while(node && node->listend) {
                     +						node = node->next;/* climb up */
                     +						c--;
                     +						len++;
                     +					}
                     +					if(!node || !node->next)
                     +						return MATCH_FAILED;/* reached root node */
                     +					node=node->next;
                     +					c--;
                     +					len++;
                     +				}
                     +				else if(node->u.children) {
                     +					struct tree_node* rewrite_next = NULL;
                     +					if(node->op==OP_PARCLOSE)
                     +						rewrite_next = node;
                     +					node = children[0];
                     +					assert(node);
                     +					assert(node->op!=OP_CHAR);
                     +					if(rewrite_next)
                     +						node->next = rewrite_next;/* this node is pointed to by several parent nodes,
                     +									     we need to know
                     +									     from which one we came, so we can find out way back
                     +									     should we fail to match somewhere deeper*/
                     +				}
                     +			}
                     +		}
                     +		else {
                     +			/* this node didn't match, try sibling, or parent (if no more siblings) */
                     +			while(node && node->listend) {
                     +				node = node->next;/* sibling of parent */
                     +				c--;
                     +				len++;
                     +			}
                     +			if(!node || !node->next) /* reached root node, it has no next */
                     +				return MATCH_FAILED;
                     +			else node=node->next;
                     +		}
                     +	}
                     +	return MATCH_FAILED;
                     +}
+                    +
                     +/* push node on stack, only if it isn't there already */
                     +static inline void stack_push_once(struct node_stack* stack,struct tree_node* node)
                     +{
                     +	size_t i;
                     +	assert(stack);
                     +	assert(node);
+                    +
                     +	for(i=0;i < stack->cnt;i++)
                     +		if(stack->data[i]==node)
                     +			return;
                     +	stack_push(stack,node);
                     +}
+                    +
                     +static void destroy_tree_internal(struct regex_matcher* matcher,struct tree_node* node)
                     +{
                     +	struct tree_node **children;
                     +	assert(matcher);
                     +	assert(node);
+                    +
                     +	children = tree_node_get_children(node);
                     +	if(node->op==OP_LEAF) {
                     +		struct leaf_info* leaf = node->u.leaf;
                     +		if(node->next && !node->listend)
                     +			destroy_tree_internal(matcher,node->next);
                     +		stack_push_once(&matcher->node_stack,(struct tree_node*)node->u.leaf);/* cast to make compiler happy, and to not make another stack implementation for storing void* */
                     +		stack_push_once(&matcher->node_stack,node);
                     +		if(leaf->preg) {
                     +			regfree(leaf->preg);
                     +			free(leaf->preg);
                     +			leaf->preg=NULL;
                     +		}
                     +		if(leaf->info) {
                     +			free(leaf->info);
                     +			leaf->info=NULL;
                     +		}
                     +	/*	return;*/
                     +	}
                     +	if(node->alternatives) {
                     +		int i;
                     +		struct tree_node* p;
                     +		assert(children);
                     +		p = children[0]->op==OP_LEAF ? NULL : children[0]->next;
                     +		for(i=0;i<node->alternatives;i++)
                     +			destroy_tree_internal(matcher,children[i]);
                     +		if(p && p!=node)
                     +			destroy_tree_internal(matcher,p);/*?? is this ok, or without _internal?*/
                     +	}
                     +	else {
                     +		if(children) {
                     +			if(children[0])
                     +				destroy_tree_internal(matcher,children[0]);
                     +		}
                     +	}
                     +	if(node->next && !node->listend)
                     +		destroy_tree_internal(matcher,node->next);
                     +	if(node->u.children)
                     +		stack_push_once(&matcher->node_stack,(struct tree_node*)node->u.children);/* cast to make compiler happy, it isn't really a tree_node* */
                     +	if(node->op==OP_CUSTOMCLASS && node->u.children[0]) {
                     +		free(node->u.children[0]);
                     +		node->u.children[0]=NULL;
                     +	}
                     +	stack_push_once(&matcher->node_stack,node);
                     +}
+                    +
                     +static void destroy_tree(struct regex_matcher* matcher)
                     +{
                     +	/* we might have the same node linked by different nodes, so a recursive walk&free doesn't work in all situations,
                     +	 * i.e. it might double-free, so instead of freeing, just push the nodes on a stack, and later free the nodes in that stack,
                     +	 * (and push to stack only if it doesn't contain it already*/
                     +	assert(matcher);
+                    +
                     +	stack_reset(&matcher->node_stack);
                     +	destroy_tree_internal(matcher,matcher->root_regex);
                     +	while (matcher->node_stack.cnt) {
                     +		struct tree_node* node = stack_pop(&matcher->node_stack);
                     +		free(node);
                     +	}
                     +}
                     +#ifndef NDEBUG
                     +static void dump_node(struct tree_node* node)
                     +{
                     +	int i;
                     +	struct tree_node* p,**children;
                     +	assert(node);
                     +	if(node->op==OP_LEAF) {
                     +		if(node->u.leaf->preg)
                     +			printf("n%p [label=\"regex\\nleaf\"]",(void*)node);
                     +		else
                     +			printf("n%p [label=\"%c\\nleaf\"];\n",(void*)node,node->c);
                     +		if(node->next && !node->listend) {
                     +			printf("n%p -> n%p;\n",(void*)node,(void*)node->next);
                     +			dump_node(node->next);
                     +		}
                     +		return;
                     +	}
                     +	printf("n%p [label=\"%c\\n%d\\nlistend:%d\"];\n",(void*)node,(node->op==OP_ROOT||node->op==OP_PARCLOSE) ?'@' :node->c,node->op,node->listend);
                     +	if(node->next)
                     +		printf("n%p -> n%p;\n",(void*)node,(void*)node->next);
                     +	printf("n%p -> {",(void*)node);/*using address of node as id*/
                     +	children = tree_node_get_children(node);
                     +	if(node->alternatives)
                     +		assert(children);
                     +	for(i=0;i<node->alternatives;i++)
                     +		printf("n%p ",(void*)children[i]);
                     +	if(node->alternatives && children[0]->op!=OP_LEAF)
                     +		for(p=children[0]->next;p!=node;p=p->next)
                     +		{
                     +			assert(p);
                     +			printf("n%p ",(void*)p);
                     +			if(p->op==OP_LEAF || p->listend)
                     +				break;
                     +		}
                     +	if(!node->alternatives && children && children[0])
                     +		printf("n%p ",(void*)children[0]);
                     +	printf("};\n");
                     +	printf("{rank=same;");
                     +	for(i=0;i<node->alternatives;i++)
                     +		printf("n%p ",(void*)node->u.children[i]);
                     +	if(node->alternatives && children[0]->op!=OP_LEAF)
                     +		for(p=children[0]->next;p!=node;p=p->next)
                     +		{
                     +			printf("n%p ",(void*)p);
                     +			if(p->op==OP_LEAF || p->listend)
                     +				break;
                     +		}
                     +	if(!node->alternatives && children && children[0])
                     +		printf("n%p ",(void*)children[0]);
                     +	printf("};\n");
                     +	for(i=0;i<node->alternatives;i++)
                     +		dump_node(children[i]);
                     +	if(node->alternatives && children[0]->op!=OP_LEAF)
                     +		for(p=children[0]->next;p!=node;p=p->next)
                     +		{
                     +			dump_node(p);
                     +			if(p->op==OP_LEAF || p->listend)
                     +				break;
                     +		}
                     +	if(!node->alternatives && children && children[0])
                     +		dump_node(children[0]);
                     +}
+                    +
                     +void dump_tree(struct tree_node* root)
                     +{
                     +	/*use dot/dotty from graphviz to view it*/
                     +	assert(root);
                     +	printf("digraph tree {\n");
                     +	dump_node(root);
                     +	printf("}\n");
                     +}
                     +#endif
+                    +
                     +#endif

clamav-devel/libclamav/regex_list.h

History View file @ bd912dd

                     new file mode 100644
@@ -0,0 +1,53 @@
                     +/*
                     + *  Match a string against a list of patterns/regexes.
                     + *
                     + *
                     + *  This program is free software; you can redistribute it and/or modify
                     + *  it under the terms of the GNU General Public License as published by
                     + *  the Free Software Foundation; either version 2 of the License, or
                     + *  (at your option) any later version.
                     + *
                     + *  This program is distributed in the hope that it will be useful,
                     + *  but WITHOUT ANY WARRANTY; without even the implied warranty of
                     + *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
                     + *  GNU General Public License for more details.
                     + *
                     + *  You should have received a copy of the GNU General Public License
                     + *  along with this program; if not, write to the Free Software
                     + *  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
                     + *  MA 02110-1301, USA.
                     + *
                     + */
+                    +
                     +#ifdef CL_EXPERIMENTAL
+                    +
                     +#ifndef _REGEX_LIST_H
                     +#define _REGEX_LIST_H
+                    +
                     +struct node_stack {
                     +	struct tree_node** data;
                     +	size_t capacity;
                     +	size_t cnt;
                     +};
+                    +
                     +struct regex_matcher {
                     +	struct cli_matcher* root_hosts;
                     +	struct cli_matcher* root_urls;
                     +	struct tree_node* root_regex;
                     +	int list_inited;
                     +	int list_loaded;
                     +	int list_built;
                     +	struct node_stack node_stack;
                     +	struct node_stack node_stack_alt;
                     +};
+                    +
                     +int regex_list_match(struct regex_matcher* matcher,const char* real_url,const char* display_url,int hostOnly,const char** info);
                     +int init_regex_list(struct regex_matcher* matcher);
                     +int load_regex_matcher(struct regex_matcher* matcher,FILE* fd,unsigned int options);
                     +void regex_list_cleanup(struct regex_matcher* matcher);
                     +void regex_list_done(struct regex_matcher* matcher);
                     +int is_regex_ok(struct regex_matcher* matcher);
                     +#endif
+                    +
                     +#endif