Browse code

Phishing module merge - libclamav

git-svn: trunk@2244

aCaB authored on 2006/09/13 04:38:39
Showing 15 changed files
... ...
@@ -1,3 +1,9 @@
1
+Tue Sep 12 21:59:17 CEST 2006 (acab)
2
+------------------------------------
3
+  * libclamav: Merge of the related part of the phishing module from
4
+               Torok Edvin <edwintorok*gmail.com>
5
+	       Part of the Google Summer of Code program
6
+
1 7
 Tue Sep 12 20:42:04 CEST 2006 (acab)
2 8
 ------------------------------------
3 9
   * sigtool: Merge of the related part of the phishing module from
... ...
@@ -1,6 +1,6 @@
1 1
 #! /bin/sh
2 2
 # Guess values for system-dependent variables and create Makefiles.
3
-# Generated by GNU Autoconf 2.60.
3
+# Generated by GNU Autoconf 2.60a.
4 4
 #
5 5
 # Copyright (C) 1992, 1993, 1994, 1995, 1996, 1998, 1999, 2000, 2001,
6 6
 # 2002, 2003, 2004, 2005, 2006 Free Software Foundation, Inc.
... ...
@@ -724,36 +724,36 @@ ac_unique_file="clamscan/clamscan.c"
724 724
 # Factoring default headers for most tests.
725 725
 ac_includes_default="\
726 726
 #include <stdio.h>
727
-#if HAVE_SYS_TYPES_H
727
+#ifdef HAVE_SYS_TYPES_H
728 728
 # include <sys/types.h>
729 729
 #endif
730
-#if HAVE_SYS_STAT_H
730
+#ifdef HAVE_SYS_STAT_H
731 731
 # include <sys/stat.h>
732 732
 #endif
733
-#if STDC_HEADERS
733
+#ifdef STDC_HEADERS
734 734
 # include <stdlib.h>
735 735
 # include <stddef.h>
736 736
 #else
737
-# if HAVE_STDLIB_H
737
+# ifdef HAVE_STDLIB_H
738 738
 #  include <stdlib.h>
739 739
 # endif
740 740
 #endif
741
-#if HAVE_STRING_H
742
-# if !STDC_HEADERS && HAVE_MEMORY_H
741
+#ifdef HAVE_STRING_H
742
+# if !defined STDC_HEADERS && defined HAVE_MEMORY_H
743 743
 #  include <memory.h>
744 744
 # endif
745 745
 # include <string.h>
746 746
 #endif
747
-#if HAVE_STRINGS_H
747
+#ifdef HAVE_STRINGS_H
748 748
 # include <strings.h>
749 749
 #endif
750
-#if HAVE_INTTYPES_H
750
+#ifdef HAVE_INTTYPES_H
751 751
 # include <inttypes.h>
752 752
 #endif
753
-#if HAVE_STDINT_H
753
+#ifdef HAVE_STDINT_H
754 754
 # include <stdint.h>
755 755
 #endif
756
-#if HAVE_UNISTD_H
756
+#ifdef HAVE_UNISTD_H
757 757
 # include <unistd.h>
758 758
 #endif"
759 759
 
... ...
@@ -1576,7 +1576,7 @@ test -n "$ac_init_help" && exit $ac_status
1576 1576
 if $ac_init_version; then
1577 1577
   cat <<\_ACEOF
1578 1578
 configure
1579
-generated by GNU Autoconf 2.60
1579
+generated by GNU Autoconf 2.60a
1580 1580
 
1581 1581
 Copyright (C) 1992, 1993, 1994, 1995, 1996, 1998, 1999, 2000, 2001,
1582 1582
 2002, 2003, 2004, 2005, 2006 Free Software Foundation, Inc.
... ...
@@ -1590,7 +1590,7 @@ This file contains any messages produced by compilers while
1590 1590
 running configure, to aid debugging if configure makes a mistake.
1591 1591
 
1592 1592
 It was created by $as_me, which was
1593
-generated by GNU Autoconf 2.60.  Invocation command line was
1593
+generated by GNU Autoconf 2.60a.  Invocation command line was
1594 1594
 
1595 1595
   $ $0 $@
1596 1596
 
... ...
@@ -3164,7 +3164,7 @@ eval "echo \"\$as_me:$LINENO: $ac_try_echo\"") >&5
3164 3164
 # in a Makefile.  We should not override ac_cv_exeext if it was cached,
3165 3165
 # so that the user can short-circuit this test for compilers unknown to
3166 3166
 # Autoconf.
3167
-for ac_file in $ac_files
3167
+for ac_file in $ac_files ''
3168 3168
 do
3169 3169
   test -f "$ac_file" || continue
3170 3170
   case $ac_file in
... ...
@@ -3192,6 +3192,12 @@ done
3192 3192
 test "$ac_cv_exeext" = no && ac_cv_exeext=
3193 3193
 
3194 3194
 else
3195
+  ac_file=''
3196
+fi
3197
+
3198
+{ echo "$as_me:$LINENO: result: $ac_file" >&5
3199
+echo "${ECHO_T}$ac_file" >&6; }
3200
+if test -z "$ac_file"; then
3195 3201
   echo "$as_me: failed program was:" >&5
3196 3202
 sed 's/^/| /' conftest.$ac_ext >&5
3197 3203
 
... ...
@@ -3203,8 +3209,6 @@ See \`config.log' for more details." >&2;}
3203 3203
 fi
3204 3204
 
3205 3205
 ac_exeext=$ac_cv_exeext
3206
-{ echo "$as_me:$LINENO: result: $ac_file" >&5
3207
-echo "${ECHO_T}$ac_file" >&6; }
3208 3206
 
3209 3207
 # Check that the compiler produces executables we can run.  If not, either
3210 3208
 # the compiler is broken, or we cross compile.
... ...
@@ -5882,7 +5886,7 @@ test "x$enable_libtool_lock" != xno && enable_libtool_lock=yes
5882 5882
 case $host in
5883 5883
 *-*-irix6*)
5884 5884
   # Find out which ABI we are using.
5885
-  echo '#line 5885 "configure"' > conftest.$ac_ext
5885
+  echo '#line 5889 "configure"' > conftest.$ac_ext
5886 5886
   if { (eval echo "$as_me:$LINENO: \"$ac_compile\"") >&5
5887 5887
   (eval $ac_compile) 2>&5
5888 5888
   ac_status=$?
... ...
@@ -5994,7 +5998,7 @@ sed 's/^/| /' conftest.$ac_ext >&5
5994 5994
 	lt_cv_cc_needs_belf=no
5995 5995
 fi
5996 5996
 
5997
-rm -f core conftest.err conftest.$ac_objext \
5997
+rm -f core conftest.err conftest.$ac_objext conftest_ipa8_conftest.oo \
5998 5998
       conftest$ac_exeext conftest.$ac_ext
5999 5999
      ac_ext=c
6000 6000
 ac_cpp='$CPP $CPPFLAGS'
... ...
@@ -6470,7 +6474,7 @@ sed 's/^/| /' conftest.$ac_ext >&5
6470 6470
 
6471 6471
 fi
6472 6472
 
6473
-rm -f core conftest.err conftest.$ac_objext \
6473
+rm -f core conftest.err conftest.$ac_objext conftest_ipa8_conftest.oo \
6474 6474
       conftest$ac_exeext conftest.$ac_ext
6475 6475
   LDFLAGS="$save_LDFLAGS"
6476 6476
 
... ...
@@ -6515,7 +6519,7 @@ chmod -w .
6515 6515
 save_CFLAGS="$CFLAGS"
6516 6516
 CFLAGS="$CFLAGS -o out/conftest2.$ac_objext"
6517 6517
 compiler_c_o=no
6518
-if { (eval echo configure:6518: \"$ac_compile\") 1>&5; (eval $ac_compile) 2>out/conftest.err; } && test -s out/conftest2.$ac_objext; then
6518
+if { (eval echo configure:6522: \"$ac_compile\") 1>&5; (eval $ac_compile) 2>out/conftest.err; } && test -s out/conftest2.$ac_objext; then
6519 6519
   # The compiler can only warn and ignore the option if not recognized
6520 6520
   # So say no if there are warnings
6521 6521
   if test -s out/conftest.err; then
... ...
@@ -8076,7 +8080,7 @@ sed 's/^/| /' conftest.$ac_ext >&5
8076 8076
 	ac_cv_func_shl_load=no
8077 8077
 fi
8078 8078
 
8079
-rm -f core conftest.err conftest.$ac_objext \
8079
+rm -f core conftest.err conftest.$ac_objext conftest_ipa8_conftest.oo \
8080 8080
       conftest$ac_exeext conftest.$ac_ext
8081 8081
 fi
8082 8082
 { echo "$as_me:$LINENO: result: $ac_cv_func_shl_load" >&5
... ...
@@ -8155,7 +8159,7 @@ sed 's/^/| /' conftest.$ac_ext >&5
8155 8155
 	ac_cv_lib_dld_shl_load=no
8156 8156
 fi
8157 8157
 
8158
-rm -f core conftest.err conftest.$ac_objext \
8158
+rm -f core conftest.err conftest.$ac_objext conftest_ipa8_conftest.oo \
8159 8159
       conftest$ac_exeext conftest.$ac_ext
8160 8160
 LIBS=$ac_check_lib_save_LIBS
8161 8161
 fi
... ...
@@ -8256,7 +8260,7 @@ sed 's/^/| /' conftest.$ac_ext >&5
8256 8256
 	ac_cv_func_dlopen=no
8257 8257
 fi
8258 8258
 
8259
-rm -f core conftest.err conftest.$ac_objext \
8259
+rm -f core conftest.err conftest.$ac_objext conftest_ipa8_conftest.oo \
8260 8260
       conftest$ac_exeext conftest.$ac_ext
8261 8261
 fi
8262 8262
 { echo "$as_me:$LINENO: result: $ac_cv_func_dlopen" >&5
... ...
@@ -8335,7 +8339,7 @@ sed 's/^/| /' conftest.$ac_ext >&5
8335 8335
 	ac_cv_lib_dl_dlopen=no
8336 8336
 fi
8337 8337
 
8338
-rm -f core conftest.err conftest.$ac_objext \
8338
+rm -f core conftest.err conftest.$ac_objext conftest_ipa8_conftest.oo \
8339 8339
       conftest$ac_exeext conftest.$ac_ext
8340 8340
 LIBS=$ac_check_lib_save_LIBS
8341 8341
 fi
... ...
@@ -8415,7 +8419,7 @@ sed 's/^/| /' conftest.$ac_ext >&5
8415 8415
 	ac_cv_lib_svld_dlopen=no
8416 8416
 fi
8417 8417
 
8418
-rm -f core conftest.err conftest.$ac_objext \
8418
+rm -f core conftest.err conftest.$ac_objext conftest_ipa8_conftest.oo \
8419 8419
       conftest$ac_exeext conftest.$ac_ext
8420 8420
 LIBS=$ac_check_lib_save_LIBS
8421 8421
 fi
... ...
@@ -8495,7 +8499,7 @@ sed 's/^/| /' conftest.$ac_ext >&5
8495 8495
 	ac_cv_lib_dld_dld_link=no
8496 8496
 fi
8497 8497
 
8498
-rm -f core conftest.err conftest.$ac_objext \
8498
+rm -f core conftest.err conftest.$ac_objext conftest_ipa8_conftest.oo \
8499 8499
       conftest$ac_exeext conftest.$ac_ext
8500 8500
 LIBS=$ac_check_lib_save_LIBS
8501 8501
 fi
... ...
@@ -8551,7 +8555,7 @@ else
8551 8551
     lt_dlunknown=0; lt_dlno_uscore=1; lt_dlneed_uscore=2
8552 8552
   lt_status=$lt_dlunknown
8553 8553
   cat > conftest.$ac_ext <<EOF
8554
-#line 8554 "configure"
8554
+#line 8558 "configure"
8555 8555
 #include "confdefs.h"
8556 8556
 
8557 8557
 #if HAVE_DLFCN_H
... ...
@@ -8649,7 +8653,7 @@ else
8649 8649
     lt_dlunknown=0; lt_dlno_uscore=1; lt_dlneed_uscore=2
8650 8650
   lt_status=$lt_dlunknown
8651 8651
   cat > conftest.$ac_ext <<EOF
8652
-#line 8652 "configure"
8652
+#line 8656 "configure"
8653 8653
 #include "confdefs.h"
8654 8654
 
8655 8655
 #if HAVE_DLFCN_H
... ...
@@ -10423,7 +10427,7 @@ sed 's/^/| /' conftest.$ac_ext >&5
10423 10423
 	ac_cv_lib_socket_bind=no
10424 10424
 fi
10425 10425
 
10426
-rm -f core conftest.err conftest.$ac_objext \
10426
+rm -f core conftest.err conftest.$ac_objext conftest_ipa8_conftest.oo \
10427 10427
       conftest$ac_exeext conftest.$ac_ext
10428 10428
 LIBS=$ac_check_lib_save_LIBS
10429 10429
 fi
... ...
@@ -10504,7 +10508,7 @@ sed 's/^/| /' conftest.$ac_ext >&5
10504 10504
 	ac_cv_lib_nsl_gethostent=no
10505 10505
 fi
10506 10506
 
10507
-rm -f core conftest.err conftest.$ac_objext \
10507
+rm -f core conftest.err conftest.$ac_objext conftest_ipa8_conftest.oo \
10508 10508
       conftest$ac_exeext conftest.$ac_ext
10509 10509
 LIBS=$ac_check_lib_save_LIBS
10510 10510
 fi
... ...
@@ -10622,7 +10626,7 @@ sed 's/^/| /' conftest.$ac_ext >&5
10622 10622
 	eval "$as_ac_var=no"
10623 10623
 fi
10624 10624
 
10625
-rm -f core conftest.err conftest.$ac_objext \
10625
+rm -f core conftest.err conftest.$ac_objext conftest_ipa8_conftest.oo \
10626 10626
       conftest$ac_exeext conftest.$ac_ext
10627 10627
 fi
10628 10628
 ac_res=`eval echo '${'$as_ac_var'}'`
... ...
@@ -10897,7 +10901,7 @@ sed 's/^/| /' conftest.$ac_ext >&5
10897 10897
 	eval "$as_ac_var=no"
10898 10898
 fi
10899 10899
 
10900
-rm -f core conftest.err conftest.$ac_objext \
10900
+rm -f core conftest.err conftest.$ac_objext conftest_ipa8_conftest.oo \
10901 10901
       conftest$ac_exeext conftest.$ac_ext
10902 10902
 fi
10903 10903
 ac_res=`eval echo '${'$as_ac_var'}'`
... ...
@@ -10954,21 +10958,21 @@ $ac_includes_default
10954 10954
 #include <fcntl.h>
10955 10955
 #include <sys/mman.h>
10956 10956
 
10957
-#if !STDC_HEADERS && !HAVE_STDLIB_H
10957
+#if !defined STDC_HEADERS && !defined HAVE_STDLIB_H
10958 10958
 char *malloc ();
10959 10959
 #endif
10960 10960
 
10961 10961
 /* This mess was copied from the GNU getpagesize.h.  */
10962
-#if !HAVE_GETPAGESIZE
10962
+#ifndef HAVE_GETPAGESIZE
10963 10963
 /* Assume that all systems that can run configure have sys/param.h.  */
10964
-# if !HAVE_SYS_PARAM_H
10964
+# ifndef HAVE_SYS_PARAM_H
10965 10965
 #  define HAVE_SYS_PARAM_H 1
10966 10966
 # endif
10967 10967
 
10968 10968
 # ifdef _SC_PAGESIZE
10969 10969
 #  define getpagesize() sysconf(_SC_PAGESIZE)
10970 10970
 # else /* no _SC_PAGESIZE */
10971
-#  if HAVE_SYS_PARAM_H
10971
+#  ifdef HAVE_SYS_PARAM_H
10972 10972
 #   include <sys/param.h>
10973 10973
 #   ifdef EXEC_PAGESIZE
10974 10974
 #    define getpagesize() EXEC_PAGESIZE
... ...
@@ -11300,7 +11304,7 @@ sed 's/^/| /' conftest.$ac_ext >&5
11300 11300
 	ac_cv_func_fseeko=no
11301 11301
 fi
11302 11302
 
11303
-rm -f core conftest.err conftest.$ac_objext \
11303
+rm -f core conftest.err conftest.$ac_objext conftest_ipa8_conftest.oo \
11304 11304
       conftest$ac_exeext conftest.$ac_ext
11305 11305
 fi
11306 11306
 { echo "$as_me:$LINENO: result: $ac_cv_func_fseeko" >&5
... ...
@@ -11696,7 +11700,7 @@ sed 's/^/| /' conftest.$ac_ext >&5
11696 11696
 	ac_cv_lib_z_inflateEnd=no
11697 11697
 fi
11698 11698
 
11699
-rm -f core conftest.err conftest.$ac_objext \
11699
+rm -f core conftest.err conftest.$ac_objext conftest_ipa8_conftest.oo \
11700 11700
       conftest$ac_exeext conftest.$ac_ext
11701 11701
 LIBS=$ac_check_lib_save_LIBS
11702 11702
 fi
... ...
@@ -11786,7 +11790,7 @@ sed 's/^/| /' conftest.$ac_ext >&5
11786 11786
 	ac_cv_lib_z_inflateEnd=no
11787 11787
 fi
11788 11788
 
11789
-rm -f core conftest.err conftest.$ac_objext \
11789
+rm -f core conftest.err conftest.$ac_objext conftest_ipa8_conftest.oo \
11790 11790
       conftest$ac_exeext conftest.$ac_ext
11791 11791
 LIBS=$ac_check_lib_save_LIBS
11792 11792
 fi
... ...
@@ -11888,7 +11892,7 @@ sed 's/^/| /' conftest.$ac_ext >&5
11888 11888
 	ac_cv_lib_bz2_bzReadOpen=no
11889 11889
 fi
11890 11890
 
11891
-rm -f core conftest.err conftest.$ac_objext \
11891
+rm -f core conftest.err conftest.$ac_objext conftest_ipa8_conftest.oo \
11892 11892
       conftest$ac_exeext conftest.$ac_ext
11893 11893
 LIBS=$ac_check_lib_save_LIBS
11894 11894
 fi
... ...
@@ -12146,7 +12150,7 @@ sed 's/^/| /' conftest.$ac_ext >&5
12146 12146
 	ac_cv_lib_sn_sigscan_sn_sigscan_initdb=no
12147 12147
 fi
12148 12148
 
12149
-rm -f core conftest.err conftest.$ac_objext \
12149
+rm -f core conftest.err conftest.$ac_objext conftest_ipa8_conftest.oo \
12150 12150
       conftest$ac_exeext conftest.$ac_ext
12151 12151
 LIBS=$ac_check_lib_save_LIBS
12152 12152
 fi
... ...
@@ -12402,7 +12406,7 @@ sed 's/^/| /' conftest.$ac_ext >&5
12402 12402
 	ac_cv_lib_resolv___dn_expand=no
12403 12403
 fi
12404 12404
 
12405
-rm -f core conftest.err conftest.$ac_objext \
12405
+rm -f core conftest.err conftest.$ac_objext conftest_ipa8_conftest.oo \
12406 12406
       conftest$ac_exeext conftest.$ac_ext
12407 12407
 LIBS=$ac_check_lib_save_LIBS
12408 12408
 fi
... ...
@@ -12484,7 +12488,7 @@ sed 's/^/| /' conftest.$ac_ext >&5
12484 12484
 	ac_cv_lib_resolv_dn_expand=no
12485 12485
 fi
12486 12486
 
12487
-rm -f core conftest.err conftest.$ac_objext \
12487
+rm -f core conftest.err conftest.$ac_objext conftest_ipa8_conftest.oo \
12488 12488
       conftest$ac_exeext conftest.$ac_ext
12489 12489
 LIBS=$ac_check_lib_save_LIBS
12490 12490
 fi
... ...
@@ -12915,7 +12919,7 @@ sed 's/^/| /' conftest.$ac_ext >&5
12915 12915
 	ac_cv_lib_gmp___gmpz_init=no
12916 12916
 fi
12917 12917
 
12918
-rm -f core conftest.err conftest.$ac_objext \
12918
+rm -f core conftest.err conftest.$ac_objext conftest_ipa8_conftest.oo \
12919 12919
       conftest$ac_exeext conftest.$ac_ext
12920 12920
 LIBS=$ac_check_lib_save_LIBS
12921 12921
 fi
... ...
@@ -12999,7 +13003,7 @@ sed 's/^/| /' conftest.$ac_ext >&5
12999 12999
 	ac_cv_lib_gmp_mpz_init=no
13000 13000
 fi
13001 13001
 
13002
-rm -f core conftest.err conftest.$ac_objext \
13002
+rm -f core conftest.err conftest.$ac_objext conftest_ipa8_conftest.oo \
13003 13003
       conftest$ac_exeext conftest.$ac_ext
13004 13004
 LIBS=$ac_check_lib_save_LIBS
13005 13005
 fi
... ...
@@ -13906,7 +13910,7 @@ sed 's/^/| /' conftest.$ac_ext >&5
13906 13906
 	ac_cv_lib_milter_mi_stop=no
13907 13907
 fi
13908 13908
 
13909
-rm -f core conftest.err conftest.$ac_objext \
13909
+rm -f core conftest.err conftest.$ac_objext conftest_ipa8_conftest.oo \
13910 13910
       conftest$ac_exeext conftest.$ac_ext
13911 13911
 LIBS=$ac_check_lib_save_LIBS
13912 13912
 fi
... ...
@@ -13993,7 +13997,7 @@ sed 's/^/| /' conftest.$ac_ext >&5
13993 13993
 
13994 13994
 fi
13995 13995
 
13996
-rm -f core conftest.err conftest.$ac_objext \
13996
+rm -f core conftest.err conftest.$ac_objext conftest_ipa8_conftest.oo \
13997 13997
       conftest$ac_exeext
13998 13998
   if test "${ac_cv_search_strlcpy+set}" = set; then
13999 13999
   break
... ...
@@ -14088,7 +14092,7 @@ sed 's/^/| /' conftest.$ac_ext >&5
14088 14088
 	ac_cv_lib_milter_mi_stop=no
14089 14089
 fi
14090 14090
 
14091
-rm -f core conftest.err conftest.$ac_objext \
14091
+rm -f core conftest.err conftest.$ac_objext conftest_ipa8_conftest.oo \
14092 14092
       conftest$ac_exeext conftest.$ac_ext
14093 14093
 LIBS=$ac_check_lib_save_LIBS
14094 14094
 fi
... ...
@@ -14407,7 +14411,7 @@ sed 's/^/| /' conftest.$ac_ext >&5
14407 14407
 	eval "$as_ac_var=no"
14408 14408
 fi
14409 14409
 
14410
-rm -f core conftest.err conftest.$ac_objext \
14410
+rm -f core conftest.err conftest.$ac_objext conftest_ipa8_conftest.oo \
14411 14411
       conftest$ac_exeext conftest.$ac_ext
14412 14412
 fi
14413 14413
 ac_res=`eval echo '${'$as_ac_var'}'`
... ...
@@ -14898,11 +14902,11 @@ echo "${ECHO_T}no" >&6; }
14898 14898
                 LIBS=$save_LIBS
14899 14899
 fi
14900 14900
 
14901
-rm -f core conftest.err conftest.$ac_objext \
14901
+rm -f core conftest.err conftest.$ac_objext conftest_ipa8_conftest.oo \
14902 14902
       conftest$ac_exeext conftest.$ac_ext
14903 14903
 fi
14904 14904
 
14905
-rm -f core conftest.err conftest.$ac_objext \
14905
+rm -f core conftest.err conftest.$ac_objext conftest_ipa8_conftest.oo \
14906 14906
       conftest$ac_exeext conftest.$ac_ext
14907 14907
 else
14908 14908
   have_wrappers=no
... ...
@@ -15717,10 +15721,10 @@ main ()
15717 15717
 #ifndef __cplusplus
15718 15718
   /* Ultrix mips cc rejects this.  */
15719 15719
   typedef int charset[2];
15720
-  const charset x;
15720
+  const charset cs;
15721 15721
   /* SunOS 4.1.1 cc rejects this.  */
15722
-  char const *const *ccp;
15723
-  char **p;
15722
+  char const *const *pcpcc;
15723
+  char **ppc;
15724 15724
   /* NEC SVR4.0.2 mips cc rejects this.  */
15725 15725
   struct point {int x, y;};
15726 15726
   static struct point const zero = {0,0};
... ...
@@ -15729,11 +15733,11 @@ main ()
15729 15729
      an arm of an if-expression whose if-part is not a constant
15730 15730
      expression */
15731 15731
   const char *g = "string";
15732
-  ccp = &g + (g ? g-g : 0);
15732
+  pcpcc = &g + (g ? g-g : 0);
15733 15733
   /* HPUX 7.0 cc rejects these. */
15734
-  ++ccp;
15735
-  p = (char**) ccp;
15736
-  ccp = (char const *const *) p;
15734
+  ++pcpcc;
15735
+  ppc = (char**) pcpcc;
15736
+  pcpcc = (char const *const *) ppc;
15737 15737
   { /* SCO 3.2v4 cc rejects this.  */
15738 15738
     char *t;
15739 15739
     char const *s = 0 ? (char *) 0 : (char const *) 0;
... ...
@@ -15760,7 +15764,7 @@ main ()
15760 15760
     const int foo = 10;
15761 15761
     if (!foo) return 0;
15762 15762
   }
15763
-  return !x[0] && !zero.x;
15763
+  return !cs[0] && !zero.x;
15764 15764
 #endif
15765 15765
 
15766 15766
   ;
... ...
@@ -15925,7 +15929,8 @@ cat >>conftest.$ac_ext <<_ACEOF
15925 15925
 int
15926 15926
 main ()
15927 15927
 {
15928
-#if !BYTE_ORDER || !BIG_ENDIAN || !LITTLE_ENDIAN
15928
+#if  ! (defined BYTE_ORDER && defined BIG_ENDIAN && defined LITTLE_ENDIAN \
15929
+	&& BYTE_ORDER && BIG_ENDIAN && LITTLE_ENDIAN)
15929 15930
  bogus endian macros
15930 15931
 #endif
15931 15932
 
... ...
@@ -17023,7 +17028,7 @@ exec 6>&1
17023 17023
 # values after options handling.
17024 17024
 ac_log="
17025 17025
 This file was extended by $as_me, which was
17026
-generated by GNU Autoconf 2.60.  Invocation command line was
17026
+generated by GNU Autoconf 2.60a.  Invocation command line was
17027 17027
 
17028 17028
   CONFIG_FILES    = $CONFIG_FILES
17029 17029
   CONFIG_HEADERS  = $CONFIG_HEADERS
... ...
@@ -17052,7 +17057,7 @@ current configuration.
17052 17052
 Usage: $0 [OPTIONS] [FILE]...
17053 17053
 
17054 17054
   -h, --help       print this help, then exit
17055
-  -V, --version    print version number, then exit
17055
+  -V, --version    print version number and configuration settings, then exit
17056 17056
   -q, --quiet      do not print progress messages
17057 17057
   -d, --debug      don't remove temporary files
17058 17058
       --recheck    update $as_me by reconfiguring in the same conditions
... ...
@@ -17076,7 +17081,7 @@ _ACEOF
17076 17076
 cat >>$CONFIG_STATUS <<_ACEOF
17077 17077
 ac_cs_version="\\
17078 17078
 config.status
17079
-configured by $0, generated by GNU Autoconf 2.60,
17079
+configured by $0, generated by GNU Autoconf 2.60a,
17080 17080
   with options \\"`echo "$ac_configure_args" | sed 's/^ //; s/[\\""\`\$]/\\\\&/g'`\\"
17081 17081
 
17082 17082
 Copyright (C) 2006 Free Software Foundation, Inc.
... ...
@@ -142,6 +142,14 @@ libclamav_la_SOURCES = \
142 142
 	uuencode.c \
143 143
 	uuencode.h \
144 144
 	pst.c \
145
-	pst.h
145
+	pst.h \
146
+	phishcheck.c \
147
+	phishcheck.h \
148
+	phish_domaincheck_db.c \
149
+	phish_domaincheck_db.h \
150
+	phish_whitelist.c \
151
+	phish_whitelist.h \
152
+	regex_list.c \
153
+	regex_list.h
146 154
 
147 155
 lib_LTLIBRARIES = libclamav.la
... ...
@@ -86,7 +86,8 @@ am_libclamav_la_OBJECTS = matcher-ac.lo matcher-bm.lo matcher.lo \
86 86
 	packlibs.lo fsg.lo line.lo untar.lo unzip.lo special.lo \
87 87
 	binhex.lo is_tar.lo tnef.lo unrar15.lo unrarvm.lo unrar.lo \
88 88
 	unrarfilter.lo unrarppm.lo unrar20.lo unrarcmd.lo pdf.lo \
89
-	spin.lo yc.lo elf.lo sis.lo uuencode.lo pst.lo
89
+	spin.lo yc.lo elf.lo sis.lo uuencode.lo pst.lo phishcheck.lo \
90
+	phish_domaincheck_db.lo phish_whitelist.lo regex_list.lo
90 91
 libclamav_la_OBJECTS = $(am_libclamav_la_OBJECTS)
91 92
 DEFAULT_INCLUDES = -I. -I$(srcdir) -I$(top_builddir)
92 93
 depcomp = $(SHELL) $(top_srcdir)/depcomp
... ...
@@ -341,7 +342,15 @@ libclamav_la_SOURCES = \
341 341
 	uuencode.c \
342 342
 	uuencode.h \
343 343
 	pst.c \
344
-	pst.h
344
+	pst.h \
345
+	phishcheck.c \
346
+	phishcheck.h \
347
+	phish_domaincheck_db.c \
348
+	phish_domaincheck_db.h \
349
+	phish_whitelist.c \
350
+	phish_whitelist.h \
351
+	regex_list.c \
352
+	regex_list.h
345 353
 
346 354
 lib_LTLIBRARIES = libclamav.la
347 355
 all: all-am
... ...
@@ -440,10 +449,14 @@ distclean-compile:
440 440
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/pdf.Plo@am__quote@
441 441
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/pe.Plo@am__quote@
442 442
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/petite.Plo@am__quote@
443
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/phish_domaincheck_db.Plo@am__quote@
444
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/phish_whitelist.Plo@am__quote@
445
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/phishcheck.Plo@am__quote@
443 446
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/pst.Plo@am__quote@
444 447
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/qtmd.Plo@am__quote@
445 448
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/readdb.Plo@am__quote@
446 449
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/rebuildpe.Plo@am__quote@
450
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/regex_list.Plo@am__quote@
447 451
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/scanners.Plo@am__quote@
448 452
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/sis.Plo@am__quote@
449 453
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/snprintf.Plo@am__quote@
... ...
@@ -23,8 +23,10 @@
23 23
 typedef struct tag_arguments_tag
24 24
 {
25 25
         int count;
26
+	int scanContents;
26 27
         unsigned char **tag;
27 28
         unsigned char **value;
29
+	struct blob   **contents; 
28 30
 } tag_arguments_t;
29 31
 
30 32
 int html_normalise_mem(unsigned char *in_buff, off_t in_size, const char *dirname, tag_arguments_t *hrefs);
31 33
new file mode 100644
... ...
@@ -0,0 +1,28 @@
0
+/*
1
+ *  Phishing module: iana tld list.
2
+ *
3
+ *  Copyright (C) 2006 Torok Edvin <edwintorok@gmail.com>
4
+ *
5
+ *  This program is free software; you can redistribute it and/or modify
6
+ *  it under the terms of the GNU General Public License as published by
7
+ *  the Free Software Foundation; either version 2 of the License, or
8
+ *  (at your option) any later version.
9
+ *
10
+ *  This program is distributed in the hope that it will be useful,
11
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
12
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13
+ *  GNU General Public License for more details.
14
+ *
15
+ *  You should have received a copy of the GNU General Public License
16
+ *  along with this program; if not, write to the Free Software
17
+ *  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
18
+ *  MA 02110-1301, USA.
19
+ *
20
+ */
21
+
22
+#ifndef IANA_TLD_H
23
+#define IANA_TLD_H
24
+#define iana_tld "(A[CDEFGILMNOQRSTUWXZ]|B[ABDEFGHIJMNORSTVWYZ]|C[ACDFGHIKLMNORUVXYZ]|D[EJKMOZ]|E[CEGRSTU]|F[IJKMOR]|G[ABDEFGHILMNPQRSTUWY]|H[KMNRTU]|I[DELMNOQRST]|J[EMOP]|K[EGHIMNRWYZ]|L[ABCIKRSTUVY]|M[ACDGHKLMNOPQRSTUVWXYZ]|N[ACEFGILOPRUZ]|OM|P[AEFGHKLMNRSTWY]|QA|R[EOUW]|S[ABCDEGHIJKLMNORTUVYZ]|T[CDFGHJKLMNOPRTVWZ]|U[AGKMSYZ]|V[ACEGINU]|W[FS]|Y[ETU]|Z[AMW]|BIZ|CAT|COM|EDU|GOV|INT|MIL|NET|ORG|PRO|AERO|ARPA|COOP|INFO|JOBS|MOBI|NAME|MUSEUM)"
25
+#define iana_cctld "(a[dfilmoqrtuwxz]|b[bdeghijmorstwyz]|c[ahlmnosuy]|d[ejkmz]|e[cegrstu]|f[ijr]|g[abdeghilmnprtuwy]|h[nrtu]|i[delnqst]|j[emop]|k[eghimwz]|l[birstuv]|m[acglmnoqrstuvwxyz]|n[aegilopru]|om|p[aehkltwy]|qa|r[ow]|s[cdeginorz]|t[dghjklmnorvwz]|u[agyz]|v[enu]|ws|y[etu])"
26
+#endif
27
+
0 28
new file mode 100644
... ...
@@ -0,0 +1,129 @@
0
+/*
1
+ *  Phishing module: domain list implementation.
2
+ *
3

                
4
+ *
5
+ *  This program is free software; you can redistribute it and/or modify
6
+ *  it under the terms of the GNU General Public License as published by
7
+ *  the Free Software Foundation; either version 2 of the License, or
8
+ *  (at your option) any later version.
9
+ *
10
+ *  This program is distributed in the hope that it will be useful,
11
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
12
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13
+ *  GNU General Public License for more details.
14
+ *
15
+ *  You should have received a copy of the GNU General Public License
16
+ *  along with this program; if not, write to the Free Software
17
+ *  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
18
+ *  MA 02110-1301, USA.
19
+ *
20
+ *  $Log: phish_domaincheck_db.c,v $
21
+ *  Revision 1.1  2006/09/12 19:38:39  acab
22
+ *  Phishing module merge - libclamav
23
+ *
24
+ *  Revision 1.3  2006/08/20 21:18:11  edwin
25
+ *  Added the script used to generate iana_tld.sh
26
+ *  Added checks for phish_domaincheck_db
27
+ *  Added phishing module design document from wiki (as discussed with aCaB).
28
+ *  Updated .wdb/.pdb format documentation (in regex_list.c)
29
+ *  Fixed some memory leaks in regex_list.c
30
+ *  IOW: cleanups before the deadline.
31
+ *  I consider my module to be ready for evaluation now.
32
+ *
33
+ *  Revision 1.2  2006/08/09 16:26:44  edwin
34
+ *  Forgot to add these files
35
+ *
36
+ */
37
+
38
+
39
+#if HAVE_CONFIG_H
40
+#include "clamav-config.h"
41
+#endif
42
+
43
+#ifdef CL_EXPERIMENTAL
44
+
45
+#ifndef CL_DEBUG
46
+#define NDEBUG
47
+#endif
48
+
49
+#ifdef CL_THREAD_SAFE
50
+#ifndef _REENTRANT
51
+#define _REENTRANT
52
+#endif
53
+#endif
54
+
55
+#include <stdio.h>
56
+#include <stdlib.h>
57
+#include <errno.h>
58
+#include <assert.h>
59
+#include <string.h>
60
+#include <strings.h>
61
+#include <ctype.h>
62
+
63
+#include <limits.h>
64
+#include "clamav.h"
65
+#include <sys/types.h>
66
+
67
+/*#define USE_PCRE*/
68
+#include <regex.h>
69
+
70
+#if defined(HAVE_READDIR_R_3) || defined(HAVE_READDIR_R_2)
71
+#include <stddef.h>
72
+#endif
73
+
74
+#include "others.h"
75
+#include "defaults.h"
76
+#include "str.h"
77
+#include "filetypes.h"
78
+#include "mbox.h"
79
+#include "phish_domaincheck_db.h"
80
+#include "regex_list.h"
81
+#include "matcher-ac.h"
82
+
83
+
84
+static struct regex_matcher domainlist_matcher;
85
+
86
+int domainlist_match(const char* real_url,const char* display_url,int hostOnly,unsigned short* flags)
87
+{
88
+	const char* info;
89
+	int rc = regex_list_match(&domainlist_matcher,real_url,display_url,hostOnly,&info);
90
+	if(rc && info && info[0]) {/*match successfull, and has custom flags*/
91
+		if(strlen(info)==3 && isxdigit(info[0]) && isxdigit(info[1]) && isxdigit(info[2])) {
92
+			unsigned short notwantedflags=0;
93
+			sscanf(info,"%hx",&notwantedflags);
94
+		        *flags &= ~notwantedflags;/* filter unwanted phishcheck flags */	
95
+		}
96
+		else {
97
+			cli_warnmsg("Phishcheck:Unknown flag format in domainlist, 3 hex digits expected");
98
+		}
99
+	}
100
+	return rc;
101
+}
102
+
103
+int init_domainlist(void)
104
+{
105
+	return	init_regex_list(&domainlist_matcher);
106
+}
107
+
108
+int is_domainlist_ok(void)
109
+{
110
+	return is_regex_ok(&domainlist_matcher);
111
+}
112
+
113
+int cli_loadpdb(FILE* fd,unsigned int options)
114
+{
115
+	return load_regex_matcher(&domainlist_matcher,fd,options);
116
+}
117
+
118
+void domainlist_cleanup(void)
119
+{
120
+	regex_list_cleanup(&domainlist_matcher);
121
+}
122
+
123
+void domainlist_done(void)
124
+{
125
+	regex_list_done(&domainlist_matcher);
126
+}
127
+
128
+#endif
... ...
@@ -19,6 +19,9 @@
19 19
  *  MA 02110-1301, USA.
20 20
  *
21 21
  */
22
+
23
+#ifdef CL_EXPERIMENTAL
24
+
22 25
 #ifndef _PHISH_DOMAINCHECK_DB_H
23 26
 #define _PHISH_DOMAINCHECK_DB_H
24 27
 
... ...
@@ -31,3 +34,5 @@ int is_domainlist_ok(void);
31 31
 int domainlist_match(const char* real_url,const char* display_url,int hostOnly,unsigned short* flags);
32 32
 
33 33
 #endif
34
+
35
+#endif
34 36
new file mode 100644
... ...
@@ -0,0 +1,157 @@
0
+/*
1
+ *  Phishing module: whitelist implementation.
2
+ *
3

                
4
+ *
5
+ *  This program is free software; you can redistribute it and/or modify
6
+ *  it under the terms of the GNU General Public License as published by
7
+ *  the Free Software Foundation; either version 2 of the License, or
8
+ *  (at your option) any later version.
9
+ *
10
+ *  This program is distributed in the hope that it will be useful,
11
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
12
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13
+ *  GNU General Public License for more details.
14
+ *
15
+ *  You should have received a copy of the GNU General Public License
16
+ *  along with this program; if not, write to the Free Software
17
+ *  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
18
+ *  MA 02110-1301, USA.
19
+ *
20
+ *  $Log: phish_whitelist.c,v $
21
+ *  Revision 1.1  2006/09/12 19:38:39  acab
22
+ *  Phishing module merge - libclamav
23
+ *
24
+ *  Revision 1.16  2006/08/06 20:27:07  edwin
25
+ *  New option to enable phish scan for all domains (disabled by default).
26
+ *  You will now have to run clamscan --phish-scan-alldomains to have any phishes detected.
27
+ *  Updated phishcheck control flow to better incorporate the domainlist.
28
+ *  Updated manpage with new options.
29
+ *
30
+ *  TODO:there is a still-reachable leak in regex_list.c
31
+ *
32
+ *  Revision 1.15  2006/07/31 20:12:30  edwin
33
+ *  Preliminary support for domain databases (domains to check by phishmodule)
34
+ *  Better memory allocation failure handling in regex_list
35
+ *
36
+ */
37
+
38
+#if HAVE_CONFIG_H
39
+#include "clamav-config.h"
40
+#endif
41
+
42
+#ifdef CL_EXPERIMENTAL
43
+
44
+#ifndef CL_DEBUG
45
+#define NDEBUG
46
+#endif
47
+
48
+#ifdef CL_THREAD_SAFE
49
+#ifndef _REENTRANT
50
+#define _REENTRANT
51
+#endif
52
+#endif
53
+
54
+#include <stdio.h>
55
+#include <stdlib.h>
56
+#include <errno.h>
57
+#include <assert.h>
58
+#include <string.h>
59
+#include <strings.h>
60
+#include <ctype.h>
61
+
62
+#include <limits.h>
63
+#include "clamav.h"
64
+#include <sys/types.h>
65
+
66
+/*#define USE_PCRE*/
67
+#include <regex.h>
68
+
69
+#if defined(HAVE_READDIR_R_3) || defined(HAVE_READDIR_R_2)
70
+#include <stddef.h>
71
+#endif
72
+
73
+#include "others.h"
74
+#include "defaults.h"
75
+#include "str.h"
76
+#include "filetypes.h"
77
+#include "mbox.h"
78
+#include "phish_whitelist.h"
79
+#include "regex_list.h"
80
+#include "matcher-ac.h"
81
+
82
+
83
+static struct regex_matcher whitelist_matcher;
84
+
85
+int whitelist_match(const char* real_url,const char* display_url,int hostOnly)
86
+{
87
+	const char* info;/*unused*/
88
+	return	regex_list_match(&whitelist_matcher,real_url,display_url,hostOnly,&info);
89
+}
90
+
91
+int init_whitelist(void)
92
+{
93
+	return	init_regex_list(&whitelist_matcher);
94
+}
95
+
96
+int is_whitelist_ok(void)
97
+{
98
+	return is_regex_ok(&whitelist_matcher);
99
+}
100
+
101
+int cli_loadwdb(FILE* fd,unsigned int options)
102
+{
103
+	return load_regex_matcher(&whitelist_matcher,fd,options);
104
+}
105
+
106
+void whitelist_cleanup(void)
107
+{
108
+	regex_list_cleanup(&whitelist_matcher);
109
+}
110
+
111
+void whitelist_done(void)
112
+{
113
+	regex_list_done(&whitelist_matcher);
114
+}
115
+
116
+#define WHITELIST_TEST
117
+#ifdef WHITELIST_TEST
118
+int main(int argc,char* argv[])
119
+{
120
+/*	struct tree_node* root=tree_node_alloc(NULL,1);
121
+	const  char* info;
122
+	const  unsigned char test[]="tesxt";
123
+	setup_matcher();
124
+	root->op=OP_ROOT;
125
+	root->c=0;
126
+	root->next=NULL;
127
+	root->listend=1;
128
+	dump_tree(root);
129
+	add_pattern(&root,"test","1");
130
+	dump_tree(root);
131
+	add_pattern(&root,"tesv","2");
132
+	dump_tree(root);
133
+	add_pattern(&root,"tert","3");
134
+	dump_tree(root);
135
+	add_pattern(&root,"terr+","4");
136
+	dump_tree(root);
137
+	add_pattern(&root,"tes[xy]t","5");
138
+	dump_tree(root);
139
+	match_node(root,test,sizeof(test),&info);
140
+	destroy_tree(root);
141
+	if(info)
142
+		printf("%s\n",info);
143
+	else printf("not found\n");*/
144
+	/*FILE* f=fopen("w.wdb","r");
145
+	init_whitelist();
146
+	load_whitelist(f);
147
+	fclose(f);
148
+	dump_tree(root_regex);
149
+	build_whitelist();
150
+	printf("%d\n",whitelist_match("http://www.google.ro","http://www.google.me.ro",0));
151
+	whitelist_done();*/
152
+	return 0;
153
+}
154
+#endif
155
+
156
+#endif
... ...
@@ -20,8 +20,10 @@
20 20
  *
21 21
  */
22 22
 
23
-#ifndef _WHITELIST_H
24
-#define _WHITELIST_H
23
+#ifdef CL_EXPERIMENTAL
24
+
25
+#ifndef _PHISH_WHITELIST_H
26
+#define _PHISH_WHITELIST_H
25 27
 
26 28
 int cli_loadwdb(FILE* fd, unsigned int options);
27 29
 int build_whitelist(void);
... ...
@@ -32,3 +34,5 @@ int is_whitelist_ok(void);
32 32
 int whitelist_match(const char* real_url,const char* display_url,int hostOnly);
33 33
 
34 34
 #endif
35
+
36
+#endif
35 37
new file mode 100644
... ...
@@ -0,0 +1,1258 @@
0
+/*
1
+ *  Detect phishing, based on URL spoofing detection.
2
+ *
3

                
4
+ *
5
+ *  This program is free software; you can redistribute it and/or modify
6
+ *  it under the terms of the GNU General Public License as published by
7
+ *  the Free Software Foundation; either version 2 of the License, or
8
+ *  (at your option) any later version.
9
+ *
10
+ *  This program is distributed in the hope that it will be useful,
11
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
12
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13
+ *  GNU General Public License for more details.
14
+ *
15
+ *  You should have received a copy of the GNU General Public License
16
+ *  along with this program; if not, write to the Free Software
17
+ *  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
18
+ *  MA 02110-1301, USA.
19
+ *
20
+ *  $Log: phishcheck.c,v $
21
+ *  Revision 1.1  2006/09/12 19:38:39  acab
22
+ *  Phishing module merge - libclamav
23
+ *
24
+ *  Revision 1.28  2006/09/09 09:49:27  edwin
25
+ *  Fix Solaris compilation problem
26
+ *
27
+ *  Revision 1.27  2006/08/28 08:43:06  edwin
28
+ *  Fixed a few minor leaks.
29
+ *  Valgrind now says:"All heap blocks were freed -- no leaks are possible"
30
+ *
31
+ *  Revision 1.26  2006/08/20 21:18:11  edwin
32
+ *  Added the script used to generate iana_tld.sh
33
+ *  Added checks for phish_domaincheck_db
34
+ *  Added phishing module design document from wiki (as discussed with aCaB).
35
+ *  Updated .wdb/.pdb format documentation (in regex_list.c)
36
+ *  Fixed some memory leaks in regex_list.c
37
+ *  IOW: cleanups before the deadline.
38
+ *  I consider my module to be ready for evaluation now.
39
+ *
40
+ *  Revision 1.25  2006/08/19 21:08:47  edwin
41
+ *  Fixed:Forgot to add form tag handling when it contains images.
42
+ *  Various fixes to get rid of gcc warnings.
43
+ *
44
+ *  Revision 1.24  2006/08/19 13:30:34  edwin
45
+ *  iana_tld.h was missing from the list of header files.
46
+ *  commentedout network code (unused currently)
47
+ *
48
+ *  Revision 1.23  2006/08/17 20:31:43  edwin
49
+ *  Disable extracting hrefs from mails in mbox, if: we aren't scanning for phish, and mailfollowurls is off.
50
+ *  Fix a still reachable leak. Remove unneeded build_regex_list export.
51
+ *
52
+ *  Revision 1.22  2006/08/12 14:35:34  edwin
53
+ *  Fix some compiler warnings.
54
+ *  Fix an assertion failure in regex_list.
55
+ *  Interpret display links that start with http|https|ftp, always as an URL.
56
+ *
57
+ *  Revision 1.21  2006/08/06 20:27:07  edwin
58
+ *  New option to enable phish scan for all domains (disabled by default).
59
+ *  You will now have to run clamscan --phish-scan-alldomains to have any phishes detected.
60
+ *  Updated phishcheck control flow to better incorporate the domainlist.
61
+ *  Updated manpage with new options.
62
+ *
63
+ *  TODO:there is a still-reachable leak in regex_list.c
64
+ *
65
+ *  Revision 1.20  2006/08/01 20:19:14  edwin
66
+ *  Integrate domainlist check into phishcheck. Warning: enabled by default.
67
+ *  Regex bracket handling update.
68
+ *  Better regex paranthesized & alternate expression handling.
69
+ *
70
+ 
71
+case CL_PHISH_HOST_NOT_LISTED:
72
+ return "Host not listed in .pdb -> not checked";*  Revision 1.19  2006/07/31 20:12:30  edwin
73
+ *  Preliminary support for domain databases (domains to check by phishmodule)
74
+ *  Better memory allocation failure handling in regex_list
75
+ *
76
+ */
77
+
78
+#if HAVE_CONFIG_H
79
+#include "clamav-config.h"
80
+#endif
81
+
82
+#ifdef CL_EXPERIMENTAL
83
+
84
+#ifndef CL_DEBUG
85
+#define NDEBUG
86
+#endif
87
+
88
+#ifdef CL_THREAD_SAFE
89
+#ifndef _REENTRANT
90
+#define _REENTRANT
91
+#endif
92
+#endif
93
+
94
+#include <stdio.h>
95
+#include <stdlib.h>
96
+#include <errno.h>
97
+#include <assert.h>
98
+#include <string.h>
99
+#include <strings.h>
100
+#include <ctype.h>
101
+#include <limits.h>
102
+#include <clamav.h>
103
+#include <netdb.h>
104
+#include <netinet/in.h>
105
+
106
+#if defined(HAVE_READDIR_R_3) || defined(HAVE_READDIR_R_2)
107
+#include <stddef.h>
108
+#endif
109
+
110
+#include <sys/types.h>
111
+#include <sys/socket.h>
112
+#include <regex.h>
113
+
114
+#include "others.h"
115
+#include "defaults.h"
116
+#include "str.h"
117
+#include "filetypes.h"
118
+#include "mbox.h"
119
+#include "htmlnorm.h"
120
+#include "phishcheck.h"
121
+#include "phish_whitelist.h"
122
+#include "phish_domaincheck_db.h"
123
+#include "iana_tld.h"
124
+
125
+#define DOMAIN_REAL 1
126
+#define DOMAIN_DISPLAY 0
127
+
128
+#define PHISHY_USERNAME_IN_URL 1
129
+#define PHISHY_NUMERIC_IP      2
130
+#define REAL_IS_MAILTO         4
131
+/* this is just a flag, so that the displayed url will be parsed as mailto too, for example
132
+ * <a href='mailto:somebody@yahoo.com'>to:somebody@yahoo.com</a>*/
133
+#define DOMAIN_LISTED          8
134
+#define PHISHY_CLOAKED_NULL    16
135
+#define PHISHY_HEX_URL         32
136
+
137
+
138
+/*
139
+* Phishing design documentation, 
140
+(initially written at http://wiki.clamav.net/index.php/phishing_design as discussed with aCaB)
141
+
142
+*Warning*: if flag *--phish-scan-alldomains* (or equivalent clamd/clamav-milter config option) isn't given, then phishing scanning is done only for domains listed in daily.pdb.
143
+If your daily.pdb is empty, then by default NO PHISHING is DONE, UNLESS you give the *--phish-scan-alldomains*
144
+This is just a side-effect, daily.pdb is empty, because it isn't yet officialy in daily.cvd.
145
+
146
+phishingCheck() determines if @displayedLink is  a legit representation of @realLink.
147
+
148
+Steps:
149
+
150
+1. if _realLink_ *==* _displayLink_ => *CLEAN*
151
+
152
+2. url cleanup (normalization)
153
+- whitespace elimination
154
+- html entity conversion
155
+- convert hostname to lowercase
156
+- normalize \ to /
157
+If there is a dot after the last space, then all spaces are replaced with dots,
158
+otherwise spaces are stripped.
159
+So both: 'Go to yahoo.com', and 'Go to e b a y . c o m', and 'Go to ebay. com' will work.
160
+
161
+
162
+3. Matched the urls against a _whitelist_:
163
+a _realLink_, _displayedLink_ pair is matched against the _whitelist_.
164
+the _whitelist_ is a list of pairs of realLink, displayedLink. Any of the elements of those pairs can be a _regex_.
165
+ if url *is found* in _whitelist_ --> *CLEAN*
166
+
167
+4. URL is looked up in the _domainlist_, unless disabled via flags (_--phish-scan-alldomains_).
168
+The _domainlist_ is a list of pairs of realLink, displayedLink (any of which can be regex).
169
+This is the list of domains we do phishing detection for (such as ebay,paypal,chase,....)
170
+We can't decide to stop processing here or not, so we just set a flag.
171
+
172
+Note(*!*): the flags are modified by the the domainlist checker. If domain is found, then the flags associated with it filter the default compile-time flags.
173
+
174
+5. _Hostname_ is extracted from the _displayed URL_.
175
+It is checked against the _whitelist_, and _domainlist_.
176
+
177
+6. Now we know if we want to stop processing.
178
+If we are only scanning domains in the _domainlist_ (default behaviour), and the url/domain
179
+isn't found in it, we return (and mark url as not_list/clean).
180
+If we scan all domains, then the domainlist isn't even checked.
181
+
182
+7. URL cloak check.
183
+check for %00, and hex-encoded IPs in URL.
184
+
185
+8. Skip empty displayedURLs
186
+
187
+9. SSL mismatch detection.
188
+Checks if realLink is http, but displayedLink is https or viceversa.
189
+(by default the SSL detection is done for hrefs only, not for imgs)
190
+
191
+10. Hostname of real URL is extracted.
192
+
193
+11. Skip cid: displayedLink urls (images embedded in mails).
194
+
195
+12. Numeric IP detection.
196
+If url is a numeric IP, then -> phish.
197
+Maybe we should do DNS lookup?
198
+Maybe we should disable numericIP checks for --phish-scan-alldomains?
199
+
200
+13. isURL(displayedLink).
201
+Checks if displayedLink is really a url.
202
+if not -> clean
203
+
204
+14. Hostnames of real, displayedLink are compared. If equal -> clean
205
+
206
+15. Extract domain names, and compare. If equal -> clean
207
+
208
+16. Do DNS lookups/reverse lookups. Disabled now (too much load/too many lookups). *
209
+
210
+For the Whitelist(.wdb)/Domainlist(.pdb) format see regex_list.c (search for Flags)
211
+ *
212
+ */
213
+static char empty_string[]="";
214
+
215
+void url_check_init(struct url_check* urls)
216
+{
217
+	urls->realLink.refcount=0;
218
+	urls->realLink.data=empty_string;
219
+	urls->realLink.ref=NULL;
220
+	urls->displayLink.refcount=0;
221
+	urls->displayLink.data=empty_string;
222
+	urls->displayLink.ref=NULL;
223
+}
224
+
225
+/* string reference counting implementation,
226
+ * so that: we don't have to keep in mind who allocated what, and when needs to be freed,
227
+ * and thus we won't leek memory*/
228
+
229
+inline void string_free(struct string* str)
230
+{
231
+	for(;;){ 
232
+		str->refcount--;
233
+		if(!str->refcount) {
234
+			if(str->ref)/* don't free, this is a portion of another string */
235
+				str=str->ref;/* try to free that one*/
236
+			else {
237
+				free(str->data);
238
+				break;
239
+			}
240
+		}
241
+		else break;
242
+	} 
243
+}
244
+
245
+/* always use the string_assign when assigning to a string, this makes sure the old one's refcount is decremented*/
246
+void string_assign(struct string* dest,struct string* src)
247
+{
248
+	string_free(dest);
249
+	src->refcount++;
250
+	dest->data=src->data;
251
+	dest->refcount=1;
252
+	dest->ref=src;
253
+}
254
+
255
+/* data will be freed when string freed */
256
+void string_assign_c(struct string* dest,char* data)
257
+{
258
+	string_free(dest);
259
+	dest->data=data;
260
+	dest->ref=NULL;
261
+	dest->refcount=1;
262
+}
263
+
264
+/* same as above, but it doesn't free old string, use only for initialization
265
+ * Doesn't allow NULL pointers, they are replaced by pointer to empty string
266
+ * */
267
+inline void string_init_c(struct string* dest,char* data)
268
+{
269
+	dest->refcount = 1;
270
+	dest->data = data ? data : empty_string;
271
+	dest->ref = NULL;
272
+}
273
+
274
+/* make a copy of the string between start -> end*/
275
+inline void string_assign_dup(struct string* dest,const char* start,const char* end)
276
+{
277
+	char*	    ret  = cli_malloc(end-start+1);
278
+	strncpy(ret,start,end-start);
279
+	ret[end-start]='\0';
280
+
281
+	string_free(dest);
282
+	dest->data=ret;
283
+	dest->refcount=1;
284
+	dest->ref=NULL;
285
+}
286
+
287
+inline void string_assign_null(struct string* dest)
288
+{
289
+	string_free(dest);
290
+	dest->data=empty_string;
291
+	dest->refcount=-1;/* don't free it! */
292
+	dest->ref=NULL;
293
+}
294
+
295
+/* this string uses portion of another string*/
296
+void string_assign_ref(struct string* dest,struct string* ref,char* data)
297
+{
298
+	string_free(dest);
299
+	ref->refcount++;
300
+	dest->data=data;
301
+	dest->refcount=1;
302
+	dest->ref=ref;
303
+}
304
+
305
+inline void free_if_needed(struct url_check* url)
306
+{
307
+	string_free(&url->realLink);
308
+	string_free(&url->displayLink);
309
+}
310
+
311
+static int phish_disabled = 0;/* disabled due to fatal startup error */
312
+static int build_regex(regex_t** preg,const char* regex,int nosub)
313
+{
314
+	int rc;
315
+	*preg = cli_malloc(sizeof(**preg));
316
+	cli_dbgmsg("Compiling regex:%s\n",regex);
317
+	rc = regcomp(*preg,regex,REG_EXTENDED|REG_ICASE|(nosub ? REG_NOSUB :0));
318
+	if(rc) {
319
+		size_t buflen =	regerror(rc,*preg,NULL,0);
320
+		char*  errbuf = cli_malloc(buflen);
321
+		regerror(rc,*preg,errbuf,buflen);
322
+		cli_errmsg("Error in compiling regex:%s\nDisabling phishing checks\n",errbuf);
323
+		free(errbuf);
324
+		free(*preg);
325
+		*preg=NULL;
326
+		phish_disabled=1;
327
+		return 1;
328
+	}
329
+	return 0;
330
+}
331
+
332
+
333
+/*static regex_t* host_preg = NULL;
334
+static const char* host_regex="cid:.+|mailto:(.+)|([[:alpha:]]+://)?(([^:/?]+@)+([^:/?]+)([:/?].+)?|([^@:/?]+)([:/?].+)?)"; <- this is slower than the function below
335
+*/
336
+/* allocates memory */
337
+void get_host(struct string* dest,const char* URL,int isReal,int* phishy)
338
+{
339
+	const char mailto[] = "mailto:";
340
+	int ismailto = 0;
341
+	const char* start;
342
+	const char* end=NULL;
343
+	if(!URL) {
344
+		string_assign_null(dest);
345
+		return;
346
+	}
347
+	start = strstr(URL,"://");
348
+	if(!start) {
349
+		if(!strncmp(URL,mailto,sizeof(mailto)-1)) {
350
+			start = URL + sizeof(mailto)-1;
351
+			ismailto = 1;
352
+		}
353
+		else if (!isReal && *phishy&REAL_IS_MAILTO) {
354
+			/* it is not required to use mailto: in the displayed url, they might use to:, or whatever */
355
+			end = URL+strlen(URL)+1;
356
+			start = URL + strcspn(URL,": ")+1;
357
+			if (start==end) 
358
+				start = URL;
359
+			ismailto = 1;
360
+		}
361
+		else {
362
+/*			if(!strncmp(URL,"cid:",4)) {handled in phishcheck
363
+				string_assign_null(dest);
364
+				return;* cid: image, nothing to verify
365
+			}
366
+*/
367
+			start=URL;/*URL without protocol*/
368
+			if(isReal)
369
+				cli_dbgmsg("PH:Real URL without protocol:%s\n",URL);
370
+			else ismailto=2;/*no-protocol, might be mailto, @ is no problem*/
371
+		}
372
+	}
373
+	else start += 3;/* :// */
374
+	
375
+	if(!ismailto || !isReal) { 
376
+		const char* realhost;
377
+		do {
378
+			end	 = start+strcspn(start,":/?");
379
+			realhost = strchr(start,'@');
380
+			if(start!=end && realhost>end) realhost = NULL;/*don't check beyond end of hostname*/
381
+			if(realhost) {
382
+				const char* tld = strrchr(realhost,'.');
383
+				if(tld && isTLD(tld,tld-realhost-1))
384
+					*phishy |= PHISHY_USERNAME_IN_URL;/* if the url contains a username that is there just to fool people,
385
+					like http://www.ebay.com@somevilplace.someevildomain.com/ */
386
+				start=realhost+1;/*skip the username*/
387
+			}
388
+		} while(realhost);/*skip over multiple @ characters, text following last @ character is the real host*/
389
+	}
390
+	else 
391
+	if (ismailto && isReal)
392
+		*phishy |= REAL_IS_MAILTO;
393
+
394
+	if(!end) {
395
+		end  = start+strcspn(start,":/?");/*especially important for mailto:somebody@yahoo.com?subject=...*/
396
+		if(!end) 
397
+			end  = start + strlen(start);
398
+	}
399
+
400
+	string_assign_dup(dest,start,end);
401
+}
402
+
403
+static regex_t* preg = NULL;
404
+static regex_t* preg_tld = NULL;
405
+static regex_t* preg_cctld = NULL;
406
+static regex_t* preg_numeric = NULL;
407
+
408
+static const char tld_regex[] = "^"iana_tld"$";
409
+static const char cctld_regex[] = "^"iana_cctld"$";
410
+
411
+int isCountryCode(const char* str)
412
+{
413
+	if(!preg_cctld) {
414
+		if(build_regex(&preg_cctld,cctld_regex,1))
415
+			return -1;
416
+	}
417
+	return str ? !regexec(preg_cctld,str,0,NULL,0) : 0;
418
+}
419
+
420
+int isTLD(const char* str,int len)
421
+{
422
+	if (!str)
423
+		return 0;
424
+	else {
425
+		char*	s  = cli_malloc(len+1);
426
+		int rc;
427
+		strncpy(s,str,len);
428
+		s[len]='\0';
429
+		if(!preg_tld) {
430
+			if(build_regex(&preg_tld,tld_regex,1))
431
+				return -1;
432
+		}
433
+		rc = !regexec(preg_tld,s,0,NULL,0);
434
+		free(s);
435
+		return rc;
436
+	}
437
+}
438
+
439
+/*
440
+ * memrchr isn't standard, so I use this
441
+ */
442
+char* rfind(char* start,char c,size_t len)
443
+{
444
+	char* p;
445
+	for(p=start+len;p>=start && *p!=c;p--);
446
+	return p<start ? NULL : p;
447
+}
448
+
449
+void get_domain(struct string* dest,struct string* host)
450
+{
451
+	char* domain;
452
+	char* tld = strrchr(host->data,'.');
453
+	if(!tld) {
454
+		cli_dbgmsg("PH:What? A host without a tld? (%s)\n",host->data);
455
+		string_assign(dest,host);
456
+		return;
457
+	}
458
+	if(isCountryCode(tld+1)) {
459
+		const char* countrycode=tld+1;
460
+		tld = rfind(host->data,'.',tld-host->data-1);
461
+		if(!tld) {
462
+			cli_dbgmsg("PH:Weird, a name with only 2 levels (%s)\n",host);
463
+			string_assign(dest,host);
464
+			return;
465
+		}
466
+		if(!isTLD(tld+1,countrycode-tld-1)) {
467
+			string_assign_ref(dest,host,tld+1);
468
+			return;/*it was a name like: subdomain.domain.uk, return domain.uk*/
469
+		}
470
+	}
471
+	/*we need to strip one more level, this is the actual domain*/
472
+	domain = rfind(host->data,'.',tld-host->data-1);
473
+	if(!domain) {
474
+		string_assign(dest,host);
475
+		return;/* it was like sourceforge.net?*/
476
+	}
477
+	string_assign_ref(dest,host,domain+1);
478
+}
479
+
480
+
481
+/*
482
+int ip_reverse(struct url_check* urls,int isReal)
483
+{
484
+	const char* host = isReal ? urls->realLink.data : urls->displayLink.data;
485
+	struct hostent *he = gethostbyname (host);
486
+	if (he)
487
+	{
488
+		char *addr = 0;
489
+		switch (he->h_addrtype)
490
+		{
491
+			case AF_INET:
492
+			  addr = inet_ntoa (*(struct in_addr *) he->h_addr);
493
+			  break;
494
+		}
495
+		if (addr && strcmp (he->h_name, addr) == 0)
496
+		{
497
+			char *h_addr_copy = strdup (he->h_addr);
498
+			if (h_addr_copy == NULL)
499
+			    he = NULL;
500
+			else
501
+			{
502
+			      he = gethostbyaddr (h_addr_copy, he->h_length, he->h_addrtype);
503
+			      free (h_addr_copy);
504
+			}
505
+		}
506
+	     if (he)
507
+		string_assign_dup(isReal ? &urls->realLink : &urls->displayLink,he->h_name,he->h_name+strlen(he->h_name));
508
+    }
509
+    return 0;
510
+}
511
+* frees its argument, and allocates memory*
512
+void reverse_lookup(struct url_check* url,int isReal)
513
+{
514
+	ip_reverse(url,isReal);
515
+}
516
+*/
517
+int isNumeric(const char* host)
518
+{
519
+	int len = strlen(host);
520
+	int a,b,c,d,n=0;
521
+	/* 1.2.3.4 -> 7*/
522
+	/* 127.127.127.127 -> 15*/
523
+	if(len<7 || len>15)
524
+		return 0;	
525
+	sscanf(host,"%d.%d.%d.%d%n",&a,&b,&c,&d,&n);
526
+	if(n==len)
527
+		if(a>=0 && a<=256 && b>=0 && b<=256 && c>=0 && c<=256 && d>=0 && d<=256)
528
+			return 1;
529
+	return 0;
530
+}
531
+
532
+int isSSL(const char* URL)
533
+{
534
+	const char https[]="https://";
535
+	return URL ? !strncmp(https,URL,sizeof(https)-1) : 0;
536
+}
537
+
538
+static int hexinited=0;
539
+static short int hextable[256];
540
+static inline char hex2int(const unsigned char* src)
541
+{
542
+	assert(hexinited);
543
+	return hextable[src[0]]<<4 | hextable[src[1]];
544
+}
545
+
546
+
547
+/* deletes @what from the string @begin. 
548
+ * @what_len: length of @what, excluding the terminating \0 */
549
+static void str_hex_to_char(char** begin,const char** end)
550
+{
551
+	char* sbegin = *begin;
552
+	const char* str_end = *end;
553
+	assert(str_end>sbegin);
554
+	/* convert leading %xx*/
555
+	if (sbegin[0] == '%') {
556
+		sbegin[2] = hex2int((unsigned char*)sbegin+1);
557
+		sbegin += 2;
558
+	}
559
+	*begin = sbegin++;
560
+	while(sbegin+3 < str_end) {
561
+		while(sbegin+3<str_end && sbegin[0]=='%') {
562
+			const char* src = sbegin+3;
563
+			*sbegin = hex2int((unsigned char*)sbegin+1);
564
+			/* move string */
565
+			memmove(sbegin+1,src,str_end-src+1);
566
+			str_end -= 2;
567
+		}
568
+		sbegin++;
569
+	}
570
+	*end = str_end;
571
+}
572
+/* deletes @what from the string @begin. 
573
+ * @what_len: length of @what, excluding the terminating \0 */
574
+static void str_strip(char** begin,const char** end,const char* what,size_t what_len)
575
+{
576
+	char* sbegin = *begin;
577
+	const char* str_end = *end;
578
+	const char* str_end_what;
579
+	size_t cmp_len = what_len;
580
+	assert(str_end>sbegin);
581
+	if(str_end < sbegin + what_len)
582
+		return;
583
+	/* strip leading @what */
584
+	while(cmp_len && !strncmp(sbegin,what,cmp_len)) {
585
+		sbegin += what_len;
586
+		if(cmp_len > what_len)
587
+			cmp_len -= what_len;
588
+		else cmp_len = 0;
589
+	}
590
+	/* strip trailing @what */
591
+	str_end_what = str_end - what_len;
592
+	while(str_end_what>sbegin && !strncmp(str_end_what,what,what_len)) {
593
+		str_end -= what_len;
594
+		str_end_what -= what_len;
595
+	}
596
+	*begin = sbegin++;
597
+	while(sbegin+what_len < str_end) {
598
+		while(sbegin+what_len<str_end && !strncmp(sbegin,what,what_len)) {
599
+			const char* src = sbegin+what_len;
600
+			/* move string */
601
+			memmove(sbegin,src,str_end-src+1);
602
+			str_end -= what_len;
603
+		}
604
+		sbegin++;
605
+	}
606
+	*end = str_end;
607
+}
608
+
609
+static const char dotnet[] = ".net";
610
+static const char adonet[] = "ado.net";
611
+static const char aspnet[] = "asp.net";
612
+static const char lt[]="&lt;";
613
+static const char gt[]="&gt;";
614
+static const size_t dotnet_len = sizeof(dotnet)-1;
615
+static const size_t adonet_len = sizeof(adonet)-1;
616
+static const size_t aspnet_len = sizeof(aspnet)-1;
617
+static const size_t lt_len = sizeof(lt)-1;
618
+static const size_t gt_len = sizeof(gt)-1;
619
+
620
+/* replace every occurence of @c in @str with @r*/
621
+static inline void str_replace(char* str,const char* end,char c,char r)
622
+{
623
+	for(;str<end;str++) {
624
+		if(*str==c)
625
+			*str=r;
626
+	}
627
+}
628
+static inline void str_make_lowercase(char* str,size_t len)
629
+{
630
+	for(;len;str++,len--) {
631
+		*str = tolower(*str);
632
+	}
633
+}
634
+
635
+#define fix32(x) ((x)<32 ? 32 : (x))
636
+static inline void clear_msb(char* begin)
637
+{
638
+	for(;*begin;begin++)
639
+		*begin = fix32((*begin)&0x7f);	
640
+}
641
+
642
+/*
643
+ * Particularly yahoo puts links like this in mails:
644
+ * http:/ /mail.yahoo.com
645
+ * So first step: delete space between / /
646
+ *
647
+ * Next there could be possible links like this:
648
+ * <a href="phishlink">w  w w . e b a y . c o m</a>
649
+ * Here we need to strip spaces to get this picked up.
650
+ *
651
+ * Next there are links like:
652
+ * <a href="www.yahoo.com">Check out yahoo.com</a>
653
+ * Here we add a ., so we get: check.out.yahoo.com (it won't trigger)
654
+ *
655
+ * Rule for adding .: if substring from right contains dot, then add dot, otherwise strip space
656
+ *
657
+ */
658
+static inline void str_fixup_spaces(char **begin,const char** end)
659
+{
660
+	char* space = strchr(*begin,' ');
661
+	/* strip any number of spaces after / */
662
+	while(space>*begin && space[-1]=='/' && space[0]==' ' && space<*end) {
663
+		memmove(space,space+1,*end-space+1);
664
+		(*end)--;
665
+	}
666
+
667
+	for(space = rfind(*begin,' ',*end-*begin);space && space[0]!='.' && space<*end;space++) {}
668
+	if(space && space[0]=='.')
669
+		str_replace(*begin,*end,' ','.');
670
+	else 
671
+		str_strip(begin,end," ",1);
672
+}
673
+
674
+/* allocates memory */
675
+void cleanupURL(struct string* URL,int isReal)
676
+{
677
+	char* begin = URL->data;
678
+	const char* end;
679
+	size_t len;
680
+	clear_msb(begin);
681
+/*	if(!URL->data)
682
+		return;*/
683
+	/*TODO: handle hex-encoded IPs*/
684
+	while(isspace(*begin)) begin++;
685
+	len=strlen(begin);
686
+	end = begin+len-1;
687
+	/*cli_dbgmsg("%d\n",end-begin);*/
688
+	if(begin>=end) {
689
+		string_assign_null(URL);
690
+		return;
691
+	}
692
+	while(isspace(*end)) 
693
+		end--;
694
+	/*TODO: convert \ to /, and stuff like that*/
695
+	/* From mailscanner, my comments enclosed in {} */
696
+        if(!strncmp(begin,dotnet,dotnet_len) || !strncmp(begin,adonet,adonet_len) || !strncmp(begin,aspnet,aspnet_len))
697
+		string_assign_null(URL);
698
+	else {
699
+		size_t host_len;
700
+		char* host_begin;
701
+		str_replace(begin,end,'\\','/');
702
+		str_strip(&begin,&end,"\"",1);
703
+		str_strip(&begin,&end,lt,lt_len);
704
+		str_strip(&begin,&end,gt,gt_len);
705
+		/* convert hostname to lowercase, but only hostname! */
706
+		host_begin = strchr(begin,':');
707
+		while(host_begin && host_begin[1]=='/') host_begin++;
708
+		if(!host_begin) host_begin=begin;
709
+		else host_begin++;
710
+		host_len = strcspn(host_begin,"/?");
711
+		str_make_lowercase(host_begin,host_len);
712
+		/* convert %xx to real value */
713
+		str_hex_to_char(&begin,&end);
714
+		str_fixup_spaces(&begin,&end);
715
+		string_assign_dup(URL,begin,end+1);
716
+		/*cli_dbgmsg("%p::%s\n",URL->data,URL->data);*/
717
+	}
718
+}
719
+
720
+void get_redirected_URL(struct string* URL)
721
+{
722
+	/*TODO: see if URL redirects sowhere, if so, then follow
723
+	returns redirected URL*/
724
+}
725
+
726
+static inline int is_phish_disabled(void)
727
+{
728
+	if (phish_disabled)
729
+		return 1;
730
+	else if (!is_whitelist_ok()) {
731
+		phish_disabled = 1;
732
+		return 1;
733
+	}
734
+	else return 0;
735
+}
736
+
737
+static void init_hextable(void)
738
+{
739
+	unsigned char c;
740
+	memset(hextable,0,256);
741
+	for(c='0';c<='9';c++)
742
+		hextable[c] = c-'0';
743
+	for(c='a';c<='z';c++)
744
+		hextable[c] = 10+c-'a';
745
+	for(c='A';c<='Z';c++)
746
+		hextable[c] = 10+c-'A';
747
+	hexinited=1;
748
+}
749
+
750
+int phishingScan(message* m,const char* dir,cli_ctx* ctx,tag_arguments_t* hrefs)
751
+{
752
+	const char src_text[]="src";
753
+	const char href_text[]="href";
754
+	const size_t href_text_len = sizeof(href_text);
755
+	const size_t src_text_len = sizeof(src_text);
756
+	int i;
757
+	if(is_phish_disabled())
758
+		return 0;
759
+	if(!hexinited) {
760
+		init_hextable(); 
761
+		atexit(phishing_done);/*TODO: replace this with a proper phishing_done call from manager.c*/
762
+	}
763
+
764
+	*ctx->virname=NULL;
765
+	for(i=0;i<hrefs->count;i++)
766
+		if(hrefs->contents[i]) {
767
+			struct url_check urls;
768
+			enum phish_status rc;
769
+			urls.flags	 = strncmp((char*)hrefs->tag[i],href_text,href_text_len)? (CL_PHISH_ALL_CHECKS&~CHECK_SSL): CL_PHISH_ALL_CHECKS;
770
+			if (!(urls.flags&CHECK_IMG_URL) && !strncmp((char*)hrefs->tag[i],src_text,src_text_len))
771
+				continue;
772
+			if (ctx->options&CL_PHISH_NO_DOMAINLIST)
773
+				urls.flags &= ~DOMAINLIST_REQUIRED;
774
+			string_init_c(&urls.realLink,(char*)hrefs->value[i]);
775
+/*			if(!hrefs->contents[i]->isClosed) {
776
+				blobAddData(hrefs->contents[i],empty_string,1);
777
+				blobClose(hrefs->contents[i]);
778
+			}*/
779
+			string_init_c(&urls.displayLink,(char*)blobGetData(hrefs->contents[i]));
780
+			assert(!urls.displayLink.data[blobGetDataSize(hrefs->contents[i])-1]);
781
+/*			assert(strlen(urls.displayLink.data) < blobGetDataSize(hrefs->contents[i]));*/
782
+			urls.realLink.refcount=-1;
783
+			urls.displayLink.refcount=-1;/*don't free these, caller will free*/
784
+			if(strcmp((char*)hrefs->tag[i],"href")) {
785
+				char *url;
786
+				url = urls.realLink.data;
787
+				urls.realLink.data = urls.displayLink.data;
788
+				urls.displayLink.data = url;
789
+			}
790
+
791
+			rc = phishingCheck(&urls);
792
+			if(phish_disabled)
793
+				return 0;
794
+			free_if_needed(&urls);
795
+			cli_dbgmsg("Phishing scan result:%s\n",phishing_ret_toString(rc));
796
+			switch(rc)/*TODO: support flags from ctx->options,*/
797
+				{
798
+					case CL_PHISH_CLEAN:
799
+					case CL_PHISH_CLEANUP_OK:
800
+					case CL_PHISH_HOST_OK:
801
+					case CL_PHISH_DOMAIN_OK:
802
+					case CL_PHISH_REDIR_OK:
803
+					case CL_PHISH_HOST_REDIR_OK:
804
+					case CL_PHISH_DOMAIN_REDIR_OK:
805
+					case CL_PHISH_HOST_REVERSE_OK:
806
+					case CL_PHISH_DOMAIN_REVERSE_OK:
807
+					case CL_PHISH_WHITELISTED:
808
+					case CL_PHISH_HOST_WHITELISTED:
809
+					case CL_PHISH_MAILTO_OK:
810
+					case CL_PHISH_TEXTURL:
811
+					case CL_PHISH_HOST_NOT_LISTED:
812
+					case CL_PHISH_CLEAN_CID:
813
+						continue;
814
+/*						break;*/
815
+					case CL_PHISH_HEX_URL:
816
+						*ctx->virname="Phishing.Email.HexURL";
817
+						return CL_VIRUS;
818
+/*						break;*/
819
+					case CL_PHISH_NUMERIC_IP:
820
+						*ctx->virname="Phishing.Email.Cloaked.NumericIP";
821
+						return CL_VIRUS;
822
+					case CL_PHISH_CLOAKED_NULL:
823
+						*ctx->virname="Phishing.Email.Cloaked.Null";/*http://www.real.com%01%00@www.evil.com*/
824
+						return CL_VIRUS;
825
+					case CL_PHISH_SSL_SPOOF:
826
+						*ctx->virname="Phishing.Email.SSL-Spoof";
827
+						return CL_VIRUS;
828
+					case CL_PHISH_CLOAKED_UIU:
829
+						*ctx->virname="Phishing.Email.Cloaked.Username";/*http://www.ebay.com@www.evil.com*/
830
+						return CL_VIRUS;
831
+					case CL_PHISH_NOMATCH:
832
+					default:
833
+						*ctx->virname="Phishing.Email";
834
+						return CL_VIRUS;
835
+				}
836
+		}
837
+		else
838
+			if(strcmp((char*)hrefs->tag[i],"href"))
839
+					cli_dbgmsg("PH:href with no contents?\n");
840
+	return 0;/*texturlfound?CL_VIRUS:0;*/
841
+}
842
+
843
+static char* str_compose(const char* a,const char* b,const char* c)
844
+{
845
+	const size_t a_len = strlen(a);
846
+	const size_t b_len = strlen(b);
847
+	const size_t c_len = strlen(c);
848
+	const size_t r_len = a_len+b_len+c_len+1;
849
+	char* concated = malloc(r_len);
850
+	strncpy(concated,a,a_len);
851
+	strncpy(concated+a_len,b,b_len);
852
+	strncpy(concated+a_len+b_len,c,c_len);
853
+	concated[r_len-1]='\0';
854
+	return concated;
855
+}
856
+
857
+/*static const char* url_regex="^ *([[:alnum:]%_-]+:(//)?)?([[:alnum:]%_-]@)*[[:alnum:]%_-]+\\.([[:alnum:]%_-]+\\.)*[[:alnum:]_%-]+(/[[:alnum:];:@$=?&/.,%_-]+) *$";*/
858
+/* for urls, including mailto: urls, and (broken) http:www... style urls*/
859
+/* refer to: http://www.w3.org/Addressing/URL/5_URI_BNF.html 
860
+ * Modifications: don't allow empty domains/subdomains, such as www..com <- that is no url
861
+ * So the 'safe' char class has been split up
862
+ * */
863
+/* character classes */
864
+#define URI_alpha       "a-zA-Z"
865
+#define URI_digit       "0-9"
866
+#define URI_safe_nodot  "-$_@&"
867
+#define URI_safe        "-$_@.&"
868
+#define URI_extra       "!*\"'(),"
869
+#define URI_reserved    "=;/#?: "
870
+#define URI_national    "{}|[]\\^~"
871
+#define URI_punctuation "<>"
872
+
873
+#define URI_hex         "[0-9a-fA-f]"
874
+#define URI_escape      "%"URI_hex"{2}"
875
+#define URI_xalpha "([" URI_safe URI_alpha URI_digit  URI_extra "]|"URI_escape")" /* URI_safe has to be first, because it contains - */
876
+#define URI_xalpha_nodot "([" URI_safe_nodot URI_alpha URI_digit URI_extra "]|"URI_escape")"
877
+ 
878
+#define URI_xalphas URI_xalpha"+"
879
+#define URI_xalphas_nodot URI_xalpha_nodot"*"
880
+
881
+#define URI_ialpha  "["URI_alpha"]"URI_xalphas_nodot""
882
+#define URI_xpalpha URI_xalpha"|\\+"
883
+#define URI_xpalpha_nodot URI_xalpha_nodot"|\\+"
884
+#define URI_xpalphas "("URI_xpalpha")+"
885
+#define URI_xpalphas_nodot "("URI_xpalpha_nodot")+"
886
+
887
+#define URI_scheme URI_ialpha
888
+#define URI_tld iana_tld
889
+#define URI_path1 URI_xpalphas_nodot"\\.("URI_xpalphas_nodot"\\.)*"
890
+#define URI_path2 URI_tld
891
+#define URI_path3 "(/("URI_xpalphas"/?)*)?"
892
+
893
+#define URI_search "("URI_xalphas"\\+)*"
894
+#define URI_fragmentid URI_xalphas
895
+
896
+#define URI_IP_digits "["URI_digit"]{1,3}"
897
+#define URI_numeric_path URI_IP_digits"(\\."URI_IP_digits"){3}(:"URI_xpalphas_nodot")?(/("URI_xpalphas"/?)*)?"
898
+#define URI_numeric_URI "("URI_scheme":(//)?)?"URI_numeric_path"(\\?" URI_search")?"
899
+#define URI_numeric_fragmentaddress URI_numeric_URI"(#"URI_fragmentid")?"
900
+
901
+#define URI_URI1 "("URI_scheme":(//)?)?"URI_path1
902
+#define URI_URI2 URI_path2
903
+#define URI_URI3 URI_path3"(\\?" URI_search")?"
904
+
905
+#define URI_fragmentaddress1 URI_URI1
906
+#define URI_fragmentaddress2 URI_URI2
907
+#define URI_fragmentaddress3 URI_URI3"(#"URI_fragmentid")?"
908
+
909
+#define URI_CHECK_PROTOCOLS "(http|https|ftp)://.+"
910
+
911
+/*Warning: take care when modifying this regex, it has been tweaked, and tuned, just don't break it please.
912
+ * there is fragmentaddress1, and 2  to work around the ISO limitation of 509 bytes max length for string constants*/
913
+static char* url_regex = NULL;
914
+static const char numeric_url_regex[] = "^ *"URI_numeric_fragmentaddress" *$";
915
+/*
916
+ * Only those URLs are identified as URLs for which phishing detection can be performed.
917
+ * This means that no attempt is made to properly recognize 'cid:' URLs
918
+ */
919
+int isURL(const char* URL)
920
+{
921
+	if(!preg) {
922
+		url_regex = str_compose("^ *("URI_fragmentaddress1,URI_fragmentaddress2,URI_fragmentaddress3"|"URI_CHECK_PROTOCOLS") *$");
923
+		if(build_regex(&preg,url_regex,1))
924
+			return -1;
925
+	}
926
+	return URL ? !regexec(preg,URL,0,NULL,0) : 0; 
927
+}
928
+
929
+int isNumericURL(const char* URL)
930
+{
931
+	if(!preg_numeric) {
932
+		if(build_regex(&preg_numeric,numeric_url_regex,1))
933
+			return -1;
934
+	}
935
+	return URL ? !regexec(preg_numeric,URL,0,NULL,0) : 0;
936
+}
937
+
938
+/* Cleans up @urls
939
+ * If URLs are identical after cleanup it will return CL_PHISH_CLEANUP_OK.
940
+ * */
941
+enum phish_status cleanupURLs(struct url_check* urls)
942
+{
943
+	if(urls->flags&CLEANUP_URL) {
944
+		cleanupURL(&urls->realLink,1);
945
+		cleanupURL(&urls->displayLink,0);
946
+		if(!urls->displayLink.data || !urls->realLink.data)
947
+			return CL_PHISH_NODECISION;
948
+		if(!strcmp(urls->realLink.data,urls->displayLink.data))
949
+			return CL_PHISH_CLEANUP_OK;
950
+	}
951
+	return CL_PHISH_NODECISION;
952
+}
953
+
954
+
955
+enum phish_status url_get_host(struct url_check* url,struct url_check* host_url,int isReal,int* phishy)
956
+{
957
+	struct string* host = isReal ? &host_url->realLink : &host_url->displayLink;
958
+	get_host(host,isReal ? url->realLink.data : url->displayLink.data, isReal,phishy);
959
+	if(!host->data)
960
+		return CL_PHISH_CLEANUP_OK;
961
+	if(*phishy&REAL_IS_MAILTO)
962
+		return CL_PHISH_MAILTO_OK;
963
+	if(strchr(host->data,' ')) {
964
+		string_free(host);
965
+		return CL_PHISH_TEXTURL;
966
+	}
967
+	if(isReal && (!strncmp(host->data,"0x",2) || !strncmp(host->data,"0X",2))) {
968
+		string_free(host);
969
+		return CL_PHISH_HEX_URL;
970
+	}
971
+	if(isReal && host->data[0]=='\0')
972
+		return CL_PHISH_CLEAN;/* link without domain, such as: href="/isapi.dll?... */
973
+	if(isNumeric(host->data)) {
974
+		*phishy |= PHISHY_NUMERIC_IP;
975
+/*		if(url->flags&DO_REVERSE_LOOKUP)
976
+			reverse_lookup(host_url,isReal);*/
977
+	}
978
+	return CL_PHISH_NODECISION;
979
+}
980
+	
981
+
982
+void url_get_domain(struct url_check* url,struct url_check* domains)
983
+{		
984
+	get_domain(&domains->realLink, &url->realLink);
985
+	get_domain(&domains->displayLink, &url->displayLink);
986
+	domains->flags	     = url->flags;
987
+}
988
+
989
+enum phish_status phishy_map(int phishy,enum phish_status fallback)
990
+{
991
+	if(phishy&PHISHY_USERNAME_IN_URL)
992
+		return CL_PHISH_CLOAKED_UIU;
993
+	else if(phishy&PHISHY_NUMERIC_IP)
994
+		return CL_PHISH_NUMERIC_IP;
995
+	else
996
+		return fallback;
997
+}
998
+
999
+int isEncoded(const char* url)
1000
+{
1001
+	const char* start=url;
1002
+	size_t cnt=0;
1003
+	do{
1004
+		cnt++;
1005
+		/*last=start;*/
1006
+		start=strstr(start,"&#");
1007
+		if(start)
1008
+			start=strstr(start,";");
1009
+	} while(start);
1010
+	return (cnt-1 >strlen(url)*7/10);/*more than 70% made up of &#;*/
1011
+}
1012
+
1013
+static void free_regex(regex_t** p)
1014
+{
1015
+	if(p) {
1016
+		if(*p) {
1017
+			regfree(*p);
1018
+			free(*p);
1019
+			*p=NULL;
1020
+		}
1021
+	}
1022
+}
1023
+
1024
+void phishing_done(void)
1025
+{
1026
+	free_regex(&preg);
1027
+	free_regex(&preg_cctld);
1028
+	free_regex(&preg_tld);
1029
+	free_regex(&preg_numeric);
1030
+	whitelist_done();
1031
+	domainlist_done();
1032
+	if(url_regex)
1033
+		free(url_regex);
1034
+}
1035
+
1036
+int whitelist_check(struct url_check* urls,int hostOnly)
1037
+{
1038
+	return whitelist_match(urls->realLink.data,urls->displayLink.data,hostOnly);
1039
+}
1040
+
1041
+/* urls can't contain null pointer, caller must ensure this */
1042
+enum phish_status phishingCheck(struct url_check* urls)
1043
+{
1044
+	struct url_check host_url;
1045
+	const char cid[] = "cid:";
1046
+	const size_t cid_len = sizeof(cid)-1;
1047
+	enum phish_status rc=CL_PHISH_NODECISION;
1048
+	int phishy=0;
1049
+	if(!urls->realLink.data)
1050
+		return CL_PHISH_CLEAN;
1051
+	cli_dbgmsg("\nPH:Checking url %s->%s \n",urls->realLink.data,urls->displayLink.data);
1052
+
1053
+	if(!strcmp(urls->realLink.data,urls->displayLink.data))
1054
+		return CL_PHISH_CLEAN;/* displayed and real URL are identical -> clean */
1055
+
1056
+	if((rc = cleanupURLs(urls))) {
1057
+		assert(!isPhishing(rc));/* not allowed to decide this is phishing */
1058
+		return rc;/* URLs identical after cleanup */
1059
+	}
1060
+
1061
+	if(whitelist_check(urls,0))
1062
+		return CL_PHISH_WHITELISTED;/* if url is whitelist don't perform further checks */
1063
+
1064
+	if(urls->flags&DOMAINLIST_REQUIRED && domainlist_match(urls->realLink.data,urls->displayLink.data,0,&urls->flags))
1065
+		phishy |= DOMAIN_LISTED;
1066
+	else {
1067
+		/* although entire url is not listed, the host might be,
1068
+		 * so defer phishing decisions till we know if host is listed*/
1069
+	}
1070
+
1071
+	url_check_init(&host_url);
1072
+
1073
+	if((rc = url_get_host(urls,&host_url,DOMAIN_DISPLAY,&phishy))) {
1074
+		free_if_needed(&host_url);
1075
+		assert(!isPhishing(rc));
1076
+		return rc;
1077
+	}
1078
+
1079
+	if(whitelist_check(&host_url,1)) {
1080
+		free_if_needed(&host_url);
1081
+		return CL_PHISH_HOST_WHITELISTED;
1082
+	}
1083
+
1084
+	if(urls->flags&DOMAINLIST_REQUIRED) {
1085
+		if(!(phishy&DOMAIN_LISTED)) {
1086
+			if(domainlist_match(urls->displayLink.data,urls->realLink.data,1,&urls->flags))
1087
+				phishy |= DOMAIN_LISTED;
1088
+			else {
1089
+				free_if_needed(&host_url);
1090
+				return CL_PHISH_HOST_NOT_LISTED;
1091
+			}
1092
+		}
1093
+	}
1094
+
1095
+	if(urls->flags&CHECK_CLOAKING) {
1096
+		/*Checks if URL is cloaked.
1097
+		Should we check if it containts another http://, https://? 
1098
+		No because we might get false positives from redirect services.*/
1099
+		if(strstr(urls->realLink.data,"%00")) {
1100
+			free_if_needed(&host_url);
1101
+			return CL_PHISH_CLOAKED_NULL;
1102
+		}
1103
+		if(isEncoded(urls->displayLink.data)) {
1104
+			free_if_needed(&host_url);
1105
+			return CL_PHISH_HEX_URL;
1106
+		}
1107
+	}
1108
+
1109
+	if(urls->displayLink.data[0]=='\0') {
1110
+		free_if_needed(&host_url);
1111
+		return CL_PHISH_CLEAN;
1112
+	}
1113
+	
1114
+	if(urls->flags&CHECK_SSL && isSSL(urls->displayLink.data) && !isSSL(urls->realLink.data)) {
1115
+		free_if_needed(&host_url);
1116
+		return CL_PHISH_SSL_SPOOF;
1117
+	}
1118
+
1119
+	if((rc = url_get_host(urls,&host_url,DOMAIN_REAL,&phishy))) 
1120
+	{	
1121
+		free_if_needed(&host_url);
1122
+		return rc;
1123
+	}
1124
+
1125
+	if(!strncmp(urls->displayLink.data,cid,cid_len))/* cid: image */{
1126
+		free_if_needed(&host_url);
1127
+		return CL_PHISH_CLEAN_CID;
1128
+	}
1129
+
1130
+	if(!isURL(urls->displayLink.data) && 
1131
+			( (phishy&PHISHY_NUMERIC_IP && !isNumericURL(urls->displayLink.data)) ||
1132
+			  !(phishy&PHISHY_NUMERIC_IP))) {
1133
+		free_if_needed(&host_url);
1134
+		return CL_PHISH_TEXTURL;
1135
+	}
1136
+
1137
+	if(urls->flags&HOST_SUFFICIENT) {
1138
+		if(!strcmp(urls->realLink.data,urls->displayLink.data)) {
1139
+			free_if_needed(&host_url);
1140
+			return CL_PHISH_HOST_OK;
1141
+		}
1142
+
1143
+
1144
+		if(urls->flags&DOMAIN_SUFFICIENT) {
1145
+			struct url_check domain_url;
1146
+			url_check_init(&domain_url);
1147
+			url_get_domain(&host_url,&domain_url);
1148
+			if(!strcmp(domain_url.realLink.data,domain_url.displayLink.data)) {
1149
+				free_if_needed(&host_url);
1150
+				free_if_needed(&domain_url);
1151
+				return CL_PHISH_DOMAIN_OK;
1152
+			}
1153
+			free_if_needed(&domain_url);
1154
+		}
1155
+
1156
+		/*if(urls->flags&CHECK_REDIR) { 
1157
+			//see where the realLink redirects, and compare that with the displayed Link
1158
+			const uchar* redirectedURL  = getRedirectedURL(urls->realLink);
1159
+			if(urls->needsfree)
1160
+				free(urls->realLink);
1161
+			urls->realLink = redirectedURL;
1162
+
1163
+			if(!strcmp(urls->realLink,urls->displayLink))
1164
+				return CL_PHISH_REDIR_OK;
1165
+
1166
+			if(urls->flags&HOST_SUFFICIENT) {
1167
+				if(rc = url_get_host(urls,&host_url,DOMAIN_REAL))
1168
+				if(!strcmp(host_url.realLink,host_url.displayLink)) {
1169
+					free_if_needed(&host_url);
1170
+					return CL_PHISH_HOST_REDIR_OK;
1171
+				}
1172
+				if(urls->flags&DOMAIN_SUFFICIENT) {
1173
+					struct url_check domain_url;
1174
+					url_get_domain(&host_url,&domain_url);
1175
+					if(!strcmp(domain_url.realLink,domain_url.displayLink)) {
1176
+						free_if_needed(&host_url);
1177
+						free_if_needed(&domain_url);
1178
+						return CL_PHISH_DOMAIN_REDIR_OK;
1179
+					}
1180
+				}
1181
+			}//HOST_SUFFICIENT&CHECK_REDIR
1182
+		}
1183
+		free_if_needed(&host_url);*/
1184
+	/*	if(urls->flags&CHECK_DOMAIN_REVERSE) {
1185
+			//do a DNS lookup of the domain, and see what IP it corresponds to
1186
+			//then do a reverse lookup on the IP, and see what domain you get
1187
+			//There are some corporate signatures that mix different domains belonging to same company
1188
+			struct url_check domain_url;
1189
+			url_check_init(&domain_url);
1190
+			if(!dns_to_ip_and_reverse(&host_url,DOMAIN_DISPLAY)) {
1191
+				if(!strcmp(host_url.realLink.data,host_url.displayLink.data)) {
1192
+					free_if_needed(&host_url);
1193
+					return CL_PHISH_HOST_REVERSE_OK;
1194
+				}
1195
+				if(urls->flags&DOMAIN_SUFFICIENT) {
1196
+					url_get_domain(&host_url,&domain_url);
1197
+					if(!strcmp(domain_url.realLink.data,domain_url.displayLink.data)) {
1198
+						free_if_needed(&host_url);
1199
+						free_if_needed(&domain_url);
1200
+						return CL_PHISH_DOMAIN_REVERSE_OK;
1201
+					}
1202
+					free_if_needed(&domain_url);
1203
+				}
1204
+			}
1205
+		}*/
1206
+		free_if_needed(&host_url);
1207
+	}/*HOST_SUFFICIENT*/
1208
+	/*we failed to find a reason why the 2 URLs are different, this is definetely phishing*/
1209
+	return phishy_map(phishy,CL_PHISH_NOMATCH);
1210
+}
1211
+
1212
+const char* phishing_ret_toString(enum phish_status rc)
1213
+{
1214
+	switch(rc) {
1215
+		case CL_PHISH_CLEAN:
1216
+			return "Clean";
1217
+		case CL_PHISH_CLEANUP_OK:
1218
+			return "URLs match after cleanup";
1219
+		case CL_PHISH_WHITELISTED:
1220
+			return "URL is whitelisted";
1221
+		case CL_PHISH_HOST_WHITELISTED:
1222
+			return "host part of URL is whitelist";
1223
+		case CL_PHISH_HOST_OK:
1224
+			return "Hosts match";
1225
+		case CL_PHISH_DOMAIN_OK:
1226
+			return "Domains match";
1227
+		case CL_PHISH_REDIR_OK:
1228
+			return "After redirecting realURL, they match";
1229
+		case CL_PHISH_HOST_REDIR_OK:
1230
+			return "After redirecting realURL, hosts match";
1231
+		case CL_PHISH_DOMAIN_REDIR_OK:
1232
+			return "After redirecting the domains match";
1233
+		case CL_PHISH_MAILTO_OK:
1234
+			return "URL is mailto";
1235
+		case CL_PHISH_NUMERIC_IP:
1236
+			return "IP address encountered in hostname";
1237
+		case CL_PHISH_TEXTURL:
1238
+			return "Displayed link is not an URL, can't check if phishing or not";
1239
+		case CL_PHISH_CLOAKED_NULL:
1240
+			return "Link URL is cloaked (null byte %00)";
1241
+		case CL_PHISH_CLOAKED_UIU:
1242
+			return "Link URL contains username, and real<->displayed hosts don't match.";
1243
+			/*username is a legit domain, and after the @ comes the evil one*/
1244
+		case CL_PHISH_SSL_SPOOF:
1245
+			return "Visible links is SSL, real link is not";
1246
+		case CL_PHISH_NOMATCH:
1247
+			return "URLs are way too different";
1248
+		case CL_PHISH_HOST_NOT_LISTED:
1249
+			return "Host not listed in .pdb -> not checked";
1250
+		case CL_PHISH_CLEAN_CID:
1251
+			return "Embedded image in mail -> clean";
1252
+		default:
1253
+			return "Unknown return code";
1254
+	}
1255
+}
1256
+
1257
+#endif
0 1258
new file mode 100644
... ...
@@ -0,0 +1,130 @@
0
+/*
1

                
2
+ *
3
+ *  This program is free software; you can redistribute it and/or modify
4
+ *  it under the terms of the GNU General Public License as published by
5
+ *  the Free Software Foundation; either version 2 of the License, or
6
+ *  (at your option) any later version.
7
+ *
8
+ *  This program is distributed in the hope that it will be useful,
9
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
10
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
11
+ *  GNU General Public License for more details.
12
+ *
13
+ *  You should have received a copy of the GNU General Public License
14
+ *  along with this program; if not, write to the Free Software
15
+ *  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
16
+ *  MA 02110-1301, USA.
17
+ */
18
+
19
+#ifdef CL_EXPERIMENTAL
20
+
21
+#ifndef _PHISH_CHECK_H
22
+#define _PHISH_CHECK_H
23
+
24
+
25
+#define CL_PHISH_BASE 100
26
+enum phish_status {CL_PHISH_NODECISION=0,CL_PHISH_CLEAN=CL_PHISH_BASE, CL_PHISH_CLEANUP_OK,CL_PHISH_HOST_OK, CL_PHISH_DOMAIN_OK,
27
+	CL_PHISH_HOST_NOT_LISTED,
28
+	CL_PHISH_REDIR_OK, CL_PHISH_HOST_REDIR_OK, CL_PHISH_DOMAIN_REDIR_OK,
29
+	CL_PHISH_HOST_REVERSE_OK,CL_PHISH_DOMAIN_REVERSE_OK,
30
+	CL_PHISH_WHITELISTED,CL_PHISH_HOST_WHITELISTED,
31
+	CL_PHISH_CLEAN_CID,
32
+	CL_PHISH_TEXTURL, CL_PHISH_MAILTO_OK,
33
+	CL_PHISH_CLOAKED_UIU, CL_PHISH_NUMERIC_IP,CL_PHISH_HEX_URL,CL_PHISH_CLOAKED_NULL,CL_PHISH_SSL_SPOOF, CL_PHISH_NOMATCH};
34
+
35
+#define HOST_SUFFICIENT   1
36
+#define DOMAIN_SUFFICIENT (HOST_SUFFICIENT | 2)
37
+#define DO_REVERSE_LOOKUP 4
38
+#define CHECK_REDIR       8
39
+#define CHECK_SSL         16
40
+#define CHECK_CLOAKING    32
41
+#define CLEANUP_URL       64
42
+#define CHECK_DOMAIN_REVERSE 128
43
+#define CHECK_IMG_URL        256
44
+#define DOMAINLIST_REQUIRED  512
45
+/* img checking disabled by default */
46
+
47
+
48
+#define CL_PHISH_ALL_CHECKS (CLEANUP_URL|DOMAIN_SUFFICIENT|CHECK_SSL|CHECK_CLOAKING|DOMAINLIST_REQUIRED|CHECK_IMG_URL)
49
+
50
+struct string {
51
+	int refcount;
52
+	struct string* ref;
53
+	char* data;
54
+};
55
+
56
+struct url_check {
57
+	struct string realLink;
58
+	struct string displayLink;
59
+	unsigned short       flags;
60
+};
61
+
62
+int phishingScan(message* m,const char* dir,cli_ctx* ctx,tag_arguments_t* hrefs);
63
+enum phish_status phishingCheck(struct url_check* urls);
64
+
65
+int whitelist_check(struct url_check* urls,int hostOnly);
66
+void url_check_init(struct url_check* urls);
67
+void get_host(struct string* dest,const char* URL,int isReal,int* phishy);
68
+void string_free(struct string* str);
69
+void string_assign(struct string* dest,struct string* src);
70
+void string_assign_c(struct string* dest,char* data);
71
+void string_init_c(struct string* dest,char* data);
72
+void string_assign_dup(struct string* dest,const char* start,const char* end);
73
+void string_assign_null(struct string* dest);
74
+void string_assign_ref(struct string* dest,struct string* ref,char* data);
75
+void free_if_needed(struct url_check* url);
76
+void get_host(struct string* dest,const char* URL,int isReal,int* phishy);
77
+int isCountryCode(const char* str);
78
+int isTLD(const char* str,int len);
79
+char* rfind(char* start,char c,size_t len);
80
+void get_domain(struct string* dest,struct string* host);
81
+int ip_reverse(struct url_check* urls,int isReal);
82
+void reverse_lookup(struct url_check* url,int isReal);
83
+int isNumeric(const char* host);
84
+int isSSL(const char* URL);
85
+void cleanupURL(struct string* URL,int isReal);
86
+void get_redirected_URL(struct string* URL);
87
+int isURL(const char* URL);
88
+enum phish_status cleanupURLs(struct url_check* urls);
89
+int isNumericURL(const char* URL);
90
+enum phish_status url_get_host(struct url_check* url,struct url_check* host_url,int isReal,int* phishy);
91
+void url_get_domain(struct url_check* url,struct url_check* domains);
92
+enum phish_status phishy_map(int phishy,enum phish_status fallback);
93
+int isEncoded(const char* url);
94
+void phishing_done(void);
95
+
96
+static inline int isPhishing(enum phish_status rc)
97
+{
98
+	switch(rc) {
99
+		case CL_PHISH_CLEAN:
100
+		case CL_PHISH_CLEANUP_OK:
101
+		case CL_PHISH_WHITELISTED:
102
+		case CL_PHISH_HOST_WHITELISTED:
103
+		case CL_PHISH_HOST_OK:
104
+		case CL_PHISH_DOMAIN_OK:
105
+		case CL_PHISH_REDIR_OK:
106
+		case CL_PHISH_HOST_REDIR_OK:
107
+		case CL_PHISH_DOMAIN_REDIR_OK:
108
+		case CL_PHISH_HOST_REVERSE_OK:
109
+		case CL_PHISH_DOMAIN_REVERSE_OK:
110
+		case CL_PHISH_MAILTO_OK:
111
+		case CL_PHISH_TEXTURL:
112
+		case CL_PHISH_HOST_NOT_LISTED:
113
+		case CL_PHISH_CLEAN_CID:
114
+			return 0;
115
+		case CL_PHISH_HEX_URL:
116
+		case CL_PHISH_CLOAKED_NULL:
117
+		case CL_PHISH_SSL_SPOOF:
118
+		case CL_PHISH_CLOAKED_UIU:
119
+		case CL_PHISH_NUMERIC_IP:
120
+		case CL_PHISH_NOMATCH:
121
+			return 1;
122
+		default:
123
+			return 1;
124
+	}
125
+}
126
+const char* phishing_ret_toString(enum phish_status rc);
127
+#endif
128
+
129
+#endif
... ...
@@ -42,10 +42,8 @@
42 42
 #include "defaults.h"
43 43
 
44 44
 #ifdef CL_EXPERIMENTAL
45
-/*
46 45
 #include "phish_whitelist.h"
47 46
 #include "phish_domaincheck_db.h"
48
-*/
49 47
 #endif
50 48
 
51 49
 
... ...
@@ -1094,7 +1092,6 @@ static int cli_load(const char *filename, struct cl_engine **engine, unsigned in
1094 1094
 #endif
1095 1095
 	    skipped = 1;
1096 1096
 #ifdef CL_EXPERIMENTAL
1097
-/*
1098 1097
     } else if(cli_strbcasestr(filename, ".wdb")) {
1099 1098
 	if(!(options & CL_SCAN_NOPHISHING))
1100 1099
 	    ret = cli_loadwdb(fd, options);
... ...
@@ -1105,7 +1102,6 @@ static int cli_load(const char *filename, struct cl_engine **engine, unsigned in
1105 1105
 	    ret = cli_loadpdb(fd, options);
1106 1106
 	else
1107 1107
 	    skipped = 1;
1108
-*/
1109 1108
 #endif
1110 1109
     } else {
1111 1110
 	cli_dbgmsg("cli_load: unknown extension - assuming old database format\n");
... ...
@@ -1172,10 +1168,8 @@ static int cli_loaddbdir(const char *dirname, struct cl_engine **engine, unsigne
1172 1172
 	     cli_strbcasestr(dent->d_name, ".zmd")  ||
1173 1173
 	     cli_strbcasestr(dent->d_name, ".rmd")  ||
1174 1174
 #ifdef CL_EXPERIMENTAL
1175
-/*
1176 1175
 	     cli_strbcasestr(dent->d_name, ".pdb")  ||
1177 1176
 	     cli_strbcasestr(dent->d_name, ".wdb")  ||
1178
-*/
1179 1177
 #endif
1180 1178
 	     cli_strbcasestr(dent->d_name, ".hw")  ||
1181 1179
 	     cli_strbcasestr(dent->d_name, ".inc")  ||
... ...
@@ -1294,10 +1288,8 @@ int cl_statinidir(const char *dirname, struct cl_stat *dbstat)
1294 1294
 	    cli_strbcasestr(dent->d_name, ".zmd")  || 
1295 1295
 	    cli_strbcasestr(dent->d_name, ".rmd")  || 
1296 1296
 #ifdef CL_EXPERIMENTAL
1297
-/*
1298 1297
 	    cli_strbcasestr(dent->d_name, ".pdb")  ||
1299 1298
 	    cli_strbcasestr(dent->d_name, ".wdb")  ||
1300
-*/
1301 1299
 #endif
1302 1300
 	    cli_strbcasestr(dent->d_name, ".hw")   ||
1303 1301
 	    cli_strbcasestr(dent->d_name, ".inc")   ||
... ...
@@ -1374,10 +1366,8 @@ int cl_statchkdir(const struct cl_stat *dbstat)
1374 1374
 	    cli_strbcasestr(dent->d_name, ".zmd")  || 
1375 1375
 	    cli_strbcasestr(dent->d_name, ".rmd")  || 
1376 1376
 #ifdef CL_EXPERIMENTAL
1377
-/*
1378 1377
 	    cli_strbcasestr(dent->d_name, ".pdb")  ||
1379 1378
 	    cli_strbcasestr(dent->d_name, ".wdb")  ||
1380
-*/
1381 1379
 #endif
1382 1380
 	    cli_strbcasestr(dent->d_name, ".hw")   ||
1383 1381
 	    cli_strbcasestr(dent->d_name, ".inc")   ||
1384 1382
new file mode 100644
... ...
@@ -0,0 +1,1521 @@
0
+/*
1
+ *  Match a string against a list of patterns/regexes.
2
+ *
3

                
4
+ *
5
+ *  This program is free software; you can redistribute it and/or modify
6
+ *  it under the terms of the GNU General Public License as published by
7
+ *  the Free Software Foundation; either version 2 of the License, or
8
+ *  (at your option) any later version.
9
+ *
10
+ *  This program is distributed in the hope that it will be useful,
11
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
12
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13
+ *  GNU General Public License for more details.
14
+ *
15
+ *  You should have received a copy of the GNU General Public License
16
+ *  along with this program; if not, write to the Free Software
17
+ *  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
18
+ *  MA 02110-1301, USA.
19
+ *
20
+ *  $Log: regex_list.c,v $
21
+ *  Revision 1.1  2006/09/12 19:38:39  acab
22
+ *  Phishing module merge - libclamav
23
+ *
24
+ *  Revision 1.13  2006/09/11 19:25:08  edwin
25
+ *  Non-printable characters in regex (although they are invalid inside an url, added some support for it).
26
+ *
27
+ *  Revision 1.12  2006/08/28 08:43:06  edwin
28
+ *  Fixed a few minor leaks.
29
+ *  Valgrind now says:"All heap blocks were freed -- no leaks are possible"
30
+ *
31
+ *  Revision 1.11  2006/08/20 21:18:11  edwin
32
+ *  Added the script used to generate iana_tld.sh
33
+ *  Added checks for phish_domaincheck_db
34
+ *  Added phishing module design document from wiki (as discussed with aCaB).
35
+ *  Updated .wdb/.pdb format documentation (in regex_list.c)
36
+ *  Fixed some memory leaks in regex_list.c
37
+ *  IOW: cleanups before the deadline.
38
+ *  I consider my module to be ready for evaluation now.
39
+ *
40
+ *  Revision 1.10  2006/08/20 19:42:02  edwin
41
+ *  Fix custom character class, and generic regex handling.
42
+ *
43
+ *  Revision 1.9  2006/08/19 21:08:47  edwin
44
+ *  Fixed:Forgot to add form tag handling when it contains images.
45
+ *  Various fixes to get rid of gcc warnings.
46
+ *
47
+ *  Revision 1.8  2006/08/19 09:26:51  edwin
48
+ *  regex_list.c: Fixed regex alternatives handling (bug discovered with autotests).
49
+ *  And forgot to commit manager.c last time.
50
+ *
51
+ *  Revision 1.7  2006/08/17 20:31:43  edwin
52
+ *  Disable extracting hrefs from mails in mbox, if: we aren't scanning for phish, and mailfollowurls is off.
53
+ *  Fix a still reachable leak. Remove unneeded build_regex_list export.
54
+ *
55
+ *  Revision 1.6  2006/08/12 14:35:34  edwin
56
+ *  Fix some compiler warnings.
57
+ *  Fix an assertion failure in regex_list.
58
+ *  Interpret display links that start with http|https|ftp, always as an URL.
59
+ *
60
+ *  Revision 1.5  2006/08/06 20:27:07  edwin
61
+ *  New option to enable phish scan for all domains (disabled by default).
62
+ *  You will now have to run clamscan --phish-scan-alldomains to have any phishes detected.
63
+ *  Updated phishcheck control flow to better incorporate the domainlist.
64
+ *  Updated manpage with new options.
65
+ *
66
+ *  TODO:there is a still-reachable leak in regex_list.c
67
+ *
68
+ *  Revision 1.4  2006/08/01 20:19:15  edwin
69
+ *  Integrate domainlist check into phishcheck. Warning: enabled by default.
70
+ *  Regex bracket handling update.
71
+ *  Better regex paranthesized & alternate expression handling.
72
+ *
73
+ *  Revision 1.3  2006/07/31 20:12:30  edwin
74
+ *  Preliminary support for domain databases (domains to check by phishmodule)
75
+ *  Better memory allocation failure handling in regex_list
76
+ *
77
+ */
78
+
79
+#if HAVE_CONFIG_H
80
+#include "clamav-config.h"
81
+#endif
82
+
83
+#ifdef CL_EXPERIMENTAL
84
+
85
+#ifndef CL_DEBUG
86
+#define NDEBUG
87
+#endif
88
+
89
+#ifdef CL_THREAD_SAFE
90
+#ifndef _REENTRANT
91
+#define _REENTRANT
92
+#endif
93
+#endif
94
+
95
+#include <stdio.h>
96
+#include <stdlib.h>
97
+#include <errno.h>
98
+#include <assert.h>
99
+#include <string.h>
100
+#include <strings.h>
101
+#include <ctype.h>
102
+
103
+#include <limits.h>
104
+#include <sys/types.h>
105
+
106
+/*#define USE_PCRE*/
107
+#include <regex.h>
108
+
109
+#if defined(HAVE_READDIR_R_3) || defined(HAVE_READDIR_R_2)
110
+#include <stddef.h>
111
+#endif
112
+
113
+#include "clamav.h"
114
+#include "others.h"
115
+#include "defaults.h"
116
+#include "str.h"
117
+#include "filetypes.h"
118
+#include "mbox.h"
119
+#include "regex_list.h"
120
+#include "matcher-ac.h"
121
+
122
+
123
+/*Tree*/
124
+enum token_op_t {OP_CHAR,OP_STDCLASS,OP_CUSTOMCLASS,OP_DOT,OP_LEAF,OP_ROOT,OP_PARCLOSE};
125
+typedef char* char_bitmap_p;
126
+/*
127
+ *
128
+ * OP_CHAR: 1 character, c = character
129
+ * complex stuff:
130
+ * OP_STDCLASS: standard character class, c = char class, class: 1<<(index into std_class of class name)
131
+ * OP_CUSTOMCLASS: custom character class, first pointer in ptr array is a pointer to the bitmap table for this class
132
+ * OP_DOT: single . matching any character except \n
133
+ * OP_LEAF: this is a leaf node, reinterpret structure
134
+ */
135
+struct tree_node {
136
+	enum token_op_t op;
137
+	unsigned char c;
138
+	char alternatives;/* number of (non-regex) children of node, i.e. sizeof(children)*/
139
+	char listend;/* no more siblings, next pointer is pointer to parent*/
140
+	struct tree_node* next;/* next regex/complex sibling, or parent, if no more siblings , can't be NULL except for root node*/
141
+	union {
142
+		struct tree_node** children;/* alternatives nr. of children, followed by (a null pointer terminated) regex leaf node pointers) */
143
+		char_bitmap_p* bitmap;
144
+		struct leaf_info*  leaf;
145
+	} u;
146
+};
147
+
148
+struct leaf_info {
149
+	char* info;/* what does it mean that we reached the leaf...*/
150
+	regex_t* preg;/* this is NULL if leaf node, and non-regex*/
151
+};
152
+
153
+/* Character classes */
154
+enum wctype_t {ALNUM,DIGIT,PUNCT,ALPHA,GRAPH,SPACE,BLANK,LOWER,UPPER,CNTRL,PRINT,XDIGIT};
155
+static struct std_classmap {
156
+		const char* classname;
157
+		const enum wctype_t type;
158
+} std_class[] = {
159
+	{"[:alnum:]",ALNUM},
160
+	{"[:digit:]",DIGIT},
161
+	{"[:punct:]",PUNCT},
162
+	{"[:alpha:]",ALPHA},
163
+	{"[:graph:]",GRAPH},
164
+	{"[:space:]",SPACE},
165
+	{"[:blank:]",BLANK},
166
+	{"[:lower:]",LOWER}, 
167
+	{"[:upper:]",UPPER},
168
+	{"[:cntrl:]",CNTRL},
169
+	{"[:print:]",PRINT},
170
+	{"[:xdigit:]",XDIGIT}
171
+};
172
+
173
+static const size_t std_class_cnt =  sizeof(std_class)/sizeof(std_class[0]);
174
+#define STD_CLASS_CNT sizeof(std_class)/sizeof(std_class[0])
175
+typedef char char_bitmap_t[32];
176
+static char_bitmap_p char_class_bitmap[STD_CLASS_CNT];
177
+static unsigned short int char_class[256];
178
+
179
+/* Prototypes */
180
+static void setup_matcher_engine(void);
181
+static void matcher_engine_done(void);
182
+static int add_pattern(struct regex_matcher* matcher,const unsigned char* pat,const char* info);
183
+static int match_node(struct tree_node* node,const unsigned char* c,size_t len,const char** info);
184
+static void destroy_tree(struct regex_matcher* matcher);
185
+
186
+
187
+#define MATCH_SUCCESS 0 
188
+#define MATCH_FAILED  -1
189
+
190
+
191
+/*
192
+ * Call this function when an unrecoverable error has occured, (instead of exit).
193
+ */
194
+static void fatal_error(struct regex_matcher* matcher)
195
+{
196
+	regex_list_done(matcher);
197
+	matcher->list_inited = -1;/* the phishing module will know we tried to load a whitelist, and failed, so it will disable itself too*/
198
+}
199
+
200
+
201
+/*
202
+ * @matcher - matcher structure to use
203
+ * @real_url - href target
204
+ * @display_url - <a> tag contents
205
+ * @hostOnly - if you want to match only the host part
206
+ *
207
+ * @return - CL_SUCCESS - url doesn't match
208
+ *         - CL_VIRUS - url matches list
209
+ *
210
+ * Do not send NULL pointers to this function!!
211
+ *
212
+ */
213
+int regex_list_match(struct regex_matcher* matcher,const char* real_url,const char* display_url,int hostOnly,const char** info)
214
+{
215
+	assert(matcher);
216
+	assert(real_url);
217
+	assert(display_url);
218
+	assert(info);
219
+	if(!matcher->list_inited)
220
+		return 0;
221
+	assert(matcher->list_built);
222
+	{
223
+		size_t real_len    = strlen(real_url);
224
+		size_t display_len = strlen(display_url);
225
+		size_t buffer_len  = real_len + display_len + 1;
226
+		char*  buffer = cli_malloc(buffer_len+1);
227
+		int partcnt,rc;
228
+		unsigned long int partoff;
229
+
230
+		if(!buffer)
231
+			return CL_EMEM;
232
+
233
+		strncpy(buffer,real_url,real_len);
234
+		buffer[real_len]=' ';
235
+		strncpy(buffer+real_len+1,display_url,display_len);
236
+		buffer[buffer_len]=0;
237
+		cli_dbgmsg("Looking up in regex_list: %s\n");
238
+
239
+		rc = cli_ac_scanbuff(buffer,buffer_len,info,hostOnly ? matcher->root_hosts : matcher->root_urls,&partcnt,0,0,&partoff,0,-1,NULL);
240
+		if(!rc && !hostOnly) 
241
+			rc = match_node(matcher->root_regex,(unsigned char*)buffer,buffer_len,info) == MATCH_SUCCESS ? CL_VIRUS : CL_SUCCESS;
242
+		free(buffer);
243
+		if(!rc)
244
+			cli_dbgmsg("not in regex list\n");
245
+		return rc;
246
+	}
247
+}
248
+
249
+static struct tree_node* tree_root_alloc(void);
250
+
251
+
252
+/* node stack */
253
+#define NODE_STACK_INITIAL 1024
254
+#define NODE_STACK_GROW    4096
255
+/* Initialize @stack */
256
+static int stack_init(struct node_stack* stack)
257
+{
258
+	assert(stack);
259
+
260
+	stack->cnt = 0;
261
+	stack->capacity = NODE_STACK_INITIAL;
262
+	stack->data = cli_malloc(stack->capacity * sizeof(*stack->data));
263
+	if(!stack->data)
264
+		return CL_EMEM;
265
+	else
266
+		return CL_SUCCESS;
267
+}
268
+
269
+/* Reset @stack pointer, but don't realloc */
270
+static void stack_reset(struct node_stack* stack)
271
+{
272
+	assert(stack);
273
+
274
+	stack->cnt = 0;
275
+}
276
+
277
+/* Push @node on @stack, growing it if necessarry */
278
+static inline int stack_push(struct node_stack* stack,struct tree_node* node)
279
+{
280
+	assert(stack);
281
+	assert(stack->data);
282
+
283
+	if(stack->cnt == stack->capacity) {
284
+		stack->capacity += NODE_STACK_GROW;
285
+		stack->data = cli_realloc(stack->data,stack->capacity*sizeof(*stack->data));
286
+		if(!stack->data)
287
+			return CL_EMEM;
288
+	}
289
+	stack->data[stack->cnt++] = node;
290
+	return CL_SUCCESS;
291
+}
292
+
293
+/* Pops node from @stack, doesn't realloc */
294
+static inline struct tree_node* stack_pop(struct node_stack* stack)
295
+{
296
+	assert(stack);
297
+	assert(stack->data);
298
+	assert(stack->cnt);/*don't pop from empty stack */
299
+
300
+	return stack->cnt ? stack->data[--stack->cnt] : NULL;
301
+}
302
+
303
+/* Initialization & loading */
304
+
305
+/* Initializes @matcher, allocating necesarry substructures */
306
+int init_regex_list(struct regex_matcher* matcher)
307
+{
308
+	assert(matcher);
309
+	
310
+	setup_matcher_engine();
311
+
312
+	matcher->list_inited = 0;
313
+	matcher->root_hosts = (struct cli_matcher*) cli_calloc(1,sizeof(*matcher->root_hosts));
314
+	if(!matcher->root_hosts)
315
+		return CL_EMEM;
316
+
317
+	matcher->root_hosts->ac_root =  (struct cli_ac_node *) cli_calloc(1, sizeof(struct cli_ac_node));
318
+	if(!matcher->root_hosts->ac_root) {
319
+		free(matcher->root_hosts);
320
+		return CL_EMEM;
321
+	}
322
+
323
+	matcher->root_urls = (struct cli_matcher*) cli_calloc(1,sizeof(*matcher->root_hosts));
324
+	if(!matcher->root_urls) {
325
+		free(matcher->root_hosts->ac_root);
326
+		free(matcher->root_hosts);
327
+		return CL_EMEM;
328
+	}
329
+
330
+	matcher->root_urls->ac_root =  (struct cli_ac_node *) cli_calloc(1, sizeof(struct cli_ac_node));
331
+	if(!matcher->root_urls->ac_root) {
332
+		free(matcher->root_hosts->ac_root);
333
+		free(matcher->root_hosts);
334
+		free(matcher->root_urls);
335
+		return CL_EMEM;
336
+	}
337
+
338
+	matcher->root_regex = tree_root_alloc();
339
+	if(!matcher->root_regex) {
340
+		free(matcher->root_hosts->ac_root);
341
+		free(matcher->root_hosts);
342
+		free(matcher->root_urls->ac_root);
343
+		free(matcher->root_urls);
344
+		return CL_EMEM;
345
+	}
346
+
347
+	stack_init(&matcher->node_stack);
348
+	stack_init(&matcher->node_stack_alt);
349
+
350
+	matcher->list_inited=1;
351
+	matcher->list_built=0;
352
+	matcher->list_loaded=0;
353
+
354
+	return CL_SUCCESS;
355
+}
356
+
357
+/* inserts @pattern into @root, using ac-matcher 
358
+ * although the name might be confusing, @pattern is not a regex!*/
359
+static int add_regex_list_element(struct cli_matcher* root,const char* pattern,char* info)
360
+{
361
+       int ret;
362
+       struct cli_ac_patt *new = cli_calloc(1,sizeof(*new));
363
+       size_t len;
364
+
365
+       if(!new)
366
+	       return CL_EMEM;
367
+       assert(root);
368
+       assert(pattern);
369
+
370
+       len = strlen(pattern);
371
+       new->type = 0;
372
+       new->sigid = 0;
373
+       new->parts = 0;
374
+       new->partno = 0;
375
+       new->mindist = 0;
376
+       new->maxdist = 0;
377
+       new->offset = 0;
378
+       new->target = 0;
379
+       new->length = len;
380
+       if(new->length > root->maxpatlen)
381
+               root->maxpatlen = new->length;
382
+
383
+       new->pattern = cli_malloc(sizeof(new->pattern[0])*len);
384
+       if(!new->pattern) {
385
+	       free(new);
386
+	       return CL_EMEM;
387
+       }
388
+       strncpy((char*)new->pattern,(const char*)pattern,len);
389
+
390
+       new->virname = info;
391
+       if((ret = cli_ac_addpatt(root,new))) {
392
+	       free(new->virname);
393
+               free(new->pattern);
394
+               free(new);
395
+               return ret;
396
+       }
397
+       return CL_SUCCESS;
398
+}
399
+
400
+
401
+#ifndef NDEBUG
402
+void dump_tree(struct tree_node* root);
403
+#endif
404
+static int matcher_engine_refcount=0;
405
+
406
+static int build_regex_list(struct regex_matcher* matcher);
407
+/* Load patterns/regexes from file */
408
+int load_regex_matcher(struct regex_matcher* matcher,FILE* fd,unsigned int options)
409
+{
410
+	int rc,line=0;
411
+	char buffer[FILEBUFF];
412
+
413
+	assert(matcher);
414
+	assert(fd);
415
+
416
+	if(matcher->list_inited==-1)
417
+		return -1;
418
+	if(matcher->list_loaded) {
419
+		cli_warnmsg("Regex list has already been loaded, ignoring further requests for load\n");
420
+		return -1;/*TODO: better return code*/
421
+	}
422
+	if(!fd) {
423
+		cli_errmsg("Unable to load regex list (null file)\n");
424
+		return -1;/*TODO: return appropiate return code*/
425
+	}
426
+
427
+	cli_dbgmsg("Loading regex_list\n");
428
+	if(!matcher->list_inited) {
429
+		init_regex_list(matcher);
430
+		if (!matcher->list_inited) {
431
+			cli_errmsg("Regex list failed to initialize!\n");
432
+			fatal_error(matcher);
433
+			return -1;
434
+		}
435
+		/*atexit(regex_list_done); TODO: destroy this in manager.c */
436
+	}
437
+	/*
438
+	 * Regexlist db format (common to .wdb(whitelist) and .pdb(domainlist) files:
439
+	 * Multiple lines of form, (empty lines are skipped):
440
+ 	 * Flags RealURL DisplayedURL
441
+	 * Where:
442
+	 * Flags: R - regex, H - host-only, followed by (optional) 3-digit hexnumber representing 
443
+	 * flags that should be filtered.
444
+	 * [i.e. phishcheck urls.flags that we don't want to be done for this particular host]
445
+	 * Note:Flag filtering only makes sense in .pdb files.
446
+	 *
447
+	 * If a line in the file doesn't conform to this format, loading fails
448
+	 * 
449
+	 */
450
+	while(fgets(buffer,FILEBUFF,fd)) {
451
+		char* pattern;
452
+		char* flags;
453
+		line++;
454
+		cli_chomp(buffer);
455
+		if(!*buffer)
456
+			continue;/* skip empty lines */
457
+		pattern = strchr(buffer,' ');
458
+		if(!pattern) {
459
+			cli_errmsg("Malformed regex list line %d\n",line);
460
+			fatal_error(matcher);
461
+			return CL_EMALFDB;
462
+		}
463
+		pattern[0]='\0';
464
+		flags=buffer+1;
465
+		pattern++;
466
+		if(buffer[0] == 'R') {
467
+			if(( rc = add_pattern(matcher,(const unsigned char*)pattern,flags) ))
468
+				return rc==CL_EMEM ? CL_EMEM : CL_EMALFDB;
469
+		}
470
+		else if(buffer[0] == 'H') {
471
+			if(( rc = add_regex_list_element(matcher->root_hosts,pattern,flags) ))
472
+				return rc==CL_EMEM ? CL_EMEM : CL_EMALFDB;
473
+		}
474
+		else {
475
+			if(( rc = add_regex_list_element(matcher->root_urls,pattern,flags) ))
476
+				return rc==CL_EMEM ? CL_EMEM : CL_EMALFDB;
477
+		}
478
+	}
479
+	matcher->list_loaded = 1;
480
+	build_regex_list(matcher);
481
+
482
+#ifndef NDEBUG
483
+/*			dump_tree(matcher->root_regex);*/
484
+#endif
485
+	if(!matcher->list_built) {
486
+		cli_errmsg("Regex list not loaded: build failed!\n");
487
+		fatal_error(matcher);
488
+		return CL_EMALFDB;
489
+	}
490
+	regex_list_cleanup(matcher);
491
+	matcher_engine_refcount++;
492
+	return CL_SUCCESS;
493
+}
494
+
495
+/*
496
+static void tree_node_merge_nonbin(struct tree_node* into,const struct tree_node* node)
497
+{
498
+	assert(into);
499
+	assert(node);
500
+
501
+	if(node->alternatives){
502
+		if(node->u.children[0]->next == node) {
503
+			*no non-bin alternatives here*
504
+		}
505
+		else {
506
+			struct tree_node* p;
507
+			for(p = node->u.children[0]->next; p->next != node; p = p->next)
508
+				tree_node_insert_nonbin(into,p);
509
+		}
510
+	}
511
+	else
512
+		tree_node_insert_nonbin(into,node->u.children[0]);
513
+}
514
+*
515
+static void tree_node_merge_bin(struct tree_node* into,const struct tree_node* node)
516
+{
517
+	if(node->u.children && node->alternatives) {
518
+		if(!into->alternatives) {
519
+			* into has no bin part, just copy+link the node there*
520
+			int i;
521
+			struct tree_node* next = into->u.children[0];
522
+			into->u.children = node->u.children;
523
+			into->alternatives = node->alternatives;
524
+			for(i=0;i < into->alternatives;i++) {
525
+				if(into->u.children[i]->next == node) {
526
+					into->u.children[i]->next = next;
527
+					into->u.children[i]->listend = 0;
528
+				}
529
+				else {
530
+					struct tree_node* p;
531
+					for(p = into->u.children[0]->next; p->next != node; p = p->next);
532
+					p->listend = 0;
533
+					p->next = next;
534
+				}
535
+			}
536
+		}
537
+		const size_t new_size = tree_node_get_array_size(into) + tree_node_get_array_size(node);
538
+		struct tree_node** new_children = cli_malloc(sizeof(
539
+	}
540
+	* else: no bin part to merge *
541
+}
542
+*/
543
+
544
+static struct tree_node ** tree_node_get_children(const struct tree_node* node)
545
+{
546
+	return node->op==OP_CUSTOMCLASS ? (node->u.children[1] ? node->u.children+1 : NULL) :node->u.children;
547
+}
548
+/* don't do this, it wastes too much memory, and has no benefit
549
+static void regex_list_dobuild(struct tree_node* called_from,struct tree_node* node)
550
+{
551
+	struct tree_node **children;
552
+	assert(node);
553
+
554
+	children = tree_node_get_children(node);
555
+	if(node->op!=OP_ROOT)
556
+		assert(called_from);
557
+	if(node->op==OP_TMP_PARCLOSE) {
558
+		const size_t array_size = (node->alternatives +(called_from->op==OP_CUSTOMCLASS ? 1:0))*sizeof(*called_from->u.children);
559
+		if(node->c)
560
+			return;* already processed this common node*
561
+		else
562
+			node->c = 1;
563
+		* copy children to called_from from this node
564
+		 * called_from should have 0 alternatives, and a link to this node via ->u.children[0]
565
+		 * *
566
+		assert(called_from->alternatives == 0);
567
+		assert(called_from->u.children);
568
+		assert(called_from->u.children[0] == node);
569
+		called_from->u.children = cli_realloc(called_from->u.children,array_size);
570
+		called_from->u.children = node->u.children;
571
+		called_from->alternatives = node->alternatives;
572
+		if(called_from->alternatives) {
573
+			* fix parent pointers *
574
+			int i;TODO: do a deep copy of children here
575
+			struct tree_node **from_children = tree_node_get_children(called_from);
576
+                        assert(from_children);
577
+			for(i=0;i < called_from->alternatives;i++) {
578
+				struct tree_node* p;
579
+				for(p=from_children[i];p->next != node; p = p->next);
580
+				p->next = called_from;
581
+			}
582
+		}
583
+	}
584
+
585
+	if(node->op==OP_LEAF) 
586
+	return;
587
+	else if (node->alternatives) {
588
+		int i;
589
+		struct tree_node* p;
590
+		assert(children);
591
+		p = children[0]->op==OP_LEAF ? NULL : children[0]->next;
592
+		for(i=0;i<node->alternatives;i++)
593
+			regex_list_dobuild(node,children[i]);
594
+		if(p && p!=node)
595
+			regex_list_dobuild(node,p);
596
+	} else {
597
+		if(children) 
598
+			if (children[0])
599
+				regex_list_dobuild(node,children[0]);
600
+	}
601
+	if(node->next && !node->listend)
602
+		regex_list_dobuild(node,node->next);
603
+	if(node->op==OP_TMP_PARCLOSE)
604
+		node->c=0;
605
+	*free(node);*
606
+}
607
+*/
608
+/* Build the matcher list */
609
+static int build_regex_list(struct regex_matcher* matcher)
610
+{
611
+	if(!matcher->list_inited || !matcher->list_loaded) {
612
+		cli_errmsg("Regex list not loaded!\n");
613
+		return -1;/*TODO: better error code */
614
+	}
615
+	cli_dbgmsg("Building regex list\n");
616
+	cli_ac_buildtrie(matcher->root_hosts);
617
+	cli_ac_buildtrie(matcher->root_urls);
618
+	matcher->list_built=1;
619
+
620
+	return CL_SUCCESS;
621
+}
622
+
623
+
624
+static void stack_destroy(struct node_stack* stack);
625
+/* Done with this matcher, free resources */
626
+void regex_list_done(struct regex_matcher* matcher)
627
+{
628
+	assert(matcher);
629
+
630
+	regex_list_cleanup(matcher);
631
+	if(matcher->list_loaded) {
632
+		cli_ac_free(matcher->root_hosts);
633
+		free(matcher->root_hosts);
634
+		matcher->root_hosts=NULL;
635
+
636
+		cli_ac_free(matcher->root_urls);
637
+		free(matcher->root_urls);
638
+		matcher->root_urls=NULL;
639
+
640
+		matcher->list_built=0;
641
+		destroy_tree(matcher);
642
+		matcher->list_loaded=0;
643
+	}
644
+	if(matcher->list_inited) {
645
+		matcher_engine_done();
646
+		matcher->list_inited=0;
647
+	}
648
+	stack_destroy(&matcher->node_stack);
649
+	stack_destroy(&matcher->node_stack_alt);
650
+}
651
+
652
+/* Tree matcher algorithm */
653
+
654
+static int cli_iswctype(const char c,const enum wctype_t type)
655
+{
656
+	switch(type) {
657
+		case ALNUM:
658
+			return isalnum(c);
659
+		case DIGIT:
660
+			return isdigit(c);
661
+		case PUNCT:
662
+			return ispunct(c);
663
+		case ALPHA:
664
+			return isalpha(c);
665
+		case GRAPH:
666
+			return isgraph(c);
667
+		case SPACE:
668
+			return isspace(c);
669
+		case BLANK:
670
+			return c=='\t' || c==' ';
671
+		case LOWER:
672
+			return islower(c);
673
+		case UPPER:
674
+			return isupper(c);
675
+		case CNTRL:
676
+			return iscntrl(c);
677
+		case PRINT:
678
+			return isprint(c);
679
+		case XDIGIT:
680
+			return isxdigit(c);
681
+		default: {
682
+				 cli_warnmsg("Unknown char class in iswctype\n");
683
+	 			 return 0;
684
+			 }
685
+	}
686
+}
687
+
688
+static int engine_inited=0;
689
+
690
+static void setup_matcher_engine(void)
691
+{
692
+	/*Set up std character classes*/
693
+	size_t i;
694
+	size_t j;
695
+	if(engine_inited)
696
+		return;
697
+	memset(char_class,0,256);
698
+	for(i=0;i<std_class_cnt;i++) {
699
+		enum wctype_t type = std_class[i].type;
700
+		char_class_bitmap[i]=cli_calloc(256>>3,1);
701
+		for(j=0;j<256;j++)
702
+			if(cli_iswctype(j,type)) {
703
+				char_class[j] |= 1<<i;
704
+				char_class_bitmap[i][j>>3] |= 1<<(j&0x07);
705
+			}
706
+	}	
707
+	engine_inited=1;
708
+}
709
+
710
+static void matcher_engine_done(void)
711
+{
712
+	size_t i;
713
+	matcher_engine_refcount--;
714
+	if(!matcher_engine_refcount) {
715
+		for(i=0;i<std_class_cnt;i++)
716
+			free(char_class_bitmap[i]);
717
+	}
718
+	engine_inited=0;
719
+}
720
+
721
+struct token_t
722
+{
723
+	size_t len;
724
+	char   type;
725
+	union {
726
+		const unsigned char* start;
727
+		char_bitmap_p        bitmap;
728
+	} u;
729
+};
730
+
731
+enum {TOKEN_CHAR,TOKEN_DOT,TOKEN_PAR_OPEN,TOKEN_PAR_CLOSE,TOKEN_BRACKET,TOKEN_ALT,TOKEN_REGEX,TOKEN_DONE};
732
+
733
+static const unsigned char* getNextToken(const unsigned char* pat,struct token_t* token)
734
+{
735
+	assert(pat);
736
+	assert(token);
737
+
738
+	switch(*pat) {
739
+		case '\\':
740
+			token->type=TOKEN_CHAR;
741
+			token->u.start = ++pat;
742
+			if(islower(token->u.start)) {
743
+				/* handle \n, \t, etc. */
744
+				char c;
745
+				if(snprintf(&c,1,"\%c",token->u.start)!=1)
746
+					token->type=TOKEN_REGEX;
747
+				token->u.start=c;
748
+			}
749
+			token->len   = 1;
750
+			break;
751
+		case '|':
752
+			token->type=TOKEN_ALT;
753
+			break;
754
+		case '*':
755
+		case '+':
756
+		case '?':
757
+		case '{':
758
+		case '}':
759
+			token->type=TOKEN_REGEX;
760
+/*			assert(0 && "find_regex_start should have forbidden us from finding regex special chars");*/
761
+			break;
762
+		case '[':
763
+			{
764
+			/*TODO: implement*/
765
+			/*see if it is something simple like a list of characters, a range, or negated ...*/
766
+			const unsigned char* old=pat++;/* save this in case we change our mind and decide this is too complicated for us to handle*/
767
+			unsigned char range_start=0;
768
+			int hasprev = 0;
769
+			char_bitmap_p bitmap = cli_malloc(32);
770
+			if(!bitmap)
771
+				return NULL;
772
+			if (*pat=='^') {
773
+				memset(bitmap,0xFF,32);/*match chars not in brackets*/
774
+				pat++;
775
+			}
776
+			else
777
+				memset(bitmap,0x00,32);
778
+			do {
779
+				/* literal ] can be first character, so test for it at the end of the loop, for example: []] */
780
+				if (*pat=='-' && hasprev) {
781
+					/* it is a range*/
782
+					unsigned char range_end;
783
+					unsigned int c;
784
+					assert(range_start);
785
+					pat++;
786
+					if (pat[0]=='[')
787
+						if (pat[1]=='.') {
788
+							if(pat[2]=='-' && pat[3]=='.' && pat[4]==']')
789
+								range_end = '-';
790
+							else {
791
+								/* this is getting complicated, bail out */
792
+								cli_warnmsg("confused about collating sequences in regex,bailing out");
793
+								pat=old;
794
+								token->type=TOKEN_REGEX;
795
+								break;
796
+							}
797
+						}
798
+						else 
799
+							range_end = *pat;
800
+					else
801
+						range_end = *pat;
802
+					for(c=range_start+1;c<=range_end;c++)
803
+						bitmap[c>>3] ^= 1<<(c&0x7);
804
+					hasprev = 0;
805
+				}
806
+				else if (pat[0]=='[' && pat[1]==':') {
807
+							const unsigned char* end;
808
+							int len,found=-1;
809
+							size_t i;
810
+
811
+							pat+=2;
812
+							end=(unsigned char*)strstr((const char*)pat,":]");
813
+							if(!end) {
814
+								cli_warnmsg("confused about std char class syntax regex,bailing out");
815
+								pat=old;
816
+								token->type=TOKEN_REGEX;
817
+								break;
818
+							}
819
+
820
+							len = end-pat;
821
+							for(i=0;i<std_class_cnt;i++)
822
+								if(!strncmp((const char*)pat,std_class[i].classname,len)) {
823
+									found=i;
824
+									break;
825
+								}
826
+							if(found!=-1) {
827
+								for(i=0;i<256;i++)
828
+									if(char_class[i]&(1<<found))
829
+										bitmap[i>>3] ^= 1<<(i&0x7);
830
+							}
831
+							else {
832
+								/*unknown class*/
833
+								cli_warnmsg("confused about regex bracket expression, bailing out");
834
+								pat=old;
835
+								token->type=TOKEN_REGEX;
836
+								break;
837
+							}
838
+						}
839
+				else {
840
+					bitmap[*pat>>3] ^= 1<<(*pat&0x7);
841
+					pat++;
842
+					range_start = *pat;
843
+					hasprev = 1;
844
+				}
845
+			} while(*pat!=']');
846
+			/*TODO: see if this bitmap already exists, then reuse*/			
847
+			token->type = TOKEN_BRACKET;
848
+			token->u.bitmap = bitmap;
849
+			break;
850
+			}
851
+		case ']':
852
+			assert(0 && "Encountered ] without matching [");
853
+			/* bad state */
854
+			break;
855
+		case '.':
856
+			token->type=TOKEN_DOT;
857
+			break;
858
+		case '(':
859
+			token->type=TOKEN_PAR_OPEN;
860
+			break;
861
+		case ')':
862
+			token->type=TOKEN_PAR_CLOSE;
863
+			break;
864
+		default:
865
+			token->type=TOKEN_CHAR;
866
+			token->u.start = pat;
867
+			token->len=1;
868
+			break;
869
+	}
870
+	return ++pat;
871
+}
872
+
873
+#define INITIAL_ALT_STACK 10
874
+#define ALT_STACK_GROW 20
875
+
876
+static const unsigned char* find_regex_start(const unsigned char* pat)
877
+{
878
+	struct token_t token;
879
+	/*TODO: find where the regex part begins, for ex:
880
+	 * abcd+, regex begins at 'd'
881
+	 * */
882
+	const unsigned char* last=NULL;
883
+	const unsigned char* tmp=NULL;
884
+	const unsigned char** altpositions = cli_malloc(INITIAL_ALT_STACK*sizeof(*altpositions));
885
+	size_t altpositions_capacity = INITIAL_ALT_STACK;
886
+	size_t altpositions_cnt = 0;
887
+	char lasttype = -1;
888
+	if(!altpositions)
889
+		return NULL;
890
+	assert(pat);
891
+
892
+	/* Try to parse pattern till special regex chars are encountered, that the tree-matcher doesn't handle, like: +,*,{}.
893
+	 * The tricky part is that once we encounter these, the previous 'atom' has to be passed on to the regex matcher, so we have to
894
+	 * back up to the last known good position
895
+	 * Example, if we have: abc(defg)+, then only abc can be handled by tree parser, so we have to return the position of (.
896
+	 * Another example: abc(defg|xyz|oz+|pdo), the last known good position is |, after xyz
897
+	 * TODO: what about open parantheses? maybe once we found a special char, we have top back out before the first (?
898
+	 * */
899
+	do {	
900
+		tmp = pat;
901
+		pat = getNextToken(pat,&token);
902
+		if(token.type!=TOKEN_REGEX) {
903
+			last = tmp;
904
+			lasttype = token.type;
905
+			if(token.type==TOKEN_BRACKET)
906
+				free(token.u.bitmap);
907
+			if(token.type==TOKEN_ALT || token.type==TOKEN_PAR_OPEN) {
908
+				/* save this position on stack, succesfully parsed till here*/
909
+				if(altpositions_cnt && altpositions[altpositions_cnt-1][0]=='|')
910
+					/* encountered another alternate (|) operator, override previous | position stored */
911
+					altpositions[altpositions_cnt-1]=last;
912
+				else {
913
+					altpositions[altpositions_cnt++] = last;
914
+					if(altpositions_cnt == altpositions_capacity) {
915
+						altpositions_capacity += ALT_STACK_GROW;
916
+						altpositions = cli_realloc(altpositions,altpositions_capacity*sizeof(*altpositions));
917
+						if(!altpositions)
918
+							return NULL;
919
+					}
920
+				}
921
+			} else if (lasttype==TOKEN_PAR_CLOSE) {
922
+				/* remove last stored position from stack, succesfully this last group */
923
+				altpositions_cnt--;
924
+				assert(altpositions_cnt>0);
925
+			}
926
+		}
927
+		else {
928
+			if(altpositions_cnt)
929
+				last = altpositions[0 /*altpositions_cnt-1*/];/*TODO: which index here?, see above TODO... */
930
+			/*last stored 'safe' position where no special (+,*,{}) regex chars were encountered*/
931
+		}
932
+	} while(*pat && token.type!=TOKEN_REGEX);
933
+	free(altpositions);
934
+	return *pat ? last : last+1;
935
+}
936
+
937
+static struct tree_node* tree_node_alloc(struct tree_node* next,char listend)
938
+{
939
+	struct tree_node* node = cli_malloc(sizeof(*node));
940
+	if(node) {
941
+		node->alternatives=0;
942
+		node->next=next;
943
+		node->listend=listend;
944
+		node->u.children=NULL;
945
+	}
946
+	return node;
947
+}
948
+
949
+static struct tree_node* tree_root_alloc(void)
950
+{
951
+	struct tree_node* root=tree_node_alloc(NULL,1);
952
+	if(root) {
953
+		root->op=OP_ROOT;
954
+		root->c=0;
955
+		root->next=NULL;
956
+		root->listend=1;
957
+	}
958
+	return root;
959
+}
960
+static inline struct tree_node* tree_node_char_binsearch(const struct tree_node* node,const char csearch,int* left)
961
+{
962
+	int right;
963
+	struct tree_node **children;
964
+	assert(node);
965
+	assert(left);
966
+
967
+	children = tree_node_get_children(node);
968
+	right = node->alternatives-1;
969
+	*left = 0;
970
+	if(!node->alternatives)
971
+		return NULL;
972
+	assert(children);
973
+	while(*left<=right) {
974
+		int mid  = *left+(right-*left)/2;
975
+		if(children[mid]->c == csearch)
976
+			return children[mid]; 
977
+		else if(children[mid]->c < csearch)
978
+			*left=mid+1;
979
+		else
980
+			right=mid-1;
981
+	}
982
+	return NULL;
983
+}
984
+
985
+static inline struct tree_node* tree_get_next(struct tree_node* node)
986
+{
987
+	struct tree_node** children;
988
+	assert(node);
989
+	children = tree_node_get_children(node);
990
+
991
+	if(!node->alternatives && children && children[0])
992
+		return children[0];
993
+	else if(node->alternatives<=1)
994
+		return node;
995
+	else
996
+		return children[0]->next;
997
+}
998
+
999
+static inline size_t tree_node_get_array_size(const struct tree_node* node)
1000
+{
1001
+	assert(node);
1002
+	/* if op is CUSTOMCLASS, then first pointer is pointer to bitmap, so array size is +1 */
1003
+	return (node->alternatives + (node->op==OP_CUSTOMCLASS ? 1 : 0)) * sizeof(node->u.children[0]);
1004
+}
1005
+
1006
+static inline struct tree_node* tree_node_char_insert(struct tree_node* node,const char c,int left)
1007
+{
1008
+	struct tree_node* new, *alt = tree_get_next(node);
1009
+	node->alternatives++;
1010
+	node->u.children = cli_realloc(node->u.children,tree_node_get_array_size(node));
1011
+	if(!node->u.children)
1012
+		return NULL;
1013
+
1014
+	new = tree_node_alloc(alt , node == alt );
1015
+	if(new) {
1016
+		new->op=OP_CHAR;
1017
+		new->c=c;
1018
+	}
1019
+
1020
+	if(node->alternatives-left-1>0)
1021
+			memmove(&node->u.children[left+1],&node->u.children[left],(node->alternatives-left-1)*sizeof(node->u.children[0]));
1022
+	node->u.children[left] = new;	
1023
+
1024
+	return new;
1025
+}
1026
+
1027
+static inline void tree_node_insert_nonbin(struct tree_node* node, struct tree_node* new)
1028
+{
1029
+	struct tree_node **children;
1030
+	assert(node);
1031
+	assert(new);
1032
+
1033
+	children = tree_node_get_children(node);
1034
+	if(node->alternatives) {
1035
+		assert(children);
1036
+	       	if(children[0]->next == node) {
1037
+			int i;
1038
+			new->listend = 1;
1039
+			for(i=0;i<node->alternatives;i++) {
1040
+				children[i]->next = new;
1041
+				children[i]->listend = 0;
1042
+			}
1043
+		}
1044
+		else {
1045
+			struct tree_node* p;
1046
+			for(p = children[0]->next ; p->next != node ; p = p->next)
1047
+				assert(!p->listend);
1048
+			new->listend = 1;
1049
+			p->listend = 0;
1050
+			p->next = new;
1051
+		}
1052
+	}
1053
+	else {
1054
+		node->u.children = cli_realloc(node->u.children,sizeof(node->u.children[0])*( node->op==OP_CUSTOMCLASS ? 2 : 1 ));
1055
+		if(node->u.children)
1056
+			node->u.children[ node->op==OP_CUSTOMCLASS ? 1 : 0 ] = new;
1057
+	}
1058
+}
1059
+
1060
+static inline unsigned char char_getclass(const unsigned char* bitmap)
1061
+{
1062
+	size_t i;
1063
+	assert(bitmap);
1064
+
1065
+	for(i=0;i<std_class_cnt;i++)
1066
+		if(!memcmp(bitmap,char_class_bitmap[i],256>>3))
1067
+			return i;
1068
+	return std_class_cnt;
1069
+}
1070
+
1071
+static void stack_destroy(struct node_stack* stack)
1072
+{
1073
+	assert(stack);
1074
+	if(stack->data)
1075
+		free(stack->data);
1076
+	stack->data = NULL;
1077
+	stack->capacity = 0;
1078
+}
1079
+
1080
+
1081
+/* call this after whitelist load is complete, and the tree is no longer going to be modified */
1082
+void regex_list_cleanup(struct regex_matcher* matcher)
1083
+{
1084
+	assert(matcher);
1085
+
1086
+	stack_destroy(&matcher->node_stack);
1087
+	stack_destroy(&matcher->node_stack_alt);
1088
+	stack_init(&matcher->node_stack);
1089
+	stack_init(&matcher->node_stack_alt);
1090
+}
1091
+
1092
+int is_regex_ok(struct regex_matcher* matcher)
1093
+{
1094
+	assert(matcher);
1095
+	return (!matcher->list_inited || matcher->list_inited!=-1);/* either we don't have a regexlist, or we initialized it successfully */
1096
+}
1097
+
1098
+/* returns 0 on success, regexec error code otherwise */						
1099
+static int add_pattern(struct regex_matcher* matcher,const unsigned char* pat,const char* info)
1100
+{
1101
+	int bol=1;
1102
+	const unsigned char* pat_end = find_regex_start(pat);
1103
+	struct token_t token;
1104
+	struct tree_node* node;
1105
+	
1106
+	assert(matcher);
1107
+
1108
+	node = matcher->root_regex;
1109
+
1110
+	stack_reset(&matcher->node_stack);
1111
+	stack_reset(&matcher->node_stack_alt);
1112
+	stack_push(&matcher->node_stack,node);
1113
+
1114
+	for(;node->op!=OP_LEAF;){
1115
+		if(pat<pat_end)
1116
+			pat  = getNextToken(pat,&token);
1117
+		else if(*pat) {
1118
+			token.type = TOKEN_REGEX;
1119
+			token.u.start=pat;
1120
+		}
1121
+		else
1122
+			token.type = TOKEN_DONE;
1123
+
1124
+		switch(token.type) {
1125
+			case TOKEN_CHAR: 
1126
+				{
1127
+					/* search for char in tree */
1128
+					int left;
1129
+					struct tree_node* newnode = tree_node_char_binsearch(node,*token.u.start,&left);
1130
+					if(newnode)
1131
+						node = newnode;
1132
+					else {
1133
+						/* not found, insert it */
1134
+						node = tree_node_char_insert(node,*token.u.start,left);
1135
+					}
1136
+					break;
1137
+				}
1138
+
1139
+			case TOKEN_PAR_OPEN:
1140
+				stack_push(&matcher->node_stack_alt,NULL);/* marker */
1141
+				stack_push(&matcher->node_stack,node);
1142
+				break;
1143
+
1144
+			case TOKEN_PAR_CLOSE: {
1145
+						      /*TODO: test this!!!*/
1146
+						      struct tree_node* node_alt = node;
1147
+						      node = tree_node_alloc(NULL,1);
1148
+						      node->op=OP_PARCLOSE;
1149
+						      node->c=0;
1150
+						      node->listend=1;
1151
+						      tree_node_insert_nonbin(node_alt,node);
1152
+						      while (( node_alt = stack_pop(&matcher->node_stack_alt) )) {
1153
+							      tree_node_insert_nonbin(node_alt,node);
1154
+						      }
1155
+				      		      stack_pop(&matcher->node_stack);					      
1156
+		      				      break;
1157
+					      }
1158
+
1159
+			case TOKEN_ALT:
1160
+				stack_push(&matcher->node_stack_alt,node);
1161
+				node = stack_pop(&matcher->node_stack);
1162
+				stack_push(&matcher->node_stack,node);
1163
+				break;
1164
+
1165
+			case TOKEN_BRACKET:
1166
+				{
1167
+					struct tree_node* new = tree_node_alloc(tree_get_next(node),1);
1168
+					unsigned char charclass = char_getclass(token.u.start);
1169
+					if(charclass == std_class_cnt) {/*not a std char class*/
1170
+						new->op = OP_CUSTOMCLASS;
1171
+						new->u.children = cli_malloc(sizeof(new->u.children[0])*2);
1172
+						new->u.bitmap[0] = token.u.bitmap;
1173
+						new->u.bitmap[1] = NULL;
1174
+						tree_node_insert_nonbin(node,new);
1175
+						node = new;
1176
+					}
1177
+					else {
1178
+						new->op = OP_STDCLASS;
1179
+						new->c = charclass;
1180
+						tree_node_insert_nonbin(node,new);
1181
+						node=new;
1182
+					}
1183
+					break;
1184
+				}
1185
+
1186
+			case TOKEN_DOT:
1187
+				{
1188
+					struct tree_node* new = tree_node_alloc(tree_get_next(node),1);
1189
+					new->op = OP_DOT;
1190
+					tree_node_insert_nonbin(node,new);
1191
+					node=new;
1192
+					break;
1193
+				}
1194
+
1195
+			case TOKEN_REGEX:
1196
+			case TOKEN_DONE: {
1197
+						 struct leaf_info* leaf=cli_malloc(sizeof(*leaf));
1198
+						 leaf->info=strdup(info);
1199
+						 if(token.type==TOKEN_REGEX) {
1200
+							 int rc;
1201
+							 struct tree_node* new;
1202
+							 regex_t* preg;
1203
+							 preg=cli_malloc(sizeof(*preg));
1204
+							 rc = regcomp(preg,(const char*)token.u.start,bol?0:REG_NOTBOL);
1205
+							 leaf->preg=preg;
1206
+							 if(rc)
1207
+								 return rc;
1208
+							 new=cli_malloc(sizeof(*new));
1209
+							 new->op=OP_LEAF;
1210
+							 new->next=node;
1211
+							 new->alternatives=0;
1212
+							 new->u.leaf=leaf;
1213
+							 new->listend=1;
1214
+							 tree_node_insert_nonbin(node,new);
1215
+						 }
1216
+						 else {
1217
+							 leaf->preg=NULL;
1218
+							 node->alternatives=0;
1219
+							 node->u.leaf=leaf;
1220
+							 node->op=OP_LEAF;
1221
+						 }
1222
+						 return 0;
1223
+					 }
1224
+		}
1225
+
1226
+		bol=0;
1227
+	}
1228
+	return 0;
1229
+}
1230
+
1231
+/* c has to be unsigned char here!! */
1232
+static int match_node(struct tree_node* node,const unsigned char* c,size_t len,const char** info)
1233
+{
1234
+	struct tree_node** children;
1235
+	int rc;
1236
+
1237
+	assert(node);
1238
+	assert(c);
1239
+	assert(info);
1240
+
1241
+	*info = NULL;
1242
+	len++;
1243
+	c--;
1244
+	for(;;) {
1245
+		assert(node);
1246
+		children = node->u.children;
1247
+		switch(node->op) {
1248
+			case OP_ROOT:
1249
+				rc=1;
1250
+				break;
1251
+			case OP_PARCLOSE:
1252
+				/*this isn't a real character, so don't move*/
1253
+				c--;
1254
+				len++;
1255
+				rc=1;
1256
+				break;
1257
+			case OP_CHAR:
1258
+				assert(*c==node->c && "We know this has to match");
1259
+				rc = 1;/* *c==node->c;- we know it has matched */
1260
+				break;
1261
+			case OP_DOT:	
1262
+				rc = *c!='\n';
1263
+				break;
1264
+			case OP_STDCLASS:
1265
+				rc = char_class[*c]&(node->c);
1266
+				break;
1267
+			case OP_CUSTOMCLASS:
1268
+			{
1269
+				char_bitmap_p bitmap;
1270
+				assert(children);
1271
+				bitmap = (char_bitmap_p)node->u.bitmap[0];
1272
+				children++;
1273
+				rc = bitmap[*c>>3]&(1<<(*c&0x7));
1274
+				break;
1275
+			}
1276
+			case OP_LEAF:
1277
+			{
1278
+				const struct leaf_info* leaf = node->u.leaf;
1279
+				/*isleaf = 1;*/
1280
+				if(leaf->preg) {
1281
+					rc = !regexec(leaf->preg,(const char*)c,0,NULL,0);
1282
+				}
1283
+				else  {
1284
+					assert(*c==node->c && "We know this has to match[2]");
1285
+					rc = 1;
1286
+				}
1287
+				if(rc) {
1288
+					*info = leaf->info;
1289
+					return MATCH_SUCCESS;
1290
+				}
1291
+				break;
1292
+			}
1293
+			default:
1294
+				/* impossible */
1295
+				cli_errmsg("Encountered invalid operator in tree:%d\n",node->op);
1296
+				exit(1);
1297
+		}
1298
+		len--;
1299
+		if(!len) rc=0;
1300
+		c++;
1301
+		if(rc) {
1302
+			const char csearch = *c;
1303
+			int left = 0,right = node->alternatives-1;
1304
+			int mid;
1305
+			/*matched so far, go deeper*/
1306
+			/*do a binary search between children */
1307
+			assert(children);
1308
+			while(left<=right) {
1309
+				mid  = left+(right-left)/2;
1310
+				if (children[mid]->c == csearch)
1311
+					break;
1312
+				else if(children[mid]->c < csearch)
1313
+					left=mid+1;
1314
+				else
1315
+					right=mid-1;
1316
+			}
1317
+			if(left<=right) {
1318
+				node = children[mid];
1319
+				assert(node);
1320
+			}
1321
+			else {
1322
+				if(node->alternatives) {
1323
+					if(!children[0]->listend) {
1324
+						node = children[0];
1325
+						c++;
1326
+						len--;
1327
+					}
1328
+					while(node && node->listend) {
1329
+						node = node->next;/* climb up */
1330
+						c--;
1331
+						len++;
1332
+					}
1333
+					if(!node || !node->next) 
1334
+						return MATCH_FAILED;/* reached root node */
1335
+					node=node->next;
1336
+					c--;
1337
+					len++;
1338
+				}
1339
+				else if(node->u.children) {
1340
+					struct tree_node* rewrite_next = NULL;
1341
+					if(node->op==OP_PARCLOSE) 
1342
+						rewrite_next = node;
1343
+					node = children[0];
1344
+					assert(node);
1345
+					assert(node->op!=OP_CHAR);
1346
+					if(rewrite_next)
1347
+						node->next = rewrite_next;/* this node is pointed to by several parent nodes, 
1348
+									     we need to know 
1349
+									     from which one we came, so we can find out way back
1350
+									     should we fail to match somewhere deeper*/
1351
+				}
1352
+			}
1353
+		}
1354
+		else {
1355
+			/* this node didn't match, try sibling, or parent (if no more siblings) */
1356
+			while(node && node->listend) {
1357
+				node = node->next;/* sibling of parent */
1358
+				c--;
1359
+				len++;
1360
+			}
1361
+			if(!node || !node->next) /* reached root node, it has no next */
1362
+				return MATCH_FAILED;
1363
+			else node=node->next;
1364
+		}
1365
+	}
1366
+	return MATCH_FAILED;
1367
+}
1368
+
1369
+/* push node on stack, only if it isn't there already */
1370
+static inline void stack_push_once(struct node_stack* stack,struct tree_node* node)
1371
+{
1372
+	size_t i;
1373
+	assert(stack);
1374
+	assert(node);
1375
+
1376
+	for(i=0;i < stack->cnt;i++)
1377
+		if(stack->data[i]==node)
1378
+			return;
1379
+	stack_push(stack,node);
1380
+}
1381
+
1382
+static void destroy_tree_internal(struct regex_matcher* matcher,struct tree_node* node)
1383
+{
1384
+	struct tree_node **children;
1385
+	assert(matcher);
1386
+	assert(node);
1387
+
1388
+	children = tree_node_get_children(node);
1389
+	if(node->op==OP_LEAF) {
1390
+		struct leaf_info* leaf = node->u.leaf;
1391
+		if(node->next && !node->listend)
1392
+			destroy_tree_internal(matcher,node->next);
1393
+		stack_push_once(&matcher->node_stack,(struct tree_node*)node->u.leaf);/* cast to make compiler happy, and to not make another stack implementation for storing void* */
1394
+		stack_push_once(&matcher->node_stack,node);
1395
+		if(leaf->preg) {
1396
+			regfree(leaf->preg);
1397
+			free(leaf->preg);
1398
+			leaf->preg=NULL;
1399
+		}
1400
+		if(leaf->info) {
1401
+			free(leaf->info);
1402
+			leaf->info=NULL;
1403
+		}
1404
+	/*	return;*/
1405
+	}
1406
+	if(node->alternatives) {
1407
+		int i;
1408
+		struct tree_node* p;
1409
+		assert(children);
1410
+		p = children[0]->op==OP_LEAF ? NULL : children[0]->next;
1411
+		for(i=0;i<node->alternatives;i++)
1412
+			destroy_tree_internal(matcher,children[i]);
1413
+		if(p && p!=node)
1414
+			destroy_tree_internal(matcher,p);/*?? is this ok, or without _internal?*/
1415
+	}
1416
+	else {
1417
+		if(children) {
1418
+			if(children[0])
1419
+				destroy_tree_internal(matcher,children[0]);		
1420
+		}
1421
+	}
1422
+	if(node->next && !node->listend)
1423
+		destroy_tree_internal(matcher,node->next);
1424
+	if(node->u.children)
1425
+		stack_push_once(&matcher->node_stack,(struct tree_node*)node->u.children);/* cast to make compiler happy, it isn't really a tree_node* */
1426
+	if(node->op==OP_CUSTOMCLASS && node->u.children[0]) {
1427
+		free(node->u.children[0]);
1428
+		node->u.children[0]=NULL;
1429
+	}
1430
+	stack_push_once(&matcher->node_stack,node);
1431
+}
1432
+
1433
+static void destroy_tree(struct regex_matcher* matcher)
1434
+{
1435
+	/* we might have the same node linked by different nodes, so a recursive walk&free doesn't work in all situations,
1436
+	 * i.e. it might double-free, so instead of freeing, just push the nodes on a stack, and later free the nodes in that stack,
1437
+	 * (and push to stack only if it doesn't contain it already*/
1438
+	assert(matcher);
1439
+
1440
+	stack_reset(&matcher->node_stack);
1441
+	destroy_tree_internal(matcher,matcher->root_regex);
1442
+	while (matcher->node_stack.cnt) {
1443
+		struct tree_node* node = stack_pop(&matcher->node_stack);
1444
+		free(node);
1445
+	}
1446
+}
1447
+#ifndef NDEBUG
1448
+static void dump_node(struct tree_node* node)
1449
+{
1450
+	int i;
1451
+	struct tree_node* p,**children;
1452
+	assert(node);
1453
+	if(node->op==OP_LEAF) {
1454
+		if(node->u.leaf->preg)
1455
+			printf("n%p [label=\"regex\\nleaf\"]",(void*)node);
1456
+		else
1457
+			printf("n%p [label=\"%c\\nleaf\"];\n",(void*)node,node->c);
1458
+		if(node->next && !node->listend) {
1459
+			printf("n%p -> n%p;\n",(void*)node,(void*)node->next);
1460
+			dump_node(node->next);
1461
+		}
1462
+		return;
1463
+	}
1464
+	printf("n%p [label=\"%c\\n%d\\nlistend:%d\"];\n",(void*)node,(node->op==OP_ROOT||node->op==OP_PARCLOSE) ?'@' :node->c,node->op,node->listend);
1465
+	if(node->next)
1466
+		printf("n%p -> n%p;\n",(void*)node,(void*)node->next);
1467
+	printf("n%p -> {",(void*)node);/*using address of node as id*/
1468
+	children = tree_node_get_children(node);
1469
+	if(node->alternatives)
1470
+		assert(children);
1471
+	for(i=0;i<node->alternatives;i++)
1472
+		printf("n%p ",(void*)children[i]);
1473
+	if(node->alternatives && children[0]->op!=OP_LEAF)
1474
+		for(p=children[0]->next;p!=node;p=p->next)
1475
+		{
1476
+			assert(p);
1477
+			printf("n%p ",(void*)p);
1478
+			if(p->op==OP_LEAF || p->listend)
1479
+				break;
1480
+		}
1481
+	if(!node->alternatives && children && children[0])
1482
+		printf("n%p ",(void*)children[0]);
1483
+	printf("};\n");
1484
+	printf("{rank=same;");
1485
+	for(i=0;i<node->alternatives;i++)
1486
+		printf("n%p ",(void*)node->u.children[i]);
1487
+	if(node->alternatives && children[0]->op!=OP_LEAF)
1488
+		for(p=children[0]->next;p!=node;p=p->next) 
1489
+		{
1490
+			printf("n%p ",(void*)p);	
1491
+			if(p->op==OP_LEAF || p->listend)
1492
+				break;
1493
+		}
1494
+	if(!node->alternatives && children && children[0])
1495
+		printf("n%p ",(void*)children[0]);
1496
+	printf("};\n");
1497
+	for(i=0;i<node->alternatives;i++)
1498
+		dump_node(children[i]);
1499
+	if(node->alternatives && children[0]->op!=OP_LEAF)
1500
+		for(p=children[0]->next;p!=node;p=p->next)
1501
+		{
1502
+			dump_node(p);
1503
+			if(p->op==OP_LEAF || p->listend)
1504
+				break;
1505
+		}
1506
+	if(!node->alternatives && children && children[0])
1507
+		dump_node(children[0]);
1508
+}
1509
+
1510
+void dump_tree(struct tree_node* root)
1511
+{
1512
+	/*use dot/dotty from graphviz to view it*/
1513
+	assert(root);
1514
+	printf("digraph tree {\n");
1515
+	dump_node(root);
1516
+	printf("}\n");
1517
+}
1518
+#endif
1519
+
1520
+#endif
0 1521
new file mode 100644
... ...
@@ -0,0 +1,53 @@
0
+/*
1
+ *  Match a string against a list of patterns/regexes.
2
+ *
3

                
4
+ *
5
+ *  This program is free software; you can redistribute it and/or modify
6
+ *  it under the terms of the GNU General Public License as published by
7
+ *  the Free Software Foundation; either version 2 of the License, or
8
+ *  (at your option) any later version.
9
+ *
10
+ *  This program is distributed in the hope that it will be useful,
11
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
12
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13
+ *  GNU General Public License for more details.
14
+ *
15
+ *  You should have received a copy of the GNU General Public License
16
+ *  along with this program; if not, write to the Free Software
17
+ *  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
18
+ *  MA 02110-1301, USA.
19
+ *
20
+ */
21
+
22
+#ifdef CL_EXPERIMENTAL
23
+
24
+#ifndef _REGEX_LIST_H
25
+#define _REGEX_LIST_H
26
+
27
+struct node_stack {
28
+	struct tree_node** data;
29
+	size_t capacity;
30
+	size_t cnt;
31
+};
32
+
33
+struct regex_matcher {
34
+	struct cli_matcher* root_hosts;
35
+	struct cli_matcher* root_urls;
36
+	struct tree_node* root_regex;
37
+	int list_inited;
38
+	int list_loaded;
39
+	int list_built;
40
+	struct node_stack node_stack;
41
+	struct node_stack node_stack_alt;
42
+};
43
+
44
+int regex_list_match(struct regex_matcher* matcher,const char* real_url,const char* display_url,int hostOnly,const char** info);
45
+int init_regex_list(struct regex_matcher* matcher);
46
+int load_regex_matcher(struct regex_matcher* matcher,FILE* fd,unsigned int options);
47
+void regex_list_cleanup(struct regex_matcher* matcher);
48
+void regex_list_done(struct regex_matcher* matcher);
49
+int is_regex_ok(struct regex_matcher* matcher);
50
+#endif
51
+
52
+#endif