Browse code

new method of file type detection; HTML normalisation

git-svn: trunk@648

Tomasz Kojm authored on 2004/07/03 08:00:58
Showing 36 changed files
... ...
@@ -1,3 +1,14 @@
1
+Sat Jul  3 00:37:28 CEST 2004 (tk)
2
+----------------------------------
3
+  * libclamav: matcher: add support for file type detection via signature
4
+	       scanning - it's required to detect data for which magic number
5
+	       tests are not possible (eg. HTML). Minor cleanup of signature
6
+	       parser.
7
+  * libclamav: integrate HTML normalizer from Trog
8
+  * clamd: new directive ScanHTML
9
+  * clamscan: new option --no-html
10
+  * docs: update man pages
11
+
1 12
 Thu Jul  1 03:18:04 CEST 2004 (tk)
2 13
 ----------------------------------
3 14
   * clamdscan: fix bug (introduced in -20040622) in stream scanning in TCP
... ...
@@ -1,4 +1,4 @@
1
-# Makefile.in generated by automake 1.8.3 from Makefile.am.
1
+# Makefile.in generated by automake 1.8.5 from Makefile.am.
2 2
 # @configure_input@
3 3
 
4 4
 # Copyright (C) 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002,
... ...
@@ -389,14 +389,16 @@ TAGS: tags-recursive $(HEADERS) $(SOURCES) clamav-config.h.in $(TAGS_DEPENDENCIE
389 389
 		$(TAGS_FILES) $(LISP)
390 390
 	tags=; \
391 391
 	here=`pwd`; \
392
-	if (etags --etags-include --version) >/dev/null 2>&1; then \
392
+	if ($(ETAGS) --etags-include --version) >/dev/null 2>&1; then \
393 393
 	  include_option=--etags-include; \
394
+	  empty_fix=.; \
394 395
 	else \
395 396
 	  include_option=--include; \
397
+	  empty_fix=; \
396 398
 	fi; \
397 399
 	list='$(SUBDIRS)'; for subdir in $$list; do \
398 400
 	  if test "$$subdir" = .; then :; else \
399
-	    test -f $$subdir/TAGS && \
401
+	    test ! -f $$subdir/TAGS || \
400 402
 	      tags="$$tags $$include_option=$$here/$$subdir/TAGS"; \
401 403
 	  fi; \
402 404
 	done; \
... ...
@@ -406,9 +408,11 @@ TAGS: tags-recursive $(HEADERS) $(SOURCES) clamav-config.h.in $(TAGS_DEPENDENCIE
406 406
 	  done | \
407 407
 	  $(AWK) '    { files[$$0] = 1; } \
408 408
 	       END { for (i in files) print i; }'`; \
409
-	test -z "$(ETAGS_ARGS)$$tags$$unique" \
410
-	  || $(ETAGS) $(ETAGSFLAGS) $(AM_ETAGSFLAGS) $(ETAGS_ARGS) \
411
-	     $$tags $$unique
409
+	if test -z "$(ETAGS_ARGS)$$tags$$unique"; then :; else \
410
+	  test -n "$$unique" || unique=$$empty_fix; \
411
+	  $(ETAGS) $(ETAGSFLAGS) $(AM_ETAGSFLAGS) $(ETAGS_ARGS) \
412
+	    $$tags $$unique; \
413
+	fi
412 414
 ctags: CTAGS
413 415
 CTAGS: ctags-recursive $(HEADERS) $(SOURCES) clamav-config.h.in $(TAGS_DEPENDENCIES) \
414 416
 		$(TAGS_FILES) $(LISP)
... ...
@@ -517,7 +521,7 @@ distcheck: dist
517 517
 	*.tar.Z*) \
518 518
 	  uncompress -c $(distdir).tar.Z | $(AMTAR) xf - ;;\
519 519
 	*.shar.gz*) \
520
-	  GZIP=$(GZIP_ENV) gunzip -c $(distdir).tar.gz | unshar ;;\
520
+	  GZIP=$(GZIP_ENV) gunzip -c $(distdir).shar.gz | unshar ;;\
521 521
 	*.zip*) \
522 522
 	  unzip $(distdir).zip ;;\
523 523
 	esac
... ...
@@ -1,4 +1,4 @@
1
-# generated automatically by aclocal 1.8.3 -*- Autoconf -*-
1
+# generated automatically by aclocal 1.8.5 -*- Autoconf -*-
2 2
 
3 3
 # Copyright (C) 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004
4 4
 # Free Software Foundation, Inc.
... ...
@@ -40,7 +40,7 @@ AC_DEFUN([AM_AUTOMAKE_VERSION], [am__api_version="1.8"])
40 40
 # Call AM_AUTOMAKE_VERSION so it can be traced.
41 41
 # This function is AC_REQUIREd by AC_INIT_AUTOMAKE.
42 42
 AC_DEFUN([AM_SET_CURRENT_AUTOMAKE_VERSION],
43
-	 [AM_AUTOMAKE_VERSION([1.8.3])])
43
+	 [AM_AUTOMAKE_VERSION([1.8.5])])
44 44
 
45 45
 # AM_AUX_DIR_EXPAND
46 46
 
... ...
@@ -266,9 +266,14 @@ AC_CACHE_CHECK([dependency style of $depcc],
266 266
        grep sub/conftest.${OBJEXT-o} sub/conftest.Po > /dev/null 2>&1 &&
267 267
        ${MAKE-make} -s -f confmf > /dev/null 2>&1; then
268 268
       # icc doesn't choke on unknown options, it will just issue warnings
269
-      # (even with -Werror).  So we grep stderr for any message
270
-      # that says an option was ignored.
271
-      if grep 'ignoring option' conftest.err >/dev/null 2>&1; then :; else
269
+      # or remarks (even with -Werror).  So we grep stderr for any message
270
+      # that says an option was ignored or not supported.
271
+      # When given -MP, icc 7.0 and 7.1 complain thusly:
272
+      #   icc: Command line warning: ignoring option '-M'; no argument required
273
+      # The diagnosis changed in icc 8.0:
274
+      #   icc: Command line remark: option '-MP' not supported
275
+      if (grep 'ignoring option' conftest.err ||
276
+          grep 'not supported' conftest.err) >/dev/null 2>&1; then :; else
272 277
         am_cv_$1_dependencies_compiler_type=$depmode
273 278
         break
274 279
       fi
... ...
@@ -46,7 +46,7 @@ while test $# -gt 0; do
46 46
 	;;
47 47
 
48 48
     --version)
49
-	echo devel-20040627
49
+	echo devel-20040702
50 50
 	exit 0
51 51
 	;;
52 52
 
... ...
@@ -1,4 +1,4 @@
1
-# Makefile.in generated by automake 1.8.3 from Makefile.am.
1
+# Makefile.in generated by automake 1.8.5 from Makefile.am.
2 2
 # @configure_input@
3 3
 
4 4
 # Copyright (C) 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002,
... ...
@@ -471,9 +471,11 @@ TAGS:  $(HEADERS) $(SOURCES)  $(TAGS_DEPENDENCIES) \
471 471
 	  done | \
472 472
 	  $(AWK) '    { files[$$0] = 1; } \
473 473
 	       END { for (i in files) print i; }'`; \
474
-	test -z "$(ETAGS_ARGS)$$tags$$unique" \
475
-	  || $(ETAGS) $(ETAGSFLAGS) $(AM_ETAGSFLAGS) $(ETAGS_ARGS) \
476
-	     $$tags $$unique
474
+	if test -z "$(ETAGS_ARGS)$$tags$$unique"; then :; else \
475
+	  test -n "$$unique" || unique=$$empty_fix; \
476
+	  $(ETAGS) $(ETAGSFLAGS) $(AM_ETAGSFLAGS) $(ETAGS_ARGS) \
477
+	    $$tags $$unique; \
478
+	fi
477 479
 ctags: CTAGS
478 480
 CTAGS:  $(HEADERS) $(SOURCES)  $(TAGS_DEPENDENCIES) \
479 481
 		$(TAGS_FILES) $(LISP)
... ...
@@ -1,4 +1,4 @@
1
-# Makefile.in generated by automake 1.8.3 from Makefile.am.
1
+# Makefile.in generated by automake 1.8.5 from Makefile.am.
2 2
 # @configure_input@
3 3
 
4 4
 # Copyright (C) 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002,
... ...
@@ -503,9 +503,11 @@ TAGS:  $(HEADERS) $(SOURCES)  $(TAGS_DEPENDENCIES) \
503 503
 	  done | \
504 504
 	  $(AWK) '    { files[$$0] = 1; } \
505 505
 	       END { for (i in files) print i; }'`; \
506
-	test -z "$(ETAGS_ARGS)$$tags$$unique" \
507
-	  || $(ETAGS) $(ETAGSFLAGS) $(AM_ETAGSFLAGS) $(ETAGS_ARGS) \
508
-	     $$tags $$unique
506
+	if test -z "$(ETAGS_ARGS)$$tags$$unique"; then :; else \
507
+	  test -n "$$unique" || unique=$$empty_fix; \
508
+	  $(ETAGS) $(ETAGSFLAGS) $(AM_ETAGSFLAGS) $(ETAGS_ARGS) \
509
+	    $$tags $$unique; \
510
+	fi
509 511
 ctags: CTAGS
510 512
 CTAGS:  $(HEADERS) $(SOURCES)  $(TAGS_DEPENDENCIES) \
511 513
 		$(TAGS_FILES) $(LISP)
... ...
@@ -325,6 +325,13 @@ int acceptloop_th(int socketd, struct cl_node *root, const struct cfgstruct *cop
325 325
 	logg("OLE2 support disabled.\n");
326 326
     }
327 327
 
328
+    if(cfgopt(copt, "ScanHTML")) {
329
+	logg("HTML support enabled.\n");
330
+	options |= CL_HTML;
331
+    } else {
332
+	logg("HTML support disabled.\n");
333
+    }
334
+
328 335
     if((cpt = cfgopt(copt, "SelfCheck"))) {
329 336
 	selfchk = cpt->numarg;
330 337
     } else {
... ...
@@ -1,4 +1,4 @@
1
-# Makefile.in generated by automake 1.8.3 from Makefile.am.
1
+# Makefile.in generated by automake 1.8.5 from Makefile.am.
2 2
 # @configure_input@
3 3
 
4 4
 # Copyright (C) 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002,
... ...
@@ -425,9 +425,11 @@ TAGS:  $(HEADERS) $(SOURCES)  $(TAGS_DEPENDENCIES) \
425 425
 	  done | \
426 426
 	  $(AWK) '    { files[$$0] = 1; } \
427 427
 	       END { for (i in files) print i; }'`; \
428
-	test -z "$(ETAGS_ARGS)$$tags$$unique" \
429
-	  || $(ETAGS) $(ETAGSFLAGS) $(AM_ETAGSFLAGS) $(ETAGS_ARGS) \
430
-	     $$tags $$unique
428
+	if test -z "$(ETAGS_ARGS)$$tags$$unique"; then :; else \
429
+	  test -n "$$unique" || unique=$$empty_fix; \
430
+	  $(ETAGS) $(ETAGSFLAGS) $(AM_ETAGSFLAGS) $(ETAGS_ARGS) \
431
+	    $$tags $$unique; \
432
+	fi
431 433
 ctags: CTAGS
432 434
 CTAGS:  $(HEADERS) $(SOURCES)  $(TAGS_DEPENDENCIES) \
433 435
 		$(TAGS_FILES) $(LISP)
... ...
@@ -1,4 +1,4 @@
1
-# Makefile.in generated by automake 1.8.3 from Makefile.am.
1
+# Makefile.in generated by automake 1.8.5 from Makefile.am.
2 2
 # @configure_input@
3 3
 
4 4
 # Copyright (C) 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002,
... ...
@@ -458,9 +458,11 @@ TAGS:  $(HEADERS) $(SOURCES)  $(TAGS_DEPENDENCIES) \
458 458
 	  done | \
459 459
 	  $(AWK) '    { files[$$0] = 1; } \
460 460
 	       END { for (i in files) print i; }'`; \
461
-	test -z "$(ETAGS_ARGS)$$tags$$unique" \
462
-	  || $(ETAGS) $(ETAGSFLAGS) $(AM_ETAGSFLAGS) $(ETAGS_ARGS) \
463
-	     $$tags $$unique
461
+	if test -z "$(ETAGS_ARGS)$$tags$$unique"; then :; else \
462
+	  test -n "$$unique" || unique=$$empty_fix; \
463
+	  $(ETAGS) $(ETAGSFLAGS) $(AM_ETAGSFLAGS) $(ETAGS_ARGS) \
464
+	    $$tags $$unique; \
465
+	fi
464 466
 ctags: CTAGS
465 467
 CTAGS:  $(HEADERS) $(SOURCES)  $(TAGS_DEPENDENCIES) \
466 468
 		$(TAGS_FILES) $(LISP)
... ...
@@ -220,6 +220,7 @@ void help(void)
220 220
     mprintf("    --mbox                -m             Treat stdin as a mailbox\n");
221 221
     mprintf("\n");
222 222
     mprintf("    --no-ole2                            Disable OLE2 support\n");
223
+    mprintf("    --no-html                            Disable HTML support\n");
223 224
     mprintf("    --no-archive                         Disable libclamav archive support\n");
224 225
     mprintf("    --block-encrypted                    Block encrypted archives.\n");
225 226
     mprintf("    --max-space=#n                       Extract first #n kilobytes only\n");
... ...
@@ -427,6 +427,11 @@ int scanfile(const char *filename, struct cl_node *root, const struct passwd *us
427 427
     else
428 428
 	options |= CL_OLE2;
429 429
 
430
+    if(optl(opt, "no-html"))
431
+	options &= ~CL_HTML;
432
+    else
433
+	options |= CL_HTML;
434
+
430 435
     if(optc(opt, 'm'))
431 436
 	options |= CL_MAIL;
432 437
 
... ...
@@ -82,6 +82,7 @@ int main(int argc, char **argv)
82 82
 	    {"no-archive", 0, 0, 0},
83 83
 	    {"block-encrypted", 0, 0, 0},
84 84
 	    {"no-ole2", 0, 0, 0},
85
+	    {"no-html", 0, 0, 0},
85 86
 	    {"mbox", 0, 0, 'm'},
86 87
 	    {"stdout", 0, 0, 0},
87 88
 	    {"unzip", 2, 0, 0},
... ...
@@ -3290,9 +3290,14 @@ else
3290 3290
        grep sub/conftest.${OBJEXT-o} sub/conftest.Po > /dev/null 2>&1 &&
3291 3291
        ${MAKE-make} -s -f confmf > /dev/null 2>&1; then
3292 3292
       # icc doesn't choke on unknown options, it will just issue warnings
3293
-      # (even with -Werror).  So we grep stderr for any message
3294
-      # that says an option was ignored.
3295
-      if grep 'ignoring option' conftest.err >/dev/null 2>&1; then :; else
3293
+      # or remarks (even with -Werror).  So we grep stderr for any message
3294
+      # that says an option was ignored or not supported.
3295
+      # When given -MP, icc 7.0 and 7.1 complain thusly:
3296
+      #   icc: Command line warning: ignoring option '-M'; no argument required
3297
+      # The diagnosis changed in icc 8.0:
3298
+      #   icc: Command line remark: option '-MP' not supported
3299
+      if (grep 'ignoring option' conftest.err ||
3300
+          grep 'not supported' conftest.err) >/dev/null 2>&1; then :; else
3296 3301
         am_cv_CC_dependencies_compiler_type=$depmode
3297 3302
         break
3298 3303
       fi
... ...
@@ -5009,7 +5014,7 @@ test "x$enable_libtool_lock" != xno && enable_libtool_lock=yes
5009 5009
 case $host in
5010 5010
 *-*-irix6*)
5011 5011
   # Find out which ABI we are using.
5012
-  echo '#line 5012 "configure"' > conftest.$ac_ext
5012
+  echo '#line 5017 "configure"' > conftest.$ac_ext
5013 5013
   if { (eval echo "$as_me:$LINENO: \"$ac_compile\"") >&5
5014 5014
   (eval $ac_compile) 2>&5
5015 5015
   ac_status=$?
... ...
@@ -5582,7 +5587,7 @@ chmod -w .
5582 5582
 save_CFLAGS="$CFLAGS"
5583 5583
 CFLAGS="$CFLAGS -o out/conftest2.$ac_objext"
5584 5584
 compiler_c_o=no
5585
-if { (eval echo configure:5585: \"$ac_compile\") 1>&5; (eval $ac_compile) 2>out/conftest.err; } && test -s out/conftest2.$ac_objext; then
5585
+if { (eval echo configure:5590: \"$ac_compile\") 1>&5; (eval $ac_compile) 2>out/conftest.err; } && test -s out/conftest2.$ac_objext; then
5586 5586
   # The compiler can only warn and ignore the option if not recognized
5587 5587
   # So say no if there are warnings
5588 5588
   if test -s out/conftest.err; then
... ...
@@ -7509,7 +7514,7 @@ else
7509 7509
     lt_dlunknown=0; lt_dlno_uscore=1; lt_dlneed_uscore=2
7510 7510
   lt_status=$lt_dlunknown
7511 7511
   cat > conftest.$ac_ext <<EOF
7512
-#line 7512 "configure"
7512
+#line 7517 "configure"
7513 7513
 #include "confdefs.h"
7514 7514
 
7515 7515
 #if HAVE_DLFCN_H
... ...
@@ -7607,7 +7612,7 @@ else
7607 7607
     lt_dlunknown=0; lt_dlno_uscore=1; lt_dlneed_uscore=2
7608 7608
   lt_status=$lt_dlunknown
7609 7609
   cat > conftest.$ac_ext <<EOF
7610
-#line 7610 "configure"
7610
+#line 7615 "configure"
7611 7611
 #include "confdefs.h"
7612 7612
 
7613 7613
 #if HAVE_DLFCN_H
... ...
@@ -1,4 +1,4 @@
1
-# Makefile.in generated by automake 1.8.3 from Makefile.am.
1
+# Makefile.in generated by automake 1.8.5 from Makefile.am.
2 2
 # @configure_input@
3 3
 
4 4
 # Copyright (C) 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002,
... ...
@@ -1,4 +1,4 @@
1
-# Makefile.in generated by automake 1.8.3 from Makefile.am.
1
+# Makefile.in generated by automake 1.8.5 from Makefile.am.
2 2
 # @configure_input@
3 3
 
4 4
 # Copyright (C) 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002,
... ...
@@ -164,6 +164,11 @@ Enables scanning of Microsoft Office document macros.
164 164
 .br 
165 165
 Default: enabled.
166 166
 .TP 
167
+\fBScanHTML\fR
168
+Enables HTML detection and normalisation.
169
+.br 
170
+Default: enabled.
171
+.TP 
167 172
 \fBScanMail\fR
168 173
 Enable scanning of Mbox, Maildir and raw mail files.
169 174
 .br 
... ...
@@ -72,6 +72,9 @@ EXTRACTION OPTIONS:
72 72
 \fB\-\-no\-ole2\fR
73 73
 Disable support for Microsoft Office document files.
74 74
 .TP 
75
+\fB\-\-no\-html\fR
76
+Disable support for HTML detection and normalisation.
77
+.TP 
75 78
 \fB\-\-no\-archive\fR
76 79
 Disable archive support built in libclamav.
77 80
 .TP 
... ...
@@ -1,4 +1,4 @@
1
-# Makefile.in generated by automake 1.8.3 from Makefile.am.
1
+# Makefile.in generated by automake 1.8.5 from Makefile.am.
2 2
 # @configure_input@
3 3
 
4 4
 # Copyright (C) 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002,
... ...
@@ -144,10 +144,18 @@ ScanOLE2
144 144
 ## Mail support
145 145
 ##
146 146
 
147
-# Uncomment this option if you are planning to scan mail files.
147
+# Uncomment this option if you are going to scan mail files.
148 148
 #ScanMail
149 149
 
150 150
 ##
151
+## HTML support
152
+##
153
+
154
+# This option enables HTML detection and normalisation. It's highly
155
+# recommended and required to detect popular exploits.
156
+ScanHTML
157
+
158
+##
151 159
 ## Archive support
152 160
 ##
153 161
 
... ...
@@ -1,4 +1,4 @@
1
-# Makefile.in generated by automake 1.8.3 from Makefile.am.
1
+# Makefile.in generated by automake 1.8.5 from Makefile.am.
2 2
 # @configure_input@
3 3
 
4 4
 # Copyright (C) 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002,
... ...
@@ -454,9 +454,11 @@ TAGS:  $(HEADERS) $(SOURCES)  $(TAGS_DEPENDENCIES) \
454 454
 	  done | \
455 455
 	  $(AWK) '    { files[$$0] = 1; } \
456 456
 	       END { for (i in files) print i; }'`; \
457
-	test -z "$(ETAGS_ARGS)$$tags$$unique" \
458
-	  || $(ETAGS) $(ETAGSFLAGS) $(AM_ETAGSFLAGS) $(ETAGS_ARGS) \
459
-	     $$tags $$unique
457
+	if test -z "$(ETAGS_ARGS)$$tags$$unique"; then :; else \
458
+	  test -n "$$unique" || unique=$$empty_fix; \
459
+	  $(ETAGS) $(ETAGSFLAGS) $(AM_ETAGSFLAGS) $(ETAGS_ARGS) \
460
+	    $$tags $$unique; \
461
+	fi
460 462
 ctags: CTAGS
461 463
 CTAGS:  $(HEADERS) $(SOURCES)  $(TAGS_DEPENDENCIES) \
462 464
 		$(TAGS_FILES) $(LISP)
... ...
@@ -41,7 +41,8 @@ libclamav_la_SOURCES = \
41 41
 	str.h \
42 42
 	defaults.h \
43 43
 	scanners.c \
44
-	scanners.h \
44
+	filetypes.c \
45
+	filetypes.h \
45 46
 	unrarlib.c \
46 47
 	unrarlib.h \
47 48
 	zziplib/zzip-conf.h \
... ...
@@ -95,6 +96,8 @@ libclamav_la_SOURCES = \
95 95
 	mspack/system.c \
96 96
 	mspack/system.h \
97 97
 	upx.c \
98
-	upx.h
98
+	upx.h \
99
+	htmlnorm.c \
100
+	htmlnorm.h
99 101
 
100 102
 lib_LTLIBRARIES = libclamav.la
... ...
@@ -1,4 +1,4 @@
1
-# Makefile.in generated by automake 1.8.3 from Makefile.am.
1
+# Makefile.in generated by automake 1.8.5 from Makefile.am.
2 2
 # @configure_input@
3 3
 
4 4
 # Copyright (C) 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002,
... ...
@@ -73,17 +73,19 @@ libLTLIBRARIES_INSTALL = $(INSTALL)
73 73
 LTLIBRARIES = $(lib_LTLIBRARIES)
74 74
 libclamav_la_DEPENDENCIES =
75 75
 am_libclamav_la_OBJECTS = matcher.lo md5.lo others.lo readdb.lo cvd.lo \
76
-	dsig.lo str.lo scanners.lo unrarlib.lo zzip-dir.lo zzip-err.lo \
77
-	zzip-file.lo zzip-info.lo zzip-io.lo zzip-stat.lo zzip-zip.lo \
78
-	strc.lo blob.lo mbox.lo message.lo snprintf.lo strrcpy.lo \
79
-	table.lo text.lo ole2_extract.lo vba_extract.lo msexpand.lo \
80
-	pe.lo cabd.lo lzxd.lo mszipd.lo qtmd.lo system.lo upx.lo
76
+	dsig.lo str.lo scanners.lo filetypes.lo unrarlib.lo \
77
+	zzip-dir.lo zzip-err.lo zzip-file.lo zzip-info.lo zzip-io.lo \
78
+	zzip-stat.lo zzip-zip.lo strc.lo blob.lo mbox.lo message.lo \
79
+	snprintf.lo strrcpy.lo table.lo text.lo ole2_extract.lo \
80
+	vba_extract.lo msexpand.lo pe.lo cabd.lo lzxd.lo mszipd.lo \
81
+	qtmd.lo system.lo upx.lo htmlnorm.lo
81 82
 libclamav_la_OBJECTS = $(am_libclamav_la_OBJECTS)
82 83
 DEFAULT_INCLUDES = -I. -I$(srcdir) -I$(top_builddir)
83 84
 depcomp = $(SHELL) $(top_srcdir)/depcomp
84 85
 am__depfiles_maybe = depfiles
85 86
 @AMDEP_TRUE@DEP_FILES = ./$(DEPDIR)/blob.Plo ./$(DEPDIR)/cabd.Plo \
86 87
 @AMDEP_TRUE@	./$(DEPDIR)/cvd.Plo ./$(DEPDIR)/dsig.Plo \
88
+@AMDEP_TRUE@	./$(DEPDIR)/filetypes.Plo ./$(DEPDIR)/htmlnorm.Plo \
87 89
 @AMDEP_TRUE@	./$(DEPDIR)/lzxd.Plo ./$(DEPDIR)/matcher.Plo \
88 90
 @AMDEP_TRUE@	./$(DEPDIR)/mbox.Plo ./$(DEPDIR)/md5.Plo \
89 91
 @AMDEP_TRUE@	./$(DEPDIR)/message.Plo ./$(DEPDIR)/msexpand.Plo \
... ...
@@ -244,7 +246,8 @@ libclamav_la_SOURCES = \
244 244
 	str.h \
245 245
 	defaults.h \
246 246
 	scanners.c \
247
-	scanners.h \
247
+	filetypes.c \
248
+	filetypes.h \
248 249
 	unrarlib.c \
249 250
 	unrarlib.h \
250 251
 	zziplib/zzip-conf.h \
... ...
@@ -298,7 +301,9 @@ libclamav_la_SOURCES = \
298 298
 	mspack/system.c \
299 299
 	mspack/system.h \
300 300
 	upx.c \
301
-	upx.h
301
+	upx.h \
302
+	htmlnorm.c \
303
+	htmlnorm.h
302 304
 
303 305
 lib_LTLIBRARIES = libclamav.la
304 306
 all: all-am
... ...
@@ -357,7 +362,7 @@ clean-libLTLIBRARIES:
357 357
 	-test -z "$(lib_LTLIBRARIES)" || rm -f $(lib_LTLIBRARIES)
358 358
 	@list='$(lib_LTLIBRARIES)'; for p in $$list; do \
359 359
 	  dir="`echo $$p | sed -e 's|/[^/]*$$||'`"; \
360
-	  test "$$dir" = "$$p" && dir=.; \
360
+	  test "$$dir" != "$$p" || dir=.; \
361 361
 	  echo "rm -f \"$${dir}/so_locations\""; \
362 362
 	  rm -f "$${dir}/so_locations"; \
363 363
 	done
... ...
@@ -374,6 +379,8 @@ distclean-compile:
374 374
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/cabd.Plo@am__quote@
375 375
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/cvd.Plo@am__quote@
376 376
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/dsig.Plo@am__quote@
377
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/filetypes.Plo@am__quote@
378
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/htmlnorm.Plo@am__quote@
377 379
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/lzxd.Plo@am__quote@
378 380
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/matcher.Plo@am__quote@
379 381
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/mbox.Plo@am__quote@
... ...
@@ -788,9 +795,11 @@ TAGS:  $(HEADERS) $(SOURCES)  $(TAGS_DEPENDENCIES) \
788 788
 	  done | \
789 789
 	  $(AWK) '    { files[$$0] = 1; } \
790 790
 	       END { for (i in files) print i; }'`; \
791
-	test -z "$(ETAGS_ARGS)$$tags$$unique" \
792
-	  || $(ETAGS) $(ETAGSFLAGS) $(AM_ETAGSFLAGS) $(ETAGS_ARGS) \
793
-	     $$tags $$unique
791
+	if test -z "$(ETAGS_ARGS)$$tags$$unique"; then :; else \
792
+	  test -n "$$unique" || unique=$$empty_fix; \
793
+	  $(ETAGS) $(ETAGSFLAGS) $(AM_ETAGSFLAGS) $(ETAGS_ARGS) \
794
+	    $$tags $$unique; \
795
+	fi
794 796
 ctags: CTAGS
795 797
 CTAGS:  $(HEADERS) $(SOURCES)  $(TAGS_DEPENDENCIES) \
796 798
 		$(TAGS_FILES) $(LISP)
... ...
@@ -73,12 +73,13 @@ extern "C"
73 73
 #define CL_DISABLERAR	4
74 74
 #define CL_OLE2		8
75 75
 #define CL_ENCRYPTED    16
76
+#define CL_HTML		32
76 77
 
77 78
 struct cli_patt {
78 79
     short int *pattern;
79 80
     unsigned int length;
80 81
     char *virname;
81
-    unsigned short int sigid, parts, partno;
82
+    unsigned short int sigid, parts, partno, type;
82 83
     struct cli_patt *next;
83 84
 };
84 85
 
85 86
new file mode 100644
... ...
@@ -0,0 +1,149 @@
0
+/*
1
+ *  Copyright (C) 2002 - 2004 Tomasz Kojm <tkojm@clamav.net>
2
+ *  With enhancements from Thomas Lamy <Thomas.Lamy@in-online.net>
3
+ *
4
+ *  This program is free software; you can redistribute it and/or modify
5
+ *  it under the terms of the GNU General Public License as published by
6
+ *  the Free Software Foundation; either version 2 of the License, or
7
+ *  (at your option) any later version.
8
+ *
9
+ *  This program is distributed in the hope that it will be useful,
10
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
11
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12
+ *  GNU General Public License for more details.
13
+ *
14
+ *  You should have received a copy of the GNU General Public License
15
+ *  along with this program; if not, write to the Free Software
16
+ *  Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
17
+ */
18
+
19
+#if HAVE_CONFIG_H
20
+#include "clamav-config.h"
21
+#endif
22
+
23
+#include <stdio.h>
24
+#include <string.h>
25
+#include <stdlib.h>
26
+
27
+#include "clamav.h"
28
+#include "filetypes.h"
29
+
30
+struct cli_magic_s {
31
+    int offset;
32
+    const char *magic;
33
+    size_t length;
34
+    const char *descr;
35
+    cli_file_t type;
36
+};
37
+
38
+struct cli_smagic_s {
39
+    const char *sig;
40
+    const char *descr;
41
+    cli_file_t type;
42
+};
43
+
44
+static const struct cli_magic_s cli_magic[] = {
45
+
46
+    /* Executables */
47
+
48
+/*  {0,  "MZ",				2,  "DOS/W32 executable", CL_DOSEXE},*/
49
+
50
+    /* Archives */
51
+
52
+    {0,  "Rar!",			4,  "RAR",		  CL_RARFILE},
53
+    {0,  "PK\003\004",			4,  "ZIP",		  CL_ZIPFILE},
54
+    {0,  "\037\213",			2,  "GZip",		  CL_GZFILE},
55
+    {0,  "BZh",				3,  "BZip",		  CL_BZFILE},
56
+    {0,  "SZDD",			4,  "compress.exe'd",	  CL_MSCFILE},
57
+    {0,  "MSCF",			4,  "MS CAB",		  CL_MSCABFILE},
58
+
59
+    /* Mail */
60
+
61
+    {0,  "From ",			 5, "MBox",		  CL_MAILFILE},
62
+    {0,  "Received",			 8, "Raw mail",		  CL_MAILFILE},
63
+    {0,  "Return-Path: ",		13, "Maildir",		  CL_MAILFILE},
64
+    {0,  "Return-path: ",		13, "Maildir",		  CL_MAILFILE},
65
+    {0,  "Delivered-To: ",		14, "Mail",		  CL_MAILFILE},
66
+    {0,  "X-UIDL: ",			 8, "Mail",		  CL_MAILFILE},
67
+    {0,  "X-Apparently-To: ",		17, "Mail",		  CL_MAILFILE},
68
+    {0,  "X-Envelope-From: ",		17, "Mail",		  CL_MAILFILE},
69
+    {0,  "X-Original-To: ",		15, "Mail",		  CL_MAILFILE},
70
+    {0,  "X-Symantec-",			11, "Symantec",		  CL_MAILFILE},
71
+    {0,  "X-EVS",			 5, "EVS mail",		  CL_MAILFILE},
72
+    {0,  "X-Real-To: ",                 11, "Mail",               CL_MAILFILE},
73
+    {0,  ">From ",			 6, "Mail",		  CL_MAILFILE},
74
+    {0,  "Date: ",			 6, "Mail",		  CL_MAILFILE},
75
+    {0,  "Message-Id: ",		12, "Mail",		  CL_MAILFILE},
76
+    {0,  "Message-ID: ",		12, "Mail",		  CL_MAILFILE},
77
+    {0,  "Envelope-to: ",		13, "Mail",		  CL_MAILFILE},
78
+    {0,  "Delivery-date: ",		15, "Mail",		  CL_MAILFILE},
79
+    {0,  "To: ",			 4, "Mail",		  CL_MAILFILE},
80
+    {0,  "Subject: ",			 9, "Mail",		  CL_MAILFILE},
81
+    {0,  "For: ",			 5, "Eserv mail",	  CL_MAILFILE},
82
+    {0,  "From: ",			 6, "Exim mail",	  CL_MAILFILE},
83
+    {0,  "v:\015\012Received: ",	14, "VPOP3 Mail (DOS)",	  CL_MAILFILE},
84
+    {0,  "v:\012Received: ",		13, "VPOP3 Mail (UNIX)",  CL_MAILFILE},
85
+    {0,  "Hi. This is the qmail-send",  26, "Qmail bounce",	  CL_MAILFILE},
86
+
87
+    /* Others */
88
+
89
+    {0,  "\320\317\021\340\241\261\032\341",
90
+	                    8, "OLE2 container",  CL_OLE2FILE},
91
+
92
+    /* Ignored types */
93
+
94
+    {0,  "\000\000\001\263",             4, "MPEG video stream",  CL_DATAFILE},
95
+    {0,  "\000\000\001\272",             4, "MPEG sys stream",    CL_DATAFILE},
96
+    {0,  "RIFF",                         4, "RIFF",		  CL_DATAFILE},
97
+    {0,  "GIF",				 3, "GIF",		  CL_DATAFILE},
98
+    {0,  "\x89PNG",			 4, "PNG",                CL_DATAFILE},
99
+    {0,  "\377\330\377",		 4, "JPEG",               CL_DATAFILE},
100
+    {0,  "BM",				 2, "BMP",                CL_DATAFILE},
101
+    {0,  "OggS",                         4, "Ogg Stream",         CL_DATAFILE},
102
+    {0,  "ID3",				 3, "MP3",		  CL_DATAFILE},
103
+    {0,  "\377\373\220",		 3, "MP3",		  CL_DATAFILE},
104
+    {0,  "\%PDF-",			 5, "PDF document",	  CL_DATAFILE},
105
+    {0,  "\%!PS-Adobe-",		11, "PostScript",	  CL_DATAFILE},
106
+    {0,  "\060\046\262\165\216\146\317", 7, "WMA/WMV/ASF",	  CL_DATAFILE},
107
+    {0,  ".RMF" ,			 4, "Real Media File",	  CL_DATAFILE},
108
+
109
+    {-1, NULL,				 0, NULL,              CL_UNKNOWN_TYPE}
110
+};
111
+
112
+static const struct cli_smagic_s cli_smagic[] = {
113
+
114
+    /* <html>*<body> */
115
+    {"3c68746d6c3e*3c626f64793e",    "HTML data", CL_HTMLFILE},
116
+
117
+    {NULL,  NULL,   CL_UNKNOWN_TYPE}
118
+};
119
+
120
+cli_file_t cli_filetype(const char *buf, size_t buflen)
121
+{
122
+	int i;
123
+
124
+    for(i = 0; cli_magic[i].magic; i++) {
125
+	if(buflen >= cli_magic[i].offset+cli_magic[i].length) {
126
+	    if(memcmp(buf+cli_magic[i].offset, cli_magic[i].magic, cli_magic[i].length) == 0) {
127
+		cli_dbgmsg("Recognized %s file\n", cli_magic[i].descr);
128
+		return cli_magic[i].type;
129
+	    }
130
+	}
131
+    }
132
+
133
+    return CL_UNKNOWN_TYPE;
134
+}
135
+
136
+int cli_addtypesigs(struct cl_node *root)
137
+{
138
+	int i, ret;
139
+
140
+    for(i = 0; cli_smagic[i].sig; i++) {
141
+	if((ret = cli_parse_add(root, cli_smagic[i].descr, cli_smagic[i].sig, cli_smagic[i].type))) {
142
+	    cli_errmsg("cli_addtypesigs(): Problem adding signature for %s\n", cli_smagic[i].descr);
143
+	    return ret;
144
+	}
145
+    }
146
+
147
+    return 0;
148
+}
0 149
new file mode 100644
... ...
@@ -0,0 +1,47 @@
0
+/*
1
+ *  Copyright (C) 2002 - 2004 Tomasz Kojm <tkojm@clamav.net>
2
+ *  With enhancements from Thomas Lamy <Thomas.Lamy@in-online.net>
3
+ *
4
+ *  This program is free software; you can redistribute it and/or modify
5
+ *  it under the terms of the GNU General Public License as published by
6
+ *  the Free Software Foundation; either version 2 of the License, or
7
+ *  (at your option) any later version.
8
+ *
9
+ *  This program is distributed in the hope that it will be useful,
10
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
11
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12
+ *  GNU General Public License for more details.
13
+ *
14
+ *  You should have received a copy of the GNU General Public License
15
+ *  along with this program; if not, write to the Free Software
16
+ *  Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
17
+ */
18
+
19
+#ifndef __FILETYPES_H
20
+#define __FILETYPES_H
21
+
22
+#define MAGIC_BUFFER_SIZE 26
23
+#define CL_TYPENO 500
24
+
25
+typedef enum {
26
+    CL_UNKNOWN_TYPE = CL_TYPENO,
27
+    CL_DOSEXE,
28
+    CL_DATAFILE,
29
+    CL_MAILFILE,
30
+    CL_GZFILE,
31
+    CL_ZIPFILE,
32
+    CL_BZFILE,
33
+    CL_RARFILE,
34
+    CL_MSCFILE,
35
+    CL_OLE2FILE,
36
+    CL_MSCABFILE,
37
+
38
+    /* file types recognized on-the-fly: bigger numbers have bigger priority */
39
+    CL_HTMLFILE
40
+
41
+} cli_file_t;
42
+
43
+cli_file_t cli_filetype(const char *buf, size_t buflen);
44
+int cli_addtypesigs(struct cl_node *root);
45
+
46
+#endif
0 47
new file mode 100644
... ...
@@ -0,0 +1,293 @@
0
+/*
1
+ *  Copyright (C) 2004 Trog <trog@clamav.net>
2
+ *
3
+ *  This program is free software; you can redistribute it and/or modify
4
+ *  it under the terms of the GNU General Public License as published by
5
+ *  the Free Software Foundation; either version 2 of the License, or
6
+ *  (at your option) any later version.
7
+ *
8
+ *  This program is distributed in the hope that it will be useful,
9
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
10
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
11
+ *  GNU General Public License for more details.
12
+ *
13
+ *  You should have received a copy of the GNU General Public License
14
+ *  along with this program; if not, write to the Free Software
15
+ *  Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
16
+ */
17
+
18
+#if HAVE_CONFIG_H
19
+#include "clamav-config.h"
20
+#endif
21
+
22
+#include <stdio.h>
23
+#include <unistd.h>
24
+#include <sys/types.h>
25
+#include <sys/stat.h>
26
+#include <fcntl.h>
27
+
28
+#include "others.h"
29
+
30
+#define FALSE (0)
31
+#define TRUE (1)
32
+
33
+/* Normalize an HTML buffer using the following rules:
34
+	o Remove multiple contiguous spaces
35
+	o Remove spaces around '<' and '>' in tags
36
+	o Remove spaces around '=' in tags
37
+	o Replace single quote with double quote in tags
38
+	o Convert to lowercase
39
+	o Convert all white space to a space character
40
+*/
41
+
42
+unsigned char *html_normalize(unsigned char *in_buff, off_t in_size)
43
+{
44
+	unsigned char *out_buff;
45
+	off_t out_size=0, i;
46
+	int had_space=FALSE, tag_depth=0, in_quote=FALSE;
47
+
48
+	out_buff = (unsigned char *) cli_malloc(in_size+1);
49
+	if (!out_buff) {
50
+		cli_dbgmsg("html_normalize(): malloc failed\n");
51
+		return NULL;
52
+	}
53
+
54
+	for (i=0 ; i < in_size ; i++) {
55
+		if (in_buff[i] == '<') {
56
+			out_buff[out_size++] = '<';
57
+			tag_depth++;
58
+			if (tag_depth == 1) {
59
+				had_space=TRUE; /* consume spaces */
60
+			}
61
+		} else if ((in_buff[i] == '=') && (tag_depth == 1)) {
62
+			/* Remove preceeding spaces */
63
+			while ((out_size > 0) &&
64
+				(out_buff[out_size-1] == ' ')) {
65
+				out_size--;
66
+			}
67
+			out_buff[out_size++] = '=';
68
+			had_space=TRUE;
69
+		} else if (isspace(in_buff[i])) {
70
+			if (!had_space) {
71
+				out_buff[out_size++] = ' ';
72
+				had_space=TRUE;
73
+			}
74
+		} else if (in_buff[i] == '>') {
75
+			/* Remove preceeding spaces */
76
+			if (tag_depth == 1) {
77
+				while ((out_size > 0) &&
78
+					(out_buff[out_size-1] == ' ')) {
79
+					out_size--;
80
+				}
81
+			}
82
+			out_buff[out_size++] = '>';
83
+			tag_depth--;	
84
+		} else if ((in_buff[i] == '\'') && (tag_depth==1)) {
85
+			/* Convert single quotes to double quotes */
86
+			if (in_quote || out_buff[out_size-1] == '=') {
87
+				out_buff[out_size++] = '\"';
88
+				in_quote = !in_quote;
89
+			} else {
90
+				out_buff[out_size++] = '\'';
91
+			}
92
+		} else {
93
+			out_buff[out_size++] = tolower(in_buff[i]);
94
+			had_space=FALSE;
95
+		}
96
+	}
97
+	out_buff[out_size] = '\0';
98
+	return out_buff;
99
+}
100
+
101
+/* Remove HTML style comments from buffer */
102
+unsigned char *remove_html_comments(unsigned char *line)
103
+{
104
+	unsigned char *newline, *newcurrent;
105
+	int in_comment=FALSE;
106
+	
107
+	if (!line) {
108
+		return NULL;
109
+	}
110
+	
111
+	newcurrent = newline = (unsigned char *) cli_malloc(strlen(line) + 1);
112
+	if (!newline) {
113
+		return NULL;
114
+	}
115
+	
116
+	while(line) {
117
+		if (!(in_comment)) {
118
+			while (*line && *line != '<') {
119
+				*newcurrent = *line;
120
+				newcurrent++;
121
+				line++;
122
+			}
123
+			if (! *line) {
124
+				break;
125
+			}
126
+			if (!line[1]) {
127
+				*newcurrent = *line;
128
+				newcurrent++;
129
+				line++;
130
+				continue;
131
+			}
132
+			if (line[1] == '!') {
133
+				in_comment = TRUE;
134
+				line += 1;
135
+			} else {
136
+				*newcurrent = *line;
137
+				newcurrent++;
138
+				line++;
139
+			}
140
+		} else {
141
+			while (*line && *line != '>') {
142
+				line++;
143
+			}
144
+			if (! *line) {
145
+				break;
146
+			}
147
+			in_comment = FALSE;
148
+			line++;
149
+		}
150
+	}
151
+	*newcurrent = '\0';
152
+	return newline;
153
+}
154
+
155
+/* Decode an HTML escape character into it's character value */
156
+unsigned int decode_html_char_ref(unsigned char *cref,
157
+                                    unsigned char *dest)
158
+{
159
+
160
+	unsigned int hex=FALSE, value=0, count=0;
161
+	
162
+	if (!cref[0] || !cref[1]) {
163
+		return 0;
164
+	}
165
+	
166
+	if (((*cref == 'x') || (*cref == 'X')) && isxdigit(cref[1])) {
167
+		hex=TRUE;
168
+		cref++;
169
+		count++;
170
+	}
171
+	
172
+	while (isdigit(*cref) || (hex && isxdigit(*cref))) {
173
+		if (hex) {
174
+			value *= 16;
175
+		} else {
176
+			value *= 10;
177
+		}
178
+		value += (*cref - '0');
179
+		cref++;
180
+		count++;
181
+	}
182
+	if (*cref == ';') {
183
+		cref++;
184
+		count++;
185
+	}
186
+	
187
+	*dest = value;
188
+	
189
+	return count;
190
+}
191
+
192
+/* Remove HTML character escape sequences from buffer */
193
+unsigned char *remove_html_char_ref(unsigned char *line)
194
+{
195
+	unsigned char *newline, *newcurrent;
196
+	unsigned char *linepos, count;
197
+	
198
+	if (!line) {
199
+		return NULL;
200
+	}
201
+	
202
+	newcurrent = newline = (unsigned char *) cli_malloc(strlen(line) + 1);
203
+	if (!newline) {
204
+		return NULL;
205
+	}
206
+	while (line) {
207
+		linepos = strchr(line, '&');
208
+		if (!linepos) {
209
+			strcpy(newcurrent, line);
210
+			return newline;
211
+		}
212
+		strncpy(newcurrent, line, linepos-line);
213
+		newcurrent += linepos-line;
214
+
215
+		if (!linepos[1] || !linepos[2]) {
216
+			*newcurrent = '&';
217
+			newcurrent++;
218
+			line = linepos+1;
219
+			continue;
220
+		}
221
+		switch (linepos[1]) {
222
+		case '#':
223
+			count = decode_html_char_ref(linepos+2,
224
+					newcurrent);
225
+			if (count > 0) {
226
+				newcurrent++;
227
+				linepos += count+2;
228
+			} else {
229
+				*newcurrent = '&';
230
+				newcurrent++;
231
+				linepos++;
232
+			}
233
+			break;
234
+		/* TODO: character entities, &amp; etc. */
235
+		default:
236
+			*newcurrent = '&';
237
+			newcurrent++;
238
+			linepos++;
239
+		}
240
+		line = linepos;
241
+	}
242
+	*newcurrent = '\0';
243
+	return newline;
244
+}
245
+
246
+int char2hex(unsigned char c)
247
+{
248
+	if ((c-'0') <= 9) {
249
+		return (c-'0');
250
+	} else if ((c-'A') <= 5) {
251
+		return (c-'A'+10);
252
+	}
253
+	return (c-'a'+10);
254
+}
255
+
256
+char *quoted_decode(unsigned char *line, off_t in_size)
257
+{
258
+	unsigned char *newline, *newcurrent, *line_end;
259
+	
260
+	newcurrent = newline = (unsigned char *) cli_malloc(in_size + 1);
261
+	if (!newline) {
262
+		return NULL;
263
+	}
264
+	
265
+	line_end = line+in_size;
266
+	while (line <= line_end) {
267
+		while ((line < line_end) && *line != '=') {
268
+			*newcurrent = *line;
269
+			line++;
270
+			newcurrent++;
271
+		}
272
+		if ((line < line_end) && isspace(line[1])) {
273
+			line++;
274
+			while ((line < line_end) && isspace(*line)) {
275
+				line++;
276
+			}
277
+			continue;
278
+		}
279
+		if ((line+2) <= line_end) {
280
+			if (isxdigit(line[1]) && isxdigit(line[2])) {
281
+				*newcurrent = 	(char2hex(line[1]) * 16) +
282
+						char2hex(line[2]);
283
+				newcurrent++;
284
+				line += 3;
285
+				continue;
286
+			}
287
+		}
288
+		line++;	
289
+	}
290
+	*newcurrent = '\0';
291
+	return newline;
292
+}
0 293
new file mode 100644
... ...
@@ -0,0 +1,29 @@
0
+/*
1
+ *  Copyright (C) 2004 Trog <trog@clamav.net>
2
+ *
3
+ *  This program is free software; you can redistribute it and/or modify
4
+ *  it under the terms of the GNU General Public License as published by
5
+ *  the Free Software Foundation; either version 2 of the License, or
6
+ *  (at your option) any later version.
7
+ *
8
+ *  This program is distributed in the hope that it will be useful,
9
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
10
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
11
+ *  GNU General Public License for more details.
12
+ *
13
+ *  You should have received a copy of the GNU General Public License
14
+ *  along with this program; if not, write to the Free Software
15
+ *  Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
16
+ */
17
+
18
+#ifndef __HTMLNORM_H
19
+#define __HTMLNORM_H
20
+
21
+#include <sys/types.h>
22
+
23
+unsigned char *html_normalize(unsigned char *in_buff, off_t in_size);
24
+unsigned char *remove_html_comments(unsigned char *line);
25
+unsigned char *remove_html_char_ref(unsigned char *line);
26
+char *quoted_decode(unsigned char *line, off_t in_size);
27
+
28
+#endif
... ...
@@ -35,6 +35,7 @@
35 35
 #include "matcher.h"
36 36
 #include "unrarlib.h"
37 37
 #include "defaults.h"
38
+#include "filetypes.h"
38 39
 
39 40
 int cli_addpatt(struct cl_node *root, struct cli_patt *pattern)
40 41
 {
... ...
@@ -161,6 +162,14 @@ static int cli_maketrans(struct cl_node *root)
161 161
 
162 162
 int cl_buildtrie(struct cl_node *root)
163 163
 {
164
+	int ret;
165
+
166
+    if(!root)
167
+	return CL_EMALFDB;
168
+
169
+    if((ret = cli_addtypesigs(root)))
170
+	return ret;
171
+
164 172
     return cli_maketrans(root);
165 173
 }
166 174
 
... ...
@@ -219,11 +228,11 @@ int inline cli_findpos(const char *buffer, int offset, int length, const struct
219 219
     return 1;
220 220
 }
221 221
 
222
-int cli_scanbuff(const char *buffer, unsigned int length, const char **virname, const struct cl_node *root, int *partcnt)
222
+int cli_scanbuff(const char *buffer, unsigned int length, const char **virname, const struct cl_node *root, int *partcnt, int typerec)
223 223
 {
224 224
 	struct cl_node *current;
225 225
 	struct cli_patt *pt;
226
-	int position;
226
+	int position, type = CL_CLEAN;
227 227
         unsigned int i;
228 228
 
229 229
 
... ...
@@ -246,15 +255,31 @@ int cli_scanbuff(const char *buffer, unsigned int length, const char **virname,
246 246
 		    if(pt->sigid) { /* it's a partial signature */
247 247
 			if(partcnt[pt->sigid] + 1 == pt->partno) {
248 248
 			    if(++partcnt[pt->sigid] == pt->parts) { /* the last one */
249
-				if(virname)
250
-				    *virname = pt->virname;
251
-				return CL_VIRUS;
249
+				if(pt->type) {
250
+				    if(typerec) {
251
+					cli_dbgmsg("Matched signature for file type: %s\n", pt->virname);
252
+					type = pt->type;
253
+				    }
254
+				} else {
255
+				    if(virname)
256
+					*virname = pt->virname;
257
+
258
+				    return CL_VIRUS;
259
+				}
252 260
 			    }
253 261
 			}
254 262
 		    } else { /* old type signature */
255
-			if(virname)
256
-			    *virname = pt->virname;
257
-			return CL_VIRUS;
263
+			if(pt->type) {
264
+			    if(typerec) {
265
+				cli_dbgmsg("Matched signature for file type: %s\n", pt->virname);
266
+				type = pt->type;
267
+			    }
268
+			} else {
269
+			    if(virname)
270
+				*virname = pt->virname;
271
+
272
+			    return CL_VIRUS;
273
+			}
258 274
 		    }
259 275
 		}
260 276
 
... ...
@@ -265,7 +290,7 @@ int cli_scanbuff(const char *buffer, unsigned int length, const char **virname,
265 265
 	}
266 266
     }
267 267
 
268
-    return CL_CLEAN;
268
+    return typerec ? type : CL_CLEAN;
269 269
 }
270 270
 
271 271
 int cl_scanbuff(const char *buffer, unsigned int length, const char **virname, const struct cl_node *root)
... ...
@@ -273,12 +298,13 @@ int cl_scanbuff(const char *buffer, unsigned int length, const char **virname, c
273 273
 {
274 274
 	int ret, *partcnt;
275 275
 
276
+
276 277
     if((partcnt = (int *) cli_calloc(root->partsigs + 1, sizeof(int))) == NULL) {
277 278
 	cli_dbgmsg("cli_scanbuff(): unable to cli_calloc(%d, %d)\n", root->partsigs + 1, sizeof(int));
278 279
 	return CL_EMEM;
279 280
     }
280 281
 
281
-    ret = cli_scanbuff(buffer, length, virname, root, partcnt);
282
+    ret = cli_scanbuff(buffer, length, virname, root, partcnt, 0);
282 283
 
283 284
     free(partcnt);
284 285
     return ret;
... ...
@@ -31,6 +31,6 @@ struct nodelist *cli_bfsadd(struct nodelist *bfs, struct cl_node *n);
31 31
 void cli_failtrans(struct cl_node *root);
32 32
 void cli_fasttrie(struct cl_node *n, struct cl_node *root);
33 33
 int cli_findpos(const char *buffer, int offset, int length, const struct cli_patt *pattern);
34
-int cli_scanbuff(const char *buffer, unsigned int length, const char **virname, const struct cl_node *root, int *partcnt);
34
+int cli_scanbuff(const char *buffer, unsigned int length, const char **virname, const struct cl_node *root, int *partcnt, int typerec);
35 35
 
36 36
 #endif
... ...
@@ -17,6 +17,9 @@
17 17
  *
18 18
  * Change History:
19 19
  * $Log: message.c,v $
20
+ * Revision 1.64  2004/07/02 23:00:57  kojm
21
+ * new method of file type detection; HTML normalisation
22
+ *
20 23
  * Revision 1.63  2004/06/26 13:16:25  nigelhorne
21 24
  * Added newline to end of warning message
22 25
  *
... ...
@@ -186,7 +189,7 @@
186 186
  * uuencodebegin() no longer static
187 187
  *
188 188
  */
189
-static	char	const	rcsid[] = "$Id: message.c,v 1.63 2004/06/26 13:16:25 nigelhorne Exp $";
189
+static	char	const	rcsid[] = "$Id: message.c,v 1.64 2004/07/02 23:00:57 kojm Exp $";
190 190
 
191 191
 #if HAVE_CONFIG_H
192 192
 #include "clamav-config.h"
... ...
@@ -223,7 +226,7 @@ static	char	const	rcsid[] = "$Id: message.c,v 1.63 2004/06/26 13:16:25 nigelhorn
223 223
 #include "strrcpy.h"
224 224
 #include "others.h"
225 225
 #include "str.h"
226
-#include "scanners.h"
226
+#include "filetypes.h"
227 227
 
228 228
 /* required for AIX and Tru64 */
229 229
 #ifdef TRUE
... ...
@@ -38,27 +38,28 @@
38 38
 #include "str.h"
39 39
 #include "defaults.h"
40 40
 
41
-static int cli_parse_add(struct cl_node *root, const char *virname, const char *hexstr, int sigid, int parts, int partno)
41
+
42
+static int cli_addsig(struct cl_node *root, const char *virname, const char *hexsig, int sigid, int parts, int partno, int type)
42 43
 {
43 44
 	struct cli_patt *new;
44
-	const char *pt;
45
-	int ret, virlen;
45
+	char *pt;
46
+	int virlen, ret;
46 47
 
47
-    /* decode a hexstring and prepare a new entry */
48 48
 
49 49
     if((new = (struct cli_patt *) cli_calloc(1, sizeof(struct cli_patt))) == NULL)
50 50
 	return CL_EMEM;
51 51
 
52
+    new->type = type;
52 53
     new->sigid = sigid;
53 54
     new->parts = parts;
54 55
     new->partno = partno;
55 56
 
56
-    new->length = strlen(hexstr) / 2;
57
+    new->length = strlen(hexsig) / 2;
57 58
 
58 59
     if(new->length > root->maxpatlen)
59 60
 	root->maxpatlen = new->length;
60 61
 
61
-    if((new->pattern = cl_hex2str(hexstr)) == NULL) {
62
+    if((new->pattern = cl_hex2str(hexsig)) == NULL) {
62 63
 	free(new);
63 64
 	return CL_EMALFDB;
64 65
     }
... ...
@@ -89,13 +90,54 @@ static int cli_parse_add(struct cl_node *root, const char *virname, const char *
89 89
     return 0;
90 90
 }
91 91
 
92
-/* this functions returns a pointer to the root of trie */
92
+int cli_parse_add(struct cl_node *root, char *virname, const char *hexsig, int type)
93
+{
94
+	struct cli_patt *new;
95
+	char *pt;
96
+	int ret, virlen, parts = 0, i, len;
97
+
98
+
99
+    if(strchr(hexsig, '*')) {
100
+	root->partsigs++;
101
+
102
+	len = strlen(hexsig);
103
+	for(i = 0; i < len; i++)
104
+	    if(hexsig[i] == '*')
105
+		parts++;
106
+
107
+	if(parts) /* there's always one part more */
108
+	    parts++;
109
+
110
+	for(i = 1; i <= parts; i++) {
111
+	    if((pt = cli_strtok(hexsig, i - 1, "*")) == NULL) {
112
+		cli_errmsg("Can't extract part %d of partial signature.\n", i + 1);
113
+		return CL_EMALFDB;
114
+	    }
115
+
116
+	    if((ret = cli_addsig(root, virname, pt, root->partsigs, parts, i, type))) {
117
+		cli_errmsg("cli_parse_add(): Problem adding signature.\n");
118
+		free(pt);
119
+		return ret;
120
+	    }
121
+
122
+	    free(pt);
123
+	}
124
+
125
+    } else { /* static */
126
+	if((ret = cli_addsig(root, virname, hexsig, 0, 0, 0, type))) {
127
+	    cli_errmsg("cli_parse_add(): Problem adding signature.\n");
128
+	    return ret;
129
+	}
130
+    }
131
+
132
+    return 0;
133
+}
93 134
 
94 135
 int cl_loaddb(const char *filename, struct cl_node **root, int *virnum)
95 136
 {
96 137
 	FILE *fd;
97
-	char *buffer, *pt, *start, *pt2;
98
-	int line = 0, ret, parts, i, sigid = 0;
138
+	char *buffer, *pt, *start;
139
+	int line = 0, ret;
99 140
 
100 141
 
101 142
     if((fd = fopen(filename, "rb")) == NULL) {
... ...
@@ -166,46 +208,11 @@ int cl_loaddb(const char *filename, struct cl_node **root, int *virnum)
166 166
 	    (*root)->maxpatlen = 0;
167 167
 	}
168 168
 
169
-	if(strchr(pt, '*')) { /* new type signature */
170
-	    (*root)->partsigs++;
171
-	    sigid++;
172
-	    parts = 0;
173
-	    for(i = 0; i < (int) strlen(pt); i++)
174
-		if(pt[i] == '*')
175
-		    parts++;
176
-
177
-	    if(parts) /* there's always one part more */
178
-		parts++;
179
-	    for(i = 1; i <= parts; i++) {
180
-		if((pt2 = cli_strtok(pt, i - 1, "*")) == NULL) {
181
-		    cli_errmsg("Can't extract part %d of partial signature in line %d\n", i + 1, line);
182
-		    free(buffer);
183
-		    fclose(fd);
184
-		    return CL_EMALFDB;
185
-		}
186
-
187
-		if((ret = cli_parse_add(*root, start, pt2, sigid, parts, i))) {
188
-		    cli_dbgmsg("parse_add() return code: %d\n", ret);
189
-		    cli_errmsg("readdb(): Malformed pattern line %d (file %s).\n", line, filename);
190
-		    free(pt2);
191
-		    free(buffer);
192
-		    fclose(fd);
193
-		    return ret;
194
-		}
195
-/*
196
-		cli_dbgmsg("Added part %d of partial signature (id %d)\n", i, sigid);
197
-*/
198
-		free(pt2);
199
-	    }
200
-
201
-	} else { /* old type */
202
-	    if((ret = cli_parse_add(*root, start, pt, 0, 0, 0))) {
203
-		cli_dbgmsg("parse_add() return code: %d\n", ret);
204
-		cli_errmsg("readdb(): Malformed pattern line %d (file %s).\n", line, filename);
205
-		free(buffer);
206
-		fclose(fd);
207
-		return ret;
208
-	    }
169
+	if((ret = cli_parse_add(*root, start, pt, 0))) {
170
+	    cli_errmsg("readdb(): Problem parsing signature at line %d (file %s).\n", line, filename);
171
+	    free(buffer);
172
+	    fclose(fd);
173
+	    return ret;
209 174
 	}
210 175
     }
211 176
 
212 177
new file mode 100644
... ...
@@ -0,0 +1,24 @@
0
+/*
1
+ *  Copyright (C) 2004 Tomasz Kojm <tkojm@clamav.net>
2
+ *
3
+ *  This program is free software; you can redistribute it and/or modify
4
+ *  it under the terms of the GNU General Public License as published by
5
+ *  the Free Software Foundation; either version 2 of the License, or
6
+ *  (at your option) any later version.
7
+ *
8
+ *  This program is distributed in the hope that it will be useful,
9
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
10
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
11
+ *  GNU General Public License for more details.
12
+ *
13
+ *  You should have received a copy of the GNU General Public License
14
+ *  along with this program; if not, write to the Free Software
15
+ *  Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
16
+ */
17
+
18
+#ifndef __READDB_H
19
+#define __READDB_H
20
+
21
+int cli_parse_add(struct cl_node *root, char *virname, const char *hexsig, int type);
22
+
23
+#endif
... ...
@@ -1,6 +1,5 @@
1 1
 /*
2 2
  *  Copyright (C) 2002 - 2004 Tomasz Kojm <tkojm@clamav.net>
3
- *  With enhancements from Thomas Lamy <Thomas.Lamy@in-online.net>
4 3
  *
5 4
  *  This program is free software; you can redistribute it and/or modify
6 5
  *  it under the terms of the GNU General Public License as published by
... ...
@@ -30,6 +29,14 @@
30 30
 #include <fcntl.h>
31 31
 #include <dirent.h>
32 32
 
33
+#if HAVE_MMAP
34
+#if HAVE_SYS_MMAN_H
35
+#include <sys/mman.h>
36
+#else /* HAVE_SYS_MMAN_H */
37
+#undef HAVE_MMAP
38
+#endif
39
+#endif
40
+
33 41
 #include <mspack.h>
34 42
 
35 43
 #ifdef CL_THREAD_SAFE
... ...
@@ -47,7 +54,8 @@ extern short cli_leavetemps_flag;
47 47
 #include "ole2_extract.h"
48 48
 #include "vba_extract.h"
49 49
 #include "msexpand.h"
50
-#include "scanners.h"
50
+#include "filetypes.h"
51
+#include "htmlnorm.h"
51 52
 
52 53
 #ifdef HAVE_ZLIB_H
53 54
 #include <zlib.h>
... ...
@@ -61,109 +69,19 @@ extern short cli_leavetemps_flag;
61 61
 #define SCAN_ARCHIVE	    (options & CL_ARCHIVE)
62 62
 #define SCAN_MAIL	    (options & CL_MAIL)
63 63
 #define SCAN_OLE2	    (options & CL_OLE2)
64
+#define SCAN_HTML	    (options & CL_HTML)
64 65
 #define DISABLE_RAR	    (options & CL_DISABLERAR)
65 66
 #define DETECT_ENCRYPTED    (options & CL_ENCRYPTED)
66 67
 
67
-struct cli_magic_s {
68
-    int offset;
69
-    const char *magic;
70
-    size_t length;
71
-    const char *descr;
72
-    cli_file_t type;
73
-};
74
-
75
-#define MAGIC_BUFFER_SIZE 26
76
-static const struct cli_magic_s cli_magic[] = {
77
-
78
-    /* Executables */
79
-
80
-/*  {0,  "MZ",				2,  "DOS/W32 executable", CL_DOSEXE},*/
81
-
82
-    /* Archives */
83
-
84
-    {0,  "Rar!",			4,  "RAR",		  CL_RARFILE},
85
-    {0,  "PK\003\004",			4,  "ZIP",		  CL_ZIPFILE},
86
-    {0,  "\037\213",			2,  "GZip",		  CL_GZFILE},
87
-    {0,  "BZh",				3,  "BZip",		  CL_BZFILE},
88
-    {0,  "SZDD",			4,  "compress.exe'd",	  CL_MSCFILE},
89
-    {0,  "MSCF",			4,  "MS CAB",		  CL_MSCABFILE},
90
-
91
-    /* Mail */
92
-
93
-    {0,  "From ",			 5, "MBox",		  CL_MAILFILE},
94
-    {0,  "Received",			 8, "Raw mail",		  CL_MAILFILE},
95
-    {0,  "Return-Path: ",		13, "Maildir",		  CL_MAILFILE},
96
-    {0,  "Return-path: ",		13, "Maildir",		  CL_MAILFILE},
97
-    {0,  "Delivered-To: ",		14, "Mail",		  CL_MAILFILE},
98
-    {0,  "X-UIDL: ",			 8, "Mail",		  CL_MAILFILE},
99
-    {0,  "X-Apparently-To: ",		17, "Mail",		  CL_MAILFILE},
100
-    {0,  "X-Envelope-From: ",		17, "Mail",		  CL_MAILFILE},
101
-    {0,  "X-Original-To: ",		15, "Mail",		  CL_MAILFILE},
102
-    {0,  "X-Symantec-",			11, "Symantec",		  CL_MAILFILE},
103
-    {0,  "X-EVS",			 5, "EVS mail",		  CL_MAILFILE},
104
-    {0,  "X-Real-To: ",                 11, "Mail",               CL_MAILFILE},
105
-    {0,  ">From ",			 6, "Mail",		  CL_MAILFILE},
106
-    {0,  "Date: ",			 6, "Mail",		  CL_MAILFILE},
107
-    {0,  "Message-Id: ",		12, "Mail",		  CL_MAILFILE},
108
-    {0,  "Message-ID: ",		12, "Mail",		  CL_MAILFILE},
109
-    {0,  "Envelope-to: ",		13, "Mail",		  CL_MAILFILE},
110
-    {0,  "Delivery-date: ",		15, "Mail",		  CL_MAILFILE},
111
-    {0,  "To: ",			 4, "Mail",		  CL_MAILFILE},
112
-    {0,  "Subject: ",			 9, "Mail",		  CL_MAILFILE},
113
-    {0,  "For: ",			 5, "Eserv mail",	  CL_MAILFILE},
114
-    {0,  "From: ",			 6, "Exim mail",	  CL_MAILFILE},
115
-    {0,  "v:\015\012Received: ",	14, "VPOP3 Mail (DOS)",	  CL_MAILFILE},
116
-    {0,  "v:\012Received: ",		13, "VPOP3 Mail (UNIX)",  CL_MAILFILE},
117
-    {0,  "Hi. This is the qmail-send",  26, "Qmail bounce",	  CL_MAILFILE},
118
-
119
-    /* Others */
120
-
121
-    {0,  "\320\317\021\340\241\261\032\341",
122
-	                    8, "OLE2 container",  CL_OLE2FILE},
123
-
124
-    /* Ignored types */
125
-
126
-    {0,  "\000\000\001\263",             4, "MPEG video stream",  CL_DATAFILE},
127
-    {0,  "\000\000\001\272",             4, "MPEG sys stream",    CL_DATAFILE},
128
-    {0,  "RIFF",                         4, "RIFF",		  CL_DATAFILE},
129
-    {0,  "GIF",				 3, "GIF",		  CL_DATAFILE},
130
-    {0,  "\x89PNG",			 4, "PNG",                CL_DATAFILE},
131
-    {0,  "\377\330\377",		 4, "JPEG",               CL_DATAFILE},
132
-    {0,  "BM",				 2, "BMP",                CL_DATAFILE},
133
-    {0,  "OggS",                         4, "Ogg Stream",         CL_DATAFILE},
134
-    {0,  "ID3",				 3, "MP3",		  CL_DATAFILE},
135
-    {0,  "\377\373\220",		 3, "MP3",		  CL_DATAFILE},
136
-    {0,  "\%PDF-",			 5, "PDF document",	  CL_DATAFILE},
137
-    {0,  "\%!PS-Adobe-",		11, "PostScript",	  CL_DATAFILE},
138
-    {0,  "\060\046\262\165\216\146\317", 7, "WMA/WMV/ASF",	  CL_DATAFILE},
139
-    {0,  ".RMF" ,			 4, "Real Media File",	  CL_DATAFILE},
140
-
141
-    {-1, NULL,				 0, NULL,              CL_UNKNOWN_TYPE}
142
-};
143
-
144
-cli_file_t cli_filetype(const char *buf, size_t buflen)
145
-{
146
-	int i;
147
-
148
-    for (i = 0; cli_magic[i].magic; i++) {
149
-	if (buflen >= cli_magic[i].offset+cli_magic[i].length) {
150
-	    if (memcmp(buf+cli_magic[i].offset, cli_magic[i].magic, cli_magic[i].length) == 0) {
151
-		cli_dbgmsg("Recognized %s file\n", cli_magic[i].descr);
152
-		return cli_magic[i].type;
153
-	    }
154
-	}
155
-    }
156
-
157
-    return CL_UNKNOWN_TYPE;
158
-}
159 68
 
160 69
 static int cli_magic_scandesc(int desc, const char **virname, long int *scanned, const struct cl_node *root, const struct cl_limits *limits, int options, int *reclev);
161 70
 static int cli_scanfile(const char *filename, const char **virname, unsigned long int *scanned, const struct cl_node *root, const struct cl_limits *limits, int options, int *reclev);
162 71
 
163
-static int cli_scandesc(int desc, const char **virname, long int *scanned, const struct cl_node *root)
72
+static int cli_scandesc(int desc, const char **virname, long int *scanned, const struct cl_node *root, int typerec)
164 73
 {
165 74
  	char *buffer, *buff, *endbl, *pt;
166
-	int bytes, buffsize, length, ret, *partcnt;
75
+	int bytes, buffsize, length, ret, *partcnt, type = CL_CLEAN;
76
+
167 77
 
168 78
     /* prepare the buffer */
169 79
     buffsize = root->maxpatlen + SCANBUFF;
... ...
@@ -194,10 +112,14 @@ static int cli_scandesc(int desc, const char **virname, long int *scanned, const
194 194
 	if(bytes < SCANBUFF)
195 195
 	    length -= SCANBUFF - bytes;
196 196
 
197
-	if((ret = cli_scanbuff(pt, length, virname, root, partcnt)) != CL_CLEAN) {
197
+	if((ret = cli_scanbuff(pt, length, virname, root, partcnt, typerec)) == CL_VIRUS) {
198 198
 	    free(buffer);
199 199
 	    free(partcnt);
200 200
 	    return ret;
201
+
202
+	} else if(typerec && ret >= CL_TYPENO) {
203
+	    if(ret >= type)
204
+		type = ret;
201 205
 	}
202 206
 
203 207
 	if(bytes == SCANBUFF)
... ...
@@ -210,7 +132,8 @@ static int cli_scandesc(int desc, const char **virname, long int *scanned, const
210 210
 
211 211
     free(buffer);
212 212
     free(partcnt);
213
-    return CL_CLEAN;
213
+
214
+    return typerec ? type : CL_CLEAN;
214 215
 }
215 216
 
216 217
 #ifdef CL_THREAD_SAFE
... ...
@@ -256,7 +179,7 @@ static int cli_scanrar(int desc, const char **virname, long int *scanned, const
256 256
 	    files++;
257 257
 	    cli_dbgmsg("Rar -> Encrypted files found in archive.\n");
258 258
 	    lseek(desc, 0, SEEK_SET);
259
-	    if(cli_scandesc(desc, virname, scanned, root) != CL_VIRUS)
259
+	    if(cli_scandesc(desc, virname, scanned, root, 0) != CL_VIRUS)
260 260
 		*virname = "Encrypted.RAR";
261 261
 	    ret = CL_VIRUS;
262 262
 	    break;
... ...
@@ -278,7 +201,7 @@ static int cli_scanrar(int desc, const char **virname, long int *scanned, const
278 278
 	    }
279 279
 	}
280 280
 
281
-        if(!!( rarlist->item.FileAttr & RAR_FENTRY_ATTR_DIRECTORY)) {
281
+        if(!(rarlist->item.FileAttr & RAR_FENTRY_ATTR_DIRECTORY)) {
282 282
             rarlist = rarlist->next;
283 283
             files++;
284 284
             continue;
... ...
@@ -428,7 +351,7 @@ static int cli_scanzip(int desc, const char **virname, long int *scanned, const
428 428
 	    files++;
429 429
 	    cli_dbgmsg("Zip -> Encrypted files found in archive.\n");
430 430
 	    lseek(desc, 0, SEEK_SET);
431
-	    if(cli_scandesc(desc, virname, scanned, root) != CL_VIRUS)
431
+	    if(cli_scandesc(desc, virname, scanned, root, 0) != CL_VIRUS)
432 432
 		*virname = "Encrypted.Zip";
433 433
 	    ret = CL_VIRUS;
434 434
 	    break;
... ...
@@ -771,6 +694,53 @@ static int cli_scanmscab(int desc, const char **virname, long int *scanned, cons
771 771
     return ret;
772 772
 }
773 773
 
774
+static int cli_scanhtml(int desc, const char **virname, long int *scanned, const struct cl_node *root, const struct cl_limits *limits, int options, int *reclev)
775
+{
776
+	unsigned char *membuff, *newbuff, *newbuff2;
777
+	struct stat statbuf;
778
+	int ret;
779
+
780
+    cli_dbgmsg("in cli_scanhtml()\n");
781
+
782
+    if(fstat(desc, &statbuf) != 0) {
783
+	cli_dbgmsg("fstat failed\n");
784
+        return CL_EIO;
785
+    }
786
+
787
+#ifdef HAVE_MMAP
788
+    membuff = mmap(NULL, statbuf.st_size, PROT_READ, MAP_PRIVATE, desc, 0);
789
+#else /* FIXME */
790
+    return CL_CLEAN;
791
+#endif
792
+
793
+    /* TODO: do file operations if mmap fails */
794
+    if(membuff == MAP_FAILED) {
795
+	cli_dbgmsg("mmap failed\n");
796
+        return CL_EMEM;
797
+    }
798
+
799
+    newbuff2 = quoted_decode(membuff, statbuf.st_size);
800
+    newbuff = html_normalize(newbuff2, strlen(newbuff2));
801
+    free(newbuff2);
802
+
803
+    if(newbuff) {
804
+	newbuff2 = remove_html_comments(newbuff);
805
+	free(newbuff);
806
+	newbuff = remove_html_char_ref(newbuff2);
807
+	free(newbuff2);
808
+	/* Normalise a second time as the above can leave inconsistent white
809
+	 * space
810
+	 */
811
+	newbuff2 = html_normalize(newbuff, strlen(newbuff));
812
+	free(newbuff);
813
+    }
814
+
815
+    ret = cl_scanbuff(newbuff2, strlen(newbuff2), virname, root);
816
+
817
+    free(newbuff2);
818
+    return ret;
819
+}
820
+
774 821
 static int cli_scandir(const char *dirname, const char **virname, long int *scanned, const struct cl_node *root, const struct cl_limits *limits, int options, int *reclev)
775 822
 {
776 823
 	DIR *dd;
... ...
@@ -1029,7 +999,7 @@ static int cli_scanmail(int desc, const char **virname, long int *scanned, const
1029 1029
 static int cli_magic_scandesc(int desc, const char **virname, long int *scanned, const struct cl_node *root, const struct cl_limits *limits, int options, int *reclev)
1030 1030
 {
1031 1031
 	char magic[MAGIC_BUFFER_SIZE+1];
1032
-	int ret = CL_CLEAN;
1032
+	int ret = CL_CLEAN, nret;
1033 1033
 	int bread = 0;
1034 1034
 	cli_file_t type;
1035 1035
 
... ...
@@ -1040,7 +1010,7 @@ static int cli_magic_scandesc(int desc, const char **virname, long int *scanned,
1040 1040
     }
1041 1041
 
1042 1042
     if(!options) { /* raw mode (stdin, etc.) */
1043
-	if((ret = cli_scandesc(desc, virname, scanned, root) == CL_VIRUS))
1043
+	if((ret = cli_scandesc(desc, virname, scanned, root, 0) == CL_VIRUS))
1044 1044
 	    cli_dbgmsg("%s virus found in descriptor %d.\n", *virname, desc);
1045 1045
 	return ret;
1046 1046
     }
... ...
@@ -1051,20 +1021,22 @@ static int cli_magic_scandesc(int desc, const char **virname, long int *scanned,
1051 1051
 	    return CL_CLEAN;
1052 1052
 
1053 1053
 
1054
-    (*reclev)++;
1055 1054
     lseek(desc, 0, SEEK_SET);
1056 1055
     bread = read(desc, magic, MAGIC_BUFFER_SIZE);
1057 1056
     magic[MAGIC_BUFFER_SIZE] = '\0';
1058 1057
     lseek(desc, 0, SEEK_SET);
1059 1058
 
1060
-    if (bread != MAGIC_BUFFER_SIZE) {
1059
+    if(bread != MAGIC_BUFFER_SIZE) {
1061 1060
 	/* short read: No need to do magic */
1062
-	(*reclev)--;
1061
+	if((ret = cli_scandesc(desc, virname, scanned, root, 0) == CL_VIRUS))
1062
+	    cli_dbgmsg("%s virus found in descriptor %d.\n", *virname, desc);
1063 1063
 	return ret;
1064 1064
     }
1065 1065
 
1066 1066
     type = cli_filetype(magic, bread);
1067 1067
 
1068
+    (*reclev)++;
1069
+
1068 1070
     switch(type) {
1069 1071
 	case CL_DOSEXE:
1070 1072
 	    /* temporarily the return code is ignored */
... ...
@@ -1128,10 +1100,22 @@ static int cli_magic_scandesc(int desc, const char **virname, long int *scanned,
1128 1128
     (*reclev)--;
1129 1129
 
1130 1130
     if(type != CL_DATAFILE && ret != CL_VIRUS) { /* scan the raw file */
1131
-	lseek(desc, 0, SEEK_SET); /* If archive scan didn't rewind desc */
1132
-	if(cli_scandesc(desc, virname, scanned, root) == CL_VIRUS) {
1131
+	lseek(desc, 0, SEEK_SET);
1132
+
1133
+	if((nret = cli_scandesc(desc, virname, scanned, root, 1)) == CL_VIRUS) {
1133 1134
 	    cli_dbgmsg("%s virus found in descriptor %d.\n", *virname, desc);
1134 1135
 	    return CL_VIRUS;
1136
+
1137
+	} else if(nret >= CL_TYPENO) {
1138
+	    lseek(desc, 0, SEEK_SET);
1139
+
1140
+	    switch(nret) {
1141
+		case CL_HTMLFILE:
1142
+		    if(SCAN_HTML)
1143
+			if(cli_scanhtml(desc, virname, scanned, root, limits, options, reclev) == CL_VIRUS)
1144
+			    return CL_VIRUS;
1145
+		    break;
1146
+	    }
1135 1147
 	}
1136 1148
     }
1137 1149
 
1138 1150
deleted file mode 100644
... ...
@@ -1,39 +0,0 @@
1
-/*
2
- *  Copyright (C) 2002 - 2004 Tomasz Kojm <tkojm@clamav.net>
3
- *  With enhancements from Thomas Lamy <Thomas.Lamy@in-online.net>
4
- *
5
- *  This program is free software; you can redistribute it and/or modify
6
- *  it under the terms of the GNU General Public License as published by
7
- *  the Free Software Foundation; either version 2 of the License, or
8
- *  (at your option) any later version.
9
- *
10
- *  This program is distributed in the hope that it will be useful,
11
- *  but WITHOUT ANY WARRANTY; without even the implied warranty of
12
- *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13
- *  GNU General Public License for more details.
14
- *
15
- *  You should have received a copy of the GNU General Public License
16
- *  along with this program; if not, write to the Free Software
17
- *  Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
18
- */
19
-
20
-#ifndef __SCANNERS_H
21
-#define __SCANNERS_H
22
-
23
-typedef enum {
24
-    CL_UNKNOWN_TYPE = 0,
25
-    CL_DOSEXE,
26
-    CL_DATAFILE,
27
-    CL_MAILFILE,
28
-    CL_GZFILE,
29
-    CL_ZIPFILE,
30
-    CL_BZFILE,
31
-    CL_RARFILE,
32
-    CL_MSCFILE,
33
-    CL_OLE2FILE,
34
-    CL_MSCABFILE
35
-} cli_file_t;
36
-
37
-cli_file_t cli_filetype(const char *buf, size_t buflen);
38
-
39
-#endif
... ...
@@ -63,6 +63,7 @@ struct cfgstruct *parsecfg(const char *cfgfile, int messages)
63 63
 	    {"TemporaryDirectory", OPT_STR},
64 64
 	    {"MaxFileSize", OPT_COMPSIZE},
65 65
 	    {"ScanMail", OPT_NOARG},
66
+	    {"ScanHTML", OPT_NOARG},
66 67
 	    {"ScanOLE2", OPT_NOARG},
67 68
 	    {"ScanArchive", OPT_NOARG},
68 69
 	    {"ScanRAR", OPT_NOARG},
... ...
@@ -1,4 +1,4 @@
1
-# Makefile.in generated by automake 1.8.3 from Makefile.am.
1
+# Makefile.in generated by automake 1.8.5 from Makefile.am.
2 2
 # @configure_input@
3 3
 
4 4
 # Copyright (C) 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002,
... ...
@@ -444,9 +444,11 @@ TAGS:  $(HEADERS) $(SOURCES)  $(TAGS_DEPENDENCIES) \
444 444
 	  done | \
445 445
 	  $(AWK) '    { files[$$0] = 1; } \
446 446
 	       END { for (i in files) print i; }'`; \
447
-	test -z "$(ETAGS_ARGS)$$tags$$unique" \
448
-	  || $(ETAGS) $(ETAGSFLAGS) $(AM_ETAGSFLAGS) $(ETAGS_ARGS) \
449
-	     $$tags $$unique
447
+	if test -z "$(ETAGS_ARGS)$$tags$$unique"; then :; else \
448
+	  test -n "$$unique" || unique=$$empty_fix; \
449
+	  $(ETAGS) $(ETAGSFLAGS) $(AM_ETAGSFLAGS) $(ETAGS_ARGS) \
450
+	    $$tags $$unique; \
451
+	fi
450 452
 ctags: CTAGS
451 453
 CTAGS:  $(HEADERS) $(SOURCES)  $(TAGS_DEPENDENCIES) \
452 454
 		$(TAGS_FILES) $(LISP)