Browse code

Normalise HTML before scanning for URLs to download

git-svn-id: file:///var/lib/svn/clamav-devel/trunk/clamav-devel@834 77e5149b-7576-45b1-b177-96237e5ba77b

Nigel Horne authored on 2004/09/06 20:05:44
Showing 2 changed files
... ...
@@ -1,3 +1,7 @@
1
+Mon Sep  6 12:04:08 BST 2004 (njh)
2
+----------------------------------
3
+  * libclamav/mbox.c:	Normalise the HTML before looking for URLs to scan
4
+
1 5
 Sun Sep  5 18:16:13 CEST 2004 (tk)
2 6
 ----------------------------------
3 7
   * libclamav/pe.c: fix pointer arithmetic in FSG block (bug reported by Nigel)
... ...
@@ -17,6 +17,9 @@
17 17
  *
18 18
  * Change History:
19 19
  * $Log: mbox.c,v $
20
+ * Revision 1.115  2004/09/06 11:02:08  nigelhorne
21
+ * Normalise HTML before scanning for URLs to download
22
+ *
20 23
  * Revision 1.114  2004/09/03 15:59:00  nigelhorne
21 24
  * Handle boundary= "foo"
22 25
  *
... ...
@@ -330,7 +333,7 @@
330 330
  * Compilable under SCO; removed duplicate code with message.c
331 331
  *
332 332
  */
333
-static	char	const	rcsid[] = "$Id: mbox.c,v 1.114 2004/09/03 15:59:00 nigelhorne Exp $";
333
+static	char	const	rcsid[] = "$Id: mbox.c,v 1.115 2004/09/06 11:02:08 nigelhorne Exp $";
334 334
 
335 335
 #if HAVE_CONFIG_H
336 336
 #include "clamav-config.h"
... ...
@@ -542,16 +545,14 @@ static	pthread_mutex_t	tables_mutex = PTHREAD_MUTEX_INITIALIZER;
542 542
 
543 543
 /*
544 544
  * TODO: when signal handling is added, need to remove temp files when a
545
- * signal is received
545
+ *	signal is received
546 546
  * TODO: add option to scan in memory not via temp files, perhaps with a
547 547
  * named pipe or memory mapped file, though this won't work on big e-mails
548 548
  * containing many levels of encapsulated messages - it'd just take too much
549 549
  * RAM
550
- * TODO: if debug is enabled, catch a segfault and dump the current e-mail
551
- * in it's entirety, then call abort()
552 550
  * TODO: parse .msg format files
553 551
  * TODO: fully handle AppleDouble format, see
554
- * http://www.lazerware.com/formats/Specs/AppleSingle_AppleDouble.pdf
552
+ *	http://www.lazerware.com/formats/Specs/AppleSingle_AppleDouble.pdf
555 553
  * TODO: ensure parseEmailHeaders is always called before parseEmailBody
556 554
  * TODO: create parseEmail which calls parseEmailHeaders then parseEmailBody
557 555
  * TODO: Look into TNEF. Is there anything that needs to be done here?
... ...
@@ -574,17 +575,17 @@ cli_mbox(const char *dir, int desc, unsigned int options)
574 574
 	if((fd = fdopen(i, "rb")) == NULL) {
575 575
 		cli_errmsg("Can't open descriptor %d\n", desc);
576 576
 		close(i);
577
-		return -1;
577
+		return CL_EOPEN;
578 578
 	}
579 579
 	if(fgets(buffer, sizeof(buffer), fd) == NULL) {
580 580
 		/* empty message */
581 581
 		fclose(fd);
582
-		return 0;
582
+		return CL_CLEAN;
583 583
 	}
584 584
 	m = messageCreate();
585 585
 	if(m == NULL) {
586 586
 		fclose(fd);
587
-		return -1;
587
+		return CL_EMEM;
588 588
 	}
589 589
 
590 590
 #ifdef	CL_THREAD_SAFE
... ...
@@ -601,7 +602,7 @@ cli_mbox(const char *dir, int desc, unsigned int options)
601 601
 #endif
602 602
 			messageDestroy(m);
603 603
 			fclose(fd);
604
-			return -1;
604
+			return CL_EMEM;
605 605
 		}
606 606
 	}
607 607
 #ifdef	CL_THREAD_SAFE
... ...
@@ -700,7 +701,12 @@ cli_mbox(const char *dir, int desc, unsigned int options)
700 700
 
701 701
 	fclose(fd);
702 702
 
703
-	retcode = 0;
703
+	/*
704
+	 * This is not necessarily true, but since the only options are
705
+	 * CL_CLEAN and CL_VIRUS this is the better choice. It would be
706
+	 * nice to have CL_CONTINUESCANNING or something like that
707
+	 */
708
+	retcode = CL_CLEAN;
704 709
 
705 710
 	body = parseEmailHeaders(m, rfc821);
706 711
 	messageDestroy(m);
... ...
@@ -710,6 +716,9 @@ cli_mbox(const char *dir, int desc, unsigned int options)
710 710
 		 */
711 711
 		if(messageGetBody(body))
712 712
 			if(!parseEmailBody(body, NULL, dir, rfc821, subtype, options))
713
+				/*
714
+				 * There is no mailformed e-mail return code
715
+				 */
713 716
 				retcode = -1;
714 717
 
715 718
 		/*
... ...
@@ -2098,7 +2107,7 @@ static void
2098 2098
 checkURLs(message *m, const char *dir)
2099 2099
 {
2100 2100
 	blob *b = messageToBlob(m);
2101
-	char *ptr;
2101
+	char *ptr, *normalised;
2102 2102
 	size_t len;
2103 2103
 	table_t *t;
2104 2104
 	int n;
... ...
@@ -2124,7 +2133,13 @@ checkURLs(message *m, const char *dir)
2124 2124
 	t = tableCreate();
2125 2125
 
2126 2126
 	n = 0;
2127
-	ptr = (char *)blobGetData(b);
2127
+	normalised = ptr = html_normalize(blobGetData(b), len);
2128
+
2129
+	if(normalised == NULL) {
2130
+		blobDestroy(b);
2131
+		tableDestroy(t);
2132
+		return;
2133
+	}
2128 2134
 
2129 2135
 	/*
2130 2136
 	 * cli_memstr(ptr, len, "<a href=", 8)
... ...
@@ -2132,7 +2147,6 @@ checkURLs(message *m, const char *dir)
2132 2132
 	 * and it returns the place that the 'needle' was found
2133 2133
 	 */
2134 2134
 	while(len >= 8) {
2135
-		/* FIXME: allow any number of white space */
2136 2135
 		if(strncasecmp(ptr, "<a href=", 8) == 0) {
2137 2136
 #ifdef	WITH_CURL
2138 2137
 #ifndef	CL_THREAD_SAFE
... ...
@@ -2227,6 +2241,7 @@ checkURLs(message *m, const char *dir)
2227 2227
 	}
2228 2228
 	blobDestroy(b);
2229 2229
 	tableDestroy(t);
2230
+	free(normalised);
2230 2231
 
2231 2232
 #if	defined(WITH_CURL) && defined(CL_THREAD_SAFE)
2232 2233
 	cli_dbgmsg("checkURLs: waiting for %d thread(s) to finish\n", n);
... ...
@@ -2390,6 +2405,8 @@ print_trace(int use_syslog)
2390 2390
 		else
2391 2391
 			cli_dbgmsg("%s\n", strings[i]);
2392 2392
 
2393
+	/* TODO: dump the current email */
2394
+
2393 2395
 	free(strings);
2394 2396
 }
2395 2397
 #endif