GitList

@@ -17,6 +17,9 @@
+                      *
                       * Change History:
                       * $Log: mbox.c,v $
                     + * Revision 1.119  2004/09/14 20:47:28  nigelhorne
                     + * Use new normalise code
                     + *
                       * Revision 1.118  2004/09/14 12:09:37  nigelhorne
                       * Include old normalise code
+                      *
@@ -342,7 +345,7 @@
                       * Compilable under SCO; removed duplicate code with message.c
+                      *
                       */
                     -static	char	const	rcsid[] = "$Id: mbox.c,v 1.118 2004/09/14 12:09:37 nigelhorne Exp $";
                     +static	char	const	rcsid[] = "$Id: mbox.c,v 1.119 2004/09/14 20:47:28 nigelhorne Exp $";
                      #if HAVE_CONFIG_H
                      #include "clamav-config.h"
@@ -427,6 +430,8 @@ typedef enum	{ FALSE = 0, TRUE = 1 } bool;
                      #ifdef	FOLLOWURLS
                     +#include "htmlnorm.h"
+                    +
                      #define	MAX_URLS	5	/*
                      				 * Maximum number of URLs scanned in a message
                      				 * part
@@ -2110,92 +2115,18 @@ saveTextPart(message *m, const char *dir)
+                     }
                      #ifdef	FOLLOWURLS
+                    -
                     -/*
                     - * TODO: Use the newer normalise code
                     - * This is the old normalise code which normalises in memory. The new
                     - * code uses temporary files and has a different API.
                     - *
                     -* Normalize an HTML buffer using the following rules:
                     -	o Remove multiple contiguous spaces
                     -	o Remove spaces around '<' and '>' in tags
                     -	o Remove spaces around '=' in tags
                     -	o Replace single quote with double quote in tags
                     -	o Convert to lowercase
                     -	o Convert all white space to a space character
                     -*/
                     -static unsigned char *
                     -mbox_html_normalize(unsigned char *in_buff, off_t in_size)
                     -{
                     -	unsigned char *out_buff;
                     -	off_t out_size=0, i;
                     -	int had_space=FALSE, tag_depth=0, in_quote=FALSE;
+                    -
                     -	out_buff = (unsigned char *)cli_malloc(in_size+1);
                     -	if (!out_buff) {
                     -		cli_errmsg("malloc failed");
                     -		return NULL;
                     -	}
+                    -
                     -	for (i=0 ; i < in_size ; i++) {
                     -		if (in_buff[i] == '<') {
                     -			out_buff[out_size++] = '<';
                     -			tag_depth++;
                     -			if (tag_depth == 1) {
                     -				had_space=TRUE; /* consume spaces */
                     -			}
                     -		} else if ((in_buff[i] == '=') && (tag_depth == 1)) {
                     -			/* Remove preceeding spaces */
                     -			while ((out_size > 0) &&
                     -				(out_buff[out_size-1] == ' ')) {
                     -				out_size--;
                     -			}
                     -			out_buff[out_size++] = '=';
                     -			had_space=TRUE;
                     -		} else if (isspace(in_buff[i])) {
                     -			if (!had_space) {
                     -				out_buff[out_size++] = ' ';
                     -				had_space=TRUE;
                     -			}
                     -		} else if (in_buff[i] == '>') {
                     -			/* Remove preceeding spaces */
                     -			if (tag_depth == 1) {
                     -				while ((out_size > 0) &&
                     -					(out_buff[out_size-1] == ' ')) {
                     -					out_size--;
                     -				}
                     -			}
                     -			out_buff[out_size++] = '>';
                     -			tag_depth--;
                     -		} else if ((in_buff[i] == '\'') && (tag_depth==1)) {
                     -			/* Convert single quotes to double quotes */
                     -			if (in_quote || out_buff[out_size-1] == '=') {
                     -				out_buff[out_size++] = '\"';
                     -				in_quote = !in_quote;
                     -			} else {
                     -				out_buff[out_size++] = '\'';
                     -			}
                     -		} else {
                     -			out_buff[out_size++] = tolower(in_buff[i]);
                     -			had_space=FALSE;
                     -		}
                     -	}
                     -	out_buff[out_size] = '\0';
                     -	return out_buff;
                     -}
+                    -
                      static void
                      checkURLs(message *m, const char *dir)
+                     {
                      	blob *b = messageToBlob(m);
                     -	char *ptr, *normalised;
                      	size_t len;
                      	table_t *t;
                     -	int n;
                     +	int i, n;
                      #if	defined(WITH_CURL) && defined(CL_THREAD_SAFE)
                      	pthread_t tid[MAX_URLS];
                      	struct arg args[MAX_URLS];
                      #endif
                     +	tag_arguments_t hrefs;
                      	if(b == NULL)
                      		return;
@@ -2213,23 +2144,26 @@ checkURLs(message *m, const char *dir)
                      	t = tableCreate();
                     -	n = 0;
                     -	normalised = ptr = mbox_html_normalize(blobGetData(b), len);
                     +	memset(&hrefs, '\0', sizeof(hrefs));
+                    +
                     +	cli_dbgmsg("checkURLs: calling html_normalise_mem\n");
                     +	html_normalise_mem(blobGetData(b), len, NULL, &hrefs);
                     +	cli_dbgmsg("checkURLs: html_normalise_mem returned\n");
                     -	if(normalised == NULL) {
                     +	/*if(href == NULL) {
                      		blobDestroy(b);
                      		tableDestroy(t);
                      		return;
                     -	}
                     +	}*/
                      	/* TODO: Do we need to call remove_html_comments? */
                     -	/*
                     -	 * cli_memstr(ptr, len, "<a href=", 8)
                     -	 * Don't use cli_memstr() until bounds problem sorted
                     -	 * and it returns the place that the 'needle' was found
                     -	 */
                     -	while(len >= 8) {
                     -		if(strncasecmp(ptr, "<a href=", 8) == 0) {
                     +	n = 0;
+                    +
                     +	for(i = 0; i < hrefs.count; i++) {
                     +		const char *url = hrefs.value[i];
+                    +
                     +		if(strncasecmp("http://", url, 7) == 0) {
                     +			char *ptr;
                      #ifdef	WITH_CURL
                      #ifndef	CL_THREAD_SAFE
                      			struct arg arg;
@@ -2242,52 +2176,31 @@ checkURLs(message *m, const char *dir)
                      			struct stat statb;
                      			char cmd[512];
                      #endif	/*WITH_CURL*/
                     -			char *p2 = &ptr[8];
                     -			char *p3;
                     -			char name[512];
+                    -
                     -			len -= 8;
                     -			while((len > 0) && ((*p2 == '\"') || isspace(*p2))) {
                     -				len--;
                     -				p2++;
                     -			}
                     -			if(len == 0)
                     -				break;
                     -			ptr = p2;
                     -			while((len > 0) && (isalnum(*ptr) || strchr("./?:%", *ptr))) {
                     -				ptr++;
                     -				len--;
                     -			}
                     -			if(len == 0)
                     -				break;
                     -			*ptr = '\0';
                     -			if(strncasecmp(p2, "mailto:", 7) == 0)
                     -				continue;
                     -			if(*p2 == '\0')
                     -				continue;
                     -			if(tableFind(t, p2) == 1) {
                     -				cli_dbgmsg("URL %s already downloaded\n", p2);
                     +			char name[NAME_MAX];
+                    +
                     +			if(tableFind(t, url) == 1) {
                     +				cli_dbgmsg("URL %s already downloaded\n", url);
                      				continue;
+                     			}
                      			if(n == MAX_URLS) {
                      				cli_warnmsg("Not all URLs will be scanned\n");
                      				break;
+                     			}
                     -			(void)tableInsert(t, p2, 1);
                     -			cli_dbgmsg("Downloading URL %s to be scanned\n", p2);
                     -			strncpy(name, p2, sizeof(name));
                     -			for(p3 = name; *p3; p3++)
                     -				if(*p3 == '/')
                     -					*p3 = '_';
                     +			(void)tableInsert(t, url, 1);
                     +			cli_dbgmsg("Downloading URL %s to be scanned\n", url);
                     +			strncpy(name, url, sizeof(name));
                     +			for(ptr = name; *ptr; ptr++)
                     +				if(*ptr == '/')
                     +					*ptr = '_';
                      #ifdef	WITH_CURL
                      #ifdef	CL_THREAD_SAFE
                     -			args[n].url = strdup(p2);
                     +			args[n].url = strdup(url);
                      			args[n].dir = strdup(dir);
                      			args[n].filename = strdup(name);
                      			pthread_create(&tid[n], NULL, getURL, &args[n]);
                      #else
                     -			arg.url = p2;
                     +			arg.url = url;
                      			arg.dir = dir;
                      			arg.filename = name;
                      			getURL(&arg);
@@ -2297,7 +2210,7 @@ checkURLs(message *m, const char *dir)
                      			/*
                      			 * TODO: maximum size and timeouts
                      			 */
                     -			snprintf(cmd, sizeof(cmd), "GET -t10 %s > %s/%s 2>/dev/null", p2, dir, name);
                     +			snprintf(cmd, sizeof(cmd), "GET -t10 %s > %s/%s 2>/dev/null", url, dir, name);
                      			cli_dbgmsg("%s\n", cmd);
                      #ifdef	CL_THREAD_SAFE
                      			pthread_mutex_lock(&system_mutex);
@@ -2309,7 +2222,7 @@ checkURLs(message *m, const char *dir)
                      			snprintf(cmd, sizeof(cmd), "%s/%s", dir, name);
                      			if(stat(cmd, &statb) >= 0)
                      				if(statb.st_size == 0) {
                     -					cli_warnmsg("URL %s failed to download\n", p2);
                     +					cli_warnmsg("URL %s failed to download\n", url);
                      					/*
                      					 * Don't bother scanning an empty file
                      					 */
@@ -2318,12 +2231,10 @@ checkURLs(message *m, const char *dir)
                      #endif
                      			++n;
+                     		}
                     -		ptr++;
                     -		len--;
+                     	}
                     +	html_tag_arg_free(&hrefs);
                      	blobDestroy(b);
                      	tableDestroy(t);
                     -	free(normalised);
                      #if	defined(WITH_CURL) && defined(CL_THREAD_SAFE)
                      	cli_dbgmsg("checkURLs: waiting for %d thread(s) to finish\n", n);

@@ -1,3 +1,10 @@
                     +Tue Sep 14 21:48:36 BST 2004 (njh)
                     +----------------------------------
                     +  * libclamav/mbox.c:	FOLLOWURL: now uses the new normalisation code to
                     +				find URLs to scan for trojans. This means
                     +				better scanning of HTML than the old FOLLOWURL
                     +				code and all is now done in RAM
+                    +
                      Tue Sep 14 22:32:50 CEST 2004 (tk)
                      ----------------------------------
                        * libclamav: do not print outdate warning for main.cvd