Browse code

Use new normalise code

git-svn-id: file:///var/lib/svn/clamav-devel/trunk/clamav-devel@865 77e5149b-7576-45b1-b177-96237e5ba77b

Nigel Horne authored on 2004/09/15 05:51:01
Showing 2 changed files
... ...
@@ -1,3 +1,10 @@
1
+Tue Sep 14 21:48:36 BST 2004 (njh)
2
+----------------------------------
3
+  * libclamav/mbox.c:	FOLLOWURL: now uses the new normalisation code to
4
+				find URLs to scan for trojans. This means
5
+				better scanning of HTML than the old FOLLOWURL
6
+				code and all is now done in RAM
7
+
1 8
 Tue Sep 14 22:32:50 CEST 2004 (tk)
2 9
 ----------------------------------
3 10
   * libclamav: do not print outdate warning for main.cvd
... ...
@@ -17,6 +17,9 @@
17 17
  *
18 18
  * Change History:
19 19
  * $Log: mbox.c,v $
20
+ * Revision 1.119  2004/09/14 20:47:28  nigelhorne
21
+ * Use new normalise code
22
+ *
20 23
  * Revision 1.118  2004/09/14 12:09:37  nigelhorne
21 24
  * Include old normalise code
22 25
  *
... ...
@@ -342,7 +345,7 @@
342 342
  * Compilable under SCO; removed duplicate code with message.c
343 343
  *
344 344
  */
345
-static	char	const	rcsid[] = "$Id: mbox.c,v 1.118 2004/09/14 12:09:37 nigelhorne Exp $";
345
+static	char	const	rcsid[] = "$Id: mbox.c,v 1.119 2004/09/14 20:47:28 nigelhorne Exp $";
346 346
 
347 347
 #if HAVE_CONFIG_H
348 348
 #include "clamav-config.h"
... ...
@@ -427,6 +430,8 @@ typedef enum	{ FALSE = 0, TRUE = 1 } bool;
427 427
 
428 428
 #ifdef	FOLLOWURLS
429 429
 
430
+#include "htmlnorm.h"
431
+
430 432
 #define	MAX_URLS	5	/*
431 433
 				 * Maximum number of URLs scanned in a message
432 434
 				 * part
... ...
@@ -2110,92 +2115,18 @@ saveTextPart(message *m, const char *dir)
2110 2110
 }
2111 2111
 
2112 2112
 #ifdef	FOLLOWURLS
2113
-
2114
-/*
2115
- * TODO: Use the newer normalise code
2116
- * This is the old normalise code which normalises in memory. The new
2117
- * code uses temporary files and has a different API.
2118
- * 
2119
-* Normalize an HTML buffer using the following rules:
2120
-	o Remove multiple contiguous spaces
2121
-	o Remove spaces around '<' and '>' in tags
2122
-	o Remove spaces around '=' in tags
2123
-	o Replace single quote with double quote in tags
2124
-	o Convert to lowercase
2125
-	o Convert all white space to a space character
2126
-*/
2127
-static unsigned char *
2128
-mbox_html_normalize(unsigned char *in_buff, off_t in_size)
2129
-{
2130
-	unsigned char *out_buff;
2131
-	off_t out_size=0, i;
2132
-	int had_space=FALSE, tag_depth=0, in_quote=FALSE;
2133
-	
2134
-	out_buff = (unsigned char *)cli_malloc(in_size+1);
2135
-	if (!out_buff) {
2136
-		cli_errmsg("malloc failed");
2137
-		return NULL;
2138
-	}
2139
-	
2140
-	for (i=0 ; i < in_size ; i++) {
2141
-		if (in_buff[i] == '<') {
2142
-			out_buff[out_size++] = '<';
2143
-			tag_depth++;
2144
-			if (tag_depth == 1) {
2145
-				had_space=TRUE; /* consume spaces */
2146
-			}
2147
-		} else if ((in_buff[i] == '=') && (tag_depth == 1)) {
2148
-			/* Remove preceeding spaces */
2149
-			while ((out_size > 0) &&
2150
-				(out_buff[out_size-1] == ' ')) {
2151
-				out_size--;
2152
-			}
2153
-			out_buff[out_size++] = '=';
2154
-			had_space=TRUE;
2155
-		} else if (isspace(in_buff[i])) {
2156
-			if (!had_space) {
2157
-				out_buff[out_size++] = ' ';
2158
-				had_space=TRUE;
2159
-			}
2160
-		} else if (in_buff[i] == '>') {
2161
-			/* Remove preceeding spaces */
2162
-			if (tag_depth == 1) {
2163
-				while ((out_size > 0) &&
2164
-					(out_buff[out_size-1] == ' ')) {
2165
-					out_size--;
2166
-				}
2167
-			}
2168
-			out_buff[out_size++] = '>';
2169
-			tag_depth--;	
2170
-		} else if ((in_buff[i] == '\'') && (tag_depth==1)) {
2171
-			/* Convert single quotes to double quotes */
2172
-			if (in_quote || out_buff[out_size-1] == '=') {
2173
-				out_buff[out_size++] = '\"';
2174
-				in_quote = !in_quote;
2175
-			} else {
2176
-				out_buff[out_size++] = '\'';
2177
-			}
2178
-		} else {
2179
-			out_buff[out_size++] = tolower(in_buff[i]);
2180
-			had_space=FALSE;
2181
-		}
2182
-	}
2183
-	out_buff[out_size] = '\0';
2184
-	return out_buff;
2185
-}
2186
-
2187 2113
 static void
2188 2114
 checkURLs(message *m, const char *dir)
2189 2115
 {
2190 2116
 	blob *b = messageToBlob(m);
2191
-	char *ptr, *normalised;
2192 2117
 	size_t len;
2193 2118
 	table_t *t;
2194
-	int n;
2119
+	int i, n;
2195 2120
 #if	defined(WITH_CURL) && defined(CL_THREAD_SAFE)
2196 2121
 	pthread_t tid[MAX_URLS];
2197 2122
 	struct arg args[MAX_URLS];
2198 2123
 #endif
2124
+	tag_arguments_t hrefs;
2199 2125
 
2200 2126
 	if(b == NULL)
2201 2127
 		return;
... ...
@@ -2213,23 +2144,26 @@ checkURLs(message *m, const char *dir)
2213 2213
 
2214 2214
 	t = tableCreate();
2215 2215
 
2216
-	n = 0;
2217
-	normalised = ptr = mbox_html_normalize(blobGetData(b), len);
2216
+	memset(&hrefs, '\0', sizeof(hrefs));
2217
+
2218
+	cli_dbgmsg("checkURLs: calling html_normalise_mem\n");
2219
+	html_normalise_mem(blobGetData(b), len, NULL, &hrefs);
2220
+	cli_dbgmsg("checkURLs: html_normalise_mem returned\n");
2218 2221
 
2219
-	if(normalised == NULL) {
2222
+	/*if(href == NULL) {
2220 2223
 		blobDestroy(b);
2221 2224
 		tableDestroy(t);
2222 2225
 		return;
2223
-	}
2226
+	}*/
2224 2227
 	/* TODO: Do we need to call remove_html_comments? */
2225 2228
 
2226
-	/*
2227
-	 * cli_memstr(ptr, len, "<a href=", 8)
2228
-	 * Don't use cli_memstr() until bounds problem sorted
2229
-	 * and it returns the place that the 'needle' was found
2230
-	 */
2231
-	while(len >= 8) {
2232
-		if(strncasecmp(ptr, "<a href=", 8) == 0) {
2229
+	n = 0;
2230
+
2231
+	for(i = 0; i < hrefs.count; i++) {
2232
+		const char *url = hrefs.value[i];
2233
+
2234
+		if(strncasecmp("http://", url, 7) == 0) {
2235
+			char *ptr;
2233 2236
 #ifdef	WITH_CURL
2234 2237
 #ifndef	CL_THREAD_SAFE
2235 2238
 			struct arg arg;
... ...
@@ -2242,52 +2176,31 @@ checkURLs(message *m, const char *dir)
2242 2242
 			struct stat statb;
2243 2243
 			char cmd[512];
2244 2244
 #endif	/*WITH_CURL*/
2245
-			char *p2 = &ptr[8];
2246
-			char *p3;
2247
-			char name[512];
2248
-
2249
-			len -= 8;
2250
-			while((len > 0) && ((*p2 == '\"') || isspace(*p2))) {
2251
-				len--;
2252
-				p2++;
2253
-			}
2254
-			if(len == 0)
2255
-				break;
2256
-			ptr = p2;
2257
-			while((len > 0) && (isalnum(*ptr) || strchr("./?:%", *ptr))) {
2258
-				ptr++;
2259
-				len--;
2260
-			}
2261
-			if(len == 0)
2262
-				break;
2263
-			*ptr = '\0';
2264
-			if(strncasecmp(p2, "mailto:", 7) == 0)
2265
-				continue;
2266
-			if(*p2 == '\0')
2267
-				continue;
2268
-			if(tableFind(t, p2) == 1) {
2269
-				cli_dbgmsg("URL %s already downloaded\n", p2);
2245
+			char name[NAME_MAX];
2246
+
2247
+			if(tableFind(t, url) == 1) {
2248
+				cli_dbgmsg("URL %s already downloaded\n", url);
2270 2249
 				continue;
2271 2250
 			}
2272 2251
 			if(n == MAX_URLS) {
2273 2252
 				cli_warnmsg("Not all URLs will be scanned\n");
2274 2253
 				break;
2275 2254
 			}
2276
-			(void)tableInsert(t, p2, 1);
2277
-			cli_dbgmsg("Downloading URL %s to be scanned\n", p2);
2278
-			strncpy(name, p2, sizeof(name));
2279
-			for(p3 = name; *p3; p3++)
2280
-				if(*p3 == '/')
2281
-					*p3 = '_';
2255
+			(void)tableInsert(t, url, 1);
2256
+			cli_dbgmsg("Downloading URL %s to be scanned\n", url);
2257
+			strncpy(name, url, sizeof(name));
2258
+			for(ptr = name; *ptr; ptr++)
2259
+				if(*ptr == '/')
2260
+					*ptr = '_';
2282 2261
 
2283 2262
 #ifdef	WITH_CURL
2284 2263
 #ifdef	CL_THREAD_SAFE
2285
-			args[n].url = strdup(p2);
2264
+			args[n].url = strdup(url);
2286 2265
 			args[n].dir = strdup(dir);
2287 2266
 			args[n].filename = strdup(name);
2288 2267
 			pthread_create(&tid[n], NULL, getURL, &args[n]);
2289 2268
 #else
2290
-			arg.url = p2;
2269
+			arg.url = url;
2291 2270
 			arg.dir = dir;
2292 2271
 			arg.filename = name;
2293 2272
 			getURL(&arg);
... ...
@@ -2297,7 +2210,7 @@ checkURLs(message *m, const char *dir)
2297 2297
 			/*
2298 2298
 			 * TODO: maximum size and timeouts
2299 2299
 			 */
2300
-			snprintf(cmd, sizeof(cmd), "GET -t10 %s > %s/%s 2>/dev/null", p2, dir, name);
2300
+			snprintf(cmd, sizeof(cmd), "GET -t10 %s > %s/%s 2>/dev/null", url, dir, name);
2301 2301
 			cli_dbgmsg("%s\n", cmd);
2302 2302
 #ifdef	CL_THREAD_SAFE
2303 2303
 			pthread_mutex_lock(&system_mutex);
... ...
@@ -2309,7 +2222,7 @@ checkURLs(message *m, const char *dir)
2309 2309
 			snprintf(cmd, sizeof(cmd), "%s/%s", dir, name);
2310 2310
 			if(stat(cmd, &statb) >= 0)
2311 2311
 				if(statb.st_size == 0) {
2312
-					cli_warnmsg("URL %s failed to download\n", p2);
2312
+					cli_warnmsg("URL %s failed to download\n", url);
2313 2313
 					/*
2314 2314
 					 * Don't bother scanning an empty file
2315 2315
 					 */
... ...
@@ -2318,12 +2231,10 @@ checkURLs(message *m, const char *dir)
2318 2318
 #endif
2319 2319
 			++n;
2320 2320
 		}
2321
-		ptr++;
2322
-		len--;
2323 2321
 	}
2322
+	html_tag_arg_free(&hrefs);
2324 2323
 	blobDestroy(b);
2325 2324
 	tableDestroy(t);
2326
-	free(normalised);
2327 2325
 
2328 2326
 #if	defined(WITH_CURL) && defined(CL_THREAD_SAFE)
2329 2327
 	cli_dbgmsg("checkURLs: waiting for %d thread(s) to finish\n", n);