git-svn-id: file:///var/lib/svn/clamav-devel/trunk/clamav-devel@865 77e5149b-7576-45b1-b177-96237e5ba77b
Nigel Horne authored on 2004/09/15 05:51:01... | ... |
@@ -1,3 +1,10 @@ |
1 |
+Tue Sep 14 21:48:36 BST 2004 (njh) |
|
2 |
+---------------------------------- |
|
3 |
+ * libclamav/mbox.c: FOLLOWURL: now uses the new normalisation code to |
|
4 |
+ find URLs to scan for trojans. This means |
|
5 |
+ better scanning of HTML than the old FOLLOWURL |
|
6 |
+ code and all is now done in RAM |
|
7 |
+ |
|
1 | 8 |
Tue Sep 14 22:32:50 CEST 2004 (tk) |
2 | 9 |
---------------------------------- |
3 | 10 |
* libclamav: do not print outdate warning for main.cvd |
... | ... |
@@ -17,6 +17,9 @@ |
17 | 17 |
* |
18 | 18 |
* Change History: |
19 | 19 |
* $Log: mbox.c,v $ |
20 |
+ * Revision 1.119 2004/09/14 20:47:28 nigelhorne |
|
21 |
+ * Use new normalise code |
|
22 |
+ * |
|
20 | 23 |
* Revision 1.118 2004/09/14 12:09:37 nigelhorne |
21 | 24 |
* Include old normalise code |
22 | 25 |
* |
... | ... |
@@ -342,7 +345,7 @@ |
342 | 342 |
* Compilable under SCO; removed duplicate code with message.c |
343 | 343 |
* |
344 | 344 |
*/ |
345 |
-static char const rcsid[] = "$Id: mbox.c,v 1.118 2004/09/14 12:09:37 nigelhorne Exp $"; |
|
345 |
+static char const rcsid[] = "$Id: mbox.c,v 1.119 2004/09/14 20:47:28 nigelhorne Exp $"; |
|
346 | 346 |
|
347 | 347 |
#if HAVE_CONFIG_H |
348 | 348 |
#include "clamav-config.h" |
... | ... |
@@ -427,6 +430,8 @@ typedef enum { FALSE = 0, TRUE = 1 } bool; |
427 | 427 |
|
428 | 428 |
#ifdef FOLLOWURLS |
429 | 429 |
|
430 |
+#include "htmlnorm.h" |
|
431 |
+ |
|
430 | 432 |
#define MAX_URLS 5 /* |
431 | 433 |
* Maximum number of URLs scanned in a message |
432 | 434 |
* part |
... | ... |
@@ -2110,92 +2115,18 @@ saveTextPart(message *m, const char *dir) |
2110 | 2110 |
} |
2111 | 2111 |
|
2112 | 2112 |
#ifdef FOLLOWURLS |
2113 |
- |
|
2114 |
-/* |
|
2115 |
- * TODO: Use the newer normalise code |
|
2116 |
- * This is the old normalise code which normalises in memory. The new |
|
2117 |
- * code uses temporary files and has a different API. |
|
2118 |
- * |
|
2119 |
-* Normalize an HTML buffer using the following rules: |
|
2120 |
- o Remove multiple contiguous spaces |
|
2121 |
- o Remove spaces around '<' and '>' in tags |
|
2122 |
- o Remove spaces around '=' in tags |
|
2123 |
- o Replace single quote with double quote in tags |
|
2124 |
- o Convert to lowercase |
|
2125 |
- o Convert all white space to a space character |
|
2126 |
-*/ |
|
2127 |
-static unsigned char * |
|
2128 |
-mbox_html_normalize(unsigned char *in_buff, off_t in_size) |
|
2129 |
-{ |
|
2130 |
- unsigned char *out_buff; |
|
2131 |
- off_t out_size=0, i; |
|
2132 |
- int had_space=FALSE, tag_depth=0, in_quote=FALSE; |
|
2133 |
- |
|
2134 |
- out_buff = (unsigned char *)cli_malloc(in_size+1); |
|
2135 |
- if (!out_buff) { |
|
2136 |
- cli_errmsg("malloc failed"); |
|
2137 |
- return NULL; |
|
2138 |
- } |
|
2139 |
- |
|
2140 |
- for (i=0 ; i < in_size ; i++) { |
|
2141 |
- if (in_buff[i] == '<') { |
|
2142 |
- out_buff[out_size++] = '<'; |
|
2143 |
- tag_depth++; |
|
2144 |
- if (tag_depth == 1) { |
|
2145 |
- had_space=TRUE; /* consume spaces */ |
|
2146 |
- } |
|
2147 |
- } else if ((in_buff[i] == '=') && (tag_depth == 1)) { |
|
2148 |
- /* Remove preceeding spaces */ |
|
2149 |
- while ((out_size > 0) && |
|
2150 |
- (out_buff[out_size-1] == ' ')) { |
|
2151 |
- out_size--; |
|
2152 |
- } |
|
2153 |
- out_buff[out_size++] = '='; |
|
2154 |
- had_space=TRUE; |
|
2155 |
- } else if (isspace(in_buff[i])) { |
|
2156 |
- if (!had_space) { |
|
2157 |
- out_buff[out_size++] = ' '; |
|
2158 |
- had_space=TRUE; |
|
2159 |
- } |
|
2160 |
- } else if (in_buff[i] == '>') { |
|
2161 |
- /* Remove preceeding spaces */ |
|
2162 |
- if (tag_depth == 1) { |
|
2163 |
- while ((out_size > 0) && |
|
2164 |
- (out_buff[out_size-1] == ' ')) { |
|
2165 |
- out_size--; |
|
2166 |
- } |
|
2167 |
- } |
|
2168 |
- out_buff[out_size++] = '>'; |
|
2169 |
- tag_depth--; |
|
2170 |
- } else if ((in_buff[i] == '\'') && (tag_depth==1)) { |
|
2171 |
- /* Convert single quotes to double quotes */ |
|
2172 |
- if (in_quote || out_buff[out_size-1] == '=') { |
|
2173 |
- out_buff[out_size++] = '\"'; |
|
2174 |
- in_quote = !in_quote; |
|
2175 |
- } else { |
|
2176 |
- out_buff[out_size++] = '\''; |
|
2177 |
- } |
|
2178 |
- } else { |
|
2179 |
- out_buff[out_size++] = tolower(in_buff[i]); |
|
2180 |
- had_space=FALSE; |
|
2181 |
- } |
|
2182 |
- } |
|
2183 |
- out_buff[out_size] = '\0'; |
|
2184 |
- return out_buff; |
|
2185 |
-} |
|
2186 |
- |
|
2187 | 2113 |
static void |
2188 | 2114 |
checkURLs(message *m, const char *dir) |
2189 | 2115 |
{ |
2190 | 2116 |
blob *b = messageToBlob(m); |
2191 |
- char *ptr, *normalised; |
|
2192 | 2117 |
size_t len; |
2193 | 2118 |
table_t *t; |
2194 |
- int n; |
|
2119 |
+ int i, n; |
|
2195 | 2120 |
#if defined(WITH_CURL) && defined(CL_THREAD_SAFE) |
2196 | 2121 |
pthread_t tid[MAX_URLS]; |
2197 | 2122 |
struct arg args[MAX_URLS]; |
2198 | 2123 |
#endif |
2124 |
+ tag_arguments_t hrefs; |
|
2199 | 2125 |
|
2200 | 2126 |
if(b == NULL) |
2201 | 2127 |
return; |
... | ... |
@@ -2213,23 +2144,26 @@ checkURLs(message *m, const char *dir) |
2213 | 2213 |
|
2214 | 2214 |
t = tableCreate(); |
2215 | 2215 |
|
2216 |
- n = 0; |
|
2217 |
- normalised = ptr = mbox_html_normalize(blobGetData(b), len); |
|
2216 |
+ memset(&hrefs, '\0', sizeof(hrefs)); |
|
2217 |
+ |
|
2218 |
+ cli_dbgmsg("checkURLs: calling html_normalise_mem\n"); |
|
2219 |
+ html_normalise_mem(blobGetData(b), len, NULL, &hrefs); |
|
2220 |
+ cli_dbgmsg("checkURLs: html_normalise_mem returned\n"); |
|
2218 | 2221 |
|
2219 |
- if(normalised == NULL) { |
|
2222 |
+ /*if(href == NULL) { |
|
2220 | 2223 |
blobDestroy(b); |
2221 | 2224 |
tableDestroy(t); |
2222 | 2225 |
return; |
2223 |
- } |
|
2226 |
+ }*/ |
|
2224 | 2227 |
/* TODO: Do we need to call remove_html_comments? */ |
2225 | 2228 |
|
2226 |
- /* |
|
2227 |
- * cli_memstr(ptr, len, "<a href=", 8) |
|
2228 |
- * Don't use cli_memstr() until bounds problem sorted |
|
2229 |
- * and it returns the place that the 'needle' was found |
|
2230 |
- */ |
|
2231 |
- while(len >= 8) { |
|
2232 |
- if(strncasecmp(ptr, "<a href=", 8) == 0) { |
|
2229 |
+ n = 0; |
|
2230 |
+ |
|
2231 |
+ for(i = 0; i < hrefs.count; i++) { |
|
2232 |
+ const char *url = hrefs.value[i]; |
|
2233 |
+ |
|
2234 |
+ if(strncasecmp("http://", url, 7) == 0) { |
|
2235 |
+ char *ptr; |
|
2233 | 2236 |
#ifdef WITH_CURL |
2234 | 2237 |
#ifndef CL_THREAD_SAFE |
2235 | 2238 |
struct arg arg; |
... | ... |
@@ -2242,52 +2176,31 @@ checkURLs(message *m, const char *dir) |
2242 | 2242 |
struct stat statb; |
2243 | 2243 |
char cmd[512]; |
2244 | 2244 |
#endif /*WITH_CURL*/ |
2245 |
- char *p2 = &ptr[8]; |
|
2246 |
- char *p3; |
|
2247 |
- char name[512]; |
|
2248 |
- |
|
2249 |
- len -= 8; |
|
2250 |
- while((len > 0) && ((*p2 == '\"') || isspace(*p2))) { |
|
2251 |
- len--; |
|
2252 |
- p2++; |
|
2253 |
- } |
|
2254 |
- if(len == 0) |
|
2255 |
- break; |
|
2256 |
- ptr = p2; |
|
2257 |
- while((len > 0) && (isalnum(*ptr) || strchr("./?:%", *ptr))) { |
|
2258 |
- ptr++; |
|
2259 |
- len--; |
|
2260 |
- } |
|
2261 |
- if(len == 0) |
|
2262 |
- break; |
|
2263 |
- *ptr = '\0'; |
|
2264 |
- if(strncasecmp(p2, "mailto:", 7) == 0) |
|
2265 |
- continue; |
|
2266 |
- if(*p2 == '\0') |
|
2267 |
- continue; |
|
2268 |
- if(tableFind(t, p2) == 1) { |
|
2269 |
- cli_dbgmsg("URL %s already downloaded\n", p2); |
|
2245 |
+ char name[NAME_MAX]; |
|
2246 |
+ |
|
2247 |
+ if(tableFind(t, url) == 1) { |
|
2248 |
+ cli_dbgmsg("URL %s already downloaded\n", url); |
|
2270 | 2249 |
continue; |
2271 | 2250 |
} |
2272 | 2251 |
if(n == MAX_URLS) { |
2273 | 2252 |
cli_warnmsg("Not all URLs will be scanned\n"); |
2274 | 2253 |
break; |
2275 | 2254 |
} |
2276 |
- (void)tableInsert(t, p2, 1); |
|
2277 |
- cli_dbgmsg("Downloading URL %s to be scanned\n", p2); |
|
2278 |
- strncpy(name, p2, sizeof(name)); |
|
2279 |
- for(p3 = name; *p3; p3++) |
|
2280 |
- if(*p3 == '/') |
|
2281 |
- *p3 = '_'; |
|
2255 |
+ (void)tableInsert(t, url, 1); |
|
2256 |
+ cli_dbgmsg("Downloading URL %s to be scanned\n", url); |
|
2257 |
+ strncpy(name, url, sizeof(name)); |
|
2258 |
+ for(ptr = name; *ptr; ptr++) |
|
2259 |
+ if(*ptr == '/') |
|
2260 |
+ *ptr = '_'; |
|
2282 | 2261 |
|
2283 | 2262 |
#ifdef WITH_CURL |
2284 | 2263 |
#ifdef CL_THREAD_SAFE |
2285 |
- args[n].url = strdup(p2); |
|
2264 |
+ args[n].url = strdup(url); |
|
2286 | 2265 |
args[n].dir = strdup(dir); |
2287 | 2266 |
args[n].filename = strdup(name); |
2288 | 2267 |
pthread_create(&tid[n], NULL, getURL, &args[n]); |
2289 | 2268 |
#else |
2290 |
- arg.url = p2; |
|
2269 |
+ arg.url = url; |
|
2291 | 2270 |
arg.dir = dir; |
2292 | 2271 |
arg.filename = name; |
2293 | 2272 |
getURL(&arg); |
... | ... |
@@ -2297,7 +2210,7 @@ checkURLs(message *m, const char *dir) |
2297 | 2297 |
/* |
2298 | 2298 |
* TODO: maximum size and timeouts |
2299 | 2299 |
*/ |
2300 |
- snprintf(cmd, sizeof(cmd), "GET -t10 %s > %s/%s 2>/dev/null", p2, dir, name); |
|
2300 |
+ snprintf(cmd, sizeof(cmd), "GET -t10 %s > %s/%s 2>/dev/null", url, dir, name); |
|
2301 | 2301 |
cli_dbgmsg("%s\n", cmd); |
2302 | 2302 |
#ifdef CL_THREAD_SAFE |
2303 | 2303 |
pthread_mutex_lock(&system_mutex); |
... | ... |
@@ -2309,7 +2222,7 @@ checkURLs(message *m, const char *dir) |
2309 | 2309 |
snprintf(cmd, sizeof(cmd), "%s/%s", dir, name); |
2310 | 2310 |
if(stat(cmd, &statb) >= 0) |
2311 | 2311 |
if(statb.st_size == 0) { |
2312 |
- cli_warnmsg("URL %s failed to download\n", p2); |
|
2312 |
+ cli_warnmsg("URL %s failed to download\n", url); |
|
2313 | 2313 |
/* |
2314 | 2314 |
* Don't bother scanning an empty file |
2315 | 2315 |
*/ |
... | ... |
@@ -2318,12 +2231,10 @@ checkURLs(message *m, const char *dir) |
2318 | 2318 |
#endif |
2319 | 2319 |
++n; |
2320 | 2320 |
} |
2321 |
- ptr++; |
|
2322 |
- len--; |
|
2323 | 2321 |
} |
2322 |
+ html_tag_arg_free(&hrefs); |
|
2324 | 2323 |
blobDestroy(b); |
2325 | 2324 |
tableDestroy(t); |
2326 |
- free(normalised); |
|
2327 | 2325 |
|
2328 | 2326 |
#if defined(WITH_CURL) && defined(CL_THREAD_SAFE) |
2329 | 2327 |
cli_dbgmsg("checkURLs: waiting for %d thread(s) to finish\n", n); |