Browse code

Multithread the FollowURL calls

git-svn-id: file:///var/lib/svn/clamav-devel/trunk/clamav-devel@768 77e5149b-7576-45b1-b177-96237e5ba77b

Nigel Horne authored on 2004/08/19 06:39:36
Showing 2 changed files
... ...
@@ -1,3 +1,7 @@
1
+Wed Aug 18 22:38:39 BST 2004 (njh)
2
+----------------------------------
3
+  * libclamav/mbox.c:	Multithread the followURL code - URLs are now followed					in parallel
4
+
1 5
 Wed Aug 18 20:37:42 CEST 2004 (tk)
2 6
 ----------------------------------
3 7
   * libclamav/contrib: Include database optimisation tool (optimize/optimize.c).
... ...
@@ -17,6 +17,9 @@
17 17
  *
18 18
  * Change History:
19 19
  * $Log: mbox.c,v $
20
+ * Revision 1.105  2004/08/18 21:35:08  nigelhorne
21
+ * Multithread the FollowURL calls
22
+ *
20 23
  * Revision 1.104  2004/08/18 15:53:43  nigelhorne
21 24
  * Honour CL_MAILURL
22 25
  *
... ...
@@ -300,7 +303,7 @@
300 300
  * Compilable under SCO; removed duplicate code with message.c
301 301
  *
302 302
  */
303
-static	char	const	rcsid[] = "$Id: mbox.c,v 1.104 2004/08/18 15:53:43 nigelhorne Exp $";
303
+static	char	const	rcsid[] = "$Id: mbox.c,v 1.105 2004/08/18 21:35:08 nigelhorne Exp $";
304 304
 
305 305
 #if HAVE_CONFIG_H
306 306
 #include "clamav-config.h"
... ...
@@ -415,7 +418,16 @@ static	bool	saveFile(const blob *b, const char *dir);
415 415
 
416 416
 static	void	checkURLs(message *m, const char *dir);
417 417
 #ifdef	WITH_CURL
418
-static	void	getURL(const char *url, const char *dir, const char *filename);
418
+struct arg {
419
+	char *url;
420
+	char *dir;
421
+	char *filename;
422
+};
423
+#ifdef	CL_THREAD_SAFE
424
+static	void	*getURL(void *a);
425
+#else
426
+static	void	*getURL(struct arg *arg);
427
+#endif
419 428
 #endif
420 429
 
421 430
 
... ...
@@ -2307,6 +2319,10 @@ checkURLs(message *m, const char *dir)
2307 2307
 	size_t len;
2308 2308
 	table_t *t;
2309 2309
 	int n;
2310
+#if	defined(WITH_CURL) && defined(CL_THREAD_SAFE)
2311
+	pthread_t tid[MAX_URLS];
2312
+	struct arg args[MAX_URLS];
2313
+#endif
2310 2314
 
2311 2315
 	if(b == NULL)
2312 2316
 		return;
... ...
@@ -2335,13 +2351,18 @@ checkURLs(message *m, const char *dir)
2335 2335
 	while(len >= 8) {
2336 2336
 		/* FIXME: allow any number of white space */
2337 2337
 		if(strncasecmp(ptr, "<a href=", 8) == 0) {
2338
-#ifndef	WITH_CURL
2338
+#ifdef	WITH_CURL
2339
+#ifndef	CL_THREAD_SAFE
2340
+			struct arg arg;
2341
+#endif
2342
+
2343
+#else	/*!WITH_CURL*/
2339 2344
 #ifdef	CL_THREAD_SAFE
2340 2345
 			static pthread_mutex_t system_mutex = PTHREAD_MUTEX_INITIALIZER;
2341 2346
 #endif
2342 2347
 			struct stat statb;
2343 2348
 			char cmd[512];
2344
-#endif
2349
+#endif	/*WITH_CURL*/
2345 2350
 			char *p2 = &ptr[8];
2346 2351
 			char *p3;
2347 2352
 			char name[512];
... ...
@@ -2365,6 +2386,10 @@ checkURLs(message *m, const char *dir)
2365 2365
 				continue;
2366 2366
 			if(*p2 == '\0')
2367 2367
 				continue;
2368
+			if(n == MAX_URLS) {
2369
+				cli_warnmsg("Not all URLs will be scanned\n");
2370
+				break;
2371
+			}
2368 2372
 			if(tableFind(t, p2) == 1) {
2369 2373
 				cli_dbgmsg("URL %s already downloaded\n", p2);
2370 2374
 				continue;
... ...
@@ -2377,7 +2402,18 @@ checkURLs(message *m, const char *dir)
2377 2377
 					*p3 = '_';
2378 2378
 
2379 2379
 #ifdef	WITH_CURL
2380
-			getURL(p2, dir, name);
2380
+#ifdef	CL_THREAD_SAFE
2381
+			args[n].url = strdup(p2);
2382
+			args[n].dir = strdup(dir);
2383
+			args[n].filename = strdup(name);
2384
+			pthread_create(&tid[n], NULL, getURL, &args[n]);
2385
+#else
2386
+			arg.url = p2;
2387
+			arg.dir = dir;
2388
+			arg.filename = name;
2389
+			getURL(&arg);
2390
+#endif
2391
+
2381 2392
 #else
2382 2393
 			/*
2383 2394
 			 * TODO: maximum size and timeouts
... ...
@@ -2401,48 +2437,78 @@ checkURLs(message *m, const char *dir)
2401 2401
 					(void)unlink(cmd);
2402 2402
 				}
2403 2403
 #endif
2404
-			if(++n > MAX_URLS) {
2405
-				cli_warnmsg("Not all URLs will be scanned\n");
2406
-				break;
2407
-			}
2404
+			++n;
2408 2405
 		}
2409 2406
 		ptr++;
2410 2407
 		len--;
2411 2408
 	}
2412 2409
 	blobDestroy(b);
2413 2410
 	tableDestroy(t);
2411
+
2412
+#if	defined(WITH_CURL) && defined(CL_THREAD_SAFE)
2413
+	cli_dbgmsg("checkURLs: waiting for %d thread(s) to finish\n", n);
2414
+	while(--n >= 0) {
2415
+		pthread_join(tid[n], NULL);
2416
+		free(args[n].url);
2417
+		free(args[n].dir);
2418
+		free(args[n].filename);
2419
+	}
2420
+#endif
2414 2421
 }
2415 2422
 
2416 2423
 #ifdef	WITH_CURL
2424
+static void *
2425
+#ifdef	CL_THREAD_SAFE
2426
+getURL(void *a)
2427
+#else
2417 2428
 static void
2418
-getURL(const char *url, const char *dir, const char *filename)
2429
+getURL(struct arg *arg)
2430
+#endif
2419 2431
 {
2420 2432
 	char *fout;
2421 2433
 	CURL *curl;
2422 2434
 	FILE *fp;
2423 2435
 	struct curl_slist *headers;
2424 2436
 	static int initialised = 0;
2437
+#ifdef	CL_THREAD_SAFE
2438
+	static pthread_mutex_t init_mutex = PTHREAD_MUTEX_INITIALIZER;
2439
+	struct arg *arg = (struct arg *)a;
2440
+#endif
2441
+	const char *url = arg->url;
2442
+	const char *dir = arg->dir;
2443
+	const char *filename = arg->filename;
2425 2444
 
2445
+#ifdef	CL_THREAD_SAFE
2446
+	pthread_mutex_lock(&init_mutex);
2447
+#endif
2426 2448
 	if(!initialised) {
2427
-		if(curl_global_init(CURL_GLOBAL_NOTHING) != 0)
2428
-			return;
2449
+		if(curl_global_init(CURL_GLOBAL_NOTHING) != 0) {
2450
+#ifdef	CL_THREAD_SAFE
2451
+			pthread_mutex_unlock(&init_mutex);
2452
+#endif
2453
+			return NULL;
2454
+		}
2429 2455
 		initialised = 1;
2430 2456
 	}
2457
+#ifdef	CL_THREAD_SAFE
2458
+	pthread_mutex_unlock(&init_mutex);
2459
+#endif
2460
+
2431 2461
 	/* easy isn't the word I'd use... */
2432 2462
 	curl = curl_easy_init();
2433 2463
 	if(curl == NULL)
2434
-		return;
2464
+		return NULL;
2435 2465
 
2436 2466
 	(void)curl_easy_setopt(curl, CURLOPT_USERAGENT, "www.clamav.net");
2437 2467
 
2438 2468
 	if(curl_easy_setopt(curl, CURLOPT_URL, url) != 0)
2439
-		return;
2469
+		return NULL;
2440 2470
 
2441 2471
 	fout = cli_malloc(strlen(dir) + strlen(filename) + 2);
2442 2472
 
2443 2473
 	if(fout == NULL) {
2444 2474
 		curl_easy_cleanup(curl);
2445
-		return;
2475
+		return NULL;
2446 2476
 	}
2447 2477
 
2448 2478
 	sprintf(fout, "%s/%s", dir, filename);
... ...
@@ -2453,8 +2519,15 @@ getURL(const char *url, const char *dir, const char *filename)
2453 2453
 		perror(fout);
2454 2454
 		free(fout);
2455 2455
 		curl_easy_cleanup(curl);
2456
-		return;
2456
+		return NULL;
2457 2457
 	}
2458
+	if(curl_easy_setopt(curl, CURLOPT_WRITEDATA, fp) != 0) {
2459
+		fclose(fp);
2460
+		free(fout);
2461
+		curl_easy_cleanup(curl);
2462
+		return NULL;
2463
+	}
2464
+
2458 2465
 	/*
2459 2466
 	 * If an item is in squid's cache get it from there (TCP_HIT/200)
2460 2467
 	 * by default curl doesn't (TCP_CLIENT_REFRESH_MISS/200)
... ...
@@ -2462,13 +2535,6 @@ getURL(const char *url, const char *dir, const char *filename)
2462 2462
 	headers = curl_slist_append(NULL, "Pragma:");
2463 2463
 	curl_easy_setopt(curl, CURLOPT_HTTPHEADER, headers);
2464 2464
 
2465
-	if(curl_easy_setopt(curl, CURLOPT_WRITEDATA, fp) != 0) {
2466
-		fclose(fp);
2467
-		free(fout);
2468
-		curl_easy_cleanup(curl);
2469
-		return;
2470
-	}
2471
-
2472 2465
 	/* These should be customisable */
2473 2466
 	curl_easy_setopt(curl, CURLOPT_TIMEOUT, 30);
2474 2467
 	curl_easy_setopt(curl, CURLOPT_CONNECTTIMEOUT, 10);
... ...
@@ -2476,14 +2542,29 @@ getURL(const char *url, const char *dir, const char *filename)
2476 2476
 	curl_easy_setopt(curl, CURLOPT_MAXFILESIZE, 50*1024);
2477 2477
 #endif
2478 2478
 
2479
+#ifdef  CL_THREAD_SAFE
2480
+	curl_easy_setopt(curl, CURLOPT_DNS_USE_GLOBAL_CACHE, 0);
2481
+#endif
2482
+	/*
2483
+	 * FIXME: valgrind reports "pthread_mutex_unlock: mutex is not locked"
2484
+	 * from gethostbyaddr_r within this. It may be a bug in libcurl
2485
+	 * rather than this code, but I need to check, see Curl_resolv()
2486
+	 * If pushed really hard it will sometimes say
2487
+	 * Conditional jump or move depends on uninitialised value(s) and
2488
+	 * quit. But the program seems to work OK without valgrind...
2489
+	 * Perhaps Curl_resolv() isn't thread safe?
2490
+	 */
2479 2491
 	if(curl_easy_perform(curl) != CURLE_OK) {
2480 2492
 		cli_warnmsg("URL %s failed to download\n", url);
2481 2493
 		unlink(fout);
2482 2494
 	}
2483 2495
 
2484 2496
 	fclose(fp);
2497
+	curl_slist_free_all(headers);
2485 2498
 	curl_easy_cleanup(curl);
2486 2499
 	free(fout);
2500
+
2501
+	return NULL;
2487 2502
 }
2488 2503
 #endif
2489 2504