Browse code

Added parseEmailFile

git-svn-id: file:///var/lib/svn/clamav-devel/trunk/clamav-devel@1195 77e5149b-7576-45b1-b177-96237e5ba77b

Nigel Horne authored on 2004/12/19 01:34:31
Showing 2 changed files
... ...
@@ -1,7 +1,13 @@
1
+Sat Dec 18 16:32:51 GMT 2004 (njh)
2
+----------------------------------
3
+  * libclamav/mbox.c:	Removed the need for a extra (short lived) parse tree
4
+		on some mail formats. This will help memory and performance
5
+		in some scenarios
6
+
1 7
 Thu Dec 16 15:31:45 GMT 2004 (njh)
2 8
 ----------------------------------
3 9
   * libclamav:	Added some test software that decodes emails without parsing
4
-  			them first. It is not enabled by default, use at your
10
+			them first. It is not enabled by default, use at your
5 11
 			own risk and look at the comments first.
6 12
 		Cleared a few problems in the decoding algorithms found when
7 13
 			testing the above code
... ...
@@ -50,7 +56,7 @@ Sun Dec 12 20:34:03 GMT 2004 (njh)
50 50
 Sun Dec 12 19:40:10 UTC 2004 (acab)
51 51
 -----------------------------------
52 52
   * libclamav: upx:  improved PE rebuild - debug info on failure
53
- 
53
+
54 54
 Fri Dec 10 15:21:48 GMT 2004 (njh)
55 55
 ----------------------------------
56 56
   * libclamav/message.c:	Warn if the content-type contains a blank entry
... ...
@@ -17,6 +17,9 @@
17 17
  *
18 18
  * Change History:
19 19
  * $Log: mbox.c,v $
20
+ * Revision 1.202  2004/12/18 16:32:10  nigelhorne
21
+ * Added parseEmailFile
22
+ *
20 23
  * Revision 1.201  2004/12/16 15:29:08  nigelhorne
21 24
  * Tidy and add mmap test code
22 25
  *
... ...
@@ -591,7 +594,7 @@
591 591
  * Compilable under SCO; removed duplicate code with message.c
592 592
  *
593 593
  */
594
-static	char	const	rcsid[] = "$Id: mbox.c,v 1.201 2004/12/16 15:29:08 nigelhorne Exp $";
594
+static	char	const	rcsid[] = "$Id: mbox.c,v 1.202 2004/12/18 16:32:10 nigelhorne Exp $";
595 595
 
596 596
 #if HAVE_CONFIG_H
597 597
 #include "clamav-config.h"
... ...
@@ -731,6 +734,7 @@ typedef enum	{ FALSE = 0, TRUE = 1 } bool;
731 731
 #define	PARTIAL_DIR
732 732
 
733 733
 static	int	cli_parse_mbox(const char *dir, int desc, unsigned int options);
734
+static	message	*parseEmailFile(FILE *fin, const table_t *rfc821Table, const char *firstLine);
734 735
 static	message	*parseEmailHeaders(const message *m, const table_t *rfc821Table);
735 736
 static	int	parseEmailHeader(message *m, const char *line, const table_t *rfc821Table);
736 737
 static	int	parseEmailBody(message *messageIn, text *textIn, const char *dir, const table_t *rfc821Table, const table_t *subtypeTable, unsigned int options);
... ...
@@ -747,6 +751,7 @@ static	char	*rfc822comments(const char *in);
747 747
 #ifdef	PARTIAL_DIR
748 748
 static	int	rfc1341(message *m, const char *dir);
749 749
 #endif
750
+static	bool	usefulHeader(int commandNumber, const char *cmd);
750 751
 #ifdef	notdef
751 752
 static	const	char	*cli_pmemstr(const char *haystack, size_t hs, const char *needle, size_t ns);
752 753
 #endif
... ...
@@ -1093,7 +1098,7 @@ static int
1093 1093
 cli_parse_mbox(const char *dir, int desc, unsigned int options)
1094 1094
 {
1095 1095
 	int retcode, i;
1096
-	message *m, *body;
1096
+	message *body;
1097 1097
 	FILE *fd;
1098 1098
 	char buffer[LINE_LENGTH + 1];
1099 1099
 #ifdef HAVE_BACKTRACE
... ...
@@ -1114,12 +1119,6 @@ cli_parse_mbox(const char *dir, int desc, unsigned int options)
1114 1114
 		fclose(fd);
1115 1115
 		return CL_CLEAN;
1116 1116
 	}
1117
-	m = messageCreate();
1118
-	if(m == NULL) {
1119
-		fclose(fd);
1120
-		return CL_EMEM;
1121
-	}
1122
-
1123 1117
 #ifdef	CL_THREAD_SAFE
1124 1118
 	pthread_mutex_lock(&tables_mutex);
1125 1119
 #endif
... ...
@@ -1132,7 +1131,6 @@ cli_parse_mbox(const char *dir, int desc, unsigned int options)
1132 1132
 #ifdef	CL_THREAD_SAFE
1133 1133
 			pthread_mutex_unlock(&tables_mutex);
1134 1134
 #endif
1135
-			messageDestroy(m);
1136 1135
 			fclose(fd);
1137 1136
 			return CL_EMEM;
1138 1137
 		}
... ...
@@ -1154,8 +1152,20 @@ cli_parse_mbox(const char *dir, int desc, unsigned int options)
1154 1154
 		 * Have been asked to check a UNIX style mbox file, which
1155 1155
 		 * may contain more than one e-mail message to decode
1156 1156
 		 */
1157
-		bool lastLineWasEmpty = FALSE;
1158
-		int messagenumber = 1;
1157
+		bool lastLineWasEmpty;
1158
+		int messagenumber;
1159
+		message *m = messageCreate();
1160
+
1161
+		if(m == NULL) {
1162
+			fclose(fd);
1163
+#ifdef HAVE_BACKTRACE
1164
+			signal(SIGSEGV, segv);
1165
+#endif
1166
+			return CL_EMEM;
1167
+		}
1168
+
1169
+		lastLineWasEmpty = FALSE;
1170
+		messagenumber = 1;
1159 1171
 
1160 1172
 		do {
1161 1173
 			/*cli_dbgmsg("read: %s", buffer);*/
... ...
@@ -1196,11 +1206,14 @@ cli_parse_mbox(const char *dir, int desc, unsigned int options)
1196 1196
 				break;
1197 1197
 		} while(fgets(buffer, sizeof(buffer) - 1, fd) != NULL);
1198 1198
 
1199
+		fclose(fd);
1200
+
1199 1201
 		cli_dbgmsg("Extract attachments from email %d\n", messagenumber);
1202
+		body = parseEmailHeaders(m, rfc821);
1203
+		messageDestroy(m);
1200 1204
 	} else {
1201 1205
 		/*
1202 1206
 		 * It's a single message, parse the headers then the body
1203
-		 * Ignore blank lines at the start of the message
1204 1207
 		 */
1205 1208
 		if(strncmp(buffer, "P I ", 4) == 0)
1206 1209
 			/*
... ...
@@ -1219,40 +1232,10 @@ cli_parse_mbox(const char *dir, int desc, unsigned int options)
1219 1219
 
1220 1220
 		buffer[sizeof(buffer) - 1] = '\0';
1221 1221
 
1222
-		/*
1223
-		 * FIXME: files full of new lines and nothing else are
1224
-		 * handled ungracefully...
1225
-		 */
1226
-		do {
1227
-			const char *ptr;
1228
-
1229
-			/*
1230
-			 * TODO: this needlessly creates a message object,
1231
-			 * it'd be better if parseEmailHeaders could also
1232
-			 * read in from a file. I do not want to lump the
1233
-			 * parseEmailHeaders code here, that'd be a duplication
1234
-			 * of code I want to avoid
1235
-			 */
1236
-			(void)cli_chomp(buffer);
1237
-
1238
-			/*
1239
-			 * Ignore leading CR, e.g. if newlines are LFCR instead
1240
-			 * or CRLF
1241
-			 */
1242
-			for(ptr = buffer; *ptr == '\r'; ptr++)
1243
-				;
1244
-			/*
1245
-			 * Don't blank lines which are only spaces from
1246
-			 * headers, otherwise they'll be treated as the end of
1247
-			 * header marker
1248
-			 */
1249
-			if(messageAddStr(m, ptr) < 0)
1250
-				break;
1251
-		} while(fgets(buffer, sizeof(buffer) - 1, fd) != NULL);
1222
+		body = parseEmailFile(fd, rfc821, buffer);
1223
+		fclose(fd);
1252 1224
 	}
1253 1225
 
1254
-	fclose(fd);
1255
-
1256 1226
 	/*
1257 1227
 	 * This is not necessarily true, but since the only options are
1258 1228
 	 * CL_CLEAN and CL_VIRUS this is the better choice. It would be
... ...
@@ -1260,8 +1243,6 @@ cli_parse_mbox(const char *dir, int desc, unsigned int options)
1260 1260
 	 */
1261 1261
 	retcode = CL_CLEAN;
1262 1262
 
1263
-	body = parseEmailHeaders(m, rfc821);
1264
-	messageDestroy(m);
1265 1263
 	if(body) {
1266 1264
 		/*
1267 1265
 		 * Write out the last entry in the mailbox
... ...
@@ -1286,9 +1267,186 @@ cli_parse_mbox(const char *dir, int desc, unsigned int options)
1286 1286
 }
1287 1287
 
1288 1288
 /*
1289
- * The given message contains a raw e-mail.
1289
+ * Read in an email message from fin, parse it, and return the message
1290 1290
  *
1291
- * This function parses the headers of m and sets the message's arguments
1291
+ * FIXME: files full of new lines and nothing else are
1292
+ * handled ungracefully...
1293
+ */
1294
+static message *
1295
+parseEmailFile(FILE *fin, const table_t *rfc821, const char *firstLine)
1296
+{
1297
+	bool inHeader = TRUE;
1298
+	bool contMarker = FALSE;
1299
+	message *ret;
1300
+	bool anyHeadersFound = FALSE;
1301
+	int commandNumber = -1;
1302
+	char *fullline = NULL;
1303
+	size_t fulllinelength = 0;
1304
+	char buffer[LINE_LENGTH+1];
1305
+
1306
+	cli_dbgmsg("parseEmailFile\n");
1307
+
1308
+	ret = messageCreate();
1309
+	if(ret == NULL)
1310
+		return NULL;
1311
+
1312
+	strcpy(buffer, firstLine);
1313
+	do {
1314
+		const char *start;
1315
+
1316
+		(void)cli_chomp(buffer);
1317
+		/*
1318
+		 * Ignore leading CR, e.g. if newlines are LFCR instead
1319
+		 * or CRLF
1320
+		 */
1321
+		for(start = buffer; *start == '\r'; start++)
1322
+			;
1323
+
1324
+		if(start[0] == '\0')
1325
+			start = NULL;
1326
+
1327
+		/*
1328
+		 * Don't blank lines which are only spaces from headers,
1329
+		 * otherwise they'll be treated as the end of header marker
1330
+		 */
1331
+		if(inHeader) {
1332
+			cli_dbgmsg("parseEmailFile: check '%s'\n", start ? start : "");
1333
+			if(start == NULL) {	/* empty line */
1334
+				if(!contMarker) {
1335
+					/*
1336
+					 * A blank line signifies the end of
1337
+					 * the header and the start of the text
1338
+					 */
1339
+					cli_dbgmsg("End of header information\n");
1340
+					inHeader = FALSE;
1341
+				} else
1342
+					contMarker = FALSE;
1343
+			} else {
1344
+				char *ptr;
1345
+				const char *qptr;
1346
+				int quotes, lookahead;
1347
+
1348
+				if(fullline == NULL) {
1349
+					char cmd[LINE_LENGTH + 1];
1350
+
1351
+					/*
1352
+					 * Continuation of line we're ignoring?
1353
+					 */
1354
+					if((start[0] == '\t') || (start[0] == ' ') || contMarker) {
1355
+						contMarker = continuationMarker(start);
1356
+						continue;
1357
+					}
1358
+
1359
+					/*
1360
+					 * Is this a header we're interested in?
1361
+					 */
1362
+					if((strchr(start, ':') == NULL) ||
1363
+					   (cli_strtokbuf(start, 0, ":", cmd) == NULL)) {
1364
+						if(strncmp(start, "From ", 5) == 0)
1365
+							anyHeadersFound = TRUE;
1366
+						continue;
1367
+					}
1368
+
1369
+					ptr = rfc822comments(cmd);
1370
+					commandNumber = tableFind(rfc821, ptr ? ptr : cmd);
1371
+					if(ptr)
1372
+						free(ptr);
1373
+
1374
+					switch(commandNumber) {
1375
+						case CONTENT_TRANSFER_ENCODING:
1376
+						case CONTENT_DISPOSITION:
1377
+						case CONTENT_TYPE:
1378
+							anyHeadersFound = TRUE;
1379
+							break;
1380
+						default:
1381
+							if(!anyHeadersFound)
1382
+								anyHeadersFound = usefulHeader(commandNumber, cmd);
1383
+							continue;
1384
+					}
1385
+					fullline = strdup(start);
1386
+					fulllinelength = strlen(start) + 1;
1387
+				} else if(start != NULL) {
1388
+					fulllinelength += strlen(start);
1389
+					fullline = cli_realloc(fullline, fulllinelength);
1390
+					strcat(fullline, start);
1391
+				}
1392
+
1393
+				contMarker = continuationMarker(start);
1394
+
1395
+				if(contMarker)
1396
+					continue;
1397
+
1398
+				assert(fullline != NULL);
1399
+
1400
+				lookahead = getc(fin);
1401
+				if(lookahead != EOF) {
1402
+					ungetc(lookahead, fin);
1403
+
1404
+					/*
1405
+					 * Section B.2 of RFC822 says TAB or
1406
+					 * SPACE means a continuation of the
1407
+					 * previous entry.
1408
+					 *
1409
+					 * Add all the arguments on the line
1410
+					 */
1411
+					if((lookahead == '\t') || (lookahead == ' '))
1412
+						continue;
1413
+				}
1414
+
1415
+				quotes = 0;
1416
+				for(qptr = start; *qptr; qptr++)
1417
+					if(*qptr == '\"')
1418
+						quotes++;
1419
+
1420
+				if(quotes & 1)
1421
+					continue;
1422
+
1423
+				ptr = rfc822comments(fullline);
1424
+				if(ptr) {
1425
+					free(fullline);
1426
+					fullline = ptr;
1427
+				}
1428
+
1429
+				if(parseEmailHeader(ret, fullline, rfc821) < 0)
1430
+					continue;
1431
+
1432
+				free(fullline);
1433
+				fullline = NULL;
1434
+			}
1435
+		} else
1436
+			/*cli_dbgmsg("Add line to body '%s'\n", start);*/
1437
+			if(messageAddStr(ret, start) < 0)
1438
+				break;
1439
+	} while(fgets(buffer, sizeof(buffer) - 1, fin) != NULL);
1440
+
1441
+	if(fullline) {
1442
+		if(*fullline) switch(commandNumber) {
1443
+			case CONTENT_TRANSFER_ENCODING:
1444
+			case CONTENT_DISPOSITION:
1445
+			case CONTENT_TYPE:
1446
+				cli_warnmsg("parseEmailHeaders: Fullline set '%s' - report to bugs@clamav.net\n", fullline);
1447
+		}
1448
+		free(fullline);
1449
+	}
1450
+
1451
+	if(!anyHeadersFound) {
1452
+		/*
1453
+		 * False positive in believing we have an e-mail when we don't
1454
+		 */
1455
+		messageDestroy(ret);
1456
+		cli_dbgmsg("parseEmailFile: no headers found, assuming it isn't an email\n");
1457
+		return NULL;
1458
+	}
1459
+
1460
+	messageClean(ret);
1461
+
1462
+	cli_dbgmsg("parseEmailFile: return\n");
1463
+
1464
+	return ret;
1465
+}
1466
+
1467
+/*
1468
+ * The given message contains a raw e-mail.
1292 1469
  *
1293 1470
  * Returns the message's body with the correct arguments set
1294 1471
  *
... ...
@@ -1296,6 +1454,8 @@ cli_parse_mbox(const char *dir, int desc, unsigned int options)
1296 1296
  * of the message in memory, the upside is that it makes for easier parsing
1297 1297
  * of encapsulated messages, and in the long run uses less memory in those
1298 1298
  * scenarios
1299
+ *
1300
+ * TODO: remove the duplication with parseEmailFile
1299 1301
  */
1300 1302
 static message *
1301 1303
 parseEmailHeaders(const message *m, const table_t *rfc821)
... ...
@@ -1326,23 +1486,21 @@ parseEmailHeaders(const message *m, const table_t *rfc821)
1326 1326
 
1327 1327
 		if(inHeader) {
1328 1328
 			cli_dbgmsg("parseEmailHeaders: check '%s'\n", buffer ? buffer : "");
1329
-			if((buffer == NULL) && !contMarker) {
1330
-				/*
1331
-				 * A blank line signifies the end of the header
1332
-				 * and the start of the text
1333
-				 */
1334
-				cli_dbgmsg("End of header information\n");
1335
-				inHeader = FALSE;
1329
+			if(buffer == NULL) {
1330
+				if(!contMarker) {
1331
+					/*
1332
+					 * A blank line signifies the end of
1333
+					 * the header and the start of the text
1334
+					 */
1335
+					cli_dbgmsg("End of header information\n");
1336
+					inHeader = FALSE;
1337
+				} else
1338
+					contMarker = FALSE;
1336 1339
 			} else {
1337 1340
 				char *ptr;
1338 1341
 				const char *qptr;
1339 1342
 				int quotes;
1340 1343
 
1341
-				if(buffer == NULL) {
1342
-					contMarker = FALSE;
1343
-					continue;
1344
-				}
1345
-
1346 1344
 				if(fullline == NULL) {
1347 1345
 					char cmd[LINE_LENGTH + 1];
1348 1346
 
... ...
@@ -1376,12 +1534,8 @@ parseEmailHeaders(const message *m, const table_t *rfc821)
1376 1376
 							anyHeadersFound = TRUE;
1377 1377
 							break;
1378 1378
 						default:
1379
-							if(strcasecmp(cmd, "From") == 0)
1380
-								anyHeadersFound = TRUE;
1381
-							else if(strcasecmp(cmd, "Received") == 0)
1382
-								anyHeadersFound = TRUE;
1383
-							else if(strcasecmp(cmd, "De") == 0)
1384
-								anyHeadersFound = TRUE;
1379
+							if(!anyHeadersFound)
1380
+								anyHeadersFound = usefulHeader(commandNumber, cmd);
1385 1381
 							continue;
1386 1382
 					}
1387 1383
 					fullline = strdup(buffer);
... ...
@@ -1397,9 +1551,9 @@ parseEmailHeaders(const message *m, const table_t *rfc821)
1397 1397
 				if(contMarker)
1398 1398
 					continue;
1399 1399
 
1400
-				if(t->t_next && (t->t_next->t_line != NULL)) {
1401
-					const char *next = lineGetData(t->t_next->t_line);
1400
+				assert(fullline != NULL);
1402 1401
 
1402
+				if(t->t_next && (t->t_next->t_line != NULL))
1403 1403
 					/*
1404 1404
 					 * Section B.2 of RFC822 says TAB or
1405 1405
 					 * SPACE means a continuation of the
... ...
@@ -1407,9 +1561,11 @@ parseEmailHeaders(const message *m, const table_t *rfc821)
1407 1407
 					 *
1408 1408
 					 * Add all the arguments on the line
1409 1409
 					 */
1410
-					if((next[0] == '\t') || (next[0] == ' '))
1411
-						continue;
1412
-				}
1410
+					switch(lineGetData(t->t_next->t_line)[0]) {
1411
+						case ' ':
1412
+						case '\t':
1413
+							continue;
1414
+					}
1413 1415
 
1414 1416
 				quotes = 0;
1415 1417
 				for(qptr = buffer; *qptr; qptr++)
... ...
@@ -1425,13 +1581,11 @@ parseEmailHeaders(const message *m, const table_t *rfc821)
1425 1425
 					fullline = ptr;
1426 1426
 				}
1427 1427
 
1428
-				if(fullline) {
1429
-					if(parseEmailHeader(ret, fullline, rfc821) < 0)
1430
-						continue;
1428
+				if(parseEmailHeader(ret, fullline, rfc821) < 0)
1429
+					continue;
1431 1430
 
1432
-					free(fullline);
1433
-					fullline = NULL;
1434
-				}
1431
+				free(fullline);
1432
+				fullline = NULL;
1435 1433
 			}
1436 1434
 		} else
1437 1435
 			/*cli_dbgmsg("Add line to body '%s'\n", buffer);*/
... ...
@@ -3725,6 +3879,26 @@ print_trace(int use_syslog)
3725 3725
 }
3726 3726
 #endif
3727 3727
 
3728
+static bool
3729
+usefulHeader(int commandNumber, const char *cmd)
3730
+{
3731
+	switch(commandNumber) {
3732
+		case CONTENT_TRANSFER_ENCODING:
3733
+		case CONTENT_DISPOSITION:
3734
+		case CONTENT_TYPE:
3735
+			return TRUE;
3736
+		default:
3737
+			if(strcasecmp(cmd, "From") == 0)
3738
+				return TRUE;
3739
+			else if(strcasecmp(cmd, "Received") == 0)
3740
+				return TRUE;
3741
+			else if(strcasecmp(cmd, "De") == 0)
3742
+				return TRUE;
3743
+	}
3744
+
3745
+	return FALSE;
3746
+}
3747
+
3728 3748
 #ifdef	notdef
3729 3749
 /*
3730 3750
  * like cli_memstr - but returns the location of the match