Browse code

Handle multiple encoding types

git-svn-id: file:///var/lib/svn/clamav-devel/trunk/clamav-devel@871 77e5149b-7576-45b1-b177-96237e5ba77b

Nigel Horne authored on 2004/09/16 03:11:55
Showing 4 changed files
... ...
@@ -1,3 +1,8 @@
1
+Wed Sep 15 19:09:56 BST 2004 (njh)
2
+----------------------------------
3
+  * libclamav:	Handle e-mails where the attachment misleads the type of
4
+			encoding used
5
+
1 6
 Wed Sep 15 18:46:44 CEST 2004 (tk)
2 7
 ----------------------------------
3 8
   * libclamav/matcher.c: fix problem with uninitialised voffset structure
... ...
@@ -17,6 +17,9 @@
17 17
  *
18 18
  * Change History:
19 19
  * $Log: mbox.c,v $
20
+ * Revision 1.121  2004/09/15 18:08:23  nigelhorne
21
+ * Handle multiple encoding types
22
+ *
20 23
  * Revision 1.120  2004/09/15 08:47:07  nigelhorne
21 24
  * Cleaner way to initialise hrefs
22 25
  *
... ...
@@ -348,7 +351,7 @@
348 348
  * Compilable under SCO; removed duplicate code with message.c
349 349
  *
350 350
  */
351
-static	char	const	rcsid[] = "$Id: mbox.c,v 1.120 2004/09/15 08:47:07 nigelhorne Exp $";
351
+static	char	const	rcsid[] = "$Id: mbox.c,v 1.121 2004/09/15 18:08:23 nigelhorne Exp $";
352 352
 
353 353
 #if HAVE_CONFIG_H
354 354
 #include "clamav-config.h"
... ...
@@ -581,7 +584,7 @@ cli_mbox(const char *dir, int desc, unsigned int options)
581 581
 	message *m, *body;
582 582
 	FILE *fd;
583 583
 	char buffer[LINE_LENGTH];
584
-#ifdef	CL_DEBUG
584
+#ifdef HAVE_BACKTRACE
585 585
 	void (*segv)(int);
586 586
 #endif
587 587
 	static table_t *rfc821, *subtype;
... ...
@@ -1659,8 +1662,7 @@ parseEmailBody(message *messageIn, text *textIn, const char *dir, const table_t
1659 1659
 		fb = messageToFileblob(messages[i], dir);
1660 1660
 
1661 1661
 		if(fb) {
1662
-			cli_dbgmsg("Saving multipart %d, encoded with scheme %d\n",
1663
-				i, messageGetEncoding(messages[i]));
1662
+			cli_dbgmsg("Saving multipart %d\n", i);
1664 1663
 
1665 1664
 			fileblobDestroy(fb);
1666 1665
 		}
... ...
@@ -2110,8 +2112,7 @@ saveTextPart(message *m, const char *dir)
2110 2110
 		/*
2111 2111
 		 * Save main part to scan that
2112 2112
 		 */
2113
-		cli_dbgmsg("Saving main message, encoded with scheme %d\n",
2114
-				messageGetEncoding(m));
2113
+		cli_dbgmsg("Saving main message, encoded with scheme\n");
2115 2114
 
2116 2115
 		fileblobDestroy(fb);
2117 2116
 	}
... ...
@@ -17,6 +17,9 @@
17 17
  *
18 18
  * Change History:
19 19
  * $Log: message.c,v $
20
+ * Revision 1.78  2004/09/15 18:08:23  nigelhorne
21
+ * Handle multiple encoding types
22
+ *
20 23
  * Revision 1.77  2004/09/13 16:44:01  kojm
21 24
  * minor cleanup
22 25
  *
... ...
@@ -228,7 +231,7 @@
228 228
  * uuencodebegin() no longer static
229 229
  *
230 230
  */
231
-static	char	const	rcsid[] = "$Id: message.c,v 1.77 2004/09/13 16:44:01 kojm Exp $";
231
+static	char	const	rcsid[] = "$Id: message.c,v 1.78 2004/09/15 18:08:23 nigelhorne Exp $";
232 232
 
233 233
 #if HAVE_CONFIG_H
234 234
 #include "clamav-config.h"
... ...
@@ -280,7 +283,7 @@ typedef enum { FALSE = 0, TRUE = 1 } bool;
280 280
 
281 281
 static	void	messageIsEncoding(message *m);
282 282
 static	const	text	*binhexBegin(const message *m);
283
-static	unsigned char	*decodeLine(message *m, const char *line, unsigned char *buf, size_t buflen);
283
+static	unsigned char	*decodeLine(message *m, encoding_type enctype, const char *line, unsigned char *buf, size_t buflen);
284 284
 static unsigned char *decode(message *m, const char *in, unsigned char *out, unsigned char (*decoder)(char), bool isFast);
285 285
 static	void	squeeze(char *s);
286 286
 static	unsigned	char	hex(char c);
... ...
@@ -292,7 +295,7 @@ static	int	usefulArg(const char *arg);
292 292
 
293 293
 /*
294 294
  * These maps are ordered in decreasing likelyhood of their appearance
295
- * in an e-mail
295
+ * in an e-mail. Probably these should be in a table...
296 296
  */
297 297
 static	const	struct	encoding_map {
298 298
 	const	char	*string;
... ...
@@ -328,10 +331,8 @@ messageCreate(void)
328 328
 {
329 329
 	message *m = (message *)cli_calloc(1, sizeof(message));
330 330
 
331
-	if(m) {
331
+	if(m)
332 332
 		m->mimeType = NOMIME;
333
-		m->encodingType = NOENCODING;
334
-	}
335 333
 
336 334
 	return m;
337 335
 }
... ...
@@ -370,7 +371,12 @@ messageReset(message *m)
370 370
 
371 371
 	memset(m, '\0', sizeof(message));
372 372
 	m->mimeType = NOMIME;
373
-	m->encodingType = NOENCODING;
373
+
374
+	if(m->encodingTypes) {
375
+		assert(m->numberOfEncTypes > 0);
376
+		free(m->encodingTypes);
377
+		m->numberOfEncTypes = 0;
378
+	}
374 379
 }
375 380
 
376 381
 void
... ...
@@ -804,29 +810,72 @@ void
804 804
 messageSetEncoding(message *m, const char *enctype)
805 805
 {
806 806
 	const struct encoding_map *e;
807
+	int i = 0;
808
+	char *type;
807 809
 	assert(m != NULL);
808 810
 	assert(enctype != NULL);
809 811
 
810
-	m->encodingType = EEXTENSION;
812
+	/*m->encodingType = EEXTENSION;*/
811 813
 
812 814
 	while((*enctype == '\t') || (*enctype == ' '))
813 815
 		enctype++;
814 816
 
815
-	for(e = encoding_map; e->string; e++)
816
-		if(strcasecmp(enctype, e->string) == 0) {
817
-			m->encodingType = e->type;
818
-			cli_dbgmsg("Encoding type is \"%s\"\n", enctype);
819
-			return;
817
+	/*
818
+	 * Iterate through
819
+	 *	Content-Transfer-Encoding: base64 binary
820
+	 * cli_strtok's fieldno counts from 0
821
+	 */
822
+	i = 0;
823
+	while((type = cli_strtok(enctype, i++, " \t")) != NULL) {
824
+		for(e = encoding_map; e->string; e++)
825
+			if(strcasecmp(type, e->string) == 0) {
826
+				int j;
827
+				encoding_type *et;
828
+
829
+				for(j = 0; j < m->numberOfEncTypes; j++) {
830
+					if(m->encodingTypes[j] == e->type) {
831
+						cli_dbgmsg("Ignoring duplicate encoding mechanism\n");
832
+						break;
833
+					}
834
+				}
835
+				if(j < m->numberOfEncTypes)
836
+					break;
837
+				et = (encoding_type *)cli_realloc(m->encodingTypes, (m->numberOfEncTypes + 1) * sizeof(encoding_type));
838
+				if(et == NULL) {
839
+					free(type);
840
+					return;
841
+				}
842
+
843
+				m->encodingTypes = et;
844
+				m->encodingTypes[m->numberOfEncTypes++] = e->type;
845
+
846
+				cli_dbgmsg("Encoding type %d is \"%s\"\n", m->numberOfEncTypes, type);
847
+				break;
848
+			}
849
+
850
+		if(e->string == NULL) {
851
+			cli_warnmsg("Unknown encoding type \"%s\"\n", type);
852
+			/*
853
+			 * Err on the side of safety, enable all decoding
854
+			 * modules
855
+			 */
856
+			/*messageSetEncoding(m, "base64");
857
+			messageSetEncoding(m, "quoted-printable");*/
858
+			break;
820 859
 		}
821 860
 
822
-	cli_warnmsg("Unknown encoding type \"%s\"\n", enctype);
861
+		free(type);
862
+	}
823 863
 }
824 864
 
825 865
 encoding_type
826 866
 messageGetEncoding(const message *m)
827 867
 {
828 868
 	assert(m != NULL);
829
-	return(m->encodingType);
869
+
870
+	if(m->numberOfEncTypes == 0)
871
+		return NOENCODING;
872
+	return m->encodingTypes[0];
830 873
 }
831 874
 
832 875
 int
... ...
@@ -988,40 +1037,19 @@ messageExport(message *m, const char *dir, void *(*create)(void), void (*destroy
988 988
 	void *ret;
989 989
 	const text *t_line;
990 990
 	char *filename;
991
+	int i;
991 992
 
992 993
 	assert(m != NULL);
993 994
 
995
+	if(messageGetBody(m) == NULL)
996
+		return NULL;
997
+
994 998
 	ret = (*create)();
995 999
 
996 1000
 	if(ret == NULL)
997 1001
 		return NULL;
998 1002
 
999
-	/*
1000
-	 * Find the filename to decode
1001
-	 */
1002
-	if(messageGetEncoding(m) == UUENCODE) {
1003
-		t_line = uuencodeBegin(m);
1004
-
1005
-		if(t_line == NULL) {
1006
-			/*cli_warnmsg("UUENCODED attachment is missing begin statement\n");*/
1007
-			(*destroy)(ret);
1008
-			return NULL;
1009
-		}
1010
-
1011
-		filename = cli_strtok(lineGetData(t_line->t_line), 2, " ");
1012
-
1013
-		if(filename == NULL) {
1014
-			cli_dbgmsg("UUencoded attachment sent with no filename\n");
1015
-			(*destroy)(ret);
1016
-			return NULL;
1017
-		}
1018
-		cli_chomp(filename);
1019
-
1020
-		cli_dbgmsg("Set uuencode filename to \"%s\"\n", filename);
1021
-
1022
-		(*setFilename)(ret, dir, filename);
1023
-		t_line = t_line->t_next;
1024
-	} else if((t_line = binhexBegin(m)) != NULL) {
1003
+	if((t_line = binhexBegin(m)) != NULL) {
1025 1004
 		unsigned char byte;
1026 1005
 		unsigned long len, l, newlen = 0L;
1027 1006
 		unsigned char *uptr, *data;
... ...
@@ -1286,8 +1314,13 @@ messageExport(message *m, const char *dir, void *(*create)(void), void (*destroy
1286 1286
 
1287 1287
 		blobDestroy(tmp);
1288 1288
 
1289
-		return ret;
1290
-	} else {
1289
+		m->binhex = NULL;
1290
+	}
1291
+
1292
+	if(m->numberOfEncTypes == 0) {
1293
+		/*
1294
+		 * Fast copy
1295
+		 */
1291 1296
 		filename = (char *)messageFindArgument(m, "filename");
1292 1297
 		if(filename == NULL) {
1293 1298
 			filename = (char *)messageFindArgument(m, "name");
... ...
@@ -1296,7 +1329,7 @@ messageExport(message *m, const char *dir, void *(*create)(void), void (*destroy
1296 1296
 				cli_dbgmsg("Attachment sent with no filename\n");
1297 1297
 				messageAddArgument(m, "name=attachment");
1298 1298
 				filename = strdup("attachment");
1299
-			} else if(messageGetEncoding(m) == NOENCODING)
1299
+			} else
1300 1300
 				/*
1301 1301
 				 * Some virus attachments don't say how they've
1302 1302
 				 * been encoded. We assume base64
... ...
@@ -1306,62 +1339,123 @@ messageExport(message *m, const char *dir, void *(*create)(void), void (*destroy
1306 1306
 
1307 1307
 		(*setFilename)(ret, dir, filename);
1308 1308
 
1309
-		t_line = messageGetBody(m);
1310
-	}
1311
-	free((char *)filename);
1309
+		free((char *)filename);
1312 1310
 
1313
-	/*
1314
-	 * t_line should now point to the first (encoded) line of the message
1315
-	 */
1316
-	if(t_line == NULL) {
1317
-		cli_warnmsg("Empty attachment not saved\n");
1318
-		(*destroy)(ret);
1319
-		return NULL;
1311
+		if(m->numberOfEncTypes == 0) {
1312
+			if(uuencodeBegin(m))
1313
+				messageSetEncoding(m, "x-uuencode");
1314
+			else
1315
+				return exportText(messageGetBody(m), ret);
1316
+		}
1320 1317
 	}
1321 1318
 
1322
-	if(messageGetEncoding(m) == NOENCODING)
1319
+	for(i = 0; i < m->numberOfEncTypes; i++) {
1320
+		encoding_type enctype = m->encodingTypes[i];
1321
+
1323 1322
 		/*
1324
-		 * Fast copy
1323
+		 * Find the filename to decode
1325 1324
 		 */
1326
-		return exportText(t_line, ret);
1325
+		if((enctype == UUENCODE) || ((i == 0) && uuencodeBegin(m))) {
1326
+			t_line = uuencodeBegin(m);
1327
+
1328
+			if(t_line == NULL) {
1329
+				/*cli_warnmsg("UUENCODED attachment is missing begin statement\n");*/
1330
+				(*destroy)(ret);
1331
+				return NULL;
1332
+			}
1333
+
1334
+			filename = cli_strtok(lineGetData(t_line->t_line), 2, " ");
1335
+
1336
+			if(filename == NULL) {
1337
+				cli_dbgmsg("UUencoded attachment sent with no filename\n");
1338
+				(*destroy)(ret);
1339
+				return NULL;
1340
+			}
1341
+			cli_chomp(filename);
1342
+
1343
+			cli_dbgmsg("Set uuencode filename to \"%s\"\n", filename);
1327 1344
 
1328
-	do {
1329
-		unsigned char data[1024];
1330
-		unsigned char *uptr;
1331
-		const char *line = lineGetData(t_line->t_line);
1345
+			(*setFilename)(ret, dir, filename);
1346
+			t_line = t_line->t_next;
1347
+			enctype = UUENCODE;
1348
+		} else {
1349
+			filename = (char *)messageFindArgument(m, "filename");
1350
+			if(filename == NULL) {
1351
+				filename = (char *)messageFindArgument(m, "name");
1352
+
1353
+				if(filename == NULL) {
1354
+					cli_dbgmsg("Attachment sent with no filename\n");
1355
+					messageAddArgument(m, "name=attachment");
1356
+					filename = strdup("attachment");
1357
+				} else if(enctype == NOENCODING)
1358
+					/*
1359
+					 * Some virus attachments don't say how they've
1360
+					 * been encoded. We assume base64
1361
+					 */
1362
+					messageSetEncoding(m, "base64");
1363
+			}
1364
+
1365
+			(*setFilename)(ret, dir, filename);
1366
+
1367
+			t_line = messageGetBody(m);
1368
+		}
1369
+		free((char *)filename);
1370
+
1371
+		/*
1372
+		 * t_line should now point to the first (encoded) line of the message
1373
+		 */
1374
+		if(t_line == NULL) {
1375
+			cli_warnmsg("Empty attachment not saved\n");
1376
+			(*destroy)(ret);
1377
+			return NULL;
1378
+		}
1332 1379
 
1333
-		if(messageGetEncoding(m) == UUENCODE) {
1380
+		if(enctype == NOENCODING) {
1334 1381
 			/*
1335
-			 * There should be no blank lines in uuencoded files...
1382
+			 * Fast copy
1336 1383
 			 */
1337
-			if(line == NULL)
1338
-				continue;
1339
-			if(strcasecmp(line, "end") == 0)
1340
-				break;
1384
+			(void)exportText(t_line, ret);
1385
+			continue;
1341 1386
 		}
1342 1387
 
1343
-		uptr = decodeLine(m, line, data, sizeof(data));
1388
+		do {
1389
+			unsigned char data[1024];
1390
+			unsigned char *uptr;
1391
+			const char *line = lineGetData(t_line->t_line);
1344 1392
 
1345
-		if(uptr == NULL)
1346
-			break;
1393
+			if(enctype == UUENCODE) {
1394
+				/*
1395
+				 * There should be no blank lines in uuencoded files...
1396
+				 */
1397
+				if(line == NULL)
1398
+					continue;
1399
+				if(strcasecmp(line, "end") == 0)
1400
+					break;
1401
+			}
1347 1402
 
1348
-		assert(uptr <= &data[sizeof(data)]);
1403
+			uptr = decodeLine(m, enctype, line, data, sizeof(data));
1349 1404
 
1350
-		if(uptr != data)
1351
-			(*addData)(ret, data, (size_t)(uptr - data));
1405
+			if(uptr == NULL)
1406
+				break;
1352 1407
 
1353
-		/*
1354
-		 * According to RFC1521, '=' is used to pad out
1355
-		 * the last byte and should be used as evidence
1356
-		 * of the end of the data. Some mail clients
1357
-		 * annoyingly then put plain text after the '='
1358
-		 * byte and viruses exploit this bug. Sigh
1359
-		 */
1360
-		/*if(messageGetEncoding(m) == BASE64)
1361
-			if(strchr(line, '='))
1362
-				break;*/
1408
+			assert(uptr <= &data[sizeof(data)]);
1363 1409
 
1364
-	} while((t_line = t_line->t_next) != NULL);
1410
+			if(uptr != data)
1411
+				(*addData)(ret, data, (size_t)(uptr - data));
1412
+
1413
+			/*
1414
+			 * According to RFC1521, '=' is used to pad out
1415
+			 * the last byte and should be used as evidence
1416
+			 * of the end of the data. Some mail clients
1417
+			 * annoyingly then put plain text after the '='
1418
+			 * byte and viruses exploit this bug. Sigh
1419
+			 */
1420
+			/*if(enctype == BASE64)
1421
+				if(strchr(line, '='))
1422
+					break;*/
1423
+
1424
+		} while((t_line = t_line->t_next) != NULL);
1425
+	}
1365 1426
 
1366 1427
 	/* Verify we have nothing left to flush out */
1367 1428
 	if(m->base64chars) {
... ...
@@ -1405,12 +1499,13 @@ messageToBlob(message *m)
1405 1405
 text *
1406 1406
 messageToText(message *m)
1407 1407
 {
1408
+	int i;
1408 1409
 	text *first = NULL, *last = NULL;
1409 1410
 	const text *t_line;
1410 1411
 
1411 1412
 	assert(m != NULL);
1412 1413
 
1413
-	if(messageGetEncoding(m) == NOENCODING)
1414
+	if(m->numberOfEncTypes == 0) {
1414 1415
 		/*
1415 1416
 		 * Fast copy
1416 1417
 		 */
... ...
@@ -1429,17 +1524,53 @@ messageToText(message *m)
1429 1429
 			}
1430 1430
 			last->t_line = lineLink(t_line->t_line);
1431 1431
 		}
1432
-	else {
1433
-		if(messageGetEncoding(m) == UUENCODE) {
1432
+		if(last)
1433
+			last->t_next = NULL;
1434
+
1435
+		return first;
1436
+	}
1437
+	/*
1438
+	 * Scan over the data a number of times once for each claimed encoding
1439
+	 * type
1440
+	 */
1441
+	for(i = 0; i < m->numberOfEncTypes; i++) {
1442
+		const encoding_type enctype = m->encodingTypes[i];
1443
+
1444
+		cli_dbgmsg("messageToText: export transfer method %d = %d\n",
1445
+			i, enctype);
1446
+		if(enctype == NOENCODING) {
1447
+			/*
1448
+			 * Fast copy
1449
+			 */
1450
+			for(t_line = messageGetBody(m); t_line; t_line = t_line->t_next) {
1451
+				if(first == NULL)
1452
+					first = last = cli_malloc(sizeof(text));
1453
+				else {
1454
+					last->t_next = cli_malloc(sizeof(text));
1455
+					last = last->t_next;
1456
+				}
1457
+
1458
+				if(last == NULL) {
1459
+					if(first)
1460
+						textDestroy(first);
1461
+					return NULL;
1462
+				}
1463
+				last->t_line = lineLink(t_line->t_line);
1464
+			}
1465
+			continue;
1466
+		}
1467
+		if(enctype == UUENCODE) {
1434 1468
 			t_line = uuencodeBegin(m);
1435 1469
 
1436 1470
 			if(t_line == NULL) {
1437 1471
 				/*cli_warnmsg("UUENCODED attachment is missing begin statement\n");*/
1472
+				if(first)
1473
+					textDestroy(first);
1438 1474
 				return NULL;
1439 1475
 			}
1440 1476
 			t_line = t_line->t_next;
1441 1477
 		} else {
1442
-			if(binhexBegin(m))
1478
+			if((i == 0) && binhexBegin(m))
1443 1479
 				cli_warnmsg("Binhex messages not supported yet.\n");
1444 1480
 			t_line = messageGetBody(m);
1445 1481
 		}
... ...
@@ -1449,18 +1580,18 @@ messageToText(message *m)
1449 1449
 			unsigned char *uptr;
1450 1450
 			const char *line = lineGetData(t_line->t_line);
1451 1451
 
1452
-			if(messageGetEncoding(m) == BASE64) {
1452
+			if(enctype == BASE64) {
1453 1453
 				/*
1454 1454
 				 * ignore blanks - breaks RFC which is
1455 1455
 				 * probably the point!
1456 1456
 				 */
1457 1457
 				if(line == NULL)
1458 1458
 					continue;
1459
-			} else if(messageGetEncoding(m) == UUENCODE)
1459
+			} else if(enctype == UUENCODE)
1460 1460
 				if(strcasecmp(line, "end") == 0)
1461 1461
 					break;
1462 1462
 
1463
-			uptr = decodeLine(m, line, data, sizeof(data));
1463
+			uptr = decodeLine(m, enctype, line, data, sizeof(data));
1464 1464
 
1465 1465
 			if(uptr == NULL)
1466 1466
 				break;
... ...
@@ -1479,7 +1610,7 @@ messageToText(message *m)
1479 1479
 
1480 1480
 			last->t_line = ((data[0] != '\n') && data[0]) ? lineCreate((char *)data) : NULL;
1481 1481
 
1482
-			if(line && messageGetEncoding(m) == BASE64)
1482
+			if(line && enctype == BASE64)
1483 1483
 				if(strchr(line, '='))
1484 1484
 					break;
1485 1485
 		}
... ...
@@ -1635,7 +1766,7 @@ messageClearMarkers(message *m)
1635 1635
  * len is sizeof(ptr)
1636 1636
  */
1637 1637
 static unsigned char *
1638
-decodeLine(message *m, const char *line, unsigned char *buf, size_t buflen)
1638
+decodeLine(message *m, encoding_type et, const char *line, unsigned char *buf, size_t buflen)
1639 1639
 {
1640 1640
 	size_t len;
1641 1641
 	bool softbreak;
... ...
@@ -1645,7 +1776,7 @@ decodeLine(message *m, const char *line, unsigned char *buf, size_t buflen)
1645 1645
 	assert(m != NULL);
1646 1646
 	assert(buf != NULL);
1647 1647
 
1648
-	switch(messageGetEncoding(m)) {
1648
+	switch(et) {
1649 1649
 		case BINARY:
1650 1650
 			/*
1651 1651
 			 * TODO: find out what this is, encoded as binary??
... ...
@@ -16,6 +16,9 @@
16 16
  *  Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
17 17
  *
18 18
  * $Log: message.h,v $
19
+ * Revision 1.17  2004/09/15 18:08:23  nigelhorne
20
+ * Handle multiple encoding types
21
+ *
19 22
  * Revision 1.16  2004/08/23 13:15:16  nigelhorne
20 23
  * messageClearMarkers
21 24
  *
... ...
@@ -62,7 +65,8 @@
62 62
 
63 63
 typedef struct message {
64 64
 	mime_type	mimeType;
65
-	encoding_type	encodingType;
65
+	encoding_type	*encodingTypes;
66
+	int	numberOfEncTypes;	/* size of encodingTypes */
66 67
 	char	*mimeSubtype;
67 68
 	int	numberOfArguments;	/* count of mimeArguments */
68 69
 	char	**mimeArguments;