Browse code

support for generic text normalizer (CL_TYPE_SCRIPT)

git-svn: trunk@3584

Török Edvin authored on 2008/02/05 06:38:34
Showing 6 changed files
... ...
@@ -1,3 +1,8 @@
1
+Mon Feb  4 23:20:12 EET 2008 (edwin)
2
+------------------------------------
3
+  * libclamav/scanners, filetypes, dconf:
4
+	support for generic text normalizer (CL_TYPE_SCRIPT)
5
+
1 6
 Mon Feb  4 23:06:34 EET 2008 (edwin)
2 7
 ---------------------------------
3 8
   * libclamav/textnorm.[ch]: generic text normalizer (bb #241)
... ...
@@ -86,6 +86,7 @@ static struct dconf_module modules[] = {
86 86
     { "DOCUMENT",   "HTML",	    DOC_CONF_HTML,	    1 },
87 87
     { "DOCUMENT",   "RTF",	    DOC_CONF_RTF,	    1 },
88 88
     { "DOCUMENT",   "PDF",	    DOC_CONF_PDF,	    1 },
89
+    { "DOCUMENT",   "SCRIPT",	    DOC_CONF_SCRIPT,	    1 },
89 90
 
90 91
     { "MAIL",	    "MBOX",	    MAIL_CONF_MBOX,	    1 },
91 92
     { "MAIL",	    "TNEF",	    MAIL_CONF_TNEF,	    1 },
... ...
@@ -74,6 +74,7 @@ struct cli_dconf {
74 74
 #define DOC_CONF_HTML	    0x1
75 75
 #define DOC_CONF_RTF	    0x2
76 76
 #define DOC_CONF_PDF	    0x4
77
+#define DOC_CONF_SCRIPT	    0x8
77 78
 
78 79
 /* Mail flags */
79 80
 #define MAIL_CONF_MBOX	    0x1
... ...
@@ -76,6 +76,7 @@ static const struct ftmap_s {
76 76
     { "CL_TYPE_PDF",		CL_TYPE_PDF		},
77 77
     { "CL_TYPE_UUENCODED",	CL_TYPE_UUENCODED	},
78 78
     { "CL_TYPE_HTML_UTF16",	CL_TYPE_HTML_UTF16	},
79
+    { "CL_TYPE_SCRIPT",         CL_TYPE_SCRIPT          },
79 80
     { "CL_TYPE_RTF",		CL_TYPE_RTF		},
80 81
     { "CL_TYPE_HTML",		CL_TYPE_HTML		},
81 82
     { "CL_TYPE_MAIL",		CL_TYPE_MAIL		},
... ...
@@ -63,6 +63,7 @@ typedef enum {
63 63
     CL_TYPE_CRYPTFF,
64 64
     CL_TYPE_PDF,
65 65
     CL_TYPE_UUENCODED,
66
+    CL_TYPE_SCRIPT,
66 67
     CL_TYPE_HTML_UTF16,
67 68
     CL_TYPE_RTF,
68 69
 
... ...
@@ -85,6 +85,7 @@
85 85
 #include "unarj.h"
86 86
 #include "nulsft.h"
87 87
 #include "autoit.h"
88
+#include "textnorm.h"
88 89
 #include <zlib.h>
89 90
 #include "unzip.h"
90 91
 
... ...
@@ -1064,6 +1065,73 @@ static int cli_scanhtml(int desc, cli_ctx *ctx)
1064 1064
     return ret;
1065 1065
 }
1066 1066
 
1067
+static int cli_scanscript(int desc, cli_ctx *ctx)
1068
+{
1069
+	unsigned char buff[FILEBUFF];
1070
+	unsigned char normalized[SCANBUFF];
1071
+	struct text_norm_state state;
1072
+	struct stat sb;
1073
+	char *tmpname = NULL;
1074
+	int ofd = -1, ret;
1075
+	ssize_t nread;
1076
+
1077
+	cli_dbgmsg("in cli_scantext()\n");
1078
+
1079
+	if(fstat(desc, &sb) == -1) {
1080
+		cli_errmsg("cli_scanscript: fstat() failed for descriptor %d\n", desc);
1081
+		return CL_EIO;
1082
+	}
1083
+
1084
+	/* don't normalize files that are too large */
1085
+	if(sb.st_size > 10485760) {
1086
+		cli_dbgmsg("cli_scanscript: exiting (file larger than 10 MB)\n");
1087
+		return CL_CLEAN;
1088
+	}
1089
+
1090
+	/* dump to disk only if explicitly asked to,
1091
+	 * otherwise we can process just in-memory */
1092
+	if(cli_leavetemps_flag) {
1093
+		if((ret = cli_gentempfd(NULL, &tmpname, &ofd))) {
1094
+			cli_dbgmsg("cli_scanscript: Can't generate temporary file/descriptor\n");
1095
+			return ret;
1096
+		}
1097
+	}
1098
+
1099
+	text_normalize_init(&state, normalized, sizeof(normalized));
1100
+	ret = CL_CLEAN;
1101
+
1102
+	do {
1103
+		nread = cli_readn(desc, buff, sizeof(buff));
1104
+		if(nread <= 0 || state.out_pos + nread > state.out_len) {
1105
+			/* flush if error/EOF, or too little buffer space left */
1106
+			if((ofd != -1) && (write(ofd, state.out, state.out_pos) == -1)) {
1107
+				cli_errmsg("cli_scanscript: can't write to file %s\n",tmpname);
1108
+				close(ofd);
1109
+				ofd = -1;
1110
+				/* we can continue to scan in memory */
1111
+			}
1112
+			/* when we flush the buffer also scan */
1113
+			if(cli_scanbuff(state.out, state.out_pos, ctx->virname, ctx->engine, CL_TYPE_TEXT_ASCII) == CL_VIRUS) {
1114
+				ret = CL_VIRUS;
1115
+				break;
1116
+			}
1117
+			text_normalize_reset(&state);
1118
+		}
1119
+		if(nread > 0 && (text_normalize_buffer(&state, buff, nread)) != nread) {
1120
+			cli_dbgmsg("cli_scanscript: short read during normalizing\n");
1121
+		}
1122
+		/* used a do {}while() here, since we need to flush our buffers at the end,
1123
+		 * and using while(){} loop would mean code duplication */
1124
+	} while (nread > 0);
1125
+
1126
+	if(cli_leavetemps_flag) {
1127
+		free(tmpname);
1128
+		close(ofd);
1129
+	}
1130
+
1131
+	return ret;
1132
+}
1133
+
1067 1134
 static int cli_scanhtml_utf16(int desc, cli_ctx *ctx)
1068 1135
 {
1069 1136
 	char *tempname, buff[512], *decoded;
... ...
@@ -1838,6 +1906,11 @@ int cli_magic_scandesc(int desc, cli_ctx *ctx)
1838 1838
 		ret = cli_scanhtml_utf16(desc, ctx);
1839 1839
 	    break;
1840 1840
 
1841
+	case CL_TYPE_SCRIPT:
1842
+	    if(DCONF_DOC & DOC_CONF_SCRIPT)
1843
+	        ret = cli_scanscript(desc, ctx);
1844
+	    break;
1845
+
1841 1846
 	case CL_TYPE_RTF:
1842 1847
 	    if(DCONF_DOC & DOC_CONF_RTF)
1843 1848
 		ret = cli_scanrtf(desc, ctx);