Browse code

Include old normalise code

git-svn-id: file:///var/lib/svn/clamav-devel/trunk/clamav-devel@863 77e5149b-7576-45b1-b177-96237e5ba77b

Nigel Horne authored on 2004/09/14 21:14:29
Showing 2 changed files
... ...
@@ -1,3 +1,12 @@
1
+Tue Sep 14 13:10:38 BST 2004 (njh)
2
+----------------------------------
3
+  * libclamav/mbox.c:	FOLLOWURL: include the text of the old HTML 
4
+				normalisation code that works in RAM until the
5
+				code for the new HTML API that uses temporary
6
+				files is added to mbox.c. This allows clamAV to
7
+				link and work until the new code is called
8
+				from mbox.c.
9
+
1 10
 Tue Sep 14 11:30:43 BST 2004 (njh)
2 11
 ----------------------------------
3 12
   * libclamav/untar.c:	Fix compilation error on AIX and OSF
... ...
@@ -17,6 +17,9 @@
17 17
  *
18 18
  * Change History:
19 19
  * $Log: mbox.c,v $
20
+ * Revision 1.118  2004/09/14 12:09:37  nigelhorne
21
+ * Include old normalise code
22
+ *
20 23
  * Revision 1.117  2004/09/13 16:44:01  kojm
21 24
  * minor cleanup
22 25
  *
... ...
@@ -339,7 +342,7 @@
339 339
  * Compilable under SCO; removed duplicate code with message.c
340 340
  *
341 341
  */
342
-static	char	const	rcsid[] = "$Id: mbox.c,v 1.117 2004/09/13 16:44:01 kojm Exp $";
342
+static	char	const	rcsid[] = "$Id: mbox.c,v 1.118 2004/09/14 12:09:37 nigelhorne Exp $";
343 343
 
344 344
 #if HAVE_CONFIG_H
345 345
 #include "clamav-config.h"
... ...
@@ -424,8 +427,6 @@ typedef enum	{ FALSE = 0, TRUE = 1 } bool;
424 424
 
425 425
 #ifdef	FOLLOWURLS
426 426
 
427
-#include "htmlnorm.h"
428
-
429 427
 #define	MAX_URLS	5	/*
430 428
 				 * Maximum number of URLs scanned in a message
431 429
 				 * part
... ...
@@ -2109,6 +2110,80 @@ saveTextPart(message *m, const char *dir)
2109 2109
 }
2110 2110
 
2111 2111
 #ifdef	FOLLOWURLS
2112
+
2113
+/*
2114
+ * TODO: Use the newer normalise code
2115
+ * This is the old normalise code which normalises in memory. The new
2116
+ * code uses temporary files and has a different API.
2117
+ * 
2118
+* Normalize an HTML buffer using the following rules:
2119
+	o Remove multiple contiguous spaces
2120
+	o Remove spaces around '<' and '>' in tags
2121
+	o Remove spaces around '=' in tags
2122
+	o Replace single quote with double quote in tags
2123
+	o Convert to lowercase
2124
+	o Convert all white space to a space character
2125
+*/
2126
+static unsigned char *
2127
+mbox_html_normalize(unsigned char *in_buff, off_t in_size)
2128
+{
2129
+	unsigned char *out_buff;
2130
+	off_t out_size=0, i;
2131
+	int had_space=FALSE, tag_depth=0, in_quote=FALSE;
2132
+	
2133
+	out_buff = (unsigned char *)cli_malloc(in_size+1);
2134
+	if (!out_buff) {
2135
+		cli_errmsg("malloc failed");
2136
+		return NULL;
2137
+	}
2138
+	
2139
+	for (i=0 ; i < in_size ; i++) {
2140
+		if (in_buff[i] == '<') {
2141
+			out_buff[out_size++] = '<';
2142
+			tag_depth++;
2143
+			if (tag_depth == 1) {
2144
+				had_space=TRUE; /* consume spaces */
2145
+			}
2146
+		} else if ((in_buff[i] == '=') && (tag_depth == 1)) {
2147
+			/* Remove preceeding spaces */
2148
+			while ((out_size > 0) &&
2149
+				(out_buff[out_size-1] == ' ')) {
2150
+				out_size--;
2151
+			}
2152
+			out_buff[out_size++] = '=';
2153
+			had_space=TRUE;
2154
+		} else if (isspace(in_buff[i])) {
2155
+			if (!had_space) {
2156
+				out_buff[out_size++] = ' ';
2157
+				had_space=TRUE;
2158
+			}
2159
+		} else if (in_buff[i] == '>') {
2160
+			/* Remove preceeding spaces */
2161
+			if (tag_depth == 1) {
2162
+				while ((out_size > 0) &&
2163
+					(out_buff[out_size-1] == ' ')) {
2164
+					out_size--;
2165
+				}
2166
+			}
2167
+			out_buff[out_size++] = '>';
2168
+			tag_depth--;	
2169
+		} else if ((in_buff[i] == '\'') && (tag_depth==1)) {
2170
+			/* Convert single quotes to double quotes */
2171
+			if (in_quote || out_buff[out_size-1] == '=') {
2172
+				out_buff[out_size++] = '\"';
2173
+				in_quote = !in_quote;
2174
+			} else {
2175
+				out_buff[out_size++] = '\'';
2176
+			}
2177
+		} else {
2178
+			out_buff[out_size++] = tolower(in_buff[i]);
2179
+			had_space=FALSE;
2180
+		}
2181
+	}
2182
+	out_buff[out_size] = '\0';
2183
+	return out_buff;
2184
+}
2185
+
2112 2186
 static void
2113 2187
 checkURLs(message *m, const char *dir)
2114 2188
 {
... ...
@@ -2139,7 +2214,7 @@ checkURLs(message *m, const char *dir)
2139 2139
 	t = tableCreate();
2140 2140
 
2141 2141
 	n = 0;
2142
-	normalised = ptr = html_normalize(blobGetData(b), len);
2142
+	normalised = ptr = mbox_html_normalize(blobGetData(b), len);
2143 2143
 
2144 2144
 	if(normalised == NULL) {
2145 2145
 		blobDestroy(b);