git-svn-id: file:///var/lib/svn/clamav-devel/trunk/clamav-devel@863 77e5149b-7576-45b1-b177-96237e5ba77b
Nigel Horne authored on 2004/09/14 21:14:29... | ... |
@@ -1,3 +1,12 @@ |
1 |
+Tue Sep 14 13:10:38 BST 2004 (njh) |
|
2 |
+---------------------------------- |
|
3 |
+ * libclamav/mbox.c: FOLLOWURL: include the text of the old HTML |
|
4 |
+ normalisation code that works in RAM until the |
|
5 |
+ code for the new HTML API that uses temporary |
|
6 |
+ files is added to mbox.c. This allows clamAV to |
|
7 |
+ link and work until the new code is called |
|
8 |
+ from mbox.c. |
|
9 |
+ |
|
1 | 10 |
Tue Sep 14 11:30:43 BST 2004 (njh) |
2 | 11 |
---------------------------------- |
3 | 12 |
* libclamav/untar.c: Fix compilation error on AIX and OSF |
... | ... |
@@ -17,6 +17,9 @@ |
17 | 17 |
* |
18 | 18 |
* Change History: |
19 | 19 |
* $Log: mbox.c,v $ |
20 |
+ * Revision 1.118 2004/09/14 12:09:37 nigelhorne |
|
21 |
+ * Include old normalise code |
|
22 |
+ * |
|
20 | 23 |
* Revision 1.117 2004/09/13 16:44:01 kojm |
21 | 24 |
* minor cleanup |
22 | 25 |
* |
... | ... |
@@ -339,7 +342,7 @@ |
339 | 339 |
* Compilable under SCO; removed duplicate code with message.c |
340 | 340 |
* |
341 | 341 |
*/ |
342 |
-static char const rcsid[] = "$Id: mbox.c,v 1.117 2004/09/13 16:44:01 kojm Exp $"; |
|
342 |
+static char const rcsid[] = "$Id: mbox.c,v 1.118 2004/09/14 12:09:37 nigelhorne Exp $"; |
|
343 | 343 |
|
344 | 344 |
#if HAVE_CONFIG_H |
345 | 345 |
#include "clamav-config.h" |
... | ... |
@@ -424,8 +427,6 @@ typedef enum { FALSE = 0, TRUE = 1 } bool; |
424 | 424 |
|
425 | 425 |
#ifdef FOLLOWURLS |
426 | 426 |
|
427 |
-#include "htmlnorm.h" |
|
428 |
- |
|
429 | 427 |
#define MAX_URLS 5 /* |
430 | 428 |
* Maximum number of URLs scanned in a message |
431 | 429 |
* part |
... | ... |
@@ -2109,6 +2110,80 @@ saveTextPart(message *m, const char *dir) |
2109 | 2109 |
} |
2110 | 2110 |
|
2111 | 2111 |
#ifdef FOLLOWURLS |
2112 |
+ |
|
2113 |
+/* |
|
2114 |
+ * TODO: Use the newer normalise code |
|
2115 |
+ * This is the old normalise code which normalises in memory. The new |
|
2116 |
+ * code uses temporary files and has a different API. |
|
2117 |
+ * |
|
2118 |
+* Normalize an HTML buffer using the following rules: |
|
2119 |
+ o Remove multiple contiguous spaces |
|
2120 |
+ o Remove spaces around '<' and '>' in tags |
|
2121 |
+ o Remove spaces around '=' in tags |
|
2122 |
+ o Replace single quote with double quote in tags |
|
2123 |
+ o Convert to lowercase |
|
2124 |
+ o Convert all white space to a space character |
|
2125 |
+*/ |
|
2126 |
+static unsigned char * |
|
2127 |
+mbox_html_normalize(unsigned char *in_buff, off_t in_size) |
|
2128 |
+{ |
|
2129 |
+ unsigned char *out_buff; |
|
2130 |
+ off_t out_size=0, i; |
|
2131 |
+ int had_space=FALSE, tag_depth=0, in_quote=FALSE; |
|
2132 |
+ |
|
2133 |
+ out_buff = (unsigned char *)cli_malloc(in_size+1); |
|
2134 |
+ if (!out_buff) { |
|
2135 |
+ cli_errmsg("malloc failed"); |
|
2136 |
+ return NULL; |
|
2137 |
+ } |
|
2138 |
+ |
|
2139 |
+ for (i=0 ; i < in_size ; i++) { |
|
2140 |
+ if (in_buff[i] == '<') { |
|
2141 |
+ out_buff[out_size++] = '<'; |
|
2142 |
+ tag_depth++; |
|
2143 |
+ if (tag_depth == 1) { |
|
2144 |
+ had_space=TRUE; /* consume spaces */ |
|
2145 |
+ } |
|
2146 |
+ } else if ((in_buff[i] == '=') && (tag_depth == 1)) { |
|
2147 |
+ /* Remove preceeding spaces */ |
|
2148 |
+ while ((out_size > 0) && |
|
2149 |
+ (out_buff[out_size-1] == ' ')) { |
|
2150 |
+ out_size--; |
|
2151 |
+ } |
|
2152 |
+ out_buff[out_size++] = '='; |
|
2153 |
+ had_space=TRUE; |
|
2154 |
+ } else if (isspace(in_buff[i])) { |
|
2155 |
+ if (!had_space) { |
|
2156 |
+ out_buff[out_size++] = ' '; |
|
2157 |
+ had_space=TRUE; |
|
2158 |
+ } |
|
2159 |
+ } else if (in_buff[i] == '>') { |
|
2160 |
+ /* Remove preceeding spaces */ |
|
2161 |
+ if (tag_depth == 1) { |
|
2162 |
+ while ((out_size > 0) && |
|
2163 |
+ (out_buff[out_size-1] == ' ')) { |
|
2164 |
+ out_size--; |
|
2165 |
+ } |
|
2166 |
+ } |
|
2167 |
+ out_buff[out_size++] = '>'; |
|
2168 |
+ tag_depth--; |
|
2169 |
+ } else if ((in_buff[i] == '\'') && (tag_depth==1)) { |
|
2170 |
+ /* Convert single quotes to double quotes */ |
|
2171 |
+ if (in_quote || out_buff[out_size-1] == '=') { |
|
2172 |
+ out_buff[out_size++] = '\"'; |
|
2173 |
+ in_quote = !in_quote; |
|
2174 |
+ } else { |
|
2175 |
+ out_buff[out_size++] = '\''; |
|
2176 |
+ } |
|
2177 |
+ } else { |
|
2178 |
+ out_buff[out_size++] = tolower(in_buff[i]); |
|
2179 |
+ had_space=FALSE; |
|
2180 |
+ } |
|
2181 |
+ } |
|
2182 |
+ out_buff[out_size] = '\0'; |
|
2183 |
+ return out_buff; |
|
2184 |
+} |
|
2185 |
+ |
|
2112 | 2186 |
static void |
2113 | 2187 |
checkURLs(message *m, const char *dir) |
2114 | 2188 |
{ |
... | ... |
@@ -2139,7 +2214,7 @@ checkURLs(message *m, const char *dir) |
2139 | 2139 |
t = tableCreate(); |
2140 | 2140 |
|
2141 | 2141 |
n = 0; |
2142 |
- normalised = ptr = html_normalize(blobGetData(b), len); |
|
2142 |
+ normalised = ptr = mbox_html_normalize(blobGetData(b), len); |
|
2143 | 2143 |
|
2144 | 2144 |
if(normalised == NULL) { |
2145 | 2145 |
blobDestroy(b); |