Browse code

handle NULL characters in HTML files. (bb #539).

git-svn: trunk@3543

Török Edvin authored on 2008/01/26 01:39:40
Showing 3 changed files
... ...
@@ -1,3 +1,7 @@
1
+Fri Jan 25 18:15:21 EET 2008 (edwin)
2
+------------------------------------
3
+  * libclamav/htmlnorm.[ch]: handle NULL characters in HTML files. (bb #539).
4
+
1 5
 Fri Jan 25 16:35:34 CET 2008 (tk)
2 6
 ---------------------------------
3 7
   * libclamav/cab.[ch]: rewrite file/folder handling code as a complete
... ...
@@ -156,13 +156,27 @@ int decrypt_tables[3][128] = {
156 156
        0x3B, 0x57, 0x22, 0x6D, 0x4D, 0x25, 0x28, 0x46, 0x4A, 0x32, 0x41, 0x3D, 0x5F, 0x4F, 0x42, 0x65}
157 157
 };
158 158
 
159
-unsigned char *cli_readline(FILE *stream, m_area_t *m_area, unsigned int max_len)
159
+static inline unsigned int rewind_tospace(const unsigned char* chunk, unsigned int len)
160 160
 {
161
-	unsigned char *line, *ptr, *start, *end;
162
-	unsigned int line_len, count;
161
+	unsigned int count = len;
162
+	while (!isspace(chunk[len - 1]) && (len > 1)) {
163
+		len--;
164
+	}
165
+	if (len == 1) {
166
+		return count;
167
+	}
168
+	return len;
169
+}
163 170
 
164
-	line = (unsigned char *) cli_malloc(max_len);
165
-	if (!line) {
171
+/* read at most @max_len of data from @m_area or @stream, skipping NULL chars.
172
+ * This used to be called cli_readline, but we don't stop at end-of-line anymore */
173
+static unsigned char *cli_readchunk(FILE *stream, m_area_t *m_area, unsigned int max_len)
174
+{
175
+	unsigned char *chunk, *start, *ptr, *end;
176
+	unsigned int chunk_len, count;
177
+
178
+	chunk = (unsigned char *) cli_malloc(max_len);
179
+	if (!chunk) {
166 180
 		return NULL;
167 181
 	}
168 182
 
... ...
@@ -171,66 +185,103 @@ unsigned char *cli_readline(FILE *stream, m_area_t *m_area, unsigned int max_len
171 171
 		start = ptr = m_area->buffer + m_area->offset;
172 172
 		end = m_area->buffer + m_area->length;
173 173
 		if (start >= end) {
174
-			free(line);
174
+			free(chunk);
175 175
 			return NULL;
176 176
 		}
177
-		line_len = 1;
178
-		while ((ptr < end) && (*ptr != '\n') && (line_len < (max_len-1))) {
179
-			ptr++;
180
-			line_len++;
181
-		}
182
-		if (ptr == end) {
183
-			line_len--;
184
-			memcpy(line, start, line_len);
185
-			line[line_len] = '\0';
186
-		} else if (*ptr == '\n') {
187
-			memcpy(line, start, line_len);
188
-			line[line_len] = '\0';
177
+		/* maximum we can copy into the buffer,
178
+		 * we could have less than max_len bytes available */
179
+		chunk_len = MIN(end-start, max_len-1);
180
+
181
+		/* look for NULL chars */
182
+		ptr = memchr(start, 0, chunk_len);
183
+	        if(!ptr) {
184
+			/* no NULL chars found, copy all */
185
+			memcpy(chunk, start, chunk_len);
186
+			chunk[chunk_len] = '\0';
187
+			m_area->offset += chunk_len;
188
+			/* point ptr to end of chunk,
189
+			 * so we can check and rewind to a space below */
190
+			ptr = start + chunk_len;
189 191
 		} else {
190
-			/* Hit max_len */
191
-			/* Store the current line end and length*/
192
-			count = line_len;
193
-			while (!isspace(*ptr) && (line_len > 1)) {
194
-				ptr--;
195
-				line_len--;
192
+			/* copy portion that doesn't contain NULL chars */
193
+			chunk_len = ptr - start;
194
+			if(chunk_len < max_len) {
195
+				memcpy(chunk, start, chunk_len);
196
+			} else {
197
+				chunk_len = 0;
198
+				ptr = start;
199
+			}
200
+			/* we have unknown number of NULL chars,
201
+			 * copy char-by-char and skip them */
202
+			while((ptr < end) && (chunk_len < max_len-1)) {
203
+				const unsigned char c = *ptr++;
204
+				if(c) {
205
+					chunk[chunk_len++] = c;
206
+				}
196 207
 			}
197
-			if (line_len == 1) {
198
-				line_len=count;
208
+			chunk[chunk_len] = '\0';
209
+			/* we can't use chunk_len to determine how many bytes we read, since
210
+			 * we skipped chars */
211
+			m_area->offset = ptr - m_area->buffer;
212
+		}
213
+		if(ptr && ptr < end && !isspace(*ptr)) {
214
+			/* we hit max_len, rewind to a space */
215
+			count = rewind_tospace(chunk, chunk_len);
216
+			if(count < chunk_len) {
217
+				chunk[count] = '\0';
218
+				m_area->offset -= chunk_len - count;
199 219
 			}
200
-			memcpy(line, start, line_len);
201
-			line[line_len] = '\0';
202 220
 		}
203
-		m_area->offset += line_len;
204 221
 	} else {
205 222
 		if (!stream) {
206 223
 			cli_dbgmsg("No HTML stream\n");
207
-			free(line);
224
+			free(chunk);
208 225
 			return NULL;
209 226
 		}
210
-		if (fgets(line, max_len, stream) == NULL) {
211
-			free(line);
227
+		chunk_len = fread(chunk, 1, max_len-1, stream);
228
+		if(!chunk_len || chunk_len > max_len-1) {
229
+			/* EOF, or prevent overflow */
230
+			free(chunk);
212 231
 			return NULL;
213 232
 		}
214 233
 
215
-		line_len=strlen(line);
216
-		if (line_len == 0) {
217
-			free(line);
218
-			return NULL;
219
-		}
220
-		if (line_len == max_len-1) {
221
-			/* didn't find a whole line - rewind to a space*/
222
-			count = 0;
223
-			while (!isspace(line[--line_len])) {
224
-				count--;
225
-				if (line_len == 0) {
226
-					return line;
234
+		/* Look for NULL chars */
235
+		ptr = memchr(chunk, 0, chunk_len);
236
+		if(ptr) {
237
+			/* NULL char found */
238
+			/* save buffer limits */
239
+		        start = ptr;
240
+			end = chunk + chunk_len;
241
+
242
+			/* start of NULL chars, we will copy non-NULL characters
243
+			 * to this position */
244
+			chunk_len = ptr - chunk;
245
+
246
+			/* find first non-NULL char */
247
+			while((ptr < end) && !(*ptr)) {
248
+				ptr++;
249
+			}
250
+			/* skip over NULL chars, and move back the rest */
251
+		        while((ptr < end) && (chunk_len < max_len-1)) {
252
+				const unsigned char c = *ptr++;
253
+				if(c) {
254
+					chunk[chunk_len++] = c;
227 255
 				}
228 256
 			}
229
-			fseek(stream, count, SEEK_CUR);
230
-			line[line_len+1] = '\0';
257
+			chunk[chunk_len] = '\0';
258
+		}
259
+		if(chunk_len == max_len - 1) {
260
+			/* rewind to a space (which includes newline) */
261
+			count = rewind_tospace(chunk, chunk_len);
262
+			if(count < chunk_len) {
263
+				chunk[count] = '\0';
264
+				/* seek-back to space */
265
+				fseek(stream, (long)(count - chunk_len), SEEK_CUR);
266
+			}
231 267
 		}
232 268
 	}
233
-	return line;
269
+
270
+	return chunk;
234 271
 }
235 272
 
236 273
 static void html_output_flush(file_buff_t *fbuff)
... ...
@@ -580,7 +631,7 @@ static int cli_html_normalise(int fd, m_area_t *m_area, const char *dirname, tag
580 580
 	if(dconf_entconv)
581 581
 		ptr = line = encoding_norm_readline(&conv, stream_in, m_area);
582 582
 	else
583
-		ptr = line = cli_readline(stream_in, m_area, 8192);
583
+		ptr = line = cli_readchunk(stream_in, m_area, 8192);
584 584
 
585 585
 	while (line) {
586 586
 		if(href_contents_begin)
... ...
@@ -1486,7 +1537,7 @@ static int cli_html_normalise(int fd, m_area_t *m_area, const char *dirname, tag
1486 1486
 			ptr = line = encoding_norm_readline(&conv, stream_in, m_area);
1487 1487
 		else {
1488 1488
 			free(line);
1489
-			ptr = line = cli_readline(stream_in, m_area, 8192);
1489
+			ptr = line = cli_readchunk(stream_in, m_area, 8192);
1490 1490
 		}
1491 1491
 	}
1492 1492
 
... ...
@@ -1609,7 +1660,7 @@ int html_screnc_decode(int fd, const char *dirname)
1609 1609
 		return FALSE;
1610 1610
 	}
1611 1611
 	
1612
-	while ((line = cli_readline(stream_in, NULL, 8192)) != NULL) {
1612
+	while ((line = cli_readchunk(stream_in, NULL, 8192)) != NULL) {
1613 1613
 		ptr = strstr(line, "#@~^");
1614 1614
 		if (ptr) {
1615 1615
 			break;
... ...
@@ -1626,7 +1677,7 @@ int html_screnc_decode(int fd, const char *dirname)
1626 1626
 	do {
1627 1627
 		if (! *ptr) {
1628 1628
 			free(line);
1629
-			ptr = line = cli_readline(stream_in, NULL, 8192);
1629
+			ptr = line = cli_readchunk(stream_in, NULL, 8192);
1630 1630
 			if (!line) {
1631 1631
 				goto abort;
1632 1632
 			}
... ...
@@ -1701,7 +1752,7 @@ int html_screnc_decode(int fd, const char *dirname)
1701 1701
 		}
1702 1702
 		free(line);
1703 1703
 		if (length) {
1704
-			ptr = line = cli_readline(stream_in, NULL, 8192);
1704
+			ptr = line = cli_readchunk(stream_in, NULL, 8192);
1705 1705
 		}
1706 1706
 	}
1707 1707
 	retval = TRUE;
... ...
@@ -35,8 +35,6 @@ typedef struct m_area_tag {
35 35
 	off_t offset;
36 36
 } m_area_t;
37 37
 
38
-
39
-unsigned char *cli_readline(FILE *stream, m_area_t *m_area, unsigned int max_len);
40 38
 int html_normalise_mem(unsigned char *in_buff, off_t in_size, const char *dirname, tag_arguments_t *hrefs,const struct cli_dconf* dconf);
41 39
 int html_normalise_fd(int fd, const char *dirname, tag_arguments_t *hrefs,const struct cli_dconf* dconf);
42 40
 void html_tag_arg_free(tag_arguments_t *tags);