Browse code

commenting out UTF8 BOM detection code for the time being until cli_scanhtml can be validated to handle multibyte utf8

Micah Snyder authored on 2017/11/28 06:52:21
Showing 1 changed files
... ...
@@ -80,14 +80,19 @@ static int td_isascii(const unsigned char *buf, unsigned int len)
80 80
 {
81 81
 	unsigned int i;
82 82
 
83
-	/* Check for the Byte-Order-Mark for UTF-8 */
84
-	if ((len >= 3) &&
85
-		(buf[0] == 0xEF) &&
86
-		(buf[1] == 0xBB) &&
87
-		(buf[2] == 0xBF))
88
-	{
89
-		return 0;
90
-	}
83
+	// @TODO:  UTF8 BOM Detection. 
84
+	//    The following BOM detection results in False Negatives in regression testing
85
+	//    which can be eliminated by adding a condition to call cli_scanhtml for CL_TYPE_TEXT_UTF8
86
+	//    in scanners.c:cli_scanraw().  However, cli_scanhtml was written for ASCII and has 
87
+	//    not been validated to correctly handle multibyte UTF8. 
88
+	// /* Check for the Byte-Order-Mark for UTF-8 */
89
+	// if ((len >= 3) &&
90
+	// 	(buf[0] == 0xEF) &&
91
+	// 	(buf[1] == 0xBB) &&
92
+	// 	(buf[2] == 0xBF))
93
+	// {
94
+	// 	return 0;
95
+	// }
91 96
 
92 97
 	/* Validate that the data all falls within the bounds of 
93 98
 	 * plain ASCII, ISO-8859 text, and non-ISO extended ASCII (Mac, IBM PC)