GitList

@@ -53,6 +53,7 @@
                      typedef enum {
                          HTML_BAD_STATE,
                          HTML_NORM,
                     +    HTML_8BIT,
                          HTML_COMMENT,
                          HTML_CHAR_REF,
                          HTML_ENTITY_REF_DECODE,
@@ -470,10 +471,36 @@ void html_tag_arg_free(tag_arguments_t *tags)
                      static inline void html_tag_contents_append(struct tag_contents *cont, const unsigned char* begin,const unsigned char *end)
+                     {
                      	size_t i;
                     +        uint32_t mbchar = 0;
                      	if(!begin || !end)
                      		return;
                      	for(i = cont->pos; i < MAX_TAG_CONTENTS_LENGTH && (begin < end);i++) {
                     -		cont->contents[i] = *begin++;
                     +            uint8_t c = *begin++;
                     +            if (mbchar && (c < 0x80 || mbchar >= 0x10000)) {
                     +                if (mbchar == 0xE38082 || mbchar == 0xEFBC8E
                     +                    || mbchar == 0xEFB992 ||
                     +                    mbchar == 0xA143 || mbchar == 0xA144 ||
                     +                    mbchar == 0xA14F) {
                     +                    cont->contents[i++] = '.';
                     +                } else {
                     +                    uint8_t c0 = mbchar >> 16;
                     +                    uint8_t c1 = (mbchar >> 8)&0xff;
                     +                    uint8_t c2 = (mbchar & 0xff);
                     +                    if (c0 && i+1 < MAX_TAG_CONTENTS_LENGTH)
                     +                        cont->contents[i++] = c0;
                     +                    if ((c0 || c1) && i+1 < MAX_TAG_CONTENTS_LENGTH)
                     +                        cont->contents[i++] = c1;
                     +                    if (i+1 < MAX_TAG_CONTENTS_LENGTH)
                     +                        cont->contents[i++] = c2;
                     +                }
                     +                mbchar = 0;
                     +            }
                     +            if (c >= 0x80) {
                     +                mbchar = (mbchar << 8) | c;
                     +                --i;
                     +            }
                     +            else
                     +		cont->contents[i] = c;
+                     	}
                      	cont->pos = i;
+                     }
@@ -631,6 +658,7 @@ static int cli_html_normalise(int fd, m_area_t *m_area, const char *dirname, tag
                      	struct parser_state *js_state = NULL;
                      	const unsigned char *js_begin = NULL, *js_end = NULL;
                      	struct tag_contents contents;
                     +        uint32_t mbchar = 0;
                      	tag_args.scanContents=0;/* do we need to store the contents of <a></a>?*/
                      	contents.pos = 0;
@@ -749,6 +777,38 @@ static int cli_html_normalise(int fd, m_area_t *m_area, const char *dirname, tag
                      					next_state = HTML_BAD_STATE;
+                     				}
                      				break;
                     +                        case HTML_8BIT:
                     +                                if (*ptr < 0x80 || mbchar >= 0x10000) {
                     +                                    if (mbchar == 0xE38082 || mbchar == 0xEFBC8E
                     +                                        || mbchar == 0xEFB992 ||
                     +                                        mbchar == 0xA143 || mbchar == 0xA144 ||
                     +                                        mbchar == 0xA14F) {
                     +                                        /* bb #4097 */
                     +                                        html_output_c(file_buff_o2, '.');
                     +                                        html_output_c(file_buff_text, '.');
                     +                                    } else {
                     +                                        uint8_t c0 = mbchar >> 16;
                     +                                        uint8_t c1 = (mbchar >> 8)&0xff;
                     +                                        uint8_t c2 = (mbchar & 0xff);
                     +                                        if (c0) {
                     +                                            html_output_c(file_buff_o2, c0);
                     +                                            html_output_c(file_buff_text, c0);
                     +                                        }
                     +                                        if (c0 || c1) {
                     +                                            html_output_c(file_buff_o2, c1);
                     +                                            html_output_c(file_buff_text, c1);
                     +                                        }
                     +                                        html_output_c(file_buff_o2, c2);
                     +                                        html_output_c(file_buff_text, c1);
                     +                                    }
                     +                                    mbchar = 0;
                     +                                    state = next_state;
                     +                                    next_state = HTML_NORM;
                     +                                } else {
                     +                                    mbchar = (mbchar << 8) | *ptr;
                     +                                    ptr++;
                     +                                }
                     +                                break;
                      			case HTML_NORM:
                      				if (*ptr == '<') {
                      					ptrend=ptr; /* for use by scanContents */
@@ -781,6 +841,11 @@ static int cli_html_normalise(int fd, m_area_t *m_area, const char *dirname, tag
                      					state = HTML_CHAR_REF;
                      					next_state = HTML_NORM;
                      					ptr++;
                     +                                } else if (*ptr >= 0x80) {
                     +                                        state = HTML_8BIT;
                     +                                        next_state = HTML_NORM;
                     +                                        mbchar = *ptr;
                     +                                        ptr++;
                      				} else {
                      					unsigned char c = tolower(*ptr);
                      					/* normalize ' to " for scripts */

@@ -78,8 +78,11 @@ static inline unsigned char* u16_normalize(uint16_t u16, unsigned char* out, con
                      	if(u16 < 0xff) {
                      		assert((uint8_t)u16 != 0);
                      		*out++ = (uint8_t)u16;
                     -	} else {
                     -		size_t i;
                     +	} else if (u16 == 0x3002 || u16 == 0xFF0E || u16 == 0xFE52) {
                     +            /* bb #4097 */
                     +                *out++ = '.';
                     +        } else {
                     +                size_t i;
                      		/* normalize only >255 to speed up */
                      		if(limit <=  8) {
                      			/* not enough space available */