Browse code

bb #4097

Normalize Big5 dot in html.

Török Edvin authored on 2012/01/17 01:45:33
Showing 2 changed files
... ...
@@ -78,8 +78,11 @@ static inline unsigned char* u16_normalize(uint16_t u16, unsigned char* out, con
78 78
 	if(u16 < 0xff) {
79 79
 		assert((uint8_t)u16 != 0);
80 80
 		*out++ = (uint8_t)u16;
81
-	} else {
82
-		size_t i;
81
+	} else if (u16 == 0x3002 || u16 == 0xFF0E || u16 == 0xFE52) {
82
+            /* bb #4097 */
83
+                *out++ = '.';
84
+        } else {
85
+                size_t i;
83 86
 		/* normalize only >255 to speed up */
84 87
 		if(limit <=  8) {
85 88
 			/* not enough space available */
... ...
@@ -53,6 +53,7 @@
53 53
 typedef enum {
54 54
     HTML_BAD_STATE,
55 55
     HTML_NORM,
56
+    HTML_8BIT,
56 57
     HTML_COMMENT,
57 58
     HTML_CHAR_REF,
58 59
     HTML_ENTITY_REF_DECODE,
... ...
@@ -470,10 +471,36 @@ void html_tag_arg_free(tag_arguments_t *tags)
470 470
 static inline void html_tag_contents_append(struct tag_contents *cont, const unsigned char* begin,const unsigned char *end)
471 471
 {
472 472
 	size_t i;
473
+        uint32_t mbchar = 0;
473 474
 	if(!begin || !end)
474 475
 		return;
475 476
 	for(i = cont->pos; i < MAX_TAG_CONTENTS_LENGTH && (begin < end);i++) {
476
-		cont->contents[i] = *begin++;
477
+            uint8_t c = *begin++;
478
+            if (mbchar && (c < 0x80 || mbchar >= 0x10000)) {
479
+                if (mbchar == 0xE38082 || mbchar == 0xEFBC8E
480
+                    || mbchar == 0xEFB992 ||
481
+                    mbchar == 0xA143 || mbchar == 0xA144 ||
482
+                    mbchar == 0xA14F) {
483
+                    cont->contents[i++] = '.';
484
+                } else {
485
+                    uint8_t c0 = mbchar >> 16;
486
+                    uint8_t c1 = (mbchar >> 8)&0xff;
487
+                    uint8_t c2 = (mbchar & 0xff);
488
+                    if (c0 && i+1 < MAX_TAG_CONTENTS_LENGTH)
489
+                        cont->contents[i++] = c0;
490
+                    if ((c0 || c1) && i+1 < MAX_TAG_CONTENTS_LENGTH)
491
+                        cont->contents[i++] = c1;
492
+                    if (i+1 < MAX_TAG_CONTENTS_LENGTH)
493
+                        cont->contents[i++] = c2;
494
+                }
495
+                mbchar = 0;
496
+            }
497
+            if (c >= 0x80) {
498
+                mbchar = (mbchar << 8) | c;
499
+                --i;
500
+            }
501
+            else
502
+		cont->contents[i] = c;
477 503
 	}
478 504
 	cont->pos = i;
479 505
 }
... ...
@@ -631,6 +658,7 @@ static int cli_html_normalise(int fd, m_area_t *m_area, const char *dirname, tag
631 631
 	struct parser_state *js_state = NULL;
632 632
 	const unsigned char *js_begin = NULL, *js_end = NULL;
633 633
 	struct tag_contents contents;
634
+        uint32_t mbchar = 0;
634 635
 
635 636
 	tag_args.scanContents=0;/* do we need to store the contents of <a></a>?*/
636 637
 	contents.pos = 0;
... ...
@@ -749,6 +777,38 @@ static int cli_html_normalise(int fd, m_area_t *m_area, const char *dirname, tag
749 749
 					next_state = HTML_BAD_STATE;
750 750
 				}
751 751
 				break;
752
+                        case HTML_8BIT:
753
+                                if (*ptr < 0x80 || mbchar >= 0x10000) {
754
+                                    if (mbchar == 0xE38082 || mbchar == 0xEFBC8E
755
+                                        || mbchar == 0xEFB992 ||
756
+                                        mbchar == 0xA143 || mbchar == 0xA144 ||
757
+                                        mbchar == 0xA14F) {
758
+                                        /* bb #4097 */
759
+                                        html_output_c(file_buff_o2, '.');
760
+                                        html_output_c(file_buff_text, '.');
761
+                                    } else {
762
+                                        uint8_t c0 = mbchar >> 16;
763
+                                        uint8_t c1 = (mbchar >> 8)&0xff;
764
+                                        uint8_t c2 = (mbchar & 0xff);
765
+                                        if (c0) {
766
+                                            html_output_c(file_buff_o2, c0);
767
+                                            html_output_c(file_buff_text, c0);
768
+                                        }
769
+                                        if (c0 || c1) {
770
+                                            html_output_c(file_buff_o2, c1);
771
+                                            html_output_c(file_buff_text, c1);
772
+                                        }
773
+                                        html_output_c(file_buff_o2, c2);
774
+                                        html_output_c(file_buff_text, c1);
775
+                                    }
776
+                                    mbchar = 0;
777
+                                    state = next_state;
778
+                                    next_state = HTML_NORM;
779
+                                } else {
780
+                                    mbchar = (mbchar << 8) | *ptr;
781
+                                    ptr++;
782
+                                }
783
+                                break;
752 784
 			case HTML_NORM:
753 785
 				if (*ptr == '<') {
754 786
 					ptrend=ptr; /* for use by scanContents */
... ...
@@ -781,6 +841,11 @@ static int cli_html_normalise(int fd, m_area_t *m_area, const char *dirname, tag
781 781
 					state = HTML_CHAR_REF;
782 782
 					next_state = HTML_NORM;
783 783
 					ptr++;
784
+                                } else if (*ptr >= 0x80) {
785
+                                        state = HTML_8BIT;
786
+                                        next_state = HTML_NORM;
787
+                                        mbchar = *ptr;
788
+                                        ptr++;
784 789
 				} else {
785 790
 					unsigned char c = tolower(*ptr);
786 791
 					/* normalize ' to " for scripts */