Normalize Big5 dot in html.
Török Edvin authored on 2012/01/17 01:45:33... | ... |
@@ -78,8 +78,11 @@ static inline unsigned char* u16_normalize(uint16_t u16, unsigned char* out, con |
78 | 78 |
if(u16 < 0xff) { |
79 | 79 |
assert((uint8_t)u16 != 0); |
80 | 80 |
*out++ = (uint8_t)u16; |
81 |
- } else { |
|
82 |
- size_t i; |
|
81 |
+ } else if (u16 == 0x3002 || u16 == 0xFF0E || u16 == 0xFE52) { |
|
82 |
+ /* bb #4097 */ |
|
83 |
+ *out++ = '.'; |
|
84 |
+ } else { |
|
85 |
+ size_t i; |
|
83 | 86 |
/* normalize only >255 to speed up */ |
84 | 87 |
if(limit <= 8) { |
85 | 88 |
/* not enough space available */ |
... | ... |
@@ -53,6 +53,7 @@ |
53 | 53 |
typedef enum { |
54 | 54 |
HTML_BAD_STATE, |
55 | 55 |
HTML_NORM, |
56 |
+ HTML_8BIT, |
|
56 | 57 |
HTML_COMMENT, |
57 | 58 |
HTML_CHAR_REF, |
58 | 59 |
HTML_ENTITY_REF_DECODE, |
... | ... |
@@ -470,10 +471,36 @@ void html_tag_arg_free(tag_arguments_t *tags) |
470 | 470 |
static inline void html_tag_contents_append(struct tag_contents *cont, const unsigned char* begin,const unsigned char *end) |
471 | 471 |
{ |
472 | 472 |
size_t i; |
473 |
+ uint32_t mbchar = 0; |
|
473 | 474 |
if(!begin || !end) |
474 | 475 |
return; |
475 | 476 |
for(i = cont->pos; i < MAX_TAG_CONTENTS_LENGTH && (begin < end);i++) { |
476 |
- cont->contents[i] = *begin++; |
|
477 |
+ uint8_t c = *begin++; |
|
478 |
+ if (mbchar && (c < 0x80 || mbchar >= 0x10000)) { |
|
479 |
+ if (mbchar == 0xE38082 || mbchar == 0xEFBC8E |
|
480 |
+ || mbchar == 0xEFB992 || |
|
481 |
+ mbchar == 0xA143 || mbchar == 0xA144 || |
|
482 |
+ mbchar == 0xA14F) { |
|
483 |
+ cont->contents[i++] = '.'; |
|
484 |
+ } else { |
|
485 |
+ uint8_t c0 = mbchar >> 16; |
|
486 |
+ uint8_t c1 = (mbchar >> 8)&0xff; |
|
487 |
+ uint8_t c2 = (mbchar & 0xff); |
|
488 |
+ if (c0 && i+1 < MAX_TAG_CONTENTS_LENGTH) |
|
489 |
+ cont->contents[i++] = c0; |
|
490 |
+ if ((c0 || c1) && i+1 < MAX_TAG_CONTENTS_LENGTH) |
|
491 |
+ cont->contents[i++] = c1; |
|
492 |
+ if (i+1 < MAX_TAG_CONTENTS_LENGTH) |
|
493 |
+ cont->contents[i++] = c2; |
|
494 |
+ } |
|
495 |
+ mbchar = 0; |
|
496 |
+ } |
|
497 |
+ if (c >= 0x80) { |
|
498 |
+ mbchar = (mbchar << 8) | c; |
|
499 |
+ --i; |
|
500 |
+ } |
|
501 |
+ else |
|
502 |
+ cont->contents[i] = c; |
|
477 | 503 |
} |
478 | 504 |
cont->pos = i; |
479 | 505 |
} |
... | ... |
@@ -631,6 +658,7 @@ static int cli_html_normalise(int fd, m_area_t *m_area, const char *dirname, tag |
631 | 631 |
struct parser_state *js_state = NULL; |
632 | 632 |
const unsigned char *js_begin = NULL, *js_end = NULL; |
633 | 633 |
struct tag_contents contents; |
634 |
+ uint32_t mbchar = 0; |
|
634 | 635 |
|
635 | 636 |
tag_args.scanContents=0;/* do we need to store the contents of <a></a>?*/ |
636 | 637 |
contents.pos = 0; |
... | ... |
@@ -749,6 +777,38 @@ static int cli_html_normalise(int fd, m_area_t *m_area, const char *dirname, tag |
749 | 749 |
next_state = HTML_BAD_STATE; |
750 | 750 |
} |
751 | 751 |
break; |
752 |
+ case HTML_8BIT: |
|
753 |
+ if (*ptr < 0x80 || mbchar >= 0x10000) { |
|
754 |
+ if (mbchar == 0xE38082 || mbchar == 0xEFBC8E |
|
755 |
+ || mbchar == 0xEFB992 || |
|
756 |
+ mbchar == 0xA143 || mbchar == 0xA144 || |
|
757 |
+ mbchar == 0xA14F) { |
|
758 |
+ /* bb #4097 */ |
|
759 |
+ html_output_c(file_buff_o2, '.'); |
|
760 |
+ html_output_c(file_buff_text, '.'); |
|
761 |
+ } else { |
|
762 |
+ uint8_t c0 = mbchar >> 16; |
|
763 |
+ uint8_t c1 = (mbchar >> 8)&0xff; |
|
764 |
+ uint8_t c2 = (mbchar & 0xff); |
|
765 |
+ if (c0) { |
|
766 |
+ html_output_c(file_buff_o2, c0); |
|
767 |
+ html_output_c(file_buff_text, c0); |
|
768 |
+ } |
|
769 |
+ if (c0 || c1) { |
|
770 |
+ html_output_c(file_buff_o2, c1); |
|
771 |
+ html_output_c(file_buff_text, c1); |
|
772 |
+ } |
|
773 |
+ html_output_c(file_buff_o2, c2); |
|
774 |
+ html_output_c(file_buff_text, c1); |
|
775 |
+ } |
|
776 |
+ mbchar = 0; |
|
777 |
+ state = next_state; |
|
778 |
+ next_state = HTML_NORM; |
|
779 |
+ } else { |
|
780 |
+ mbchar = (mbchar << 8) | *ptr; |
|
781 |
+ ptr++; |
|
782 |
+ } |
|
783 |
+ break; |
|
752 | 784 |
case HTML_NORM: |
753 | 785 |
if (*ptr == '<') { |
754 | 786 |
ptrend=ptr; /* for use by scanContents */ |
... | ... |
@@ -781,6 +841,11 @@ static int cli_html_normalise(int fd, m_area_t *m_area, const char *dirname, tag |
781 | 781 |
state = HTML_CHAR_REF; |
782 | 782 |
next_state = HTML_NORM; |
783 | 783 |
ptr++; |
784 |
+ } else if (*ptr >= 0x80) { |
|
785 |
+ state = HTML_8BIT; |
|
786 |
+ next_state = HTML_NORM; |
|
787 |
+ mbchar = *ptr; |
|
788 |
+ ptr++; |
|
784 | 789 |
} else { |
785 | 790 |
unsigned char c = tolower(*ptr); |
786 | 791 |
/* normalize ' to " for scripts */ |