Also relocated codepage table from msdoc.h to entconv.h
Also adds new macros for codepages to reduce use of magic numbers when
referencing code pages elsewhere in libclamav.
... | ... |
@@ -72,7 +72,6 @@ |
72 | 72 |
#include "lzma_iface.h" |
73 | 73 |
|
74 | 74 |
#include "egg.h" |
75 |
-#include "msdoc.h" |
|
76 | 75 |
#include "entconv.h" |
77 | 76 |
|
78 | 77 |
#ifndef WCHAR |
... | ... |
@@ -619,9 +618,9 @@ static cl_error_t egg_parse_comment_header(const uint8_t* index, size_t size, ex |
619 | 619 |
if (extraField->bit_flag & COMMENT_HEADER_FLAGS_MULTIBYTE_CODEPAGE_INSTEAD_OF_UTF8) { |
620 | 620 |
/* |
621 | 621 |
* Unlike with filenames, the multibyte string codepage (or "locale") is not present in comment headers. |
622 |
- * Try conversion with codepage 65001. |
|
622 |
+ * Try conversion with CODEPAGE_UTF8. |
|
623 | 623 |
*/ |
624 |
- if (CL_SUCCESS != cli_codepage_to_utf8((char*)index, size, 65001, &comment_utf8, &comment_utf8_size)) { |
|
624 |
+ if (CL_SUCCESS != cli_codepage_to_utf8((char*)index, size, CODEPAGE_UTF8, &comment_utf8, &comment_utf8_size)) { |
|
625 | 625 |
cli_dbgmsg("egg_parse_comment_header: failed to convert codepage \"0\" to UTF-8\n"); |
626 | 626 |
comment_utf8 = cli_genfname(NULL); |
627 | 627 |
} |
... | ... |
@@ -1163,7 +1162,7 @@ static cl_error_t egg_parse_file_extra_field(egg_handle* handle, egg_file* eggFi |
1163 | 1163 |
* - 949 (Korean Unified Code) |
1164 | 1164 |
* - 932 (Japanese Shift-JIS) */ |
1165 | 1165 |
if (0 == codepage) { |
1166 |
- if (CL_SUCCESS != cli_codepage_to_utf8((char*)index, name_size, 65001, &name_utf8, &name_utf8_size)) { |
|
1166 |
+ if (CL_SUCCESS != cli_codepage_to_utf8((char*)index, name_size, CODEPAGE_UTF8, &name_utf8, &name_utf8_size)) { |
|
1167 | 1167 |
cli_dbgmsg("egg_parse_file_extra_field: failed to convert codepage \"0\" to UTF-8\n"); |
1168 | 1168 |
name_utf8 = cli_genfname(NULL); |
1169 | 1169 |
} |
... | ... |
@@ -780,7 +780,7 @@ cl_error_t cli_codepage_to_utf8(char* in, size_t in_size, uint16_t codepage, cha |
780 | 780 |
size_t out_utf8_size = 0; |
781 | 781 |
|
782 | 782 |
#if defined(HAVE_ICONV) |
783 |
- iconv_t conv = (iconv_t) -1; |
|
783 |
+ iconv_t conv = (iconv_t)-1; |
|
784 | 784 |
#elif defined(WIN32) |
785 | 785 |
LPWSTR lpWideCharStr = NULL; |
786 | 786 |
int cchWideChar = 0; |
... | ... |
@@ -796,8 +796,8 @@ cl_error_t cli_codepage_to_utf8(char* in, size_t in_size, uint16_t codepage, cha |
796 | 796 |
*out_size = 0; |
797 | 797 |
|
798 | 798 |
switch (codepage) { |
799 |
- case 20127: /* US-ASCII (7-bit) */ |
|
800 |
- case 65001: { /* Unicode (UTF-8) */ |
|
799 |
+ case CODEPAGE_US_7BIT_ASCII: /* US-ASCII (7-bit) */ |
|
800 |
+ case CODEPAGE_UTF8: { /* Unicode (UTF-8) */ |
|
801 | 801 |
char* track; |
802 | 802 |
int byte_count, sigbit_count; |
803 | 803 |
|
... | ... |
@@ -811,7 +811,7 @@ cl_error_t cli_codepage_to_utf8(char* in, size_t in_size, uint16_t codepage, cha |
811 | 811 |
memcpy(out_utf8, in, in_size); |
812 | 812 |
|
813 | 813 |
track = out_utf8 + in_size - 1; |
814 |
- if ((codepage == 65001) && (*track & 0x80)) { |
|
814 |
+ if ((codepage == CODEPAGE_UTF8) && (*track & 0x80)) { |
|
815 | 815 |
/* |
816 | 816 |
* UTF-8 with a most significant bit. |
817 | 817 |
*/ |
... | ... |
@@ -847,7 +847,7 @@ cl_error_t cli_codepage_to_utf8(char* in, size_t in_size, uint16_t codepage, cha |
847 | 847 |
* Do conversion using native Win32 APIs. |
848 | 848 |
*/ |
849 | 849 |
|
850 |
- if (1200 != codepage) { /* not already UTF16-LE (Windows Unicode) */ |
|
850 |
+ if (CODEPAGE_UTF16_LE != codepage) { /* not already UTF16-LE (Windows Unicode) */ |
|
851 | 851 |
/* |
852 | 852 |
* First, Convert from codepage -> UCS-2 LE with MultiByteToWideChar(codepage) |
853 | 853 |
*/ |
... | ... |
@@ -942,7 +942,7 @@ cl_error_t cli_codepage_to_utf8(char* in, size_t in_size, uint16_t codepage, cha |
942 | 942 |
} |
943 | 943 |
} |
944 | 944 |
|
945 |
- if (NULL == encoding){ |
|
945 |
+ if (NULL == encoding) { |
|
946 | 946 |
cli_dbgmsg("cli_codepage_to_utf8: Invalid codepage parameter passed in.\n"); |
947 | 947 |
goto done; |
948 | 948 |
} |
... | ... |
@@ -953,7 +953,7 @@ cl_error_t cli_codepage_to_utf8(char* in, size_t in_size, uint16_t codepage, cha |
953 | 953 |
size_t iconvRet = -1; |
954 | 954 |
size_t outbytesleft = 0; |
955 | 955 |
|
956 |
- char* out_utf8_tmp = NULL; |
|
956 |
+ char* out_utf8_tmp = NULL; |
|
957 | 957 |
char* out_utf8_index = NULL; |
958 | 958 |
|
959 | 959 |
/* Charset to UTF-8 should never exceed in_size * 6; |
... | ... |
@@ -978,8 +978,8 @@ cl_error_t cli_codepage_to_utf8(char* in, size_t in_size, uint16_t codepage, cha |
978 | 978 |
|
979 | 979 |
iconvRet = iconv(conv, &inbuf, &inbufsize, &out_utf8_index, &outbytesleft); |
980 | 980 |
iconv_close(conv); |
981 |
- conv = (iconv_t) -1; |
|
982 |
- if ((size_t)-1 == iconvRet){ |
|
981 |
+ conv = (iconv_t)-1; |
|
982 |
+ if ((size_t)-1 == iconvRet) { |
|
983 | 983 |
switch (errno) { |
984 | 984 |
case E2BIG: |
985 | 985 |
cli_warnmsg("cli_codepage_to_utf8: iconv error: There is not sufficient room at *outbuf.\n"); |
... | ... |
@@ -1036,7 +1036,7 @@ done: |
1036 | 1036 |
#endif |
1037 | 1037 |
|
1038 | 1038 |
#if defined(HAVE_ICONV) |
1039 |
- if (conv != (iconv_t) -1) { |
|
1039 |
+ if (conv != (iconv_t)-1) { |
|
1040 | 1040 |
iconv_close(conv); |
1041 | 1041 |
} |
1042 | 1042 |
#endif |
... | ... |
@@ -1050,9 +1050,9 @@ done: |
1050 | 1050 |
return status; |
1051 | 1051 |
} |
1052 | 1052 |
|
1053 |
-char *cli_utf16toascii(const char *str, unsigned int length) |
|
1053 |
+char* cli_utf16toascii(const char* str, unsigned int length) |
|
1054 | 1054 |
{ |
1055 |
- char *decoded; |
|
1055 |
+ char* decoded; |
|
1056 | 1056 |
unsigned int i, j; |
1057 | 1057 |
|
1058 | 1058 |
if (length < 2) { |
... | ... |
@@ -1074,14 +1074,14 @@ char *cli_utf16toascii(const char *str, unsigned int length) |
1074 | 1074 |
return decoded; |
1075 | 1075 |
} |
1076 | 1076 |
|
1077 |
-char *cli_utf16_to_utf8(const char *utf16, size_t length, encoding_t type) |
|
1077 |
+char* cli_utf16_to_utf8(const char* utf16, size_t length, encoding_t type) |
|
1078 | 1078 |
{ |
1079 | 1079 |
/* utf8 - |
1080 | 1080 |
* 4 bytes for utf16 high+low surrogate (4 bytes input) |
1081 | 1081 |
* 3 bytes for utf16 otherwise (2 bytes input) */ |
1082 | 1082 |
size_t i, j; |
1083 | 1083 |
size_t needed = length * 3 / 2 + 2; |
1084 |
- char *s2; |
|
1084 |
+ char* s2; |
|
1085 | 1085 |
|
1086 | 1086 |
if (length < 2) |
1087 | 1087 |
return cli_strdup(""); |
... | ... |
@@ -1146,7 +1146,7 @@ char *cli_utf16_to_utf8(const char *utf16, size_t length, encoding_t type) |
1146 | 1146 |
return s2; |
1147 | 1147 |
} |
1148 | 1148 |
|
1149 |
-int cli_isutf8(const char *buf, unsigned int len) |
|
1149 |
+int cli_isutf8(const char* buf, unsigned int len) |
|
1150 | 1150 |
{ |
1151 | 1151 |
unsigned int i, j; |
1152 | 1152 |
|
... | ... |
@@ -34,24 +34,193 @@ |
34 | 34 |
#include "hashtab.h" |
35 | 35 |
#include "htmlnorm.h" |
36 | 36 |
|
37 |
-#define UCS4_1234 "UCS-4BE" |
|
38 |
-#define UCS4_4321 "UCS-4LE" |
|
39 |
-#define UCS4_2143 "UCS4" |
|
40 |
-#define UCS4_3412 "UCS-4" |
|
41 |
-#define UTF16_BE "UTF-16BE" |
|
42 |
-#define UTF16_LE "UTF-16LE" |
|
43 |
-#define UTF8 "UTF-8" |
|
44 |
-#define UNDECIDED_32_1234 UCS4_1234 |
|
45 |
-#define UNDECIDED_32_4321 UCS4_4321 |
|
46 |
-#define UNDECIDED_32_2143 UCS4_2143 |
|
47 |
-#define UNDECIDED_32_3412 UCS4_3412 |
|
48 |
-#define UNDECIDED_16_BE UTF16_BE |
|
49 |
-#define UNDECIDED_16_LE UTF16_LE |
|
50 |
-#define UNDECIDED_8 "ISO-8859-1" |
|
51 |
- |
|
52 |
-#define CODEPAGE_ISO8859_1 28591 |
|
53 |
-#define CODEPAGE_UTF16_LE 1200 |
|
54 |
-#define CODEPAGE_UTF16_BE 1201 |
|
37 |
+// clang-format off |
|
38 |
+#define UCS4_1234 "UCS-4BE" |
|
39 |
+#define UCS4_4321 "UCS-4LE" |
|
40 |
+#define UCS4_2143 "UCS4" |
|
41 |
+#define UCS4_3412 "UCS-4" |
|
42 |
+#define UTF16_BE "UTF-16BE" |
|
43 |
+#define UTF16_LE "UTF-16LE" |
|
44 |
+#define UTF8 "UTF-8" |
|
45 |
+#define UNDECIDED_32_1234 UCS4_1234 |
|
46 |
+#define UNDECIDED_32_4321 UCS4_4321 |
|
47 |
+#define UNDECIDED_32_2143 UCS4_2143 |
|
48 |
+#define UNDECIDED_32_3412 UCS4_3412 |
|
49 |
+#define UNDECIDED_16_BE UTF16_BE |
|
50 |
+#define UNDECIDED_16_LE UTF16_LE |
|
51 |
+#define UNDECIDED_8 "ISO-8859-1" |
|
52 |
+ |
|
53 |
+#define CODEPAGE_JAPANESE_SHIFT_JIS 932 /* Japanese Shift-JIS */ |
|
54 |
+#define CODEPAGE_KOREAN_UNIFIED 949 /* Korean Unified Code */ |
|
55 |
+#define CODEPAGE_UTF16_LE 1200 /* UTF16 Little Endian */ |
|
56 |
+#define CODEPAGE_UTF16_BE 1201 /* UTF16 Big Endian */ |
|
57 |
+#define CODEPAGE_US_7BIT_ASCII 20127 /* US-ASCII (7-bit) */ |
|
58 |
+#define CODEPAGE_ISO8859_1 28591 /* ISO 8859-1 Latin 1; Western European (ISO) */ |
|
59 |
+#define CODEPAGE_UTF8 65001 /* UTF-8 */ |
|
60 |
+// clang-format on |
|
61 |
+ |
|
62 |
+/* string conversion */ |
|
63 |
+struct codepage_entry { |
|
64 |
+ uint16_t codepage; |
|
65 |
+ const char *encoding; |
|
66 |
+}; |
|
67 |
+ |
|
68 |
+#define NUMCODEPAGES (sizeof(codepage_entries) / sizeof(struct codepage_entry)) |
|
69 |
+/* MAINTAIN - the array in codepage value sorted order */ |
|
70 |
+static const struct codepage_entry codepage_entries[] = { |
|
71 |
+ {37, "IBM037"}, /* IBM EBCDIC US-Canada */ |
|
72 |
+ {437, "IBM437"}, /* OEM United States */ |
|
73 |
+ {500, "IBM500"}, /* IBM EBCDIC International */ |
|
74 |
+ {708, "ASMO-708"}, /* Arabic (ASMO 708) */ |
|
75 |
+ {709, NULL}, /* Arabic (ASMO-449+, BCON V4) */ |
|
76 |
+ {710, NULL}, /* Arabic - Transparent Arabic */ |
|
77 |
+ {720, NULL}, /* Arabic (Transparent ASMO); Arabic (DOS) */ |
|
78 |
+ {737, NULL}, /* OEM Greek (formerly 437G); Greek (DOS) */ |
|
79 |
+ {775, "IBM775"}, /* OEM Baltic; Baltic (DOS) */ |
|
80 |
+ {850, "IBM850"}, /* OEM Multilingual Latin 1; Western European (DOS) */ |
|
81 |
+ {852, "IBM852"}, /* OEM Latin 2; Central European (DOS) */ |
|
82 |
+ {855, "IBM855"}, /* OEM Cyrillic (primarily Russian) */ |
|
83 |
+ {857, "IBM857"}, /* OEM Turkish; Turkish (DOS) */ |
|
84 |
+ {858, NULL}, /* OEM Multilingual Latin 1 + Euro symbol */ |
|
85 |
+ {860, "IBM860"}, /* OEM Portuguese; Portuguese (DOS) */ |
|
86 |
+ {861, "IBM861"}, /* OEM Icelandic; Icelandic (DOS) */ |
|
87 |
+ {862, NULL}, /* OEM Hebrew; Hebrew (DOS) */ |
|
88 |
+ {863, "IBM863"}, /* OEM French Canadian; French Canadian (DOS) */ |
|
89 |
+ {864, "IBM864"}, /* OEM Arabic; Arabic (864) */ |
|
90 |
+ {865, "IBM865"}, /* OEM Nordic; Nordic (DOS) */ |
|
91 |
+ {866, "CP866"}, /* OEM Russian; Cyrillic (DOS) */ |
|
92 |
+ {869, "IBM869"}, /* OEM Modern Greek; Greek, Modern (DOS) */ |
|
93 |
+ {870, "IBM870"}, /* IBM EBCDIC Multilingual/ROECE (Latin 2); IBM EBCDIC Multilingual Latin 2 */ |
|
94 |
+ {874, "WINDOWS-874"}, /* ANSI/OEM Thai (ISO 8859-11); Thai (Windows) */ |
|
95 |
+ {875, "CP875"}, /* IBM EBCDIC Greek Modern */ |
|
96 |
+ {932, "SHIFT_JIS"}, /* ANSI/OEM Japanese; Japanese (Shift-JIS) */ |
|
97 |
+ {936, "GB2312"}, /* ANSI/OEM Simplified Chinese (PRC, Singapore); Chinese Simplified (GB2312) */ |
|
98 |
+ {949, "CP949"}, /* ANSI/OEM Korean (Unified Hangul Code) */ |
|
99 |
+ {950, "BIG5"}, /* ANSI/OEM Traditional Chinese (Taiwan; Hong Kong SAR, PRC); Chinese Traditional (Big5) */ |
|
100 |
+ {1026, "IBM1026"}, /* IBM EBCDIC Turkish (Latin 5) */ |
|
101 |
+ {1047, NULL}, /* IBM EBCDIC Latin 1/Open System */ |
|
102 |
+ {1140, NULL}, /* IBM EBCDIC US-Canada (037 + Euro symbol); IBM EBCDIC (US-Canada-Euro) */ |
|
103 |
+ {1141, NULL}, /* IBM EBCDIC Germany (20273 + Euro symbol); IBM EBCDIC (Germany-Euro) */ |
|
104 |
+ {1142, NULL}, /* IBM EBCDIC Denmark-Norway (20277 + Euro symbol); IBM EBCDIC (Denmark-Norway-Euro) */ |
|
105 |
+ {1143, NULL}, /* IBM EBCDIC Finland-Sweden (20278 + Euro symbol); IBM EBCDIC (Finland-Sweden-Euro) */ |
|
106 |
+ {1144, NULL}, /* IBM EBCDIC Italy (20280 + Euro symbol); IBM EBCDIC (Italy-Euro) */ |
|
107 |
+ {1145, NULL}, /* IBM EBCDIC Latin America-Spain (20284 + Euro symbol); IBM EBCDIC (Spain-Euro) */ |
|
108 |
+ {1146, NULL}, /* IBM EBCDIC United Kingdom (20285 + Euro symbol); IBM EBCDIC (UK-Euro) */ |
|
109 |
+ {1147, NULL}, /* IBM EBCDIC France (20297 + Euro symbol); IBM EBCDIC (France-Euro) */ |
|
110 |
+ {1148, NULL}, /* IBM EBCDIC International (500 + Euro symbol); IBM EBCDIC (International-Euro) */ |
|
111 |
+ {1149, NULL}, /* IBM EBCDIC Icelandic (20871 + Euro symbol); IBM EBCDIC (Icelandic-Euro) */ |
|
112 |
+ {1200, "UTF-16LE"}, /* Unicode UTF-16, little endian byte order (BMP of ISO 10646); available only to managed applications */ |
|
113 |
+ {1201, "UTF-16BE"}, /* Unicode UTF-16, big endian byte order; available only to managed applications */ |
|
114 |
+ {1250, "WINDOWS-1250"}, /* ANSI Central European; Central European (Windows) */ |
|
115 |
+ {1251, "WINDOWS-1251"}, /* ANSI Cyrillic; Cyrillic (Windows) */ |
|
116 |
+ {1252, "WINDOWS-1252"}, /* ANSI Latin 1; Western European (Windows) */ |
|
117 |
+ {1253, "WINDOWS-1253"}, /* ANSI Greek; Greek (Windows) */ |
|
118 |
+ {1254, "WINDOWS-1254"}, /* ANSI Turkish; Turkish (Windows) */ |
|
119 |
+ {1255, "WINDOWS-1255"}, /* ANSI Hebrew; Hebrew (Windows) */ |
|
120 |
+ {1256, "WINDOWS-1256"}, /* ANSI Arabic; Arabic (Windows) */ |
|
121 |
+ {1257, "WINDOWS-1257"}, /* ANSI Baltic; Baltic (Windows) */ |
|
122 |
+ {1258, "WINDOWS-1258"}, /* ANSI/OEM Vietnamese; Vietnamese (Windows) */ |
|
123 |
+ {1361, "JOHAB"}, /* Korean (Johab) */ |
|
124 |
+ {10000, "MACINTOSH"}, /* MAC Roman; Western European (Mac) */ |
|
125 |
+ {10001, NULL}, /* Japanese (Mac) */ |
|
126 |
+ {10002, NULL}, /* MAC Traditional Chinese (Big5); Chinese Traditional (Mac) */ |
|
127 |
+ {10003, NULL}, /* Korean (Mac) */ |
|
128 |
+ {10004, NULL}, /* Arabic (Mac) */ |
|
129 |
+ {10005, NULL}, /* Hebrew (Mac) */ |
|
130 |
+ {10006, NULL}, /* Greek (Mac) */ |
|
131 |
+ {10007, NULL}, /* Cyrillic (Mac) */ |
|
132 |
+ {10008, NULL}, /* MAC Simplified Chinese (GB 2312); Chinese Simplified (Mac) */ |
|
133 |
+ {10010, NULL}, /* Romanian (Mac) */ |
|
134 |
+ {10017, NULL}, /* Ukrainian (Mac) */ |
|
135 |
+ {10021, NULL}, /* Thai (Mac) */ |
|
136 |
+ {10029, NULL}, /* MAC Latin 2; Central European (Mac) */ |
|
137 |
+ {10079, NULL}, /* Icelandic (Mac) */ |
|
138 |
+ {10081, NULL}, /* Turkish (Mac) */ |
|
139 |
+ {10082, NULL}, /* Croatian (Mac) */ |
|
140 |
+ {12000, "UTF-32LE"}, /* Unicode UTF-32, little endian byte order; available only to managed applications */ |
|
141 |
+ {12001, "UTF-32BE"}, /* Unicode UTF-32, big endian byte order; available only to managed applications */ |
|
142 |
+ {20000, NULL}, /* CNS Taiwan; Chinese Traditional (CNS) */ |
|
143 |
+ {20001, NULL}, /* TCA Taiwan */ |
|
144 |
+ {20002, NULL}, /* Eten Taiwan; Chinese Traditional (Eten) */ |
|
145 |
+ {20003, NULL}, /* IBM5550 Taiwan */ |
|
146 |
+ {20004, NULL}, /* TeleText Taiwan */ |
|
147 |
+ {20005, NULL}, /* Wang Taiwan */ |
|
148 |
+ {20105, NULL}, /* IA5 (IRV International Alphabet No. 5, 7-bit); Western European (IA5) */ |
|
149 |
+ {20106, NULL}, /* IA5 German (7-bit) */ |
|
150 |
+ {20107, NULL}, /* IA5 Swedish (7-bit) */ |
|
151 |
+ {20108, NULL}, /* IA5 Norwegian (7-bit) */ |
|
152 |
+ {20127, "US-ASCII"}, /* US-ASCII (7-bit) */ |
|
153 |
+ {20261, NULL}, /* T.61 */ |
|
154 |
+ {20269, NULL}, /* ISO 6937 Non-Spacing Accent */ |
|
155 |
+ {20273, "IBM273"}, /* IBM EBCDIC Germany */ |
|
156 |
+ {20277, "IBM277"}, /* IBM EBCDIC Denmark-Norway */ |
|
157 |
+ {20278, "IBM278"}, /* IBM EBCDIC Finland-Sweden */ |
|
158 |
+ {20280, "IBM280"}, /* IBM EBCDIC Italy */ |
|
159 |
+ {20284, "IBM284"}, /* IBM EBCDIC Latin America-Spain */ |
|
160 |
+ {20285, "IBM285"}, /* IBM EBCDIC United Kingdom */ |
|
161 |
+ {20290, "IBM290"}, /* IBM EBCDIC Japanese Katakana Extended */ |
|
162 |
+ {20297, "IBM297"}, /* IBM EBCDIC France */ |
|
163 |
+ {20420, "IBM420"}, /* IBM EBCDIC Arabic */ |
|
164 |
+ {20423, "IBM423"}, /* IBM EBCDIC Greek */ |
|
165 |
+ {20424, "IBM424"}, /* IBM EBCDIC Hebrew */ |
|
166 |
+ {20833, NULL}, /* IBM EBCDIC Korean Extended */ |
|
167 |
+ {20838, NULL}, /* IBM EBCDIC Thai */ |
|
168 |
+ {20866, "KOI8-R"}, /* Russian (KOI8-R); Cyrillic (KOI8-R) */ |
|
169 |
+ {20871, "IBM871"}, /* IBM EBCDIC Icelandic */ |
|
170 |
+ {20880, "IBM880"}, /* IBM EBCDIC Cyrillic Russian */ |
|
171 |
+ {20905, "IBM905"}, /* IBM EBCDIC Turkish */ |
|
172 |
+ {20924, NULL}, /* IBM EBCDIC Latin 1/Open System (1047 + Euro symbol) */ |
|
173 |
+ {20932, "EUC-JP"}, /* Japanese (JIS 0208-1990 and 0212-1990) */ |
|
174 |
+ {20936, NULL}, /* Simplified Chinese (GB2312); Chinese Simplified (GB2312-80) */ |
|
175 |
+ {20949, NULL}, /* Korean Wansung */ |
|
176 |
+ {21025, "CP1025"}, /* IBM EBCDIC Cyrillic Serbian-Bulgarian */ |
|
177 |
+ {21027, NULL}, /* (deprecated) */ |
|
178 |
+ {21866, "KOI8-U"}, /* Ukrainian (KOI8-U); Cyrillic (KOI8-U) */ |
|
179 |
+ {28591, "ISO-8859-1"}, /* ISO 8859-1 Latin 1; Western European (ISO) */ |
|
180 |
+ {28592, "ISO-8859-2"}, /* ISO 8859-2 Central European; Central European (ISO) */ |
|
181 |
+ {28593, "ISO-8859-3"}, /* ISO 8859-3 Latin 3 */ |
|
182 |
+ {28594, "ISO-8859-4"}, /* ISO 8859-4 Baltic */ |
|
183 |
+ {28595, "ISO-8859-5"}, /* ISO 8859-5 Cyrillic */ |
|
184 |
+ {28596, "ISO-8859-6"}, /* ISO 8859-6 Arabic */ |
|
185 |
+ {28597, "ISO-8859-7"}, /* ISO 8859-7 Greek */ |
|
186 |
+ {28598, "ISO-8859-8"}, /* ISO 8859-8 Hebrew; Hebrew (ISO-Visual) */ |
|
187 |
+ {28599, "ISO-8859-9"}, /* ISO 8859-9 Turkish */ |
|
188 |
+ {28603, "ISO-8859-13"}, /* ISO 8859-13 Estonian */ |
|
189 |
+ {28605, "ISO-8859-15"}, /* ISO 8859-15 Latin 9 */ |
|
190 |
+ {29001, NULL}, /* Europa 3 */ |
|
191 |
+ {38598, NULL}, /* ISO 8859-8 Hebrew; Hebrew (ISO-Logical) */ |
|
192 |
+ {50220, "ISO-2022-JP"}, /* ISO 2022 Japanese with no halfwidth Katakana; Japanese (JIS) (guess) */ |
|
193 |
+ {50221, "ISO-2022-JP-2"}, /* ISO 2022 Japanese with halfwidth Katakana; Japanese (JIS-Allow 1 byte Kana) (guess) */ |
|
194 |
+ {50222, "ISO-2022-JP-3"}, /* ISO 2022 Japanese JIS X 0201-1989; Japanese (JIS-Allow 1 byte Kana - SO/SI) (guess) */ |
|
195 |
+ {50225, "ISO-2022-KR"}, /* ISO 2022 Korean */ |
|
196 |
+ {50227, NULL}, /* ISO 2022 Simplified Chinese; Chinese Simplified (ISO 2022) */ |
|
197 |
+ {50229, NULL}, /* ISO 2022 Traditional Chinese */ |
|
198 |
+ {50930, NULL}, /* EBCDIC Japanese (Katakana) Extended */ |
|
199 |
+ {50931, NULL}, /* EBCDIC US-Canada and Japanese */ |
|
200 |
+ {50933, NULL}, /* EBCDIC Korean Extended and Korean */ |
|
201 |
+ {50935, NULL}, /* EBCDIC Simplified Chinese Extended and Simplified Chinese */ |
|
202 |
+ {50936, NULL}, /* EBCDIC Simplified Chinese */ |
|
203 |
+ {50937, NULL}, /* EBCDIC US-Canada and Traditional Chinese */ |
|
204 |
+ {50939, NULL}, /* EBCDIC Japanese (Latin) Extended and Japanese */ |
|
205 |
+ {51932, "EUC-JP"}, /* EUC Japanese */ |
|
206 |
+ {51936, "EUC-CN"}, /* EUC Simplified Chinese; Chinese Simplified (EUC) */ |
|
207 |
+ {51949, "EUC-KR"}, /* EUC Korean */ |
|
208 |
+ {51950, NULL}, /* EUC Traditional Chinese */ |
|
209 |
+ {52936, NULL}, /* HZ-GB2312 Simplified Chinese; Chinese Simplified (HZ) */ |
|
210 |
+ {54936, "GB18030"}, /* Windows XP and later: GB18030 Simplified Chinese (4 byte); Chinese Simplified (GB18030) */ |
|
211 |
+ {57002, NULL}, /* ISCII Devanagari */ |
|
212 |
+ {57003, NULL}, /* ISCII Bengali */ |
|
213 |
+ {57004, NULL}, /* ISCII Tamil */ |
|
214 |
+ {57005, NULL}, /* ISCII Telugu */ |
|
215 |
+ {57006, NULL}, /* ISCII Assamese */ |
|
216 |
+ {57007, NULL}, /* ISCII Oriya */ |
|
217 |
+ {57008, NULL}, /* ISCII Kannada */ |
|
218 |
+ {57009, NULL}, /* ISCII Malayalam */ |
|
219 |
+ {57010, NULL}, /* ISCII Gujarati */ |
|
220 |
+ {57011, NULL}, /* ISCII Punjabi */ |
|
221 |
+ {65000, "UTF-7"}, /* Unicode (UTF-7) */ |
|
222 |
+ {65001, "UTF-8"} /* Unicode (UTF-8) */ |
|
223 |
+}; |
|
55 | 224 |
|
56 | 225 |
#define MAX_ENTITY_SIZE 22 |
57 | 226 |
|
... | ... |
@@ -90,10 +259,10 @@ int encoding_normalize_toascii(const m_area_t* in_m_area, const char* initial_en |
90 | 90 |
*/ |
91 | 91 |
cl_error_t cli_codepage_to_utf8(char* in, size_t in_size, uint16_t codepage, char** out, size_t* out_size); |
92 | 92 |
|
93 |
-char *cli_utf16toascii(const char *str, unsigned int length); |
|
93 |
+char* cli_utf16toascii(const char* str, unsigned int length); |
|
94 | 94 |
|
95 |
-char *cli_utf16_to_utf8(const char *utf16, size_t length, encoding_t type); |
|
95 |
+char* cli_utf16_to_utf8(const char* utf16, size_t length, encoding_t type); |
|
96 | 96 |
|
97 |
-int cli_isutf8(const char *buf, unsigned int len); |
|
97 |
+int cli_isutf8(const char* buf, unsigned int len); |
|
98 | 98 |
|
99 | 99 |
#endif |
... | ... |
@@ -47,6 +47,7 @@ |
47 | 47 |
#include "scanners.h" |
48 | 48 |
#include "fmap.h" |
49 | 49 |
#include "json_api.h" |
50 |
+#include "entconv.h" |
|
50 | 51 |
|
51 | 52 |
#if HAVE_JSON |
52 | 53 |
static char * |
... | ... |
@@ -64,7 +65,7 @@ ole2_convert_utf(summary_ctx_t *sctx, char *begin, size_t sz, const char *encodi |
64 | 64 |
UNUSEDPARAM(encoding); |
65 | 65 |
#endif |
66 | 66 |
/* applies in the both case */ |
67 |
- if (sctx->codepage == 20127 || sctx->codepage == 65001) { |
|
67 |
+ if (sctx->codepage == 20127 || sctx->codepage == CODEPAGE_UTF8) { |
|
68 | 68 |
char *track; |
69 | 69 |
size_t bcnt, scnt; |
70 | 70 |
|
... | ... |
@@ -74,7 +75,7 @@ ole2_convert_utf(summary_ctx_t *sctx, char *begin, size_t sz, const char *encodi |
74 | 74 |
memcpy(outbuf, begin, sz); |
75 | 75 |
|
76 | 76 |
track = outbuf + sz - 1; |
77 |
- if ((sctx->codepage == 65001) && (*track & 0x80)) { /* UTF-8 with a most significant bit */ |
|
77 |
+ if ((sctx->codepage == CODEPAGE_UTF8) && (*track & 0x80)) { /* UTF-8 with a most significant bit */ |
|
78 | 78 |
/* locate the start of the last character */ |
79 | 79 |
for (bcnt = 1; (track != outbuf); track--, bcnt++) { |
80 | 80 |
if (((uint8_t)*track & 0xC0) != 0x80) |
... | ... |
@@ -27,169 +27,6 @@ |
27 | 27 |
#include "others.h" |
28 | 28 |
#include "uniq.h" |
29 | 29 |
|
30 |
-/* string conversion */ |
|
31 |
-struct codepage_entry { |
|
32 |
- uint16_t codepage; |
|
33 |
- const char *encoding; |
|
34 |
-}; |
|
35 |
- |
|
36 |
-#define NUMCODEPAGES (sizeof(codepage_entries) / sizeof(struct codepage_entry)) |
|
37 |
-/* MAINTAIN - the array in codepage value sorted order */ |
|
38 |
-static const struct codepage_entry codepage_entries[] = { |
|
39 |
- {37, "IBM037"}, /* IBM EBCDIC US-Canada */ |
|
40 |
- {437, "IBM437"}, /* OEM United States */ |
|
41 |
- {500, "IBM500"}, /* IBM EBCDIC International */ |
|
42 |
- {708, "ASMO-708"}, /* Arabic (ASMO 708) */ |
|
43 |
- {709, NULL}, /* Arabic (ASMO-449+, BCON V4) */ |
|
44 |
- {710, NULL}, /* Arabic - Transparent Arabic */ |
|
45 |
- {720, NULL}, /* Arabic (Transparent ASMO); Arabic (DOS) */ |
|
46 |
- {737, NULL}, /* OEM Greek (formerly 437G); Greek (DOS) */ |
|
47 |
- {775, "IBM775"}, /* OEM Baltic; Baltic (DOS) */ |
|
48 |
- {850, "IBM850"}, /* OEM Multilingual Latin 1; Western European (DOS) */ |
|
49 |
- {852, "IBM852"}, /* OEM Latin 2; Central European (DOS) */ |
|
50 |
- {855, "IBM855"}, /* OEM Cyrillic (primarily Russian) */ |
|
51 |
- {857, "IBM857"}, /* OEM Turkish; Turkish (DOS) */ |
|
52 |
- {858, NULL}, /* OEM Multilingual Latin 1 + Euro symbol */ |
|
53 |
- {860, "IBM860"}, /* OEM Portuguese; Portuguese (DOS) */ |
|
54 |
- {861, "IBM861"}, /* OEM Icelandic; Icelandic (DOS) */ |
|
55 |
- {862, NULL}, /* OEM Hebrew; Hebrew (DOS) */ |
|
56 |
- {863, "IBM863"}, /* OEM French Canadian; French Canadian (DOS) */ |
|
57 |
- {864, "IBM864"}, /* OEM Arabic; Arabic (864) */ |
|
58 |
- {865, "IBM865"}, /* OEM Nordic; Nordic (DOS) */ |
|
59 |
- {866, "CP866"}, /* OEM Russian; Cyrillic (DOS) */ |
|
60 |
- {869, "IBM869"}, /* OEM Modern Greek; Greek, Modern (DOS) */ |
|
61 |
- {870, "IBM870"}, /* IBM EBCDIC Multilingual/ROECE (Latin 2); IBM EBCDIC Multilingual Latin 2 */ |
|
62 |
- {874, "WINDOWS-874"}, /* ANSI/OEM Thai (ISO 8859-11); Thai (Windows) */ |
|
63 |
- {875, "CP875"}, /* IBM EBCDIC Greek Modern */ |
|
64 |
- {932, "SHIFT_JIS"}, /* ANSI/OEM Japanese; Japanese (Shift-JIS) */ |
|
65 |
- {936, "GB2312"}, /* ANSI/OEM Simplified Chinese (PRC, Singapore); Chinese Simplified (GB2312) */ |
|
66 |
- {949, "CP949"}, /* ANSI/OEM Korean (Unified Hangul Code) */ |
|
67 |
- {950, "BIG5"}, /* ANSI/OEM Traditional Chinese (Taiwan; Hong Kong SAR, PRC); Chinese Traditional (Big5) */ |
|
68 |
- {1026, "IBM1026"}, /* IBM EBCDIC Turkish (Latin 5) */ |
|
69 |
- {1047, NULL}, /* IBM EBCDIC Latin 1/Open System */ |
|
70 |
- {1140, NULL}, /* IBM EBCDIC US-Canada (037 + Euro symbol); IBM EBCDIC (US-Canada-Euro) */ |
|
71 |
- {1141, NULL}, /* IBM EBCDIC Germany (20273 + Euro symbol); IBM EBCDIC (Germany-Euro) */ |
|
72 |
- {1142, NULL}, /* IBM EBCDIC Denmark-Norway (20277 + Euro symbol); IBM EBCDIC (Denmark-Norway-Euro) */ |
|
73 |
- {1143, NULL}, /* IBM EBCDIC Finland-Sweden (20278 + Euro symbol); IBM EBCDIC (Finland-Sweden-Euro) */ |
|
74 |
- {1144, NULL}, /* IBM EBCDIC Italy (20280 + Euro symbol); IBM EBCDIC (Italy-Euro) */ |
|
75 |
- {1145, NULL}, /* IBM EBCDIC Latin America-Spain (20284 + Euro symbol); IBM EBCDIC (Spain-Euro) */ |
|
76 |
- {1146, NULL}, /* IBM EBCDIC United Kingdom (20285 + Euro symbol); IBM EBCDIC (UK-Euro) */ |
|
77 |
- {1147, NULL}, /* IBM EBCDIC France (20297 + Euro symbol); IBM EBCDIC (France-Euro) */ |
|
78 |
- {1148, NULL}, /* IBM EBCDIC International (500 + Euro symbol); IBM EBCDIC (International-Euro) */ |
|
79 |
- {1149, NULL}, /* IBM EBCDIC Icelandic (20871 + Euro symbol); IBM EBCDIC (Icelandic-Euro) */ |
|
80 |
- {1200, "UTF-16LE"}, /* Unicode UTF-16, little endian byte order (BMP of ISO 10646); available only to managed applications */ |
|
81 |
- {1201, "UTF-16BE"}, /* Unicode UTF-16, big endian byte order; available only to managed applications */ |
|
82 |
- {1250, "WINDOWS-1250"}, /* ANSI Central European; Central European (Windows) */ |
|
83 |
- {1251, "WINDOWS-1251"}, /* ANSI Cyrillic; Cyrillic (Windows) */ |
|
84 |
- {1252, "WINDOWS-1252"}, /* ANSI Latin 1; Western European (Windows) */ |
|
85 |
- {1253, "WINDOWS-1253"}, /* ANSI Greek; Greek (Windows) */ |
|
86 |
- {1254, "WINDOWS-1254"}, /* ANSI Turkish; Turkish (Windows) */ |
|
87 |
- {1255, "WINDOWS-1255"}, /* ANSI Hebrew; Hebrew (Windows) */ |
|
88 |
- {1256, "WINDOWS-1256"}, /* ANSI Arabic; Arabic (Windows) */ |
|
89 |
- {1257, "WINDOWS-1257"}, /* ANSI Baltic; Baltic (Windows) */ |
|
90 |
- {1258, "WINDOWS-1258"}, /* ANSI/OEM Vietnamese; Vietnamese (Windows) */ |
|
91 |
- {1361, "JOHAB"}, /* Korean (Johab) */ |
|
92 |
- {10000, "MACINTOSH"}, /* MAC Roman; Western European (Mac) */ |
|
93 |
- {10001, NULL}, /* Japanese (Mac) */ |
|
94 |
- {10002, NULL}, /* MAC Traditional Chinese (Big5); Chinese Traditional (Mac) */ |
|
95 |
- {10003, NULL}, /* Korean (Mac) */ |
|
96 |
- {10004, NULL}, /* Arabic (Mac) */ |
|
97 |
- {10005, NULL}, /* Hebrew (Mac) */ |
|
98 |
- {10006, NULL}, /* Greek (Mac) */ |
|
99 |
- {10007, NULL}, /* Cyrillic (Mac) */ |
|
100 |
- {10008, NULL}, /* MAC Simplified Chinese (GB 2312); Chinese Simplified (Mac) */ |
|
101 |
- {10010, NULL}, /* Romanian (Mac) */ |
|
102 |
- {10017, NULL}, /* Ukrainian (Mac) */ |
|
103 |
- {10021, NULL}, /* Thai (Mac) */ |
|
104 |
- {10029, NULL}, /* MAC Latin 2; Central European (Mac) */ |
|
105 |
- {10079, NULL}, /* Icelandic (Mac) */ |
|
106 |
- {10081, NULL}, /* Turkish (Mac) */ |
|
107 |
- {10082, NULL}, /* Croatian (Mac) */ |
|
108 |
- {12000, "UTF-32LE"}, /* Unicode UTF-32, little endian byte order; available only to managed applications */ |
|
109 |
- {12001, "UTF-32BE"}, /* Unicode UTF-32, big endian byte order; available only to managed applications */ |
|
110 |
- {20000, NULL}, /* CNS Taiwan; Chinese Traditional (CNS) */ |
|
111 |
- {20001, NULL}, /* TCA Taiwan */ |
|
112 |
- {20002, NULL}, /* Eten Taiwan; Chinese Traditional (Eten) */ |
|
113 |
- {20003, NULL}, /* IBM5550 Taiwan */ |
|
114 |
- {20004, NULL}, /* TeleText Taiwan */ |
|
115 |
- {20005, NULL}, /* Wang Taiwan */ |
|
116 |
- {20105, NULL}, /* IA5 (IRV International Alphabet No. 5, 7-bit); Western European (IA5) */ |
|
117 |
- {20106, NULL}, /* IA5 German (7-bit) */ |
|
118 |
- {20107, NULL}, /* IA5 Swedish (7-bit) */ |
|
119 |
- {20108, NULL}, /* IA5 Norwegian (7-bit) */ |
|
120 |
- {20127, "US-ASCII"}, /* US-ASCII (7-bit) */ |
|
121 |
- {20261, NULL}, /* T.61 */ |
|
122 |
- {20269, NULL}, /* ISO 6937 Non-Spacing Accent */ |
|
123 |
- {20273, "IBM273"}, /* IBM EBCDIC Germany */ |
|
124 |
- {20277, "IBM277"}, /* IBM EBCDIC Denmark-Norway */ |
|
125 |
- {20278, "IBM278"}, /* IBM EBCDIC Finland-Sweden */ |
|
126 |
- {20280, "IBM280"}, /* IBM EBCDIC Italy */ |
|
127 |
- {20284, "IBM284"}, /* IBM EBCDIC Latin America-Spain */ |
|
128 |
- {20285, "IBM285"}, /* IBM EBCDIC United Kingdom */ |
|
129 |
- {20290, "IBM290"}, /* IBM EBCDIC Japanese Katakana Extended */ |
|
130 |
- {20297, "IBM297"}, /* IBM EBCDIC France */ |
|
131 |
- {20420, "IBM420"}, /* IBM EBCDIC Arabic */ |
|
132 |
- {20423, "IBM423"}, /* IBM EBCDIC Greek */ |
|
133 |
- {20424, "IBM424"}, /* IBM EBCDIC Hebrew */ |
|
134 |
- {20833, NULL}, /* IBM EBCDIC Korean Extended */ |
|
135 |
- {20838, NULL}, /* IBM EBCDIC Thai */ |
|
136 |
- {20866, "KOI8-R"}, /* Russian (KOI8-R); Cyrillic (KOI8-R) */ |
|
137 |
- {20871, "IBM871"}, /* IBM EBCDIC Icelandic */ |
|
138 |
- {20880, "IBM880"}, /* IBM EBCDIC Cyrillic Russian */ |
|
139 |
- {20905, "IBM905"}, /* IBM EBCDIC Turkish */ |
|
140 |
- {20924, NULL}, /* IBM EBCDIC Latin 1/Open System (1047 + Euro symbol) */ |
|
141 |
- {20932, "EUC-JP"}, /* Japanese (JIS 0208-1990 and 0212-1990) */ |
|
142 |
- {20936, NULL}, /* Simplified Chinese (GB2312); Chinese Simplified (GB2312-80) */ |
|
143 |
- {20949, NULL}, /* Korean Wansung */ |
|
144 |
- {21025, "CP1025"}, /* IBM EBCDIC Cyrillic Serbian-Bulgarian */ |
|
145 |
- {21027, NULL}, /* (deprecated) */ |
|
146 |
- {21866, "KOI8-U"}, /* Ukrainian (KOI8-U); Cyrillic (KOI8-U) */ |
|
147 |
- {28591, "ISO-8859-1"}, /* ISO 8859-1 Latin 1; Western European (ISO) */ |
|
148 |
- {28592, "ISO-8859-2"}, /* ISO 8859-2 Central European; Central European (ISO) */ |
|
149 |
- {28593, "ISO-8859-3"}, /* ISO 8859-3 Latin 3 */ |
|
150 |
- {28594, "ISO-8859-4"}, /* ISO 8859-4 Baltic */ |
|
151 |
- {28595, "ISO-8859-5"}, /* ISO 8859-5 Cyrillic */ |
|
152 |
- {28596, "ISO-8859-6"}, /* ISO 8859-6 Arabic */ |
|
153 |
- {28597, "ISO-8859-7"}, /* ISO 8859-7 Greek */ |
|
154 |
- {28598, "ISO-8859-8"}, /* ISO 8859-8 Hebrew; Hebrew (ISO-Visual) */ |
|
155 |
- {28599, "ISO-8859-9"}, /* ISO 8859-9 Turkish */ |
|
156 |
- {28603, "ISO-8859-13"}, /* ISO 8859-13 Estonian */ |
|
157 |
- {28605, "ISO-8859-15"}, /* ISO 8859-15 Latin 9 */ |
|
158 |
- {29001, NULL}, /* Europa 3 */ |
|
159 |
- {38598, NULL}, /* ISO 8859-8 Hebrew; Hebrew (ISO-Logical) */ |
|
160 |
- {50220, "ISO-2022-JP"}, /* ISO 2022 Japanese with no halfwidth Katakana; Japanese (JIS) (guess) */ |
|
161 |
- {50221, "ISO-2022-JP-2"}, /* ISO 2022 Japanese with halfwidth Katakana; Japanese (JIS-Allow 1 byte Kana) (guess) */ |
|
162 |
- {50222, "ISO-2022-JP-3"}, /* ISO 2022 Japanese JIS X 0201-1989; Japanese (JIS-Allow 1 byte Kana - SO/SI) (guess) */ |
|
163 |
- {50225, "ISO-2022-KR"}, /* ISO 2022 Korean */ |
|
164 |
- {50227, NULL}, /* ISO 2022 Simplified Chinese; Chinese Simplified (ISO 2022) */ |
|
165 |
- {50229, NULL}, /* ISO 2022 Traditional Chinese */ |
|
166 |
- {50930, NULL}, /* EBCDIC Japanese (Katakana) Extended */ |
|
167 |
- {50931, NULL}, /* EBCDIC US-Canada and Japanese */ |
|
168 |
- {50933, NULL}, /* EBCDIC Korean Extended and Korean */ |
|
169 |
- {50935, NULL}, /* EBCDIC Simplified Chinese Extended and Simplified Chinese */ |
|
170 |
- {50936, NULL}, /* EBCDIC Simplified Chinese */ |
|
171 |
- {50937, NULL}, /* EBCDIC US-Canada and Traditional Chinese */ |
|
172 |
- {50939, NULL}, /* EBCDIC Japanese (Latin) Extended and Japanese */ |
|
173 |
- {51932, "EUC-JP"}, /* EUC Japanese */ |
|
174 |
- {51936, "EUC-CN"}, /* EUC Simplified Chinese; Chinese Simplified (EUC) */ |
|
175 |
- {51949, "EUC-KR"}, /* EUC Korean */ |
|
176 |
- {51950, NULL}, /* EUC Traditional Chinese */ |
|
177 |
- {52936, NULL}, /* HZ-GB2312 Simplified Chinese; Chinese Simplified (HZ) */ |
|
178 |
- {54936, "GB18030"}, /* Windows XP and later: GB18030 Simplified Chinese (4 byte); Chinese Simplified (GB18030) */ |
|
179 |
- {57002, NULL}, /* ISCII Devanagari */ |
|
180 |
- {57003, NULL}, /* ISCII Bengali */ |
|
181 |
- {57004, NULL}, /* ISCII Tamil */ |
|
182 |
- {57005, NULL}, /* ISCII Telugu */ |
|
183 |
- {57006, NULL}, /* ISCII Assamese */ |
|
184 |
- {57007, NULL}, /* ISCII Oriya */ |
|
185 |
- {57008, NULL}, /* ISCII Kannada */ |
|
186 |
- {57009, NULL}, /* ISCII Malayalam */ |
|
187 |
- {57010, NULL}, /* ISCII Gujarati */ |
|
188 |
- {57011, NULL}, /* ISCII Punjabi */ |
|
189 |
- {65000, "UTF-7"}, /* Unicode (UTF-7) */ |
|
190 |
- {65001, "UTF-8"} /* Unicode (UTF-8) */ |
|
191 |
-}; |
|
192 |
- |
|
193 | 30 |
#if HAVE_JSON |
194 | 31 |
|
195 | 32 |
#define PROPCNTLIMIT 25 |
... | ... |
@@ -24,6 +24,7 @@ |
24 | 24 |
#include "../libclamav/dsig.h" |
25 | 25 |
#include "../libclamav/fpu.h" |
26 | 26 |
#include "../platform.h" |
27 |
+#include "../libclamav/entconv.h" |
|
27 | 28 |
#include "checks.h" |
28 | 29 |
|
29 | 30 |
static int fpu_words = FPU_ENDIAN_INITME; |
... | ... |
@@ -983,6 +984,54 @@ START_TEST(test_sanitize_path) |
983 | 983 |
} |
984 | 984 |
END_TEST |
985 | 985 |
|
986 |
+START_TEST(test_cli_codepage_to_utf8) |
|
987 |
+{ |
|
988 |
+ cl_error_t ret; |
|
989 |
+ char *utf8 = NULL; |
|
990 |
+ size_t utf8_size = 0; |
|
991 |
+ |
|
992 |
+ ret = cli_codepage_to_utf8("\x82\xB1\x82\xF1\x82\xC9\x82\xBF\x82\xCD", 10, CODEPAGE_JAPANESE_SHIFT_JIS, &utf8, &utf8_size); |
|
993 |
+ ck_assert_msg(CL_SUCCESS == ret, "test_cli_codepage_to_utf8: Failed to convert CODEPAGE_JAPANESE_SHIFT_JIS to UTF8: ret != SUCCESS!"); |
|
994 |
+ ck_assert_msg(NULL != utf8, "sanitize_path: Failed to convert CODEPAGE_JAPANESE_SHIFT_JIS to UTF8: utf8 pointer is NULL!"); |
|
995 |
+ ck_assert_msg(0 == strcmp(utf8, "こんにちは"), "sanitize_path: '%s' doesn't match '%s'", utf8, "こんにちは"); |
|
996 |
+ |
|
997 |
+ if (NULL != utf8) { |
|
998 |
+ free(utf8); |
|
999 |
+ utf8 = NULL; |
|
1000 |
+ } |
|
1001 |
+ |
|
1002 |
+ ret = cli_codepage_to_utf8("\x00\x48\x00\x65\x00\x6c\x00\x6c\x00\x6f\x00\x20\x00\x77\x00\x6f\x00\x72\x00\x6c\x00\x64\x00\x21\x00\x00", 26, CODEPAGE_UTF16_BE, &utf8, &utf8_size); |
|
1003 |
+ ck_assert_msg(CL_SUCCESS == ret, "test_cli_codepage_to_utf8: Failed to convert CODEPAGE_UTF16_LE to UTF8: ret != SUCCESS!"); |
|
1004 |
+ ck_assert_msg(NULL != utf8, "sanitize_path: Failed to convert CODEPAGE_UTF16_LE to UTF8: utf8 pointer is NULL!"); |
|
1005 |
+ ck_assert_msg(0 == strcmp(utf8, "Hello world!"), "sanitize_path: '%s' doesn't match '%s'", utf8, "Hello world!"); |
|
1006 |
+ |
|
1007 |
+ if (NULL != utf8) { |
|
1008 |
+ free(utf8); |
|
1009 |
+ utf8 = NULL; |
|
1010 |
+ } |
|
1011 |
+ |
|
1012 |
+ ret = cli_codepage_to_utf8("\x00\x48\x00\x65\x00\x6c\x00\x6c\x00\x6f\x00\x20\x00\x77\x00\x6f\x00\x72\x00\x6c\x00\x64\x00\x21", 24, CODEPAGE_UTF16_BE, &utf8, &utf8_size); |
|
1013 |
+ ck_assert_msg(CL_SUCCESS == ret, "test_cli_codepage_to_utf8: Failed to convert CODEPAGE_UTF16_BE to UTF8: ret != SUCCESS!"); |
|
1014 |
+ ck_assert_msg(NULL != utf8, "sanitize_path: Failed to convert CODEPAGE_UTF16_BE to UTF8: utf8 pointer is NULL!"); |
|
1015 |
+ ck_assert_msg(0 == strcmp(utf8, "Hello world!"), "sanitize_path: '%s' doesn't match '%s'", utf8, "Hello world!"); |
|
1016 |
+ |
|
1017 |
+ if (NULL != utf8) { |
|
1018 |
+ free(utf8); |
|
1019 |
+ utf8 = NULL; |
|
1020 |
+ } |
|
1021 |
+ |
|
1022 |
+ ret = cli_codepage_to_utf8("\x48\x00\x65\x00\x6c\x00\x6c\x00\x6f\x00\x20\x00\x77\x00\x6f\x00\x72\x00\x6c\x00\x64\x00\x21\x00\x00\x00", 26, CODEPAGE_UTF16_LE, &utf8, &utf8_size); |
|
1023 |
+ ck_assert_msg(CL_SUCCESS == ret, "test_cli_codepage_to_utf8: Failed to convert CODEPAGE_UTF16_LE to UTF8: ret != SUCCESS!"); |
|
1024 |
+ ck_assert_msg(NULL != utf8, "sanitize_path: Failed to convert CODEPAGE_UTF16_LE to UTF8: utf8 pointer is NULL!"); |
|
1025 |
+ ck_assert_msg(0 == strcmp(utf8, "Hello world!"), "sanitize_path: '%s' doesn't match '%s'", utf8, "Hello world!"); |
|
1026 |
+ |
|
1027 |
+ if (NULL != utf8) { |
|
1028 |
+ free(utf8); |
|
1029 |
+ utf8 = NULL; |
|
1030 |
+ } |
|
1031 |
+} |
|
1032 |
+END_TEST |
|
1033 |
+ |
|
986 | 1034 |
static Suite *test_cli_suite(void) |
987 | 1035 |
{ |
988 | 1036 |
Suite *s = suite_create("cli"); |
... | ... |
@@ -1002,6 +1051,7 @@ static Suite *test_cli_suite(void) |
1002 | 1002 |
|
1003 | 1003 |
suite_add_tcase(s, tc_cli_assorted); |
1004 | 1004 |
tcase_add_test(tc_cli_assorted, test_sanitize_path); |
1005 |
+ tcase_add_test(tc_cli_assorted, test_cli_codepage_to_utf8); |
|
1005 | 1006 |
|
1006 | 1007 |
return s; |
1007 | 1008 |
} |