Commit 2390104d authored by Hye-Shik Chang's avatar Hye-Shik Chang

Add cheot-ga-keut composed make-up sequence support in EUC-KR codec.

parent c553f429
......@@ -376,13 +376,20 @@ teststring = {
"\xcc\xc7\xce\x2c\x20\xb1\xd7\xb8\xae\xb0\xed\x20\xc0\xce\xc5\xcd"
"\xc7\xc1\xb8\xae\xc6\xc3\x0a\xc8\xaf\xb0\xe6\xc0\xba\x20\xc6\xc4"
"\xc0\xcc\xbd\xe3\xc0\xbb\x20\xbd\xba\xc5\xa9\xb8\xb3\xc6\xc3\xb0"
"\xfa\x20\xbf\xa9\xb7\xc1\x20\xba\xd0\xbe\xdf\xbf\xa1\xbc\xad\xbf"
"\xfa\x20\xbf\xa9\xb7\xaf\x20\xba\xd0\xbe\xdf\xbf\xa1\xbc\xad\xbf"
"\xcd\x20\xb4\xeb\xba\xce\xba\xd0\xc0\xc7\x20\xc7\xc3\xb7\xa7\xc6"
"\xfb\xbf\xa1\xbc\xad\xc0\xc7\x20\xba\xfc\xb8\xa5\x0a\xbe\xd6\xc7"
"\xc3\xb8\xae\xc4\xc9\xc0\xcc\xbc\xc7\x20\xb0\xb3\xb9\xdf\xc0\xbb"
"\x20\xc7\xd2\x20\xbc\xf6\x20\xc0\xd6\xb4\xc2\x20\xc0\xcc\xbb\xf3"
"\xc0\xfb\xc0\xce\x20\xbe\xf0\xbe\xee\xb7\xce\x20\xb8\xb8\xb5\xe9"
"\xbe\xee\xc1\xdd\xb4\xcf\xb4\xd9\x2e\x0a\x0a",
"\xbe\xee\xc1\xdd\xb4\xcf\xb4\xd9\x2e\x0a\x0a\xa1\xd9\xc3\xb9\xb0"
"\xa1\xb3\xa1\x3a\x20\xb3\xaf\xbe\xc6\xb6\xf3\x20\xa4\xd4\xa4\xb6"
"\xa4\xd0\xa4\xd4\xa4\xd4\xa4\xb6\xa4\xd0\xa4\xd4\xbe\xb1\x7e\x20"
"\xa4\xd4\xa4\xa4\xa4\xd2\xa4\xb7\xc5\xad\x21\x20\xa4\xd4\xa4\xa8"
"\xa4\xd1\xa4\xb7\xb1\xdd\xbe\xf8\xc0\xcc\x20\xc0\xfc\xa4\xd4\xa4"
"\xbe\xa4\xc8\xa4\xb2\xb4\xcf\xb4\xd9\x2e\x20\xa4\xd4\xa4\xb2\xa4"
"\xce\xa4\xaa\x2e\x20\xb1\xd7\xb7\xb1\xb0\xc5\x20\xa4\xd4\xa4\xb7"
"\xa4\xd1\xa4\xb4\xb4\xd9\x2e\x0a",
"\xe2\x97\x8e\x20\xed\x8c\x8c\xec\x9d\xb4\xec\x8d\xac\x28\x50\x79"
"\x74\x68\x6f\x6e\x29\xec\x9d\x80\x20\xeb\xb0\xb0\xec\x9a\xb0\xea"
"\xb8\xb0\x20\xec\x89\xbd\xea\xb3\xa0\x2c\x20\xea\xb0\x95\xeb\xa0"
......@@ -404,7 +411,7 @@ teststring = {
"\xec\x9d\xb8\xed\x84\xb0\xed\x94\x84\xeb\xa6\xac\xed\x8c\x85\x0a"
"\xed\x99\x98\xea\xb2\xbd\xec\x9d\x80\x20\xed\x8c\x8c\xec\x9d\xb4"
"\xec\x8d\xac\xec\x9d\x84\x20\xec\x8a\xa4\xed\x81\xac\xeb\xa6\xbd"
"\xed\x8c\x85\xea\xb3\xbc\x20\xec\x97\xac\xeb\xa0\xa4\x20\xeb\xb6"
"\xed\x8c\x85\xea\xb3\xbc\x20\xec\x97\xac\xeb\x9f\xac\x20\xeb\xb6"
"\x84\xec\x95\xbc\xec\x97\x90\xec\x84\x9c\xec\x99\x80\x20\xeb\x8c"
"\x80\xeb\xb6\x80\xeb\xb6\x84\xec\x9d\x98\x20\xed\x94\x8c\xeb\x9e"
"\xab\xed\x8f\xbc\xec\x97\x90\xec\x84\x9c\xec\x9d\x98\x20\xeb\xb9"
......@@ -413,7 +420,13 @@ teststring = {
"\x84\x20\xed\x95\xa0\x20\xec\x88\x98\x20\xec\x9e\x88\xeb\x8a\x94"
"\x20\xec\x9d\xb4\xec\x83\x81\xec\xa0\x81\xec\x9d\xb8\x20\xec\x96"
"\xb8\xec\x96\xb4\xeb\xa1\x9c\x20\xeb\xa7\x8c\xeb\x93\xa4\xec\x96"
"\xb4\xec\xa4\x8d\xeb\x8b\x88\xeb\x8b\xa4\x2e\x0a\x0a"),
"\xb4\xec\xa4\x8d\xeb\x8b\x88\xeb\x8b\xa4\x2e\x0a\x0a\xe2\x98\x86"
"\xec\xb2\xab\xea\xb0\x80\xeb\x81\x9d\x3a\x20\xeb\x82\xa0\xec\x95"
"\x84\xeb\x9d\xbc\x20\xec\x93\x94\xec\x93\x94\xec\x93\xa9\x7e\x20"
"\xeb\x8b\x81\xed\x81\xbc\x21\x20\xeb\x9c\xbd\xea\xb8\x88\xec\x97"
"\x86\xec\x9d\xb4\x20\xec\xa0\x84\xed\x99\xa5\xeb\x8b\x88\xeb\x8b"
"\xa4\x2e\x20\xeb\xb7\x81\x2e\x20\xea\xb7\xb8\xeb\x9f\xb0\xea\xb1"
"\xb0\x20\xec\x9d\x8e\xeb\x8b\xa4\x2e\x0a"),
'gb18030': (
"\x50\x79\x74\x68\x6f\x6e\xa3\xa8\xc5\xc9\xc9\xad\xa3\xa9\xd3\xef"
"\xd1\xd4\xca\xc7\xd2\xbb\xd6\xd6\xb9\xa6\xc4\xdc\xc7\xbf\xb4\xf3"
......
......@@ -30,6 +30,24 @@ class Test_EUCKR(test_multibytecodec_support.TestBase, unittest.TestCase):
("abc\x80\x80\xc1\xc4", "replace", u"abc\ufffd\uc894"),
("abc\x80\x80\xc1\xc4\xc8", "replace", u"abc\ufffd\uc894\ufffd"),
("abc\x80\x80\xc1\xc4", "ignore", u"abc\uc894"),
# composed make-up sequence errors
("\xa4\xd4", "strict", None),
("\xa4\xd4\xa4", "strict", None),
("\xa4\xd4\xa4\xb6", "strict", None),
("\xa4\xd4\xa4\xb6\xa4", "strict", None),
("\xa4\xd4\xa4\xb6\xa4\xd0", "strict", None),
("\xa4\xd4\xa4\xb6\xa4\xd0\xa4", "strict", None),
("\xa4\xd4\xa4\xb6\xa4\xd0\xa4\xd4", "strict", u"\uc4d4"),
("\xa4\xd4\xa4\xb6\xa4\xd0\xa4\xd4x", "strict", u"\uc4d4x"),
("a\xa4\xd4\xa4\xb6\xa4", "replace", u"a\ufffd"),
("\xa4\xd4\xa3\xb6\xa4\xd0\xa4\xd4", "strict", None),
("\xa4\xd4\xa4\xb6\xa3\xd0\xa4\xd4", "strict", None),
("\xa4\xd4\xa4\xb6\xa4\xd0\xa3\xd4", "strict", None),
("\xa4\xd4\xa4\xff\xa4\xd0\xa4\xd4", "replace", u"\ufffd"),
("\xa4\xd4\xa4\xb6\xa4\xff\xa4\xd4", "replace", u"\ufffd"),
("\xa4\xd4\xa4\xb6\xa4\xd0\xa4\xff", "replace", u"\ufffd"),
("\xc1\xc4", "strict", u"\uc894"),
)
class Test_JOHAB(test_multibytecodec_support.TestBase, unittest.TestCase):
......
......@@ -20,6 +20,10 @@ class TestEUCKRMap(test_multibytecodec_support.TestBase_Mapping,
encoding = 'euc_kr'
mapfileurl = 'http://people.freebsd.org/~perky/i18n/EUC-KR.TXT'
# A4D4 HANGUL FILLER indicates the begin of 8-bytes make-up sequence.
pass_enctest = [('\xa4\xd4', u'\u3164')]
pass_dectest = [('\xa4\xd4', u'\u3164')]
class TestJOHABMap(test_multibytecodec_support.TestBase_Mapping,
unittest.TestCase):
......
......@@ -240,6 +240,9 @@ Core and builtins
Library
-------
- EUC-KR codec now handles the cheot-ga-keut composed make-up hangul
syllables.
- GB18030 codec now can encode additional two-byte characters that
are missing in GBK.
......
......@@ -11,6 +11,26 @@
* EUC-KR codec
*/
#define EUCKR_JAMO_FIRSTBYTE 0xA4
#define EUCKR_JAMO_FILLER 0xD4
static const unsigned char u2cgk_choseong[19] = {
0xa1, 0xa2, 0xa4, 0xa7, 0xa8, 0xa9, 0xb1, 0xb2,
0xb3, 0xb5, 0xb6, 0xb7, 0xb8, 0xb9, 0xba, 0xbb,
0xbc, 0xbd, 0xbe
};
static const unsigned char u2cgk_jungseong[21] = {
0xbf, 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6,
0xc7, 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce,
0xcf, 0xd0, 0xd1, 0xd2, 0xd3
};
static const unsigned char u2cgk_jongseong[28] = {
0xd4, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7,
0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, 0xb0,
0xb1, 0xb2, 0xb4, 0xb5, 0xb6, 0xb7, 0xb8, 0xba,
0xbb, 0xbc, 0xbd, 0xbe
};
ENCODER(euc_kr)
{
while (inleft > 0) {
......@@ -28,17 +48,57 @@ ENCODER(euc_kr)
TRYMAP_ENC(cp949, code, c);
else return 1;
if (code & 0x8000) /* MSB set: CP949 */
return 1;
if ((code & 0x8000) == 0) {
/* KS X 1001 coded character */
OUT1((code >> 8) | 0x80)
OUT2((code & 0xFF) | 0x80)
NEXT(1, 2)
}
else { /* Mapping is found in CP949 extension,
* but we encode it in KS X 1001:1998 Annex 3,
* make-up sequence for EUC-KR. */
OUT1((code >> 8) | 0x80)
OUT2((code & 0xFF) | 0x80)
NEXT(1, 2)
REQUIRE_OUTBUF(8)
/* syllable composition precedence */
OUT1(EUCKR_JAMO_FIRSTBYTE)
OUT2(EUCKR_JAMO_FILLER)
/* All codepoints in CP949 extension are in unicode
* Hangul Syllable area. */
assert(0xac00 <= c && c <= 0xd7a3);
c -= 0xac00;
OUT3(EUCKR_JAMO_FIRSTBYTE)
OUT4(u2cgk_choseong[c / 588])
NEXT_OUT(4)
OUT1(EUCKR_JAMO_FIRSTBYTE)
OUT2(u2cgk_jungseong[(c / 28) % 21])
OUT3(EUCKR_JAMO_FIRSTBYTE)
OUT4(u2cgk_jongseong[c % 28])
NEXT(1, 4)
}
}
return 0;
}
#define NONE 127
static const unsigned char cgk2u_choseong[] = { /* [A1, BE] */
0, 1, NONE, 2, NONE, NONE, 3, 4,
5, NONE, NONE, NONE, NONE, NONE, NONE, NONE,
6, 7, 8, NONE, 9, 10, 11, 12,
13, 14, 15, 16, 17, 18
};
static const unsigned char cgk2u_jongseong[] = { /* [A1, BE] */
1, 2, 3, 4, 5, 6, 7, NONE,
8, 9, 10, 11, 12, 13, 14, 15,
16, 17, NONE, 18, 19, 20, 21, 22,
NONE, 23, 24, 25, 26, 27
};
DECODER(euc_kr)
{
while (inleft > 0) {
......@@ -54,13 +114,50 @@ DECODER(euc_kr)
REQUIRE_INBUF(2)
TRYMAP_DEC(ksx1001, **outbuf, c ^ 0x80, IN2 ^ 0x80) {
if (c == EUCKR_JAMO_FIRSTBYTE &&
IN2 == EUCKR_JAMO_FILLER) {
/* KS X 1001:1998 Annex 3 make-up sequence */
DBCHAR cho, jung, jong;
REQUIRE_INBUF(8)
if ((*inbuf)[2] != EUCKR_JAMO_FIRSTBYTE ||
(*inbuf)[4] != EUCKR_JAMO_FIRSTBYTE ||
(*inbuf)[6] != EUCKR_JAMO_FIRSTBYTE)
return 8;
c = (*inbuf)[3];
if (0xa1 <= c && c <= 0xbe)
cho = cgk2u_choseong[c - 0xa1];
else
cho = NONE;
c = (*inbuf)[5];
jung = (0xbf <= c && c <= 0xd3) ? c - 0xbf : NONE;
c = (*inbuf)[7];
if (c == EUCKR_JAMO_FILLER)
jong = 0;
else if (0xa1 <= c && c <= 0xbe)
jong = cgk2u_jongseong[c - 0xa1];
else
jong = NONE;
if (cho == NONE || jung == NONE || jong == NONE)
return 8;
OUT1(0xac00 + cho*588 + jung*28 + jong);
NEXT(8, 1)
}
else TRYMAP_DEC(ksx1001, **outbuf, c ^ 0x80, IN2 ^ 0x80) {
NEXT(2, 1)
} else return 2;
}
else
return 2;
}
return 0;
}
#undef NONE
/*
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment