Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Support
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
C
cpython
Project overview
Project overview
Details
Activity
Releases
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Issues
0
Issues
0
List
Boards
Labels
Milestones
Merge Requests
0
Merge Requests
0
Analytics
Analytics
Repository
Value Stream
Wiki
Wiki
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Create a new issue
Commits
Issue Boards
Open sidebar
Kirill Smelkov
cpython
Commits
2390104d
Commit
2390104d
authored
Aug 20, 2007
by
Hye-Shik Chang
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
Add cheot-ga-keut composed make-up sequence support in EUC-KR codec.
parent
c553f429
Changes
5
Show whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
146 additions
and
11 deletions
+146
-11
Lib/test/cjkencodings_test.py
Lib/test/cjkencodings_test.py
+17
-4
Lib/test/test_codecencodings_kr.py
Lib/test/test_codecencodings_kr.py
+18
-0
Lib/test/test_codecmaps_kr.py
Lib/test/test_codecmaps_kr.py
+4
-0
Misc/NEWS
Misc/NEWS
+3
-0
Modules/cjkcodecs/_codecs_kr.c
Modules/cjkcodecs/_codecs_kr.c
+104
-7
No files found.
Lib/test/cjkencodings_test.py
View file @
2390104d
...
...
@@ -376,13 +376,20 @@ teststring = {
"
\
xcc
\
xc7
\
xce
\
x2c
\
x20
\
xb1
\
xd7
\
xb8
\
xae
\
xb0
\
xed
\
x20
\
xc0
\
xce
\
xc5
\
xcd
"
"
\
xc7
\
xc1
\
xb8
\
xae
\
xc6
\
xc3
\
x0a
\
xc8
\
xaf
\
xb0
\
xe6
\
xc0
\
xba
\
x20
\
xc6
\
xc4
"
"
\
xc0
\
xcc
\
xbd
\
xe3
\
xc0
\
xbb
\
x20
\
xbd
\
xba
\
xc5
\
xa9
\
xb8
\
xb3
\
xc6
\
xc3
\
xb0
"
"
\
xfa
\
x20
\
xbf
\
xa9
\
xb7
\
x
c1
\
x20
\
xba
\
xd0
\
xbe
\
xdf
\
xbf
\
xa1
\
xbc
\
xad
\
xbf
"
"
\
xfa
\
x20
\
xbf
\
xa9
\
xb7
\
x
af
\
x20
\
xba
\
xd0
\
xbe
\
xdf
\
xbf
\
xa1
\
xbc
\
xad
\
xbf
"
"
\
xcd
\
x20
\
xb4
\
xeb
\
xba
\
xce
\
xba
\
xd0
\
xc0
\
xc7
\
x20
\
xc7
\
xc3
\
xb7
\
xa7
\
xc6
"
"
\
xfb
\
xbf
\
xa1
\
xbc
\
xad
\
xc0
\
xc7
\
x20
\
xba
\
xfc
\
xb8
\
xa5
\
x0a
\
xbe
\
xd6
\
xc7
"
"
\
xc3
\
xb8
\
xae
\
xc4
\
xc9
\
xc0
\
xcc
\
xbc
\
xc7
\
x20
\
xb0
\
xb3
\
xb9
\
xdf
\
xc0
\
xbb
"
"
\
x20
\
xc7
\
xd2
\
x20
\
xbc
\
xf6
\
x20
\
xc0
\
xd6
\
xb4
\
xc2
\
x20
\
xc0
\
xcc
\
xbb
\
xf3
"
"
\
xc0
\
xfb
\
xc0
\
xce
\
x20
\
xbe
\
xf0
\
xbe
\
xee
\
xb7
\
xce
\
x20
\
xb8
\
xb8
\
xb5
\
xe9
"
"
\
xbe
\
xee
\
xc1
\
xdd
\
xb4
\
xcf
\
xb4
\
xd9
\
x2e
\
x0a
\
x0a
"
,
"
\
xbe
\
xee
\
xc1
\
xdd
\
xb4
\
xcf
\
xb4
\
xd9
\
x2e
\
x0a
\
x0a
\
xa1
\
xd9
\
xc3
\
xb9
\
xb0
"
"
\
xa1
\
xb3
\
xa1
\
x3a
\
x20
\
xb3
\
xaf
\
xbe
\
xc6
\
xb6
\
xf3
\
x20
\
xa4
\
xd4
\
xa4
\
xb6
"
"
\
xa4
\
xd0
\
xa4
\
xd4
\
xa4
\
xd4
\
xa4
\
xb6
\
xa4
\
xd0
\
xa4
\
xd4
\
xbe
\
xb1
\
x7e
\
x20
"
"
\
xa4
\
xd4
\
xa4
\
xa4
\
xa4
\
xd2
\
xa4
\
xb7
\
xc5
\
xad
\
x21
\
x20
\
xa4
\
xd4
\
xa4
\
xa8
"
"
\
xa4
\
xd1
\
xa4
\
xb7
\
xb1
\
xdd
\
xbe
\
xf8
\
xc0
\
xcc
\
x20
\
xc0
\
xfc
\
xa4
\
xd4
\
xa4
"
"
\
xbe
\
xa4
\
xc8
\
xa4
\
xb2
\
xb4
\
xcf
\
xb4
\
xd9
\
x2e
\
x20
\
xa4
\
xd4
\
xa4
\
xb2
\
xa4
"
"
\
xce
\
xa4
\
xaa
\
x2e
\
x20
\
xb1
\
xd7
\
xb7
\
xb1
\
xb0
\
xc5
\
x20
\
xa4
\
xd4
\
xa4
\
xb7
"
"
\
xa4
\
xd1
\
xa4
\
xb4
\
xb4
\
xd9
\
x2e
\
x0a
"
,
"
\
xe2
\
x97
\
x8e
\
x20
\
xed
\
x8c
\
x8c
\
xec
\
x9d
\
xb4
\
xec
\
x8d
\
xac
\
x28
\
x50
\
x79
"
"
\
x74
\
x68
\
x6f
\
x6e
\
x29
\
xec
\
x9d
\
x80
\
x20
\
xeb
\
xb0
\
xb0
\
xec
\
x9a
\
xb0
\
xea
"
"
\
xb8
\
xb0
\
x20
\
xec
\
x89
\
xbd
\
xea
\
xb3
\
xa0
\
x2c
\
x20
\
xea
\
xb0
\
x95
\
xeb
\
xa0
"
...
...
@@ -404,7 +411,7 @@ teststring = {
"
\
xec
\
x9d
\
xb8
\
xed
\
x84
\
xb0
\
xed
\
x94
\
x84
\
xeb
\
xa6
\
xac
\
xed
\
x8c
\
x85
\
x0a
"
"
\
xed
\
x99
\
x98
\
xea
\
xb2
\
xbd
\
xec
\
x9d
\
x80
\
x20
\
xed
\
x8c
\
x8c
\
xec
\
x9d
\
xb4
"
"
\
xec
\
x8d
\
xac
\
xec
\
x9d
\
x84
\
x20
\
xec
\
x8a
\
xa4
\
xed
\
x81
\
xac
\
xeb
\
xa6
\
xbd
"
"
\
xed
\
x8c
\
x85
\
xea
\
xb3
\
xbc
\
x20
\
xec
\
x97
\
xac
\
xeb
\
x
a0
\
xa4
\
x20
\
xeb
\
xb6
"
"
\
xed
\
x8c
\
x85
\
xea
\
xb3
\
xbc
\
x20
\
xec
\
x97
\
xac
\
xeb
\
x
9f
\
xac
\
x20
\
xeb
\
xb6
"
"
\
x84
\
xec
\
x95
\
xbc
\
xec
\
x97
\
x90
\
xec
\
x84
\
x9c
\
xec
\
x99
\
x80
\
x20
\
xeb
\
x8c
"
"
\
x80
\
xeb
\
xb6
\
x80
\
xeb
\
xb6
\
x84
\
xec
\
x9d
\
x98
\
x20
\
xed
\
x94
\
x8c
\
xeb
\
x9e
"
"
\
xab
\
xed
\
x8f
\
xbc
\
xec
\
x97
\
x90
\
xec
\
x84
\
x9c
\
xec
\
x9d
\
x98
\
x20
\
xeb
\
xb9
"
...
...
@@ -413,7 +420,13 @@ teststring = {
"
\
x84
\
x20
\
xed
\
x95
\
xa0
\
x20
\
xec
\
x88
\
x98
\
x20
\
xec
\
x9e
\
x88
\
xeb
\
x8a
\
x94
"
"
\
x20
\
xec
\
x9d
\
xb4
\
xec
\
x83
\
x81
\
xec
\
xa0
\
x81
\
xec
\
x9d
\
xb8
\
x20
\
xec
\
x96
"
"
\
xb8
\
xec
\
x96
\
xb4
\
xeb
\
xa1
\
x9c
\
x20
\
xeb
\
xa7
\
x8c
\
xeb
\
x93
\
xa4
\
xec
\
x96
"
"
\
xb4
\
xec
\
xa4
\
x8d
\
xeb
\
x8b
\
x88
\
xeb
\
x8b
\
xa4
\
x2e
\
x0a
\
x0a
"
),
"
\
xb4
\
xec
\
xa4
\
x8d
\
xeb
\
x8b
\
x88
\
xeb
\
x8b
\
xa4
\
x2e
\
x0a
\
x0a
\
xe2
\
x98
\
x86
"
"
\
xec
\
xb2
\
xab
\
xea
\
xb0
\
x80
\
xeb
\
x81
\
x9d
\
x3a
\
x20
\
xeb
\
x82
\
xa0
\
xec
\
x95
"
"
\
x84
\
xeb
\
x9d
\
xbc
\
x20
\
xec
\
x93
\
x94
\
xec
\
x93
\
x94
\
xec
\
x93
\
xa9
\
x7e
\
x20
"
"
\
xeb
\
x8b
\
x81
\
xed
\
x81
\
xbc
\
x21
\
x20
\
xeb
\
x9c
\
xbd
\
xea
\
xb8
\
x88
\
xec
\
x97
"
"
\
x86
\
xec
\
x9d
\
xb4
\
x20
\
xec
\
xa0
\
x84
\
xed
\
x99
\
xa5
\
xeb
\
x8b
\
x88
\
xeb
\
x8b
"
"
\
xa4
\
x2e
\
x20
\
xeb
\
xb7
\
x81
\
x2e
\
x20
\
xea
\
xb7
\
xb8
\
xeb
\
x9f
\
xb0
\
xea
\
xb1
"
"
\
xb0
\
x20
\
xec
\
x9d
\
x8e
\
xeb
\
x8b
\
xa4
\
x2e
\
x0a
"
),
'gb18030'
:
(
"
\
x50
\
x79
\
x74
\
x68
\
x6f
\
x6e
\
xa3
\
xa8
\
xc5
\
xc9
\
xc9
\
xad
\
xa3
\
xa9
\
xd3
\
xef
"
"
\
xd1
\
xd4
\
xca
\
xc7
\
xd2
\
xbb
\
xd6
\
xd6
\
xb9
\
xa6
\
xc4
\
xdc
\
xc7
\
xbf
\
xb4
\
xf3
"
...
...
Lib/test/test_codecencodings_kr.py
View file @
2390104d
...
...
@@ -30,6 +30,24 @@ class Test_EUCKR(test_multibytecodec_support.TestBase, unittest.TestCase):
(
"abc
\
x80
\
x80
\
xc1
\
xc4
"
,
"replace"
,
u"abc
\
ufffd
\
uc894
"
),
(
"abc
\
x80
\
x80
\
xc1
\
xc4
\
xc8
"
,
"replace"
,
u"abc
\
ufffd
\
uc894
\
ufffd
"
),
(
"abc
\
x80
\
x80
\
xc1
\
xc4
"
,
"ignore"
,
u"abc
\
uc894
"
),
# composed make-up sequence errors
(
"
\
xa4
\
xd4
"
,
"strict"
,
None
),
(
"
\
xa4
\
xd4
\
xa4
"
,
"strict"
,
None
),
(
"
\
xa4
\
xd4
\
xa4
\
xb6
"
,
"strict"
,
None
),
(
"
\
xa4
\
xd4
\
xa4
\
xb6
\
xa4
"
,
"strict"
,
None
),
(
"
\
xa4
\
xd4
\
xa4
\
xb6
\
xa4
\
xd0
"
,
"strict"
,
None
),
(
"
\
xa4
\
xd4
\
xa4
\
xb6
\
xa4
\
xd0
\
xa4
"
,
"strict"
,
None
),
(
"
\
xa4
\
xd4
\
xa4
\
xb6
\
xa4
\
xd0
\
xa4
\
xd4
"
,
"strict"
,
u"
\
uc4d4
"
),
(
"
\
xa4
\
xd4
\
xa4
\
xb6
\
xa4
\
xd0
\
xa4
\
xd4
x"
,
"strict"
,
u"
\
uc4d4
x"
),
(
"a
\
xa4
\
xd4
\
xa4
\
xb6
\
xa4
"
,
"replace"
,
u"a
\
ufffd
"
),
(
"
\
xa4
\
xd4
\
xa3
\
xb6
\
xa4
\
xd0
\
xa4
\
xd4
"
,
"strict"
,
None
),
(
"
\
xa4
\
xd4
\
xa4
\
xb6
\
xa3
\
xd0
\
xa4
\
xd4
"
,
"strict"
,
None
),
(
"
\
xa4
\
xd4
\
xa4
\
xb6
\
xa4
\
xd0
\
xa3
\
xd4
"
,
"strict"
,
None
),
(
"
\
xa4
\
xd4
\
xa4
\
xff
\
xa4
\
xd0
\
xa4
\
xd4
"
,
"replace"
,
u"
\
ufffd
"
),
(
"
\
xa4
\
xd4
\
xa4
\
xb6
\
xa4
\
xff
\
xa4
\
xd4
"
,
"replace"
,
u"
\
ufffd
"
),
(
"
\
xa4
\
xd4
\
xa4
\
xb6
\
xa4
\
xd0
\
xa4
\
xff
"
,
"replace"
,
u"
\
ufffd
"
),
(
"
\
xc1
\
xc4
"
,
"strict"
,
u"
\
uc894
"
),
)
class
Test_JOHAB
(
test_multibytecodec_support
.
TestBase
,
unittest
.
TestCase
):
...
...
Lib/test/test_codecmaps_kr.py
View file @
2390104d
...
...
@@ -20,6 +20,10 @@ class TestEUCKRMap(test_multibytecodec_support.TestBase_Mapping,
encoding
=
'euc_kr'
mapfileurl
=
'http://people.freebsd.org/~perky/i18n/EUC-KR.TXT'
# A4D4 HANGUL FILLER indicates the begin of 8-bytes make-up sequence.
pass_enctest
=
[(
'
\
xa4
\
xd4
'
,
u'
\
u3164
'
)]
pass_dectest
=
[(
'
\
xa4
\
xd4
'
,
u'
\
u3164
'
)]
class
TestJOHABMap
(
test_multibytecodec_support
.
TestBase_Mapping
,
unittest
.
TestCase
):
...
...
Misc/NEWS
View file @
2390104d
...
...
@@ -240,6 +240,9 @@ Core and builtins
Library
-------
- EUC-KR codec now handles the cheot-ga-keut composed make-up hangul
syllables.
- GB18030 codec now can encode additional two-byte characters that
are missing in GBK.
...
...
Modules/cjkcodecs/_codecs_kr.c
View file @
2390104d
...
...
@@ -11,6 +11,26 @@
* EUC-KR codec
*/
#define EUCKR_JAMO_FIRSTBYTE 0xA4
#define EUCKR_JAMO_FILLER 0xD4
static
const
unsigned
char
u2cgk_choseong
[
19
]
=
{
0xa1
,
0xa2
,
0xa4
,
0xa7
,
0xa8
,
0xa9
,
0xb1
,
0xb2
,
0xb3
,
0xb5
,
0xb6
,
0xb7
,
0xb8
,
0xb9
,
0xba
,
0xbb
,
0xbc
,
0xbd
,
0xbe
};
static
const
unsigned
char
u2cgk_jungseong
[
21
]
=
{
0xbf
,
0xc0
,
0xc1
,
0xc2
,
0xc3
,
0xc4
,
0xc5
,
0xc6
,
0xc7
,
0xc8
,
0xc9
,
0xca
,
0xcb
,
0xcc
,
0xcd
,
0xce
,
0xcf
,
0xd0
,
0xd1
,
0xd2
,
0xd3
};
static
const
unsigned
char
u2cgk_jongseong
[
28
]
=
{
0xd4
,
0xa1
,
0xa2
,
0xa3
,
0xa4
,
0xa5
,
0xa6
,
0xa7
,
0xa9
,
0xaa
,
0xab
,
0xac
,
0xad
,
0xae
,
0xaf
,
0xb0
,
0xb1
,
0xb2
,
0xb4
,
0xb5
,
0xb6
,
0xb7
,
0xb8
,
0xba
,
0xbb
,
0xbc
,
0xbd
,
0xbe
};
ENCODER
(
euc_kr
)
{
while
(
inleft
>
0
)
{
...
...
@@ -28,17 +48,57 @@ ENCODER(euc_kr)
TRYMAP_ENC
(
cp949
,
code
,
c
);
else
return
1
;
if
(
code
&
0x8000
)
/* MSB set: CP949 */
return
1
;
if
((
code
&
0x8000
)
==
0
)
{
/* KS X 1001 coded character */
OUT1
((
code
>>
8
)
|
0x80
)
OUT2
((
code
&
0xFF
)
|
0x80
)
NEXT
(
1
,
2
)
}
else
{
/* Mapping is found in CP949 extension,
* but we encode it in KS X 1001:1998 Annex 3,
* make-up sequence for EUC-KR. */
REQUIRE_OUTBUF
(
8
)
/* syllable composition precedence */
OUT1
(
EUCKR_JAMO_FIRSTBYTE
)
OUT2
(
EUCKR_JAMO_FILLER
)
/* All codepoints in CP949 extension are in unicode
* Hangul Syllable area. */
assert
(
0xac00
<=
c
&&
c
<=
0xd7a3
);
c
-=
0xac00
;
OUT3
(
EUCKR_JAMO_FIRSTBYTE
)
OUT4
(
u2cgk_choseong
[
c
/
588
])
NEXT_OUT
(
4
)
OUT1
(
EUCKR_JAMO_FIRSTBYTE
)
OUT2
(
u2cgk_jungseong
[(
c
/
28
)
%
21
])
OUT3
(
EUCKR_JAMO_FIRSTBYTE
)
OUT4
(
u2cgk_jongseong
[
c
%
28
])
NEXT
(
1
,
4
)
}
}
return
0
;
}
#define NONE 127
static
const
unsigned
char
cgk2u_choseong
[]
=
{
/* [A1, BE] */
0
,
1
,
NONE
,
2
,
NONE
,
NONE
,
3
,
4
,
5
,
NONE
,
NONE
,
NONE
,
NONE
,
NONE
,
NONE
,
NONE
,
6
,
7
,
8
,
NONE
,
9
,
10
,
11
,
12
,
13
,
14
,
15
,
16
,
17
,
18
};
static
const
unsigned
char
cgk2u_jongseong
[]
=
{
/* [A1, BE] */
1
,
2
,
3
,
4
,
5
,
6
,
7
,
NONE
,
8
,
9
,
10
,
11
,
12
,
13
,
14
,
15
,
16
,
17
,
NONE
,
18
,
19
,
20
,
21
,
22
,
NONE
,
23
,
24
,
25
,
26
,
27
};
DECODER
(
euc_kr
)
{
while
(
inleft
>
0
)
{
...
...
@@ -54,13 +114,50 @@ DECODER(euc_kr)
REQUIRE_INBUF
(
2
)
TRYMAP_DEC
(
ksx1001
,
**
outbuf
,
c
^
0x80
,
IN2
^
0x80
)
{
if
(
c
==
EUCKR_JAMO_FIRSTBYTE
&&
IN2
==
EUCKR_JAMO_FILLER
)
{
/* KS X 1001:1998 Annex 3 make-up sequence */
DBCHAR
cho
,
jung
,
jong
;
REQUIRE_INBUF
(
8
)
if
((
*
inbuf
)[
2
]
!=
EUCKR_JAMO_FIRSTBYTE
||
(
*
inbuf
)[
4
]
!=
EUCKR_JAMO_FIRSTBYTE
||
(
*
inbuf
)[
6
]
!=
EUCKR_JAMO_FIRSTBYTE
)
return
8
;
c
=
(
*
inbuf
)[
3
];
if
(
0xa1
<=
c
&&
c
<=
0xbe
)
cho
=
cgk2u_choseong
[
c
-
0xa1
];
else
cho
=
NONE
;
c
=
(
*
inbuf
)[
5
];
jung
=
(
0xbf
<=
c
&&
c
<=
0xd3
)
?
c
-
0xbf
:
NONE
;
c
=
(
*
inbuf
)[
7
];
if
(
c
==
EUCKR_JAMO_FILLER
)
jong
=
0
;
else
if
(
0xa1
<=
c
&&
c
<=
0xbe
)
jong
=
cgk2u_jongseong
[
c
-
0xa1
];
else
jong
=
NONE
;
if
(
cho
==
NONE
||
jung
==
NONE
||
jong
==
NONE
)
return
8
;
OUT1
(
0xac00
+
cho
*
588
+
jung
*
28
+
jong
);
NEXT
(
8
,
1
)
}
else
TRYMAP_DEC
(
ksx1001
,
**
outbuf
,
c
^
0x80
,
IN2
^
0x80
)
{
NEXT
(
2
,
1
)
}
else
return
2
;
}
else
return
2
;
}
return
0
;
}
#undef NONE
/*
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment