Commit 3e2a3069 authored by Hye-Shik Chang's avatar Hye-Shik Chang

Add CJK codecs support as discussed on python-dev. (SF #873597)

Several style fixes are suggested by Martin v. Loewis and
Marc-Andre Lemburg. Thanks!
parent cd1f7430
......@@ -212,15 +212,6 @@ The others represent the BOM in UTF-8 and UTF-32 encodings.
\end{datadesc}
\begin{seealso}
\seeurl{http://sourceforge.net/projects/python-codecs/}{A
SourceForge project working on additional support for Asian
codecs for use with Python. They are in the early stages of
development at the time of this writing --- look in their
FTP area for downloadable files.}
\end{seealso}
\subsection{Codec Base Classes}
The \module{codecs} defines a set of base classes which define the
......@@ -553,6 +544,10 @@ exist:
{646, us-ascii}
{English}
\lineiii{big5}
{big5_tw, csbig5}
{Traditional Chinese}
\lineiii{cp037}
{IBM037, IBM039}
{English}
......@@ -633,6 +628,18 @@ exist:
{}
{Greek}
\lineiii{cp932}
{932, ms932, mskanji, ms_kanji}
{Japanese}
\lineiii{cp949}
{949, ms949, uhc}
{Korean}
\lineiii{cp950}
{950, ms950}
{Traditional Chinese}
\lineiii{cp1006}
{}
{Urdu}
......@@ -681,6 +688,59 @@ exist:
{windows-1258}
{Vietnamese}
\lineiii{euc_jp}
{eucjp, ujis, u_jis}
{Japanese}
\lineiii{euc_jisx0213}
{jisx0213, eucjisx0213}
{Japanese}
\lineiii{euc_kr}
{euckr, korean, ksc5601, ks_c_5601, ks_c_5601_1987, ksx1001, ks_x_1001}
{Korean}
\lineiii{gb2312}
{chinese, csiso58gb231280, euc_cn, euccn, eucgb2312_cn, gb2312_1980,
gb2312_80, iso_ir_58}
{Simplified Chinese}
\lineiii{gbk}
{936, cp936, ms936}
{Unified Chinese}
\lineiii{gb18030}
{gb18030_2000}
{Unified Chinese}
\lineiii{hz}
{hzgb, hz_gb, hz_gb_2312}
{Simplified Chinese}
\lineiii{iso2022_jp}
{csiso2022jp, iso2022jp, iso_2022_jp}
{Japanese}
\lineiii{iso2022_jp_1}
{iso2022jp_1, iso_2022_jp_1}
{Japanese}
\lineiii{iso2022_jp_2}
{iso2022jp_2, iso_2022_jp_2}
{Japanese, Korean, Simplified Chinese, Western Europe, Greek}
\lineiii{iso2022_jp_3}
{iso2022jp_3, iso_2022_jp_3}
{Japanese}
\lineiii{iso2022_jp_ext}
{iso2022jp_ext, iso_2022_jp_ext}
{Japanese}
\lineiii{iso2022_kr}
{csiso2022kr, iso2022kr, iso_2022_kr}
{Korean}
\lineiii{latin_1}
{iso-8859-1, iso8859-1, 8859, cp819, latin, latin1, L1}
{West Europe}
......@@ -733,6 +793,10 @@ exist:
{iso-8859-15}
{Western Europe}
\lineiii{johab}
{cp1361, ms1361}
{Korean}
\lineiii{koi8_r}
{}
{Russian}
......@@ -765,6 +829,14 @@ exist:
{macturkish}
{Turkish}
\lineiii{shift_jis}
{csshiftjis, shiftjis, sjis, s_jis}
{Japanese}
\lineiii{shift_jisx0213}
{shiftjisx0213, sjisx0213, s_jisx0213}
{Japanese}
\lineiii{utf_16}
{U16, utf16}
{all languages}
......
......@@ -8,13 +8,6 @@ from email.test.test_email import TestEmailBase
from email.Charset import Charset
from email.Header import Header, decode_header
# See if we have the Japanese codecs package installed
try:
unicode('foo', 'japanese.iso-2022-jp')
except LookupError:
raise TestSkipped, 'Optional Japanese codecs not installed'
class TestEmailAsianCodecs(TestEmailBase):
def test_japanese_codecs(self):
......
......@@ -14,12 +14,6 @@
codecs. In addition to these, a few Python specific codec
aliases have also been added.
About the CJK codec aliases:
The codecs for these encodings are not distributed with the
Python core, but are included here for reference, since the
locale module relies on having these aliases available.
"""
aliases = {
......@@ -41,6 +35,10 @@ aliases = {
'base64' : 'base64_codec',
'base_64' : 'base64_codec',
# big5 codec
'big5_tw' : 'big5',
'csbig5' : 'big5',
# bz2_codec codec
'bz2' : 'bz2_codec',
......@@ -168,9 +166,91 @@ aliases = {
'csibm869' : 'cp869',
'ibm869' : 'cp869',
# cp932 codec
'932' : 'cp932',
'ms932' : 'cp932',
'mskanji' : 'cp932',
'ms_kanji' : 'cp932',
# cp949 codec
'949' : 'cp949',
'ms949' : 'cp949',
'uhc' : 'cp949',
# cp950 codec
'950' : 'cp950',
'ms950' : 'cp950',
# euc_jisx0213 codec
'jisx0213' : 'euc_jisx0213',
'eucjisx0213' : 'euc_jisx0213',
# euc_jp codec
'eucjp' : 'euc_jp',
'ujis' : 'euc_jp',
'u_jis' : 'euc_jp',
# euc_kr codec
'euckr' : 'euc_kr',
'korean' : 'euc_kr',
'ksc5601' : 'euc_kr',
'ks_c_5601' : 'euc_kr',
'ks_c_5601_1987' : 'euc_kr',
'ksx1001' : 'euc_kr',
'ks_x_1001' : 'euc_kr',
# gb18030 codec
'gb18030_2000' : 'gb18030',
# gb2312 codec
'chinese' : 'gb2312',
'csiso58gb231280' : 'gb2312',
'euc_cn' : 'gb2312',
'euccn' : 'gb2312',
'eucgb2312_cn' : 'gb2312',
'gb2312_1980' : 'gb2312',
'gb2312_80' : 'gb2312',
'iso_ir_58' : 'gb2312',
# gbk codec
'936' : 'gbk',
'cp936' : 'gbk',
'ms936' : 'gbk',
# hex_codec codec
'hex' : 'hex_codec',
# hz codec
'hzgb' : 'hz',
'hz_gb' : 'hz',
'hz_gb_2312' : 'hz',
# iso2022_jp codec
'csiso2022jp' : 'iso2022_jp',
'iso2022jp' : 'iso2022_jp',
'iso_2022_jp' : 'iso2022_jp',
# iso2022_jp_1 codec
'iso2022jp_1' : 'iso2022_jp_1',
'iso_2022_jp_1' : 'iso2022_jp_1',
# iso2022_jp_2 codec
'iso2022jp_2' : 'iso2022_jp_2',
'iso_2022_jp_2' : 'iso2022_jp_2',
# iso_3022_jp_3 codec
'iso2022jp_3' : 'iso2022_jp_3',
'iso_2022_jp_3' : 'iso2022_jp_3',
# iso2022_jp_ext codec
'iso2022jp_ext' : 'iso2022_jp_ext',
'iso_2022_jp_ext' : 'iso2022_jp_ext',
# iso2022_kr codec
'csiso2022kr' : 'iso2022_kr',
'iso2022kr' : 'iso2022_kr',
'iso_2022_kr' : 'iso2022_kr',
# iso8859_10 codec
'csisolatin6' : 'iso8859_10',
'iso_8859_10' : 'iso8859_10',
......@@ -258,9 +338,9 @@ aliases = {
'l5' : 'iso8859_9',
'latin5' : 'iso8859_9',
# jis_7 codec
'csiso2022jp' : 'jis_7',
'iso_2022_jp' : 'jis_7',
# johab codec
'cp1361' : 'johab',
'ms1361' : 'johab',
# koi8_r codec
'cskoi8r' : 'koi8_r',
......@@ -308,6 +388,17 @@ aliases = {
# rot_13 codec
'rot13' : 'rot_13',
# shift_jis codec
'csshiftjis' : 'shift_jis',
'shiftjis' : 'shift_jis',
'sjis' : 'shift_jis',
's_jis' : 'shift_jis',
# shift_jisx0213 codec
'shiftjisx0213' : 'shift_jisx0213',
'sjisx0213' : 'shift_jisx0213',
's_jisx0213' : 'shift_jisx0213',
# tactis codec
'tis260' : 'tactis',
......
#
# big5.py: Python Unicode Codec for BIG5
#
# Written by Hye-Shik Chang <perky@FreeBSD.org>
# $CJKCodecs: big5.py,v 1.3 2004/01/17 11:26:10 perky Exp $
#
from _codecs_big5 import codec
import codecs
class Codec(codecs.Codec):
encode = codec.encode
decode = codec.decode
class StreamReader(Codec, codecs.StreamReader):
def __init__(self, stream, errors='strict'):
codecs.StreamReader.__init__(self, stream, errors)
__codec = codec.StreamReader(stream, errors)
self.read = __codec.read
self.readline = __codec.readline
self.readlines = __codec.readlines
self.reset = __codec.reset
class StreamWriter(Codec, codecs.StreamWriter):
def __init__(self, stream, errors='strict'):
codecs.StreamWriter.__init__(self, stream, errors)
__codec = codec.StreamWriter(stream, errors)
self.write = __codec.write
self.writelines = __codec.writelines
self.reset = __codec.reset
def getregentry():
return (Codec().encode,Codec().decode,StreamReader,StreamWriter)
#
# cp932.py: Python Unicode Codec for CP932
#
# Written by Hye-Shik Chang <perky@FreeBSD.org>
# $CJKCodecs: cp932.py,v 1.3 2004/01/17 11:26:10 perky Exp $
#
from _codecs_cp932 import codec
import codecs
class Codec(codecs.Codec):
encode = codec.encode
decode = codec.decode
class StreamReader(Codec, codecs.StreamReader):
def __init__(self, stream, errors='strict'):
codecs.StreamReader.__init__(self, stream, errors)
__codec = codec.StreamReader(stream, errors)
self.read = __codec.read
self.readline = __codec.readline
self.readlines = __codec.readlines
self.reset = __codec.reset
class StreamWriter(Codec, codecs.StreamWriter):
def __init__(self, stream, errors='strict'):
codecs.StreamWriter.__init__(self, stream, errors)
__codec = codec.StreamWriter(stream, errors)
self.write = __codec.write
self.writelines = __codec.writelines
self.reset = __codec.reset
def getregentry():
return (Codec().encode,Codec().decode,StreamReader,StreamWriter)
#
# cp949.py: Python Unicode Codec for CP949
#
# Written by Hye-Shik Chang <perky@FreeBSD.org>
# $CJKCodecs: cp949.py,v 1.3 2004/01/17 11:26:10 perky Exp $
#
from _codecs_cp949 import codec
import codecs
class Codec(codecs.Codec):
encode = codec.encode
decode = codec.decode
class StreamReader(Codec, codecs.StreamReader):
def __init__(self, stream, errors='strict'):
codecs.StreamReader.__init__(self, stream, errors)
__codec = codec.StreamReader(stream, errors)
self.read = __codec.read
self.readline = __codec.readline
self.readlines = __codec.readlines
self.reset = __codec.reset
class StreamWriter(Codec, codecs.StreamWriter):
def __init__(self, stream, errors='strict'):
codecs.StreamWriter.__init__(self, stream, errors)
__codec = codec.StreamWriter(stream, errors)
self.write = __codec.write
self.writelines = __codec.writelines
self.reset = __codec.reset
def getregentry():
return (Codec().encode,Codec().decode,StreamReader,StreamWriter)
#
# cp950.py: Python Unicode Codec for CP950
#
# Written by Hye-Shik Chang <perky@FreeBSD.org>
# $CJKCodecs: cp950.py,v 1.3 2004/01/17 11:26:10 perky Exp $
#
from _codecs_cp950 import codec
import codecs
class Codec(codecs.Codec):
encode = codec.encode
decode = codec.decode
class StreamReader(Codec, codecs.StreamReader):
def __init__(self, stream, errors='strict'):
codecs.StreamReader.__init__(self, stream, errors)
__codec = codec.StreamReader(stream, errors)
self.read = __codec.read
self.readline = __codec.readline
self.readlines = __codec.readlines
self.reset = __codec.reset
class StreamWriter(Codec, codecs.StreamWriter):
def __init__(self, stream, errors='strict'):
codecs.StreamWriter.__init__(self, stream, errors)
__codec = codec.StreamWriter(stream, errors)
self.write = __codec.write
self.writelines = __codec.writelines
self.reset = __codec.reset
def getregentry():
return (Codec().encode,Codec().decode,StreamReader,StreamWriter)
#
# euc_jisx0213.py: Python Unicode Codec for EUC_JISX0213
#
# Written by Hye-Shik Chang <perky@FreeBSD.org>
# $CJKCodecs: euc_jisx0213.py,v 1.3 2004/01/17 11:26:10 perky Exp $
#
from _codecs_euc_jisx0213 import codec
import codecs
class Codec(codecs.Codec):
encode = codec.encode
decode = codec.decode
class StreamReader(Codec, codecs.StreamReader):
def __init__(self, stream, errors='strict'):
codecs.StreamReader.__init__(self, stream, errors)
__codec = codec.StreamReader(stream, errors)
self.read = __codec.read
self.readline = __codec.readline
self.readlines = __codec.readlines
self.reset = __codec.reset
class StreamWriter(Codec, codecs.StreamWriter):
def __init__(self, stream, errors='strict'):
codecs.StreamWriter.__init__(self, stream, errors)
__codec = codec.StreamWriter(stream, errors)
self.write = __codec.write
self.writelines = __codec.writelines
self.reset = __codec.reset
def getregentry():
return (Codec().encode,Codec().decode,StreamReader,StreamWriter)
#
# euc_jp.py: Python Unicode Codec for EUC_JP
#
# Written by Hye-Shik Chang <perky@FreeBSD.org>
# $CJKCodecs: euc_jp.py,v 1.3 2004/01/17 11:26:10 perky Exp $
#
from _codecs_euc_jp import codec
import codecs
class Codec(codecs.Codec):
encode = codec.encode
decode = codec.decode
class StreamReader(Codec, codecs.StreamReader):
def __init__(self, stream, errors='strict'):
codecs.StreamReader.__init__(self, stream, errors)
__codec = codec.StreamReader(stream, errors)
self.read = __codec.read
self.readline = __codec.readline
self.readlines = __codec.readlines
self.reset = __codec.reset
class StreamWriter(Codec, codecs.StreamWriter):
def __init__(self, stream, errors='strict'):
codecs.StreamWriter.__init__(self, stream, errors)
__codec = codec.StreamWriter(stream, errors)
self.write = __codec.write
self.writelines = __codec.writelines
self.reset = __codec.reset
def getregentry():
return (Codec().encode,Codec().decode,StreamReader,StreamWriter)
#
# euc_kr.py: Python Unicode Codec for EUC_KR
#
# Written by Hye-Shik Chang <perky@FreeBSD.org>
# $CJKCodecs: euc_kr.py,v 1.3 2004/01/17 11:26:10 perky Exp $
#
from _codecs_euc_kr import codec
import codecs
class Codec(codecs.Codec):
encode = codec.encode
decode = codec.decode
class StreamReader(Codec, codecs.StreamReader):
def __init__(self, stream, errors='strict'):
codecs.StreamReader.__init__(self, stream, errors)
__codec = codec.StreamReader(stream, errors)
self.read = __codec.read
self.readline = __codec.readline
self.readlines = __codec.readlines
self.reset = __codec.reset
class StreamWriter(Codec, codecs.StreamWriter):
def __init__(self, stream, errors='strict'):
codecs.StreamWriter.__init__(self, stream, errors)
__codec = codec.StreamWriter(stream, errors)
self.write = __codec.write
self.writelines = __codec.writelines
self.reset = __codec.reset
def getregentry():
return (Codec().encode,Codec().decode,StreamReader,StreamWriter)
#
# gb18030.py: Python Unicode Codec for GB18030
#
# Written by Hye-Shik Chang <perky@FreeBSD.org>
# $CJKCodecs: gb18030.py,v 1.3 2004/01/17 11:26:10 perky Exp $
#
from _codecs_gb18030 import codec
import codecs
class Codec(codecs.Codec):
encode = codec.encode
decode = codec.decode
class StreamReader(Codec, codecs.StreamReader):
def __init__(self, stream, errors='strict'):
codecs.StreamReader.__init__(self, stream, errors)
__codec = codec.StreamReader(stream, errors)
self.read = __codec.read
self.readline = __codec.readline
self.readlines = __codec.readlines
self.reset = __codec.reset
class StreamWriter(Codec, codecs.StreamWriter):
def __init__(self, stream, errors='strict'):
codecs.StreamWriter.__init__(self, stream, errors)
__codec = codec.StreamWriter(stream, errors)
self.write = __codec.write
self.writelines = __codec.writelines
self.reset = __codec.reset
def getregentry():
return (Codec().encode,Codec().decode,StreamReader,StreamWriter)
#
# gb2312.py: Python Unicode Codec for GB2312
#
# Written by Hye-Shik Chang <perky@FreeBSD.org>
# $CJKCodecs: gb2312.py,v 1.3 2004/01/17 11:26:10 perky Exp $
#
from _codecs_gb2312 import codec
import codecs
class Codec(codecs.Codec):
encode = codec.encode
decode = codec.decode
class StreamReader(Codec, codecs.StreamReader):
def __init__(self, stream, errors='strict'):
codecs.StreamReader.__init__(self, stream, errors)
__codec = codec.StreamReader(stream, errors)
self.read = __codec.read
self.readline = __codec.readline
self.readlines = __codec.readlines
self.reset = __codec.reset
class StreamWriter(Codec, codecs.StreamWriter):
def __init__(self, stream, errors='strict'):
codecs.StreamWriter.__init__(self, stream, errors)
__codec = codec.StreamWriter(stream, errors)
self.write = __codec.write
self.writelines = __codec.writelines
self.reset = __codec.reset
def getregentry():
return (Codec().encode,Codec().decode,StreamReader,StreamWriter)
#
# gbk.py: Python Unicode Codec for GBK
#
# Written by Hye-Shik Chang <perky@FreeBSD.org>
# $CJKCodecs: gbk.py,v 1.3 2004/01/17 11:26:10 perky Exp $
#
from _codecs_gbk import codec
import codecs
class Codec(codecs.Codec):
encode = codec.encode
decode = codec.decode
class StreamReader(Codec, codecs.StreamReader):
def __init__(self, stream, errors='strict'):
codecs.StreamReader.__init__(self, stream, errors)
__codec = codec.StreamReader(stream, errors)
self.read = __codec.read
self.readline = __codec.readline
self.readlines = __codec.readlines
self.reset = __codec.reset
class StreamWriter(Codec, codecs.StreamWriter):
def __init__(self, stream, errors='strict'):
codecs.StreamWriter.__init__(self, stream, errors)
__codec = codec.StreamWriter(stream, errors)
self.write = __codec.write
self.writelines = __codec.writelines
self.reset = __codec.reset
def getregentry():
return (Codec().encode,Codec().decode,StreamReader,StreamWriter)
#
# hz.py: Python Unicode Codec for HZ
#
# Written by Hye-Shik Chang <perky@FreeBSD.org>
# $CJKCodecs: hz.py,v 1.3 2004/01/17 11:26:10 perky Exp $
#
from _codecs_hz import codec
import codecs
class Codec(codecs.Codec):
encode = codec.encode
decode = codec.decode
class StreamReader(Codec, codecs.StreamReader):
def __init__(self, stream, errors='strict'):
codecs.StreamReader.__init__(self, stream, errors)
__codec = codec.StreamReader(stream, errors)
self.read = __codec.read
self.readline = __codec.readline
self.readlines = __codec.readlines
self.reset = __codec.reset
class StreamWriter(Codec, codecs.StreamWriter):
def __init__(self, stream, errors='strict'):
codecs.StreamWriter.__init__(self, stream, errors)
__codec = codec.StreamWriter(stream, errors)
self.write = __codec.write
self.writelines = __codec.writelines
self.reset = __codec.reset
def getregentry():
return (Codec().encode,Codec().decode,StreamReader,StreamWriter)
#
# iso2022_jp.py: Python Unicode Codec for ISO_2022_JP
#
# Written by Hye-Shik Chang <perky@FreeBSD.org>
# $CJKCodecs: iso2022_jp.py,v 1.3 2004/01/17 11:26:10 perky Exp $
#
from _codecs_iso2022_jp import codec
import codecs
class Codec(codecs.Codec):
encode = codec.encode
decode = codec.decode
class StreamReader(Codec, codecs.StreamReader):
def __init__(self, stream, errors='strict'):
codecs.StreamReader.__init__(self, stream, errors)
__codec = codec.StreamReader(stream, errors)
self.read = __codec.read
self.readline = __codec.readline
self.readlines = __codec.readlines
self.reset = __codec.reset
class StreamWriter(Codec, codecs.StreamWriter):
def __init__(self, stream, errors='strict'):
codecs.StreamWriter.__init__(self, stream, errors)
__codec = codec.StreamWriter(stream, errors)
self.write = __codec.write
self.writelines = __codec.writelines
self.reset = __codec.reset
def getregentry():
return (Codec().encode,Codec().decode,StreamReader,StreamWriter)
#
# iso2022_jp_1.py: Python Unicode Codec for ISO_2022_JP_1
#
# Written by Hye-Shik Chang <perky@FreeBSD.org>
# $CJKCodecs: iso2022_jp_1.py,v 1.3 2004/01/17 11:26:10 perky Exp $
#
from _codecs_iso2022_jp_1 import codec
import codecs
class Codec(codecs.Codec):
encode = codec.encode
decode = codec.decode
class StreamReader(Codec, codecs.StreamReader):
def __init__(self, stream, errors='strict'):
codecs.StreamReader.__init__(self, stream, errors)
__codec = codec.StreamReader(stream, errors)
self.read = __codec.read
self.readline = __codec.readline
self.readlines = __codec.readlines
self.reset = __codec.reset
class StreamWriter(Codec, codecs.StreamWriter):
def __init__(self, stream, errors='strict'):
codecs.StreamWriter.__init__(self, stream, errors)
__codec = codec.StreamWriter(stream, errors)
self.write = __codec.write
self.writelines = __codec.writelines
self.reset = __codec.reset
def getregentry():
return (Codec().encode,Codec().decode,StreamReader,StreamWriter)
#
# iso2022_jp_2.py: Python Unicode Codec for ISO_2022_JP_2
#
# Written by Hye-Shik Chang <perky@FreeBSD.org>
# $CJKCodecs: iso2022_jp_2.py,v 1.3 2004/01/17 11:26:10 perky Exp $
#
from _codecs_iso2022_jp_2 import codec
import codecs
class Codec(codecs.Codec):
encode = codec.encode
decode = codec.decode
class StreamReader(Codec, codecs.StreamReader):
def __init__(self, stream, errors='strict'):
codecs.StreamReader.__init__(self, stream, errors)
__codec = codec.StreamReader(stream, errors)
self.read = __codec.read
self.readline = __codec.readline
self.readlines = __codec.readlines
self.reset = __codec.reset
class StreamWriter(Codec, codecs.StreamWriter):
def __init__(self, stream, errors='strict'):
codecs.StreamWriter.__init__(self, stream, errors)
__codec = codec.StreamWriter(stream, errors)
self.write = __codec.write
self.writelines = __codec.writelines
self.reset = __codec.reset
def getregentry():
return (Codec().encode,Codec().decode,StreamReader,StreamWriter)
#
# iso2022_jp_3.py: Python Unicode Codec for ISO_2022_JP_3
#
# Written by Hye-Shik Chang <perky@FreeBSD.org>
# $CJKCodecs: iso2022_jp_3.py,v 1.3 2004/01/17 11:26:10 perky Exp $
#
from _codecs_iso2022_jp_3 import codec
import codecs
class Codec(codecs.Codec):
encode = codec.encode
decode = codec.decode
class StreamReader(Codec, codecs.StreamReader):
def __init__(self, stream, errors='strict'):
codecs.StreamReader.__init__(self, stream, errors)
__codec = codec.StreamReader(stream, errors)
self.read = __codec.read
self.readline = __codec.readline
self.readlines = __codec.readlines
self.reset = __codec.reset
class StreamWriter(Codec, codecs.StreamWriter):
def __init__(self, stream, errors='strict'):
codecs.StreamWriter.__init__(self, stream, errors)
__codec = codec.StreamWriter(stream, errors)
self.write = __codec.write
self.writelines = __codec.writelines
self.reset = __codec.reset
def getregentry():
return (Codec().encode,Codec().decode,StreamReader,StreamWriter)
#
# iso2022_jp_ext.py: Python Unicode Codec for ISO_2022_JP_EXT
#
# Written by Hye-Shik Chang <perky@FreeBSD.org>
# $CJKCodecs: iso2022_jp_ext.py,v 1.3 2004/01/17 11:26:10 perky Exp $
#
from _codecs_iso2022_jp_ext import codec
import codecs
class Codec(codecs.Codec):
encode = codec.encode
decode = codec.decode
class StreamReader(Codec, codecs.StreamReader):
def __init__(self, stream, errors='strict'):
codecs.StreamReader.__init__(self, stream, errors)
__codec = codec.StreamReader(stream, errors)
self.read = __codec.read
self.readline = __codec.readline
self.readlines = __codec.readlines
self.reset = __codec.reset
class StreamWriter(Codec, codecs.StreamWriter):
def __init__(self, stream, errors='strict'):
codecs.StreamWriter.__init__(self, stream, errors)
__codec = codec.StreamWriter(stream, errors)
self.write = __codec.write
self.writelines = __codec.writelines
self.reset = __codec.reset
def getregentry():
return (Codec().encode,Codec().decode,StreamReader,StreamWriter)
#
# iso2022_kr.py: Python Unicode Codec for ISO_2022_KR
#
# Written by Hye-Shik Chang <perky@FreeBSD.org>
# $CJKCodecs: iso2022_kr.py,v 1.3 2004/01/17 11:26:10 perky Exp $
#
from _codecs_iso2022_kr import codec
import codecs
class Codec(codecs.Codec):
encode = codec.encode
decode = codec.decode
class StreamReader(Codec, codecs.StreamReader):
def __init__(self, stream, errors='strict'):
codecs.StreamReader.__init__(self, stream, errors)
__codec = codec.StreamReader(stream, errors)
self.read = __codec.read
self.readline = __codec.readline
self.readlines = __codec.readlines
self.reset = __codec.reset
class StreamWriter(Codec, codecs.StreamWriter):
def __init__(self, stream, errors='strict'):
codecs.StreamWriter.__init__(self, stream, errors)
__codec = codec.StreamWriter(stream, errors)
self.write = __codec.write
self.writelines = __codec.writelines
self.reset = __codec.reset
def getregentry():
return (Codec().encode,Codec().decode,StreamReader,StreamWriter)
#
# johab.py: Python Unicode Codec for JOHAB
#
# Written by Hye-Shik Chang <perky@FreeBSD.org>
# $CJKCodecs: johab.py,v 1.3 2004/01/17 11:26:10 perky Exp $
#
from _codecs_johab import codec
import codecs
class Codec(codecs.Codec):
encode = codec.encode
decode = codec.decode
class StreamReader(Codec, codecs.StreamReader):
def __init__(self, stream, errors='strict'):
codecs.StreamReader.__init__(self, stream, errors)
__codec = codec.StreamReader(stream, errors)
self.read = __codec.read
self.readline = __codec.readline
self.readlines = __codec.readlines
self.reset = __codec.reset
class StreamWriter(Codec, codecs.StreamWriter):
def __init__(self, stream, errors='strict'):
codecs.StreamWriter.__init__(self, stream, errors)
__codec = codec.StreamWriter(stream, errors)
self.write = __codec.write
self.writelines = __codec.writelines
self.reset = __codec.reset
def getregentry():
return (Codec().encode,Codec().decode,StreamReader,StreamWriter)
#
# shift_jis.py: Python Unicode Codec for SHIFT_JIS
#
# Written by Hye-Shik Chang <perky@FreeBSD.org>
# $CJKCodecs: shift_jis.py,v 1.3 2004/01/17 11:26:10 perky Exp $
#
from _codecs_shift_jis import codec
import codecs
class Codec(codecs.Codec):
encode = codec.encode
decode = codec.decode
class StreamReader(Codec, codecs.StreamReader):
def __init__(self, stream, errors='strict'):
codecs.StreamReader.__init__(self, stream, errors)
__codec = codec.StreamReader(stream, errors)
self.read = __codec.read
self.readline = __codec.readline
self.readlines = __codec.readlines
self.reset = __codec.reset
class StreamWriter(Codec, codecs.StreamWriter):
def __init__(self, stream, errors='strict'):
codecs.StreamWriter.__init__(self, stream, errors)
__codec = codec.StreamWriter(stream, errors)
self.write = __codec.write
self.writelines = __codec.writelines
self.reset = __codec.reset
def getregentry():
return (Codec().encode,Codec().decode,StreamReader,StreamWriter)
#
# shift_jisx0213.py: Python Unicode Codec for SHIFT_JISX0213
#
# Written by Hye-Shik Chang <perky@FreeBSD.org>
# $CJKCodecs: shift_jisx0213.py,v 1.3 2004/01/17 11:26:10 perky Exp $
#
from _codecs_shift_jisx0213 import codec
import codecs
class Codec(codecs.Codec):
encode = codec.encode
decode = codec.decode
class StreamReader(Codec, codecs.StreamReader):
def __init__(self, stream, errors='strict'):
codecs.StreamReader.__init__(self, stream, errors)
__codec = codec.StreamReader(stream, errors)
self.read = __codec.read
self.readline = __codec.readline
self.readlines = __codec.readlines
self.reset = __codec.reset
class StreamWriter(Codec, codecs.StreamWriter):
def __init__(self, stream, errors='strict'):
codecs.StreamWriter.__init__(self, stream, errors)
__codec = codec.StreamWriter(stream, errors)
self.write = __codec.write
self.writelines = __codec.writelines
self.reset = __codec.reset
def getregentry():
return (Codec().encode,Codec().decode,StreamReader,StreamWriter)
This diff is collapsed.
......@@ -549,6 +549,10 @@ def printlist(x, width=70, indent=4):
# test_timeout
# Controlled by test_timeout.skip_expected. Requires the network
# resource and a socket module.
# test_codecmaps_*
# Whether a skip is expected here depends on whether a large test
# input file has been downloaded. test_codecmaps_*.skip_expected
# controls that.
_expectations = {
'win32':
......@@ -565,7 +569,6 @@ _expectations = {
test_curses
test_dbm
test_dl
test_email_codecs
test_fcntl
test_fork1
test_gdbm
......@@ -598,7 +601,6 @@ _expectations = {
test_cl
test_curses
test_dl
test_email_codecs
test_gl
test_imgfile
test_largefile
......@@ -623,7 +625,6 @@ _expectations = {
test_curses
test_dbm
test_dl
test_email_codecs
test_fcntl
test_fork1
test_gl
......@@ -778,7 +779,6 @@ _expectations = {
test_cl
test_curses
test_dl
test_email_codecs
test_gdbm
test_gl
test_imgfile
......@@ -803,7 +803,6 @@ _expectations = {
test_cl
test_curses
test_dbm
test_email_codecs
test_gdbm
test_gl
test_gzip
......@@ -850,7 +849,6 @@ _expectations = {
test_cl
test_curses
test_dl
test_email_codecs
test_gdbm
test_gl
test_imgfile
......@@ -876,7 +874,6 @@ _expectations = {
test_cl
test_curses
test_dbm
test_email_codecs
test_gl
test_imgfile
test_ioctl
......@@ -901,7 +898,6 @@ _expectations = {
test_commands
test_curses
test_dl
test_email_codecs
test_gl
test_imgfile
test_largefile
......@@ -925,7 +921,6 @@ _expectations = {
test_bsddb3
test_cd
test_cl
test_email_codecs
test_gl
test_imgfile
test_linuxaudiodev
......@@ -955,6 +950,8 @@ class _ExpectedSkips:
from test import test_normalization
from test import test_socket_ssl
from test import test_timeout
from test import test_codecmaps_cn, test_codecmaps_jp
from test import test_codecmaps_kr, test_codecmaps_tw
self.valid = False
if sys.platform in _expectations:
......@@ -973,6 +970,10 @@ class _ExpectedSkips:
if test_timeout.skip_expected:
self.expected.add('test_timeout')
for cc in ('cn', 'jp', 'kr', 'tw'):
if eval('test_codecmaps_' + cc).skip_expected:
self.expected.add('test_codecmaps_' + cc)
if not sys.platform in ("mac", "darwin"):
MAC_ONLY = ["test_macostools", "test_macfs", "test_aepack",
"test_plistlib", "test_scriptpackages"]
......
#!/usr/bin/env python
#
# test_codecencodings_cn.py
# Codec encoding tests for PRC encodings.
#
# $CJKCodecs: test_codecencodings_cn.py,v 1.1 2003/12/19 03:00:05 perky Exp $
from test import test_support
from test import test_multibytecodec_support
import unittest
class Test_GB2312(test_multibytecodec_support.TestBase, unittest.TestCase):
encoding = 'gb2312'
tstring = test_multibytecodec_support.load_teststring('gb2312')
codectests = (
# invalid bytes
("abc\x81\x81\xc1\xc4", "strict", None),
("abc\xc8", "strict", None),
("abc\x81\x81\xc1\xc4", "replace", u"abc\ufffd\u804a"),
("abc\x81\x81\xc1\xc4\xc8", "replace", u"abc\ufffd\u804a\ufffd"),
("abc\x81\x81\xc1\xc4", "ignore", u"abc\u804a"),
("\xc1\x64", "strict", None),
)
class Test_GBK(test_multibytecodec_support.TestBase, unittest.TestCase):
encoding = 'gbk'
tstring = test_multibytecodec_support.load_teststring('gbk')
codectests = (
# invalid bytes
("abc\x80\x80\xc1\xc4", "strict", None),
("abc\xc8", "strict", None),
("abc\x80\x80\xc1\xc4", "replace", u"abc\ufffd\u804a"),
("abc\x80\x80\xc1\xc4\xc8", "replace", u"abc\ufffd\u804a\ufffd"),
("abc\x80\x80\xc1\xc4", "ignore", u"abc\u804a"),
("\x83\x34\x83\x31", "strict", None),
)
class Test_GB18030(test_multibytecodec_support.TestBase, unittest.TestCase):
encoding = 'gb18030'
tstring = test_multibytecodec_support.load_teststring('gb18030')
codectests = (
# invalid bytes
("abc\x80\x80\xc1\xc4", "strict", None),
("abc\xc8", "strict", None),
("abc\x80\x80\xc1\xc4", "replace", u"abc\ufffd\u804a"),
("abc\x80\x80\xc1\xc4\xc8", "replace", u"abc\ufffd\u804a\ufffd"),
("abc\x80\x80\xc1\xc4", "ignore", u"abc\u804a"),
("abc\x84\x39\x84\x39\xc1\xc4", "replace", u"abc\ufffd\u804a"),
)
has_iso10646 = True
def test_main():
suite = unittest.TestSuite()
suite.addTest(unittest.makeSuite(Test_GB2312))
suite.addTest(unittest.makeSuite(Test_GBK))
suite.addTest(unittest.makeSuite(Test_GB18030))
test_support.run_suite(suite)
if __name__ == "__main__":
test_main()
#!/usr/bin/env python
#
# test_codecencodings_jp.py
# Codec encoding tests for Japanese encodings.
#
# $CJKCodecs: test_codecencodings_jp.py,v 1.2 2004/01/06 09:25:37 perky Exp $
from test import test_support
from test import test_multibytecodec_support
import unittest
class Test_CP932(test_multibytecodec_support.TestBase, unittest.TestCase):
encoding = 'cp932'
tstring = test_multibytecodec_support.load_teststring('shift_jis')
codectests = (
# invalid bytes
("abc\x81\x00\x81\x00\x82\x84", "strict", None),
("abc\xf8", "strict", None),
("abc\x81\x00\x82\x84", "replace", u"abc\ufffd\uff44"),
("abc\x81\x00\x82\x84\x88", "replace", u"abc\ufffd\uff44\ufffd"),
("abc\x81\x00\x82\x84", "ignore", u"abc\uff44"),
# sjis vs cp932
("\\\x7e", "replace", u"\\\x7e"),
("\x81\x5f\x81\x61\x81\x7c", "replace", u"\uff3c\u2225\uff0d"),
)
class Test_EUC_JISX0213(test_multibytecodec_support.TestBase,
unittest.TestCase):
encoding = 'euc_jisx0213'
tstring = test_multibytecodec_support.load_teststring('euc_jisx0213')
codectests = (
# invalid bytes
("abc\x80\x80\xc1\xc4", "strict", None),
("abc\xc8", "strict", None),
("abc\x80\x80\xc1\xc4", "replace", u"abc\ufffd\u7956"),
("abc\x80\x80\xc1\xc4\xc8", "replace", u"abc\ufffd\u7956\ufffd"),
("abc\x80\x80\xc1\xc4", "ignore", u"abc\u7956"),
("abc\x8f\x83\x83", "replace", u"abc\ufffd"),
("\xc1\x64", "strict", None),
("\xa1\xc0", "strict", u"\uff3c"),
)
xmlcharnametest = (
u"\xab\u211c\xbb = \u2329\u1234\u232a",
"\xa9\xa8&real;\xa9\xb2 = &lang;&#4660;&rang;"
)
eucjp_commontests = (
("abc\x80\x80\xc1\xc4", "strict", None),
("abc\xc8", "strict", None),
("abc\x80\x80\xc1\xc4", "replace", u"abc\ufffd\u7956"),
("abc\x80\x80\xc1\xc4\xc8", "replace", u"abc\ufffd\u7956\ufffd"),
("abc\x80\x80\xc1\xc4", "ignore", u"abc\u7956"),
("abc\x8f\x83\x83", "replace", u"abc\ufffd"),
("\xc1\x64", "strict", None),
)
class Test_EUC_JP_COMPAT(test_multibytecodec_support.TestBase,
unittest.TestCase):
encoding = 'euc_jp'
tstring = test_multibytecodec_support.load_teststring('euc_jp')
codectests = eucjp_commontests + (
("\xa1\xc0\\", "strict", u"\uff3c\\"),
(u"\xa5", "strict", "\x5c"),
(u"\u203e", "strict", "\x7e"),
)
class Test_EUC_JP_STRICT(test_multibytecodec_support.TestBase,
unittest.TestCase):
encoding = 'euc_jp_strict'
tstring = test_multibytecodec_support.load_teststring('euc_jp')
codectests = eucjp_commontests + (
("\xa1\xc0\\", "strict", u"\\\\"),
(u"\xa5", "strict", None),
(u"\u203e", "strict", None),
)
shiftjis_commonenctests = (
("abc\x80\x80\x82\x84", "strict", None),
("abc\xf8", "strict", None),
("abc\x80\x80\x82\x84", "replace", u"abc\ufffd\uff44"),
("abc\x80\x80\x82\x84\x88", "replace", u"abc\ufffd\uff44\ufffd"),
("abc\x80\x80\x82\x84def", "ignore", u"abc\uff44def"),
)
class Test_SJIS_COMPAT(test_multibytecodec_support.TestBase, unittest.TestCase):
encoding = 'shift_jis'
tstring = test_multibytecodec_support.load_teststring('shift_jis')
codectests = shiftjis_commonenctests + (
("\\\x7e", "strict", u"\\\x7e"),
("\x81\x5f\x81\x61\x81\x7c", "strict", u"\uff3c\u2016\u2212"),
)
class Test_SJIS_STRICT(test_multibytecodec_support.TestBase, unittest.TestCase):
encoding = 'shift_jis_strict'
tstring = test_multibytecodec_support.load_teststring('shift_jis')
codectests = shiftjis_commonenctests + (
("\\\x7e", "replace", u"\xa5\u203e"),
("\x81\x5f\x81\x61\x81\x7c", "replace", u"\x5c\u2016\u2212"),
)
class Test_SJISX0213(test_multibytecodec_support.TestBase, unittest.TestCase):
encoding = 'shift_jisx0213'
tstring = test_multibytecodec_support.load_teststring('shift_jisx0213')
codectests = (
# invalid bytes
("abc\x80\x80\x82\x84", "strict", None),
("abc\xf8", "strict", None),
("abc\x80\x80\x82\x84", "replace", u"abc\ufffd\uff44"),
("abc\x80\x80\x82\x84\x88", "replace", u"abc\ufffd\uff44\ufffd"),
("abc\x80\x80\x82\x84def", "ignore", u"abc\uff44def"),
# sjis vs cp932
("\\\x7e", "replace", u"\xa5\u203e"),
("\x81\x5f\x81\x61\x81\x7c", "replace", u"\x5c\u2016\u2212"),
)
xmlcharnametest = (
u"\xab\u211c\xbb = \u2329\u1234\u232a",
"\x85G&real;\x85Q = &lang;&#4660;&rang;"
)
def test_main():
suite = unittest.TestSuite()
suite.addTest(unittest.makeSuite(Test_CP932))
suite.addTest(unittest.makeSuite(Test_EUC_JISX0213))
suite.addTest(unittest.makeSuite(Test_EUC_JP_COMPAT))
suite.addTest(unittest.makeSuite(Test_SJIS_COMPAT))
if test_multibytecodec_support.__cjkcodecs__:
suite.addTest(unittest.makeSuite(Test_EUC_JP_STRICT))
suite.addTest(unittest.makeSuite(Test_SJIS_STRICT))
suite.addTest(unittest.makeSuite(Test_SJISX0213))
test_support.run_suite(suite)
if __name__ == "__main__":
test_main()
#!/usr/bin/env python
#
# test_codecencodings_kr.py
# Codec encoding tests for ROK encodings.
#
# $CJKCodecs: test_codecencodings_kr.py,v 1.1 2003/12/19 03:00:06 perky Exp $
from test import test_support
from test import test_multibytecodec_support
import unittest
class Test_CP949(test_multibytecodec_support.TestBase, unittest.TestCase):
encoding = 'cp949'
tstring = test_multibytecodec_support.load_teststring('cp949')
codectests = (
# invalid bytes
("abc\x80\x80\xc1\xc4", "strict", None),
("abc\xc8", "strict", None),
("abc\x80\x80\xc1\xc4", "replace", u"abc\ufffd\uc894"),
("abc\x80\x80\xc1\xc4\xc8", "replace", u"abc\ufffd\uc894\ufffd"),
("abc\x80\x80\xc1\xc4", "ignore", u"abc\uc894"),
)
class Test_EUCKR(test_multibytecodec_support.TestBase, unittest.TestCase):
encoding = 'euc_kr'
tstring = test_multibytecodec_support.load_teststring('euc_kr')
codectests = (
# invalid bytes
("abc\x80\x80\xc1\xc4", "strict", None),
("abc\xc8", "strict", None),
("abc\x80\x80\xc1\xc4", "replace", u"abc\ufffd\uc894"),
("abc\x80\x80\xc1\xc4\xc8", "replace", u"abc\ufffd\uc894\ufffd"),
("abc\x80\x80\xc1\xc4", "ignore", u"abc\uc894"),
)
class Test_JOHAB(test_multibytecodec_support.TestBase, unittest.TestCase):
encoding = 'johab'
tstring = test_multibytecodec_support.load_teststring('johab')
codectests = (
# invalid bytes
("abc\x80\x80\xc1\xc4", "strict", None),
("abc\xc8", "strict", None),
("abc\x80\x80\xc1\xc4", "replace", u"abc\ufffd\ucd27"),
("abc\x80\x80\xc1\xc4\xc8", "replace", u"abc\ufffd\ucd27\ufffd"),
("abc\x80\x80\xc1\xc4", "ignore", u"abc\ucd27"),
)
def test_main():
suite = unittest.TestSuite()
suite.addTest(unittest.makeSuite(Test_CP949))
suite.addTest(unittest.makeSuite(Test_EUCKR))
suite.addTest(unittest.makeSuite(Test_JOHAB))
test_support.run_suite(suite)
if __name__ == "__main__":
test_main()
#!/usr/bin/env python
#
# test_codecencodings_tw.py
# Codec encoding tests for ROC encodings.
#
# $CJKCodecs: test_codecencodings_tw.py,v 1.1 2003/12/19 03:00:06 perky Exp $
from test import test_support
from test import test_multibytecodec_support
import unittest
class Test_Big5(test_multibytecodec_support.TestBase, unittest.TestCase):
encoding = 'big5'
tstring = test_multibytecodec_support.load_teststring('big5')
codectests = (
# invalid bytes
("abc\x80\x80\xc1\xc4", "strict", None),
("abc\xc8", "strict", None),
("abc\x80\x80\xc1\xc4", "replace", u"abc\ufffd\u8b10"),
("abc\x80\x80\xc1\xc4\xc8", "replace", u"abc\ufffd\u8b10\ufffd"),
("abc\x80\x80\xc1\xc4", "ignore", u"abc\u8b10"),
)
def test_main():
suite = unittest.TestSuite()
suite.addTest(unittest.makeSuite(Test_Big5))
test_support.run_suite(suite)
if __name__ == "__main__":
test_main()
#!/usr/bin/env python
#
# test_codecmaps_cn.py
# Codec mapping tests for PRC encodings
#
# $CJKCodecs: test_codecmaps_cn.py,v 1.2 2004/01/17 12:47:19 perky Exp $
from test import test_support
from test import test_multibytecodec_support
import unittest
class TestGB2312Map(test_multibytecodec_support.TestBase_Mapping,
unittest.TestCase):
encoding = 'gb2312'
mapfilename = 'EUC-CN.TXT'
mapfileurl = 'http://people.freebsd.org/~perky/i18n/EUC-CN.TXT'
class TestGBKMap(test_multibytecodec_support.TestBase_Mapping,
unittest.TestCase):
encoding = 'gbk'
mapfilename = 'CP936.TXT'
mapfileurl = 'http://www.unicode.org/Public/MAPPINGS/VENDORS/' \
'MICSFT/WINDOWS/CP936.TXT'
def test_main():
suite = unittest.TestSuite()
suite.addTest(unittest.makeSuite(TestGB2312Map))
suite.addTest(unittest.makeSuite(TestGBKMap))
test_support.run_suite(suite)
test_multibytecodec_support.register_skip_expected(TestGB2312Map, TestGBKMap)
if __name__ == "__main__":
test_main()
#!/usr/bin/env python
#
# test_codecmaps_jp.py
# Codec mapping tests for Japanese encodings
#
# $CJKCodecs: test_codecmaps_jp.py,v 1.2 2004/01/17 12:47:19 perky Exp $
from test import test_support
from test import test_multibytecodec_support
import unittest
class TestCP932Map(test_multibytecodec_support.TestBase_Mapping,
unittest.TestCase):
encoding = 'cp932'
mapfilename = 'CP932.TXT'
mapfileurl = 'http://www.unicode.org/Public/MAPPINGS/VENDORS/MICSFT/' \
'WINDOWS/CP932.TXT'
supmaps = [
('\x80', u'\u0080'),
('\xa0', u'\uf8f0'),
('\xfd', u'\uf8f1'),
('\xfe', u'\uf8f2'),
('\xff', u'\uf8f3'),
]
for i in range(0xa1, 0xe0):
supmaps.append((chr(i), unichr(i+0xfec0)))
class TestEUCJPCOMPATMap(test_multibytecodec_support.TestBase_Mapping,
unittest.TestCase):
encoding = 'euc_jp'
mapfilename = 'EUC-JP.TXT'
mapfileurl = 'http://people.freebsd.org/~perky/i18n/EUC-JP.TXT'
class TestSJISCOMPATMap(test_multibytecodec_support.TestBase_Mapping,
unittest.TestCase):
encoding = 'shift_jis'
mapfilename = 'SHIFTJIS.TXT'
mapfileurl = 'http://www.unicode.org/Public/MAPPINGS/OBSOLETE' \
'/EASTASIA/JIS/SHIFTJIS.TXT'
pass_enctest = [
('\x81_', u'\\'),
]
pass_dectest = [
('\\', u'\xa5'),
('~', u'\u203e'),
('\x81_', u'\\'),
]
class TestSJISSTRICTMap(test_multibytecodec_support.TestBase_Mapping,
unittest.TestCase):
encoding = 'shift_jis_strict'
mapfilename = 'SHIFTJIS.TXT'
mapfileurl = 'http://www.unicode.org/Public/MAPPINGS/OBSOLETE' \
'/EASTASIA/JIS/SHIFTJIS.TXT'
class TestEUCJISX0213Map(test_multibytecodec_support.TestBase_Mapping,
unittest.TestCase):
encoding = 'euc_jisx0213'
mapfilename = 'EUC-JISX0213.TXT'
mapfileurl = 'http://people.freebsd.org/~perky/i18n/EUC-JISX0213.TXT'
class TestSJISX0213Map(test_multibytecodec_support.TestBase_Mapping,
unittest.TestCase):
encoding = 'shift_jisx0213'
mapfilename = 'SHIFT_JISX0213.TXT'
mapfileurl = 'http://people.freebsd.org/~perky/i18n/SHIFT_JISX0213.TXT'
def test_main():
suite = unittest.TestSuite()
suite.addTest(unittest.makeSuite(TestCP932Map))
suite.addTest(unittest.makeSuite(TestEUCJPCOMPATMap))
suite.addTest(unittest.makeSuite(TestSJISCOMPATMap))
if test_multibytecodec_support.__cjkcodecs__:
suite.addTest(unittest.makeSuite(TestSJISSTRICTMap))
suite.addTest(unittest.makeSuite(TestEUCJISX0213Map))
suite.addTest(unittest.makeSuite(TestSJISX0213Map))
test_support.run_suite(suite)
test_multibytecodec_support.register_skip_expected(TestCP932Map,
TestEUCJPCOMPATMap, TestSJISCOMPATMap, TestEUCJISX0213Map,
TestSJISX0213Map)
if __name__ == "__main__":
test_main()
#!/usr/bin/env python
#
# test_codecmaps_kr.py
# Codec mapping tests for ROK encodings
#
# $CJKCodecs: test_codecmaps_kr.py,v 1.2 2004/01/17 12:47:19 perky Exp $
from test import test_support
from test import test_multibytecodec_support
import unittest
class TestCP949Map(test_multibytecodec_support.TestBase_Mapping,
unittest.TestCase):
encoding = 'cp949'
mapfilename = 'CP949.TXT'
mapfileurl = 'http://www.unicode.org/Public/MAPPINGS/VENDORS/MICSFT' \
'/WINDOWS/CP949.TXT'
class TestEUCKRMap(test_multibytecodec_support.TestBase_Mapping,
unittest.TestCase):
encoding = 'euc_kr'
mapfilename = 'EUC-KR.TXT'
mapfileurl = 'http://people.freebsd.org/~perky/i18n/EUC-KR.TXT'
class TestJOHABMap(test_multibytecodec_support.TestBase_Mapping,
unittest.TestCase):
encoding = 'johab'
mapfilename = 'JOHAB.TXT'
mapfileurl = 'http://www.unicode.org/Public/MAPPINGS/OBSOLETE/EASTASIA/' \
'KSC/JOHAB.TXT'
# KS X 1001 standard assigned 0x5c as WON SIGN.
# but, in early 90s that is the only era used johab widely,
# the most softwares implements it as REVERSE SOLIDUS.
# So, we ignore the standard here.
pass_enctest = [('\\', u'\u20a9')]
pass_dectest = [('\\', u'\u20a9')]
def test_main():
suite = unittest.TestSuite()
suite.addTest(unittest.makeSuite(TestCP949Map))
suite.addTest(unittest.makeSuite(TestEUCKRMap))
suite.addTest(unittest.makeSuite(TestJOHABMap))
test_support.run_suite(suite)
test_multibytecodec_support.register_skip_expected(TestCP949Map,
TestEUCKRMap, TestJOHABMap)
if __name__ == "__main__":
test_main()
#!/usr/bin/env python
#
# test_codecmaps_tw.py
# Codec mapping tests for ROC encodings
#
# $CJKCodecs: test_codecmaps_tw.py,v 1.2 2004/01/17 12:47:19 perky Exp $
from test import test_support
from test import test_multibytecodec_support
import unittest
class TestBIG5Map(test_multibytecodec_support.TestBase_Mapping,
unittest.TestCase):
encoding = 'big5'
mapfilename = 'BIG5.TXT'
mapfileurl = 'http://www.unicode.org/Public/MAPPINGS/OBSOLETE/' \
'EASTASIA/OTHER/BIG5.TXT'
class TestCP950Map(test_multibytecodec_support.TestBase_Mapping,
unittest.TestCase):
encoding = 'cp950'
mapfilename = 'CP950.TXT'
mapfileurl = 'http://www.unicode.org/Public/MAPPINGS/VENDORS/MICSFT/' \
'WINDOWS/CP950.TXT'
pass_enctest = [
('\xa2\xcc', u'\u5341'),
('\xa2\xce', u'\u5345'),
]
def test_main():
suite = unittest.TestSuite()
suite.addTest(unittest.makeSuite(TestBIG5Map))
suite.addTest(unittest.makeSuite(TestCP950Map))
test_support.run_suite(suite)
test_multibytecodec_support.register_skip_expected(TestBIG5Map, TestCP950Map)
if __name__ == "__main__":
test_main()
#!/usr/bin/env python
#
# test_multibytecodec.py
# Unit test for multibytecodec itself
#
# $CJKCodecs: test_multibytecodec.py,v 1.5 2004/01/06 02:26:28 perky Exp $
from test import test_support
from test import test_multibytecodec_support
import unittest, StringIO, codecs
class Test_StreamWriter(unittest.TestCase):
if len(u'\U00012345') == 2: # UCS2
def test_gb18030(self):
s= StringIO.StringIO()
c = codecs.lookup('gb18030')[3](s)
c.write(u'123')
self.assertEqual(s.getvalue(), '123')
c.write(u'\U00012345')
self.assertEqual(s.getvalue(), '123\x907\x959')
c.write(u'\U00012345'[0])
self.assertEqual(s.getvalue(), '123\x907\x959')
c.write(u'\U00012345'[1] + u'\U00012345' + u'\uac00\u00ac')
self.assertEqual(s.getvalue(),
'123\x907\x959\x907\x959\x907\x959\x827\xcf5\x810\x851')
c.write(u'\U00012345'[0])
self.assertEqual(s.getvalue(),
'123\x907\x959\x907\x959\x907\x959\x827\xcf5\x810\x851')
self.assertRaises(UnicodeError, c.reset)
self.assertEqual(s.getvalue(),
'123\x907\x959\x907\x959\x907\x959\x827\xcf5\x810\x851')
# standard utf-8 codecs has broken StreamReader
if test_multibytecodec_support.__cjkcodecs__:
def test_utf_8(self):
s= StringIO.StringIO()
c = codecs.lookup('utf-8')[3](s)
c.write(u'123')
self.assertEqual(s.getvalue(), '123')
c.write(u'\U00012345')
self.assertEqual(s.getvalue(), '123\xf0\x92\x8d\x85')
c.write(u'\U00012345'[0])
self.assertEqual(s.getvalue(), '123\xf0\x92\x8d\x85')
c.write(u'\U00012345'[1] + u'\U00012345' + u'\uac00\u00ac')
self.assertEqual(s.getvalue(),
'123\xf0\x92\x8d\x85\xf0\x92\x8d\x85\xf0\x92\x8d\x85'
'\xea\xb0\x80\xc2\xac')
c.write(u'\U00012345'[0])
self.assertEqual(s.getvalue(),
'123\xf0\x92\x8d\x85\xf0\x92\x8d\x85\xf0\x92\x8d\x85'
'\xea\xb0\x80\xc2\xac')
c.reset()
self.assertEqual(s.getvalue(),
'123\xf0\x92\x8d\x85\xf0\x92\x8d\x85\xf0\x92\x8d\x85'
'\xea\xb0\x80\xc2\xac\xed\xa0\x88')
c.write(u'\U00012345'[1])
self.assertEqual(s.getvalue(),
'123\xf0\x92\x8d\x85\xf0\x92\x8d\x85\xf0\x92\x8d\x85'
'\xea\xb0\x80\xc2\xac\xed\xa0\x88\xed\xbd\x85')
else: # UCS4
pass
def test_nullcoding(self):
self.assertEqual(''.decode('utf-8'), u'')
self.assertEqual(unicode('', 'utf-8'), u'')
self.assertEqual(u''.encode('utf-8'), '')
def test_str_decode(self):
self.assertEqual('abcd'.encode('utf-8'), 'abcd')
def test_main():
suite = unittest.TestSuite()
suite.addTest(unittest.makeSuite(Test_StreamWriter))
test_support.run_suite(suite)
if __name__ == "__main__":
test_main()
#!/usr/bin/env python
#
# test_multibytecodec_support.py
# Common Unittest Routines for CJK codecs
#
# $CJKCodecs: test_multibytecodec_support.py,v 1.5 2004/01/17 12:47:19 perky Exp $
import sys, codecs, os.path
import unittest
from test import test_support
from StringIO import StringIO
__cjkcodecs__ = 0 # define this as 0 for python
class TestBase:
encoding = '' # codec name
codec = None # codec tuple (with 4 elements)
tstring = '' # string to test StreamReader
codectests = None # must set. codec test tuple
roundtriptest = 1 # set if roundtrip is possible with unicode
has_iso10646 = 0 # set if this encoding contains whole iso10646 map
xmlcharnametest = None # string to test xmlcharrefreplace
def setUp(self):
if self.codec is None:
self.codec = codecs.lookup(self.encoding)
self.encode, self.decode, self.reader, self.writer = self.codec
def test_chunkcoding(self):
for native, utf8 in zip(*[StringIO(f).readlines()
for f in self.tstring]):
u = self.decode(native)[0]
self.assertEqual(u, utf8.decode('utf-8'))
if self.roundtriptest:
self.assertEqual(native, self.encode(u)[0])
def test_errorhandle(self):
for source, scheme, expected in self.codectests:
if type(source) == type(''):
func = self.decode
else:
func = self.encode
if expected:
result = func(source, scheme)[0]
self.assertEqual(result, expected)
else:
self.assertRaises(UnicodeError, func, source, scheme)
if sys.hexversion >= 0x02030000:
def test_xmlcharrefreplace(self):
if self.has_iso10646:
return
s = u"\u0b13\u0b23\u0b60 nd eggs"
self.assertEqual(
self.encode(s, "xmlcharrefreplace")[0],
"&#2835;&#2851;&#2912; nd eggs"
)
def test_customreplace(self):
if self.has_iso10646:
return
import htmlentitydefs
names = {}
for (key, value) in htmlentitydefs.entitydefs.items():
if len(value)==1:
names[value.decode('latin-1')] = self.decode(key)[0]
else:
names[unichr(int(value[2:-1]))] = self.decode(key)[0]
def xmlcharnamereplace(exc):
if not isinstance(exc, UnicodeEncodeError):
raise TypeError("don't know how to handle %r" % exc)
l = []
for c in exc.object[exc.start:exc.end]:
try:
l.append(u"&%s;" % names[c])
except KeyError:
l.append(u"&#%d;" % ord(c))
return (u"".join(l), exc.end)
codecs.register_error(
"test.xmlcharnamereplace", xmlcharnamereplace)
if self.xmlcharnametest:
sin, sout = self.xmlcharnametest
else:
sin = u"\xab\u211c\xbb = \u2329\u1234\u232a"
sout = "&laquo;&real;&raquo; = &lang;&#4660;&rang;"
self.assertEqual(self.encode(sin,
"test.xmlcharnamereplace")[0], sout)
def test_streamreader(self):
UTF8Writer = codecs.getwriter('utf-8')
for name in ["read", "readline", "readlines"]:
for sizehint in [None, -1] + range(1, 33) + \
[64, 128, 256, 512, 1024]:
istream = self.reader(StringIO(self.tstring[0]))
ostream = UTF8Writer(StringIO())
func = getattr(istream, name)
while 1:
data = func(sizehint)
if not data:
break
if name == "readlines":
ostream.writelines(data)
else:
ostream.write(data)
self.assertEqual(ostream.getvalue(), self.tstring[1])
def test_streamwriter(self):
if __cjkcodecs__:
readfuncs = ('read', 'readline', 'readlines')
else:
# standard utf8 codec has broken readline and readlines.
readfuncs = ('read',)
UTF8Reader = codecs.getreader('utf-8')
for name in readfuncs:
for sizehint in [None] + range(1, 33) + \
[64, 128, 256, 512, 1024]:
istream = UTF8Reader(StringIO(self.tstring[1]))
ostream = self.writer(StringIO())
func = getattr(istream, name)
while 1:
if sizehint is not None:
data = func(sizehint)
else:
data = func()
if not data:
break
if name == "readlines":
ostream.writelines(data)
else:
ostream.write(data)
self.assertEqual(ostream.getvalue(), self.tstring[0])
if len(u'\U00012345') == 2: # ucs2 build
_unichr = unichr
def unichr(v):
if v >= 0x10000:
return _unichr(0xd800 + ((v - 0x10000) >> 10)) + \
_unichr(0xdc00 + ((v - 0x10000) & 0x3ff))
else:
return _unichr(v)
_ord = ord
def ord(c):
if len(c) == 2:
return 0x10000 + ((_ord(c[0]) - 0xd800) << 10) + \
(ord(c[1]) - 0xdc00)
else:
return _ord(c)
class TestBase_Mapping(unittest.TestCase):
pass_enctest = []
pass_dectest = []
supmaps = []
def __init__(self, *args, **kw):
unittest.TestCase.__init__(self, *args, **kw)
if not os.path.exists(self.mapfilename):
raise test_support.TestSkipped('%s not found, download from %s' %
(self.mapfilename, self.mapfileurl))
def test_mapping_file(self):
unichrs = lambda s: u''.join(map(unichr, map(eval, s.split('+'))))
urt_wa = {}
for line in open(self.mapfilename):
if not line:
break
data = line.split('#')[0].strip().split()
if len(data) != 2:
continue
csetval = eval(data[0])
if csetval <= 0x7F:
csetch = chr(csetval & 0xff)
elif csetval >= 0x1000000:
csetch = chr(csetval >> 24) + chr((csetval >> 16) & 0xff) + \
chr((csetval >> 8) & 0xff) + chr(csetval & 0xff)
elif csetval >= 0x10000:
csetch = chr(csetval >> 16) + \
chr((csetval >> 8) & 0xff) + chr(csetval & 0xff)
elif csetval >= 0x100:
csetch = chr(csetval >> 8) + chr(csetval & 0xff)
else:
continue
unich = unichrs(data[1])
if ord(unich) == 0xfffd or urt_wa.has_key(unich):
continue
urt_wa[unich] = csetch
self._testpoint(csetch, unich)
def test_mapping_supplemental(self):
for mapping in self.supmaps:
self._testpoint(*mapping)
def _testpoint(self, csetch, unich):
if (csetch, unich) not in self.pass_enctest:
self.assertEqual(unich.encode(self.encoding), csetch)
if (csetch, unich) not in self.pass_dectest:
self.assertEqual(unicode(csetch, self.encoding), unich)
def load_teststring(encoding):
if __cjkcodecs__:
etxt = open(os.path.join('sampletexts', encoding) + '.txt').read()
utxt = open(os.path.join('sampletexts', encoding) + '.utf8').read()
return (etxt, utxt)
else:
from test import cjkencodings_test
return cjkencodings_test.teststring[encoding]
def register_skip_expected(*cases):
for case in cases: # len(cases) must be 1 at least.
for path in [os.path.curdir, os.path.pardir]:
fn = os.path.join(path, case.mapfilename)
if os.path.exists(fn):
case.mapfilename = fn
break
else:
sys.modules[case.__module__].skip_expected = True
break
else:
sys.modules[case.__module__].skip_expected = False
......@@ -478,6 +478,48 @@ GLHACK=-Dclear=__GLclear
#EXPAT_DIR=/usr/local/src/expat-1.95.2
#pyexpat pyexpat.c -DHAVE_EXPAT_H -I$(EXPAT_DIR)/lib -L$(EXPAT_DIR) -lexpat
# Hye-Shik Chang's CJKCodecs
# multibytecodec is required for all the other CJK codec modules
#_multibytecodec cjkcodecs/multibytecodec.c
# mapdata modules are required to support their respective dependent codecs
#_codecs_mapdata_ja_JP cjkcodecs/mapdata_ja_JP.c
#_codecs_mapdata_ko_KR cjkcodecs/mapdata_ko_KR.c
#_codecs_mapdata_zh_CN cjkcodecs/mapdata_zh_CN.c
#_codecs_mapdata_zh_TW cjkcodecs/mapdata_zh_TW.c
# ja_JP codecs
#_codecs_cp932 cjkcodecs/_cp932.c
#_codecs_euc_jisx0213 cjkcodecs/_euc_jisx0213.c
#_codecs_euc_jp cjkcodecs/_euc_jp.c
#_codecs_iso2022_jp cjkcodecs/_iso2022_jp.c
#_codecs_iso2022_jp_1 cjkcodecs/_iso2022_jp_1.c
#_codecs_iso2022_jp_3 cjkcodecs/_iso2022_jp_3.c
#_codecs_iso2022_jp_ext cjkcodecs/_iso2022_jp_ext.c
#_codecs_shift_jis cjkcodecs/_shift_jis.c
#_codecs_shift_jisx0213 cjkcodecs/_shift_jisx0213.c
# ko_KR codecs
#_codecs_cp949 cjkcodecs/_cp949.c
#_codecs_euc_kr cjkcodecs/_euc_kr.c
#_codecs_johab cjkcodecs/_johab.c
# zh_CN codecs
#_codecs_gb18030 cjkcodecs/_gb18030.c
#_codecs_gb2312 cjkcodecs/_gb2312.c
#_codecs_gbk cjkcodecs/_gbk.c
#_codecs_hz cjkcodecs/_hz.c
# zh_TW codecs
#_codecs_big5 cjkcodecs/_big5.c
#_codecs_cp950 cjkcodecs/_cp950.c
# international codecs
#_codecs_iso2022_jp_2 cjkcodecs/_iso2022_jp_2.c # requires ja_JP, ko_KR, zh_CN
# Example -- included for reference only:
# xx xxmodule.c
......
Notes on cjkcodecs
-------------------
This directory contains source files for cjkcodecs extension modules.
They are based on CJKCodecs (http://cjkpython.i18n.org/#CJKCodecs)
as of Jan 17 2004 currently.
To generate or modify mapping headers
-------------------------------------
Mapping headers are imported from CJKCodecs as pre-generated form.
If you need to tweak or add something on it, please look at tools/
subdirectory of CJKCodecs' distribution.
Notes on implmentation characteristics of each codecs
-----------------------------------------------------
1) Big5 codec
The big5 codec maps the following characters as cp950 does rather
than conforming Unicode.org's that maps to 0xFFFD.
BIG5 Unicode Description
0xA15A 0x2574 SPACING UNDERSCORE
0xA1C3 0xFFE3 SPACING HEAVY OVERSCORE
0xA1C5 0x02CD SPACING HEAVY UNDERSCORE
0xA1FE 0xFF0F LT DIAG UP RIGHT TO LOW LEFT
0xA240 0xFF3C LT DIAG UP LEFT TO LOW RIGHT
0xA2CC 0x5341 HANGZHOU NUMERAL TEN
0xA2CE 0x5345 HANGZHOU NUMERAL THIRTY
Because unicode 0x5341, 0x5345, 0xFF0F, 0xFF3C is mapped to another
big5 codes already, a roundtrip compatibility is not guaranteed for
them.
2) cp932 codec
To conform to Windows's real mapping, cp932 codec maps the following
codepoints in addition of the official cp932 mapping.
CP932 Unicode Description
0x80 0x80 UNDEFINED
0xA0 0xF8F0 UNDEFINED
0xFD 0xF8F1 UNDEFINED
0xFE 0xF8F2 UNDEFINED
0xFF 0xF8F3 UNDEFINED
3) euc-jisx0213 codec
The euc-jisx0213 codec maps JIS X 0213 Plane 1 code 0x2140 into
unicode U+FF3C instead of U+005C as on unicode.org's mapping.
Because euc-jisx0213 has REVERSE SOLIDUS on 0x5c already and A140
is shown as a full width character, mapping to U+FF3C can make
more sense.
The euc-jisx0213 codec is enabled to decode JIS X 0212 codes on
codeset 2. Because JIS X 0212 and JIS X 0213 Plane 2 don't have
overlapped by each other, it doesn't bother standard conformations
(and JIS X 0213 Plane 2 is intended to use so.) On encoding
sessions, the codec will try to encode kanji characters in this
order:
JIS X 0213 Plane 1 -> JIS X 0213 Plane 2 -> JIS X 0212
4) euc-jp codec
The euc-jp codec is a compatibility instance on these points:
- U+FF3C FULLWIDTH REVERSE SOLIDUS is mapped to EUC-JP A1C0 (vice versa)
- U+00A5 YEN SIGN is mapped to EUC-JP 0x5c. (one way)
- U+203E OVERLINE is mapped to EUC-JP 0x7e. (one way)
5) shift-jis codec
The shift-jis codec is mapping 0x20-0x7e area to U+20-U+7E directly
instead of using JIS X 0201 for compatibility. The differences are:
- U+005C REVERSE SOLIDUS is mapped to SHIFT-JIS 0x5c.
- U+007E TILDE is mapped to SHIFT-JIS 0x7e.
- U+FF3C FULL-WIDTH REVERSE SOLIDUS is mapped to SHIFT-JIS 815f.
/*
* _big5.c: the Big5 codec
*
* Written by Hye-Shik Chang <perky@FreeBSD.org>
* $CJKCodecs: _big5.c,v 1.2 2003/12/31 05:46:55 perky Exp $
*/
#include "codeccommon.h"
ENCMAP(big5)
DECMAP(big5)
ENCODER(big5)
{
while (inleft > 0) {
Py_UNICODE c = **inbuf;
DBCHAR code;
if (c < 0x80) {
RESERVE_OUTBUF(1)
**outbuf = c;
NEXT(1, 1)
continue;
}
UCS4INVALID(c)
RESERVE_OUTBUF(2)
TRYMAP_ENC(big5, code, c);
else return 1;
(*outbuf)[0] = code >> 8;
(*outbuf)[1] = code & 0xFF;
NEXT(1, 2)
}
return 0;
}
DECODER(big5)
{
while (inleft > 0) {
unsigned char c = IN1;
RESERVE_OUTBUF(1)
if (c < 0x80) {
OUT1(c)
NEXT(1, 1)
continue;
}
RESERVE_INBUF(2)
TRYMAP_DEC(big5, **outbuf, c, IN2) {
NEXT(2, 1)
} else return 2;
}
return 0;
}
#include "codecentry.h"
BEGIN_CODEC_REGISTRY(big5)
MAPOPEN(zh_TW)
IMPORTMAP_ENCDEC(big5)
MAPCLOSE()
END_CODEC_REGISTRY(big5)
/*
* _cp932.c: the CP932 codec
*
* Written by Hye-Shik Chang <perky@FreeBSD.org>
* $CJKCodecs: _cp932.c,v 1.2 2003/12/31 05:46:55 perky Exp $
*/
#include "codeccommon.h"
ENCMAP(jisxcommon)
ENCMAP(cp932ext)
DECMAP(jisx0208)
DECMAP(cp932ext)
ENCODER(cp932)
{
while (inleft > 0) {
Py_UNICODE c = IN1;
DBCHAR code;
unsigned char c1, c2;
if (c <= 0x80) {
WRITE1(c)
NEXT(1, 1)
continue;
} else if (c >= 0xff61 && c <= 0xff9f) {
WRITE1(c - 0xfec0)
NEXT(1, 1)
continue;
} else if (c >= 0xf8f0 && c <= 0xf8f3) {
/* Windows compatability */
RESERVE_OUTBUF(1)
if (c == 0xf8f0)
OUT1(0xa0)
else
OUT1(c - 0xfef1 + 0xfd)
NEXT(1, 1)
continue;
}
UCS4INVALID(c)
RESERVE_OUTBUF(2)
TRYMAP_ENC(cp932ext, code, c) {
OUT1(code >> 8)
OUT2(code & 0xff)
} else TRYMAP_ENC(jisxcommon, code, c) {
if (code & 0x8000) /* MSB set: JIS X 0212 */
return 1;
/* JIS X 0208 */
c1 = code >> 8;
c2 = code & 0xff;
c2 = (((c1 - 0x21) & 1) ? 0x5e : 0) + (c2 - 0x21);
c1 = (c1 - 0x21) >> 1;
OUT1(c1 < 0x1f ? c1 + 0x81 : c1 + 0xc1)
OUT2(c2 < 0x3f ? c2 + 0x40 : c2 + 0x41)
} else if (c >= 0xe000 && c < 0xe758) {
/* User-defined area */
c1 = (Py_UNICODE)(c - 0xe000) / 188;
c2 = (Py_UNICODE)(c - 0xe000) % 188;
OUT1(c1 + 0xf0)
OUT2(c2 < 0x3f ? c2 + 0x40 : c2 + 0x41)
} else
return 1;
NEXT(1, 2)
}
return 0;
}
DECODER(cp932)
{
while (inleft > 0) {
unsigned char c = IN1, c2;
RESERVE_OUTBUF(1)
if (c <= 0x80) {
OUT1(c)
NEXT(1, 1)
continue;
} else if (c >= 0xa0 && c <= 0xdf) {
if (c == 0xa0)
OUT1(0xf8f0) /* half-width katakana */
else
OUT1(0xfec0 + c)
NEXT(1, 1)
continue;
} else if (c >= 0xfd/* && c <= 0xff*/) {
/* Windows compatibility */
OUT1(0xf8f1 - 0xfd + c)
NEXT(1, 1)
continue;
}
RESERVE_INBUF(2)
c2 = IN2;
TRYMAP_DEC(cp932ext, **outbuf, c, c2);
else if ((c >= 0x81 && c <= 0x9f) || (c >= 0xe0 && c <= 0xea)) {
if (c2 < 0x40 || (c2 > 0x7e && c2 < 0x80) || c2 > 0xfc)
return 2;
c = (c < 0xe0 ? c - 0x81 : c - 0xc1);
c2 = (c2 < 0x80 ? c2 - 0x40 : c2 - 0x41);
c = (2 * c + (c2 < 0x5e ? 0 : 1) + 0x21);
c2 = (c2 < 0x5e ? c2 : c2 - 0x5e) + 0x21;
TRYMAP_DEC(jisx0208, **outbuf, c, c2);
else return 2;
} else if (c >= 0xf0 && c <= 0xf9) {
if ((c2 >= 0x40 && c2 <= 0x7e) || (c2 >= 0x80 && c2 <= 0xfc))
OUT1(0xe000 + 188 * (c - 0xf0) +
(c2 < 0x80 ? c2 - 0x40 : c2 - 0x41))
else
return 2;
} else
return 2;
NEXT(2, 1)
}
return 0;
}
#include "codecentry.h"
BEGIN_CODEC_REGISTRY(cp932)
MAPOPEN(ja_JP)
IMPORTMAP_DEC(jisx0208)
IMPORTMAP_ENCDEC(cp932ext)
IMPORTMAP_ENC(jisxcommon)
MAPCLOSE()
END_CODEC_REGISTRY(cp932)
/*
* _cp949.c: the CP949 codec
*
* Written by Hye-Shik Chang <perky@FreeBSD.org>
* $CJKCodecs: _cp949.c,v 1.2 2003/12/31 05:46:55 perky Exp $
*/
#include "codeccommon.h"
ENCMAP(cp949)
DECMAP(ksx1001)
DECMAP(cp949ext)
ENCODER(cp949)
{
while (inleft > 0) {
Py_UNICODE c = IN1;
DBCHAR code;
if (c < 0x80) {
WRITE1(c)
NEXT(1, 1)
continue;
}
UCS4INVALID(c)
RESERVE_OUTBUF(2)
TRYMAP_ENC(cp949, code, c);
else return 1;
OUT1((code >> 8) | 0x80)
if (code & 0x8000)
OUT2(code & 0xFF) /* MSB set: CP949 */
else
OUT2((code & 0xFF) | 0x80) /* MSB unset: ks x 1001 */
NEXT(1, 2)
}
return 0;
}
DECODER(cp949)
{
while (inleft > 0) {
unsigned char c = IN1;
RESERVE_OUTBUF(1)
if (c < 0x80) {
OUT1(c)
NEXT(1, 1)
continue;
}
RESERVE_INBUF(2)
TRYMAP_DEC(ksx1001, **outbuf, c ^ 0x80, IN2 ^ 0x80);
else TRYMAP_DEC(cp949ext, **outbuf, c, IN2);
else return 2;
NEXT(2, 1)
}
return 0;
}
#include "codecentry.h"
BEGIN_CODEC_REGISTRY(cp949)
MAPOPEN(ko_KR)
IMPORTMAP_DEC(ksx1001)
IMPORTMAP_DEC(cp949ext)
IMPORTMAP_ENC(cp949)
MAPCLOSE()
END_CODEC_REGISTRY(cp949)
/*
* _cp950.c: the CP950 codec
*
* Written by Hye-Shik Chang <perky@FreeBSD.org>
* $CJKCodecs: _cp950.c,v 1.2 2003/12/31 05:46:55 perky Exp $
*/
#include "codeccommon.h"
ENCMAP(big5)
ENCMAP(cp950ext)
DECMAP(big5)
DECMAP(cp950ext)
ENCODER(cp950)
{
while (inleft > 0) {
Py_UNICODE c = IN1;
DBCHAR code;
if (c < 0x80) {
WRITE1(c)
NEXT(1, 1)
continue;
}
UCS4INVALID(c)
RESERVE_OUTBUF(2)
TRYMAP_ENC(cp950ext, code, c);
else TRYMAP_ENC(big5, code, c);
else return 1;
OUT1(code >> 8)
OUT2(code & 0xFF)
NEXT(1, 2)
}
return 0;
}
DECODER(cp950)
{
while (inleft > 0) {
unsigned char c = IN1;
RESERVE_OUTBUF(1)
if (c < 0x80) {
OUT1(c)
NEXT(1, 1)
continue;
}
RESERVE_INBUF(2)
TRYMAP_DEC(cp950ext, **outbuf, c, IN2);
else TRYMAP_DEC(big5, **outbuf, c, IN2);
else return 2;
NEXT(2, 1)
}
return 0;
}
#include "codecentry.h"
BEGIN_CODEC_REGISTRY(cp950)
MAPOPEN(zh_TW)
IMPORTMAP_ENCDEC(big5)
IMPORTMAP_ENCDEC(cp950ext)
MAPCLOSE()
END_CODEC_REGISTRY(cp950)
/*
* _euc_jisx0213.c: the EUC-JISX0213 codec
*
* Written by Hye-Shik Chang <perky@FreeBSD.org>
* $CJKCodecs: _euc_jisx0213.c,v 1.2 2003/12/31 05:46:55 perky Exp $
*/
#define USING_BINARY_PAIR_SEARCH
#include "codeccommon.h"
#include "map_jisx0213_pairs.h"
ENCMAP(jisxcommon)
DECMAP(jisx0208)
DECMAP(jisx0212)
ENCMAP(jisx0213_bmp)
DECMAP(jisx0213_1_bmp)
DECMAP(jisx0213_2_bmp)
ENCMAP(jisx0213_emp)
DECMAP(jisx0213_1_emp)
DECMAP(jisx0213_2_emp)
#define EMPBASE 0x20000
ENCODER(euc_jisx0213)
{
while (inleft > 0) {
ucs4_t c = IN1;
DBCHAR code;
int insize;
if (c < 0x80) {
WRITE1(c)
NEXT(1, 1)
continue;
}
DECODE_SURROGATE(c)
insize = GET_INSIZE(c);
if (c <= 0xFFFF) {
/* try 0213 first because it might have MULTIC */
TRYMAP_ENC(jisx0213_bmp, code, c) {
if (code == MULTIC) {
if (inleft < 2) {
if (flags & MBENC_FLUSH) {
code = find_pairencmap(c, 0, jisx0213_pairencmap,
JISX0213_ENCPAIRS);
if (code == DBCINV)
return 1;
} else
return MBERR_TOOFEW;
} else {
code = find_pairencmap(c, (*inbuf)[1],
jisx0213_pairencmap, JISX0213_ENCPAIRS);
if (code == DBCINV) {
code = find_pairencmap(c, 0, jisx0213_pairencmap,
JISX0213_ENCPAIRS);
if (code == DBCINV)
return 1;
} else
insize = 2;
}
}
} else TRYMAP_ENC(jisxcommon, code, c);
else if (c >= 0xff61 && c <= 0xff9f) {
/* JIS X 0201 half-width katakana */
WRITE2(0x8e, c - 0xfec0)
NEXT(1, 2)
continue;
} else if (c == 0xff3c)
/* F/W REVERSE SOLIDUS (see NOTES.euc-jisx0213) */
code = 0x2140;
else if (c == 0xff5e)
/* F/W TILDE (see NOTES.euc-jisx0213) */
code = 0x2232;
else
return 1;
} else if (c >> 16 == EMPBASE >> 16) {
TRYMAP_ENC(jisx0213_emp, code, c & 0xffff);
else return insize;
} else
return insize;
if (code & 0x8000) {
/* Codeset 2 */
WRITE3(0x8f, code >> 8, (code & 0xFF) | 0x80)
NEXT(insize, 3)
} else {
/* Codeset 1 */
WRITE2((code >> 8) | 0x80, (code & 0xFF) | 0x80)
NEXT(insize, 2)
}
}
return 0;
}
DECODER(euc_jisx0213)
{
while (inleft > 0) {
unsigned char c = IN1;
ucs4_t code;
RESERVE_OUTBUF(1)
if (c < 0x80) {
OUT1(c)
NEXT(1, 1)
continue;
}
if (c == 0x8e) {
/* JIS X 0201 half-width katakana */
unsigned char c2;
RESERVE_INBUF(2)
c2 = IN2;
if (c2 >= 0xa1 && c2 <= 0xdf) {
OUT1(0xfec0 + c2)
NEXT(2, 1)
} else
return 2;
} else if (c == 0x8f) {
unsigned char c2, c3;
RESERVE_INBUF(3)
c2 = IN2 ^ 0x80;
c3 = IN3 ^ 0x80;
/* JIS X 0213 Plane 2 or JIS X 0212 (see NOTES.euc-jisx0213) */
TRYMAP_DEC(jisx0213_2_bmp, **outbuf, c2, c3) ;
else TRYMAP_DEC(jisx0213_2_emp, code, c2, c3) {
PUTUCS4(EMPBASE | code)
NEXT_IN(3)
continue;
} else TRYMAP_DEC(jisx0212, **outbuf, c2, c3) ;
else return 3;
NEXT(3, 1)
} else {
unsigned char c2;
RESERVE_INBUF(2)
c ^= 0x80;
c2 = IN2 ^ 0x80;
/* JIS X 0213 Plane 1 */
if (c == 0x21 && c2 == 0x40) **outbuf = 0xff3c;
else if (c == 0x22 && c2 == 0x32) **outbuf = 0xff5e;
else TRYMAP_DEC(jisx0208, **outbuf, c, c2);
else TRYMAP_DEC(jisx0213_1_bmp, **outbuf, c, c2);
else TRYMAP_DEC(jisx0213_1_emp, code, c, c2) {
PUTUCS4(EMPBASE | code)
NEXT_IN(2)
continue;
} else TRYMAP_DEC(jisx0213_pair, code, c, c2) {
WRITE2(code >> 16, code & 0xffff)
NEXT(2, 2)
continue;
} else return 2;
NEXT(2, 1)
}
}
return 0;
}
#include "codecentry.h"
BEGIN_CODEC_REGISTRY(euc_jisx0213)
MAPOPEN(ja_JP)
IMPORTMAP_ENC(jisxcommon)
IMPORTMAP_DEC(jisx0208)
IMPORTMAP_DEC(jisx0212)
IMPORTMAP_ENC(jisx0213_bmp)
IMPORTMAP_DEC(jisx0213_1_bmp)
IMPORTMAP_DEC(jisx0213_2_bmp)
IMPORTMAP_ENC(jisx0213_emp)
IMPORTMAP_DEC(jisx0213_1_emp)
IMPORTMAP_DEC(jisx0213_2_emp)
MAPCLOSE()
END_CODEC_REGISTRY(euc_jisx0213)
/*
* _euc_jp.c: the EUC-JP codec
*
* Written by Hye-Shik Chang <perky@FreeBSD.org>
* $CJKCodecs: _euc_jp.c,v 1.5 2003/12/31 05:46:55 perky Exp $
*/
#include "codeccommon.h"
ENCMAP(jisxcommon)
DECMAP(jisx0208)
DECMAP(jisx0212)
ENCODER(euc_jp)
{
while (inleft > 0) {
Py_UNICODE c = IN1;
DBCHAR code;
if (c < 0x80) {
WRITE1(c)
NEXT(1, 1)
continue;
}
UCS4INVALID(c)
TRYMAP_ENC(jisxcommon, code, c);
else if (c >= 0xff61 && c <= 0xff9f) {
/* JIS X 0201 half-width katakana */
WRITE2(0x8e, c - 0xfec0)
NEXT(1, 2)
continue;
}
#ifndef STRICT_BUILD
else if (c == 0xff3c) /* FULL-WIDTH REVERSE SOLIDUS */
code = 0x2140;
else if (c == 0xa5) { /* YEN SIGN */
WRITE1(0x5c);
NEXT(1, 1)
continue;
} else if (c == 0x203e) { /* OVERLINE */
WRITE1(0x7e);
NEXT(1, 1)
continue;
}
#endif
else
return 1;
if (code & 0x8000) {
/* JIS X 0212 */
WRITE3(0x8f, code >> 8, (code & 0xFF) | 0x80)
NEXT(1, 3)
} else {
/* JIS X 0208 */
WRITE2((code >> 8) | 0x80, (code & 0xFF) | 0x80)
NEXT(1, 2)
}
}
return 0;
}
DECODER(euc_jp)
{
while (inleft > 0) {
unsigned char c = IN1;
RESERVE_OUTBUF(1)
if (c < 0x80) {
OUT1(c)
NEXT(1, 1)
continue;
}
if (c == 0x8e) {
/* JIS X 0201 half-width katakana */
unsigned char c2;
RESERVE_INBUF(2)
c2 = IN2;
if (c2 >= 0xa1 && c2 <= 0xdf) {
OUT1(0xfec0 + c2)
NEXT(2, 1)
} else
return 2;
} else if (c == 0x8f) {
unsigned char c2, c3;
RESERVE_INBUF(3)
c2 = IN2;
c3 = IN3;
/* JIS X 0212 */
TRYMAP_DEC(jisx0212, **outbuf, c2 ^ 0x80, c3 ^ 0x80) {
NEXT(3, 1)
} else
return 3;
} else {
unsigned char c2;
RESERVE_INBUF(2)
c2 = IN2;
/* JIS X 0208 */
#ifndef STRICT_BUILD
if (c == 0xa1 && c2 == 0xc0) /* FULL-WIDTH REVERSE SOLIDUS */
**outbuf = 0xff3c;
else
#endif
TRYMAP_DEC(jisx0208, **outbuf, c ^ 0x80, c2 ^ 0x80) ;
else return 2;
NEXT(2, 1)
}
}
return 0;
}
#include "codecentry.h"
BEGIN_CODEC_REGISTRY(euc_jp)
MAPOPEN(ja_JP)
IMPORTMAP_DEC(jisx0208)
IMPORTMAP_DEC(jisx0212)
IMPORTMAP_ENC(jisxcommon)
MAPCLOSE()
END_CODEC_REGISTRY(euc_jp)
/*
* _euc_kr.c: the EUC-KR codec
*
* Written by Hye-Shik Chang <perky@FreeBSD.org>
* $CJKCodecs: _euc_kr.c,v 1.2 2003/12/31 05:46:55 perky Exp $
*/
#include "codeccommon.h"
ENCMAP(cp949)
DECMAP(ksx1001)
ENCODER(euc_kr)
{
while (inleft > 0) {
Py_UNICODE c = IN1;
DBCHAR code;
if (c < 0x80) {
WRITE1(c)
NEXT(1, 1)
continue;
}
UCS4INVALID(c)
RESERVE_OUTBUF(2)
TRYMAP_ENC(cp949, code, c);
else return 1;
if (code & 0x8000) /* MSB set: CP949 */
return 1;
OUT1((code >> 8) | 0x80)
OUT2((code & 0xFF) | 0x80)
NEXT(1, 2)
}
return 0;
}
DECODER(euc_kr)
{
while (inleft > 0) {
unsigned char c = IN1;
RESERVE_OUTBUF(1)
if (c < 0x80) {
OUT1(c)
NEXT(1, 1)
continue;
}
RESERVE_INBUF(2)
TRYMAP_DEC(ksx1001, **outbuf, c ^ 0x80, IN2 ^ 0x80) {
NEXT(2, 1)
} else return 2;
}
return 0;
}
#include "codecentry.h"
BEGIN_CODEC_REGISTRY(euc_kr)
MAPOPEN(ko_KR)
IMPORTMAP_DEC(ksx1001)
IMPORTMAP_ENC(cp949)
MAPCLOSE()
END_CODEC_REGISTRY(euc_kr)
/*
* _gb18030.c: the GB18030 codec
*
* Written by Hye-Shik Chang <perky@FreeBSD.org>
* $CJKCodecs: _gb18030.c,v 1.2 2003/12/31 05:46:55 perky Exp $
*/
#include "codeccommon.h"
#include "tweak_gbk.h"
#include "map_gb18030uni.h"
ENCMAP(gbcommon)
ENCMAP(gb18030ext)
DECMAP(gb2312)
DECMAP(gbkext)
DECMAP(gb18030ext)
ENCODER(gb18030)
{
while (inleft > 0) {
ucs4_t c = IN1;
DBCHAR code;
if (c < 0x80) {
WRITE1(c)
NEXT(1, 1)
continue;
}
DECODE_SURROGATE(c)
if (c > 0x10FFFF)
#if Py_UNICODE_SIZE == 2
return 2; /* surrogates pair */
#else
return 1;
#endif
else if (c >= 0x10000) {
ucs4_t tc = c - 0x10000;
RESERVE_OUTBUF(4)
OUT4((unsigned char)(tc % 10) + 0x30)
tc /= 10;
OUT3((unsigned char)(tc % 126) + 0x81)
tc /= 126;
OUT2((unsigned char)(tc % 10) + 0x30)
tc /= 10;
OUT1((unsigned char)(tc + 0x90))
#if Py_UNICODE_SIZE == 2
NEXT(2, 4) /* surrogates pair */
#else
NEXT(1, 4)
#endif
continue;
}
RESERVE_OUTBUF(2)
GBK_PREENCODE(c, code)
else TRYMAP_ENC(gbcommon, code, c);
else TRYMAP_ENC(gb18030ext, code, c);
else {
const struct _gb18030_to_unibmp_ranges *utrrange;
RESERVE_OUTBUF(4)
for (utrrange = gb18030_to_unibmp_ranges;
utrrange->first != 0;
utrrange++)
if (utrrange->first <= c && c <= utrrange->last) {
Py_UNICODE tc;
tc = c - utrrange->first + utrrange->base;
OUT4((unsigned char)(tc % 10) + 0x30)
tc /= 10;
OUT3((unsigned char)(tc % 126) + 0x81)
tc /= 126;
OUT2((unsigned char)(tc % 10) + 0x30)
tc /= 10;
OUT1((unsigned char)tc + 0x81)
NEXT(1, 4)
break;
}
if (utrrange->first == 0) {
PyErr_SetString(PyExc_RuntimeError,
"unicode mapping invalid");
return 1;
}
continue;
}
OUT1((code >> 8) | 0x80)
if (code & 0x8000)
OUT2((code & 0xFF)) /* MSB set: GBK or GB18030ext */
else
OUT2((code & 0xFF) | 0x80) /* MSB unset: GB2312 */
NEXT(1, 2)
}
return 0;
}
DECODER(gb18030)
{
while (inleft > 0) {
unsigned char c = IN1, c2;
RESERVE_OUTBUF(1)
if (c < 0x80) {
OUT1(c)
NEXT(1, 1)
continue;
}
RESERVE_INBUF(2)
c2 = IN2;
if (c2 >= 0x30 && c2 <= 0x39) { /* 4 bytes seq */
const struct _gb18030_to_unibmp_ranges *utr;
unsigned char c3, c4;
ucs4_t lseq;
RESERVE_INBUF(4)
c3 = IN3;
c4 = IN4;
if (c < 0x81 || c3 < 0x81 || c4 < 0x30 || c4 > 0x39)
return 4;
c -= 0x81; c2 -= 0x30;
c3 -= 0x81; c4 -= 0x30;
if (c < 4) { /* U+0080 - U+FFFF */
lseq = ((ucs4_t)c * 10 + c2) * 1260 +
(ucs4_t)c3 * 10 + c4;
if (lseq < 39420) {
for (utr = gb18030_to_unibmp_ranges;
lseq >= (utr + 1)->base;
utr++) ;
OUT1(utr->first - utr->base + lseq)
NEXT(4, 1)
continue;
}
}
else if (c >= 15) { /* U+10000 - U+10FFFF */
lseq = 0x10000 + (((ucs4_t)c-15) * 10 + c2) * 1260 +
(ucs4_t)c3 * 10 + c4;
if (lseq <= 0x10FFFF) {
PUTUCS4(lseq);
NEXT_IN(4)
continue;
}
}
return 4;
}
GBK_PREDECODE(c, c2, **outbuf)
else TRYMAP_DEC(gb2312, **outbuf, c ^ 0x80, c2 ^ 0x80);
else TRYMAP_DEC(gbkext, **outbuf, c, c2);
else TRYMAP_DEC(gb18030ext, **outbuf, c, c2);
else return 2;
NEXT(2, 1)
}
return 0;
}
#include "codecentry.h"
BEGIN_CODEC_REGISTRY(gb18030)
MAPOPEN(zh_CN)
IMPORTMAP_DEC(gb2312)
IMPORTMAP_DEC(gbkext)
IMPORTMAP_ENC(gbcommon)
IMPORTMAP_ENCDEC(gb18030ext)
MAPCLOSE()
END_CODEC_REGISTRY(gb18030)
/*
* _gb2312.c: the GB2312 codec
*
* Written by Hye-Shik Chang <perky@FreeBSD.org>
* $CJKCodecs: _gb2312.c,v 1.2 2003/12/31 05:46:55 perky Exp $
*/
#include "codeccommon.h"
ENCMAP(gbcommon)
DECMAP(gb2312)
ENCODER(gb2312)
{
while (inleft > 0) {
Py_UNICODE c = IN1;
DBCHAR code;
if (c < 0x80) {
WRITE1(c)
NEXT(1, 1)
continue;
}
UCS4INVALID(c)
RESERVE_OUTBUF(2)
TRYMAP_ENC(gbcommon, code, c);
else return 1;
if (code & 0x8000) /* MSB set: GBK */
return 1;
OUT1((code >> 8) | 0x80)
OUT2((code & 0xFF) | 0x80)
NEXT(1, 2)
}
return 0;
}
DECODER(gb2312)
{
while (inleft > 0) {
unsigned char c = **inbuf;
RESERVE_OUTBUF(1)
if (c < 0x80) {
OUT1(c)
NEXT(1, 1)
continue;
}
RESERVE_INBUF(2)
TRYMAP_DEC(gb2312, **outbuf, c ^ 0x80, IN2 ^ 0x80) {
NEXT(2, 1)
} else return 2;
}
return 0;
}
#include "codecentry.h"
BEGIN_CODEC_REGISTRY(gb2312)
MAPOPEN(zh_CN)
IMPORTMAP_DEC(gb2312)
IMPORTMAP_ENC(gbcommon)
MAPCLOSE()
END_CODEC_REGISTRY(gb2312)
/*
* _gbk.c: the GBK codec
*
* Written by Hye-Shik Chang <perky@FreeBSD.org>
* $CJKCodecs: _gbk.c,v 1.2 2003/12/31 05:46:55 perky Exp $
*/
#include "codeccommon.h"
#include "tweak_gbk.h"
ENCMAP(gbcommon)
DECMAP(gb2312)
DECMAP(gbkext)
ENCODER(gbk)
{
while (inleft > 0) {
Py_UNICODE c = IN1;
DBCHAR code;
if (c < 0x80) {
WRITE1(c)
NEXT(1, 1)
continue;
}
UCS4INVALID(c)
RESERVE_OUTBUF(2)
GBK_PREENCODE(c, code)
else TRYMAP_ENC(gbcommon, code, c);
else return 1;
OUT1((code >> 8) | 0x80)
if (code & 0x8000)
OUT2((code & 0xFF)) /* MSB set: GBK */
else
OUT2((code & 0xFF) | 0x80) /* MSB unset: GB2312 */
NEXT(1, 2)
}
return 0;
}
DECODER(gbk)
{
while (inleft > 0) {
unsigned char c = IN1;
RESERVE_OUTBUF(1)
if (c < 0x80) {
OUT1(c)
NEXT(1, 1)
continue;
}
RESERVE_INBUF(2)
GBK_PREDECODE(c, IN2, **outbuf)
else TRYMAP_DEC(gb2312, **outbuf, c ^ 0x80, IN2 ^ 0x80);
else TRYMAP_DEC(gbkext, **outbuf, c, IN2);
else return 2;
NEXT(2, 1)
}
return 0;
}
#include "codecentry.h"
BEGIN_CODEC_REGISTRY(gbk)
MAPOPEN(zh_CN)
IMPORTMAP_DEC(gb2312)
IMPORTMAP_DEC(gbkext)
IMPORTMAP_ENC(gbcommon)
MAPCLOSE()
END_CODEC_REGISTRY(gbk)
/*
* _hz.c: the HZ codec (RFC1843)
*
* Written by Hye-Shik Chang <perky@FreeBSD.org>
* $CJKCodecs: _hz.c,v 1.2 2003/12/31 05:46:55 perky Exp $
*/
#include "codeccommon.h"
ENCMAP(gbcommon)
DECMAP(gb2312)
#define HAVE_ENCODER_INIT
ENCODER_INIT(hz)
{
state->i = 0;
return 0;
}
#define HAVE_ENCODER_RESET
ENCODER_RESET(hz)
{
if (state->i != 0) {
WRITE2('~', '}')
state->i = 0;
NEXT_OUT(2)
}
return 0;
}
ENCODER(hz)
{
while (inleft > 0) {
Py_UNICODE c = IN1;
DBCHAR code;
if (c < 0x80) {
if (state->i == 0) {
WRITE1(c)
NEXT(1, 1)
} else {
WRITE3('~', '}', c)
NEXT(1, 3)
state->i = 0;
}
continue;
}
UCS4INVALID(c)
TRYMAP_ENC(gbcommon, code, c);
else return 1;
if (code & 0x8000) /* MSB set: GBK */
return 1;
if (state->i == 0) {
WRITE4('~', '{', code >> 8, code & 0xff)
NEXT(1, 4)
state->i = 1;
} else {
WRITE2(code >> 8, code & 0xff)
NEXT(1, 2)
}
}
return 0;
}
#define HAVE_DECODER_INIT
DECODER_INIT(hz)
{
state->i = 0;
return 0;
}
#define HAVE_DECODER_RESET
DECODER_RESET(hz)
{
state->i = 0;
return 0;
}
DECODER(hz)
{
while (inleft > 0) {
unsigned char c = IN1;
if (c == '~') {
unsigned char c2 = IN2;
RESERVE_INBUF(2)
if (c2 == '~') {
WRITE1('~')
NEXT(2, 1)
continue;
} else if (c2 == '{' && state->i == 0)
state->i = 1; /* set GB */
else if (c2 == '}' && state->i == 1)
state->i = 0; /* set ASCII */
else if (c2 == '\n')
; /* line-continuation */
else
return 2;
NEXT(2, 0);
continue;
}
if (c & 0x80)
return 1;
if (state->i == 0) { /* ASCII mode */
WRITE1(c)
NEXT(1, 1)
} else { /* GB mode */
RESERVE_INBUF(2)
RESERVE_OUTBUF(1)
TRYMAP_DEC(gb2312, **outbuf, c, IN2) {
NEXT(2, 1)
} else
return 2;
}
}
return 0;
}
#include "codecentry.h"
BEGIN_CODEC_REGISTRY(hz)
MAPOPEN(zh_CN)
IMPORTMAP_DEC(gb2312)
IMPORTMAP_ENC(gbcommon)
MAPCLOSE()
END_CODEC_REGISTRY(hz)
/*
* _iso2022_jp.c: the ISO-2022-JP codec (RFC1468)
*
* Written by Hye-Shik Chang <perky@FreeBSD.org>
* $CJKCodecs: _iso2022_jp.c,v 1.7 2003/12/31 05:46:55 perky Exp $
*/
#define ISO2022_DESIGNATIONS \
CHARSET_ASCII, CHARSET_JISX0201_R, CHARSET_JISX0208, CHARSET_JISX0208_O
#define ISO2022_NO_SHIFT
#define ISO2022_USE_JISX0208EXT
#include "codeccommon.h"
#include "iso2022common.h"
#include "alg_jisx0201.h"
ENCMAP(jisxcommon)
DECMAP(jisx0208)
#define HAVE_ENCODER_INIT
ENCODER_INIT(iso2022_jp)
{
STATE_CLEARFLAGS(state)
STATE_SETG0(state, CHARSET_ASCII)
STATE_SETG1(state, CHARSET_ASCII)
return 0;
}
#define HAVE_ENCODER_RESET
ENCODER_RESET(iso2022_jp)
{
if (STATE_GETG0(state) != CHARSET_ASCII) {
RESERVE_OUTBUF(3)
WRITE3(ESC, '(', 'B')
STATE_SETG0(state, CHARSET_ASCII)
NEXT_OUT(3)
}
return 0;
}
/* ISO-2022-JP changes designations instead of shifting-out */
ENCODER(iso2022_jp)
{
while (inleft > 0) {
Py_UNICODE c = IN1;
DBCHAR code;
if (c < 0x80) {
switch (STATE_GETG0(state)) {
case CHARSET_ASCII:
WRITE1(c)
NEXT(1, 1)
break;
case CHARSET_JISX0201_R:
JISX0201_R_ENCODE(c, code)
else { /* FALLTHROUGH (yay!) */
default:
WRITE3(ESC, '(', 'B')
NEXT_OUT(3)
STATE_SETG0(state, CHARSET_ASCII)
code = c;
}
WRITE1(code)
NEXT(1, 1)
break;
}
if (c == '\n')
STATE_CLEARFLAG(state, F_SHIFTED)
}
else UCS4INVALID(c)
else {
unsigned char charset;
charset = STATE_GETG0(state);
if (charset == CHARSET_JISX0201_R) {
code = DBCINV;
JISX0201_R_ENCODE(c, code)
if (code != DBCINV) {
WRITE1(code)
NEXT(1, 1)
continue;
}
}
TRYMAP_ENC(jisxcommon, code, c) {
if (code & 0x8000) /* MSB set: JIS X 0212 */
return 1;
jisx0208encode: if (charset != CHARSET_JISX0208) {
WRITE3(ESC, '$', 'B')
STATE_SETG0(state, CHARSET_JISX0208)
NEXT_OUT(3)
}
WRITE2(code >> 8, code & 0xff)
NEXT(1, 2)
} else if (c == 0xff3c) { /* FULL-WIDTH REVERSE SOLIDUS */
code = 0x2140;
goto jisx0208encode;
} else {
JISX0201_R_ENCODE(c, code)
else
return 1;
/* if (charset == CHARSET_JISX0201_R) : already checked */
WRITE4(ESC, '(', 'J', code)
STATE_SETG0(state, CHARSET_JISX0201_R)
NEXT(1, 4)
}
}
}
return 0;
}
#define HAVE_DECODER_INIT
DECODER_INIT(iso2022_jp)
{
STATE_CLEARFLAGS(state)
STATE_SETG0(state, CHARSET_ASCII)
STATE_SETG1(state, CHARSET_ASCII)
return 0;
}
#define HAVE_DECODER_RESET
DECODER_RESET(iso2022_jp)
{
STATE_CLEARFLAG(state, F_SHIFTED)
return 0;
}
DECODER(iso2022_jp)
{
ISO2022_LOOP_BEGIN
unsigned char charset, c2;
ISO2022_GETCHARSET(charset, c)
if (charset & CHARSET_DOUBLEBYTE) {
/* all double byte character sets are in JIS X 0208 here.
* this means that we don't distinguish :1978 from :1983. */
RESERVE_INBUF(2)
RESERVE_OUTBUF(1)
c2 = IN2;
if (c == 0x21 && c2 == 0x40) { /* FULL-WIDTH REVERSE SOLIDUS */
**outbuf = 0xff3c;
NEXT(2, 1)
} else TRYMAP_DEC(jisx0208, **outbuf, c, c2) {
NEXT(2, 1)
} else
return 2;
} else if (charset == CHARSET_ASCII) {
RESERVE_OUTBUF(1)
OUT1(c)
NEXT(1, 1)
} else if (charset == CHARSET_JISX0201_R) {
RESERVE_OUTBUF(1)
JISX0201_R_DECODE(c, **outbuf)
else
return 1;
NEXT(1, 1)
} else
return MBERR_INTERNAL;
ISO2022_LOOP_END
return 0;
}
#include "codecentry.h"
BEGIN_CODEC_REGISTRY(iso2022_jp)
MAPOPEN(ja_JP)
IMPORTMAP_DEC(jisx0208)
IMPORTMAP_ENC(jisxcommon)
MAPCLOSE()
END_CODEC_REGISTRY(iso2022_jp)
/*
* _iso2022_jp_1.c: the ISO-2022-JP-1 codec (RFC2237)
*
* Written by Hye-Shik Chang <perky@FreeBSD.org>
* $CJKCodecs: _iso2022_jp_1.c,v 1.8 2003/12/31 05:46:55 perky Exp $
*/
#define ISO2022_DESIGNATIONS \
CHARSET_ASCII, CHARSET_JISX0201_R, CHARSET_JISX0208, \
CHARSET_JISX0208_O, CHARSET_JISX0212
#define ISO2022_NO_SHIFT
#define ISO2022_USE_JISX0208EXT
#include "codeccommon.h"
#include "iso2022common.h"
#include "alg_jisx0201.h"
ENCMAP(jisxcommon)
DECMAP(jisx0208)
DECMAP(jisx0212)
#define HAVE_ENCODER_INIT
ENCODER_INIT(iso2022_jp_1)
{
STATE_CLEARFLAGS(state)
STATE_SETG0(state, CHARSET_ASCII)
STATE_SETG1(state, CHARSET_ASCII)
return 0;
}
#define HAVE_ENCODER_RESET
ENCODER_RESET(iso2022_jp_1)
{
if (STATE_GETG0(state) != CHARSET_ASCII) {
RESERVE_OUTBUF(3)
WRITE3(ESC, '(', 'B')
STATE_SETG0(state, CHARSET_ASCII)
NEXT_OUT(3)
}
return 0;
}
/* ISO-2022-JP-1 changes designations instead of shifting-out */
ENCODER(iso2022_jp_1)
{
while (inleft > 0) {
Py_UNICODE c = **inbuf;
DBCHAR code;
if (c < 0x80) {
switch (STATE_GETG0(state)) {
case CHARSET_ASCII:
WRITE1(c)
NEXT(1, 1)
break;
case CHARSET_JISX0201_R:
JISX0201_R_ENCODE(c, code)
else { /* FALLTHROUGH (yay!) */
default:
WRITE3(ESC, '(', 'B')
NEXT_OUT(3)
STATE_SETG0(state, CHARSET_ASCII)
code = c;
}
WRITE1(code)
NEXT(1, 1)
break;
}
if (c == '\n')
STATE_CLEARFLAG(state, F_SHIFTED)
}
else UCS4INVALID(c)
else {
unsigned char charset;
charset = STATE_GETG0(state);
if (charset == CHARSET_JISX0201_R) {
code = DBCINV;
JISX0201_R_ENCODE(c, code)
if (code != DBCINV) {
WRITE1(code)
NEXT(1, 1)
continue;
}
}
TRYMAP_ENC(jisxcommon, code, c) {
if (code & 0x8000) { /* MSB set: JIS X 0212 */
if (charset != CHARSET_JISX0212) {
WRITE4(ESC, '$', '(', 'D')
STATE_SETG0(state, CHARSET_JISX0212)
NEXT_OUT(4)
}
WRITE2((code >> 8) & 0x7f, code & 0x7f)
} else { /* MSB unset: JIS X 0208 */
jisx0208encode: if (charset != CHARSET_JISX0208) {
WRITE3(ESC, '$', 'B')
STATE_SETG0(state, CHARSET_JISX0208)
NEXT_OUT(3)
}
WRITE2(code >> 8, code & 0xff)
}
NEXT(1, 2)
} else if (c == 0xff3c) { /* FULL-WIDTH REVERSE SOLIDUS */
code = 0x2140;
goto jisx0208encode;
} else {
JISX0201_R_ENCODE(c, code)
else
return 1;
/* if (charset == CHARSET_JISX0201_R) : already checked */
WRITE4(ESC, '(', 'J', code)
STATE_SETG0(state, CHARSET_JISX0201_R)
NEXT(1, 4)
}
}
}
return 0;
}
#define HAVE_DECODER_INIT
DECODER_INIT(iso2022_jp_1)
{
STATE_CLEARFLAGS(state)
STATE_SETG0(state, CHARSET_ASCII)
STATE_SETG1(state, CHARSET_ASCII)
return 0;
}
#define HAVE_DECODER_RESET
DECODER_RESET(iso2022_jp_1)
{
STATE_CLEARFLAG(state, F_SHIFTED)
return 0;
}
DECODER(iso2022_jp_1)
{
ISO2022_LOOP_BEGIN
unsigned char charset, c2;
ISO2022_GETCHARSET(charset, c)
if (charset & CHARSET_DOUBLEBYTE) {
RESERVE_INBUF(2)
RESERVE_OUTBUF(1)
c2 = IN2;
if (charset == CHARSET_JISX0208 || charset == CHARSET_JISX0208_O) {
if (c == 0x21 && c2 == 0x40) /* FULL-WIDTH REVERSE SOLIDUS */
**outbuf = 0xff3c;
else TRYMAP_DEC(jisx0208, **outbuf, c, c2);
else return 2;
} else if (charset == CHARSET_JISX0212) {
TRYMAP_DEC(jisx0212, **outbuf, c, c2);
else return 2;
} else
return MBERR_INTERNAL;
NEXT(2, 1)
} else if (charset == CHARSET_ASCII) {
RESERVE_OUTBUF(1)
OUT1(c)
NEXT(1, 1)
} else if (charset == CHARSET_JISX0201_R) {
RESERVE_OUTBUF(1)
JISX0201_R_DECODE(c, **outbuf)
else
return 1;
NEXT(1, 1)
} else
return MBERR_INTERNAL;
ISO2022_LOOP_END
return 0;
}
#include "codecentry.h"
BEGIN_CODEC_REGISTRY(iso2022_jp_1)
MAPOPEN(ja_JP)
IMPORTMAP_DEC(jisx0208)
IMPORTMAP_DEC(jisx0212)
IMPORTMAP_ENC(jisxcommon)
MAPCLOSE()
END_CODEC_REGISTRY(iso2022_jp_1)
/*
* _iso2022_jp_2.c: the ISO-2022-JP-2 codec (RFC1554)
*
* Written by Hye-Shik Chang <perky@FreeBSD.org>
* $CJKCodecs: _iso2022_jp_2.c,v 1.8 2003/12/31 05:46:55 perky Exp $
*/
#define ISO2022_DESIGNATIONS \
CHARSET_ASCII, CHARSET_JISX0201_R, CHARSET_JISX0208, \
CHARSET_JISX0208_O, CHARSET_JISX0212, CHARSET_GB2312, \
CHARSET_KSX1001, CHARSET_JISX0212, \
CHARSET_ISO8859_1, CHARSET_ISO8859_7
#define ISO2022_USE_G2_DESIGNATION yo!
#define ISO2022_USE_JISX0208EXT
#include "codeccommon.h"
#include "iso2022common.h"
#include "alg_jisx0201.h"
#include "alg_iso8859_1.h"
#include "alg_iso8859_7.h"
ENCMAP(jisxcommon)
DECMAP(jisx0208)
DECMAP(jisx0212)
ENCMAP(cp949)
DECMAP(ksx1001)
ENCMAP(gbcommon)
DECMAP(gb2312)
#define HAVE_ENCODER_INIT
ENCODER_INIT(iso2022_jp_2)
{
STATE_CLEARFLAGS(state)
STATE_SETG0(state, CHARSET_ASCII)
STATE_SETG1(state, CHARSET_ASCII)
STATE_SETG2(state, CHARSET_ASCII)
return 0;
}
#define HAVE_ENCODER_RESET
ENCODER_RESET(iso2022_jp_2)
{
if (STATE_GETG0(state) != CHARSET_ASCII) {
WRITE3(ESC, '(', 'B')
STATE_SETG0(state, CHARSET_ASCII)
NEXT_OUT(3)
}
return 0;
}
ENCODER(iso2022_jp_2)
{
while (inleft > 0) {
Py_UNICODE c = IN1;
DBCHAR code;
if (c < 0x80) {
switch (STATE_GETG0(state)) {
case CHARSET_ASCII:
WRITE1(c)
NEXT(1, 1)
break;
case CHARSET_JISX0201_R:
JISX0201_R_ENCODE(c, code)
else { /* FALLTHROUGH (yay!) */
default:
WRITE3(ESC, '(', 'B')
NEXT_OUT(3)
STATE_SETG0(state, CHARSET_ASCII)
code = c;
}
WRITE1(code)
NEXT(1, 1)
break;
}
if (c == '\n')
STATE_CLEARFLAG(state, F_SHIFTED)
}
else UCS4INVALID(c)
else {
unsigned char charset;
charset = STATE_GETG0(state);
if (charset == CHARSET_JISX0201_R) {
code = DBCINV;
JISX0201_R_ENCODE(c, code)
if (code != DBCINV) {
WRITE1(code)
NEXT(1, 1)
continue;
}
}
TRYMAP_ENC(jisxcommon, code, c) {
if (code & 0x8000) { /* MSB set: JIS X 0212 */
if (charset != CHARSET_JISX0212) {
WRITE4(ESC, '$', '(', 'D')
STATE_SETG0(state, CHARSET_JISX0212)
NEXT_OUT(4)
}
WRITE2((code >> 8) & 0x7f, code & 0x7f)
} else { /* MSB unset: JIS X 0208 */
jisx0208encode: if (charset != CHARSET_JISX0208) {
WRITE3(ESC, '$', 'B')
STATE_SETG0(state, CHARSET_JISX0208)
NEXT_OUT(3)
}
WRITE2(code >> 8, code & 0xff)
}
NEXT(1, 2)
} else TRYMAP_ENC(cp949, code, c) {
if (code & 0x8000) /* MSB set: CP949 */
return 2;
if (charset != CHARSET_KSX1001) {
WRITE4(ESC, '$', '(', 'C')
STATE_SETG0(state, CHARSET_KSX1001)
NEXT_OUT(4)
}
WRITE2(code >> 8, code & 0xff)
NEXT(1, 2)
} else TRYMAP_ENC(gbcommon, code, c) {
if (code & 0x8000) /* MSB set: GBK */
return 2;
if (charset != CHARSET_GB2312) {
WRITE4(ESC, '$', '(', 'A')
STATE_SETG0(state, CHARSET_GB2312)
NEXT_OUT(4)
}
WRITE2(code >> 8, code & 0xff)
NEXT(1, 2)
} else if (c == 0xff3c) { /* FULL-WIDTH REVERSE SOLIDUS */
code = 0x2140;
goto jisx0208encode;
} else {
JISX0201_R_ENCODE(c, code)
else {
/* There's no need to try to encode as ISO-8859-1 or
* ISO-8859-7 because JIS X 0212 includes them already.
*/
return 1;
}
/* if (charset == CHARSET_JISX0201_R) : already checked */
WRITE4(ESC, '(', 'J', code)
STATE_SETG0(state, CHARSET_JISX0201_R)
NEXT(1, 4)
}
}
}
return 0;
}
#define HAVE_DECODER_INIT
DECODER_INIT(iso2022_jp_2)
{
STATE_CLEARFLAGS(state)
STATE_SETG0(state, CHARSET_ASCII)
STATE_SETG1(state, CHARSET_ASCII)
STATE_SETG2(state, CHARSET_ASCII)
return 0;
}
#define HAVE_DECODER_RESET
DECODER_RESET(iso2022_jp_2)
{
STATE_CLEARFLAG(state, F_SHIFTED)
return 0;
}
DECODER(iso2022_jp_2)
{
ISO2022_LOOP_BEGIN
unsigned char charset, c2;
ISO2022_GETCHARSET(charset, c)
if (charset & CHARSET_DOUBLEBYTE) {
RESERVE_INBUF(2)
RESERVE_OUTBUF(1)
c2 = IN2;
if (charset == CHARSET_JISX0208 || charset == CHARSET_JISX0208_O) {
if (c == 0x21 && c2 == 0x40) /* FULL-WIDTH REVERSE SOLIDUS */
**outbuf = 0xff3c;
else TRYMAP_DEC(jisx0208, **outbuf, c, c2);
else return 2;
} else if (charset == CHARSET_JISX0212) {
TRYMAP_DEC(jisx0212, **outbuf, c, c2);
else return 2;
} else if (charset == CHARSET_KSX1001) {
TRYMAP_DEC(ksx1001, **outbuf, c, c2);
else return 2;
} else if (charset == CHARSET_GB2312) {
TRYMAP_DEC(gb2312, **outbuf, c, c2);
else return 2;
} else
return MBERR_INTERNAL;
NEXT(2, 1)
} else if (charset == CHARSET_ASCII) {
RESERVE_OUTBUF(1)
OUT1(c)
NEXT(1, 1)
} else if (charset == CHARSET_JISX0201_R) {
RESERVE_OUTBUF(1)
JISX0201_R_DECODE(c, **outbuf)
else
return 1;
NEXT(1, 1)
} else
return MBERR_INTERNAL;
ISO2022_LOOP_END
return 0;
}
#include "codecentry.h"
BEGIN_CODEC_REGISTRY(iso2022_jp_2)
MAPOPEN(ja_JP)
IMPORTMAP_DEC(jisx0208)
IMPORTMAP_DEC(jisx0212)
IMPORTMAP_ENC(jisxcommon)
MAPCLOSE()
MAPOPEN(ko_KR)
IMPORTMAP_ENC(cp949)
IMPORTMAP_DEC(ksx1001)
MAPCLOSE()
MAPOPEN(zh_CN)
IMPORTMAP_ENC(gbcommon)
IMPORTMAP_DEC(gb2312)
MAPCLOSE()
END_CODEC_REGISTRY(iso2022_jp_2)
/*
* _iso2022_jp_3.c: the ISO-2022-JP-3 codec (JIS X 0213)
*
* Written by Hye-Shik Chang <perky@FreeBSD.org>
* $CJKCodecs: _iso2022_jp_3.c,v 1.7 2003/12/31 05:46:55 perky Exp $
*/
#define USING_BINARY_PAIR_SEARCH
#define ISO2022_DESIGNATIONS \
CHARSET_ASCII, CHARSET_JISX0208, CHARSET_JISX0213_1, CHARSET_JISX0213_2
#define ISO2022_NO_SHIFT
#define ISO2022_USE_JISX0208EXT
#include "codeccommon.h"
#include "iso2022common.h"
#include "map_jisx0213_pairs.h"
ENCMAP(jisxcommon)
DECMAP(jisx0208)
DECMAP(jisx0212)
ENCMAP(jisx0213_bmp)
DECMAP(jisx0213_1_bmp)
DECMAP(jisx0213_2_bmp)
ENCMAP(jisx0213_emp)
DECMAP(jisx0213_1_emp)
DECMAP(jisx0213_2_emp)
#define EMPBASE 0x20000
#define HAVE_ENCODER_INIT
ENCODER_INIT(iso2022_jp_3)
{
STATE_CLEARFLAGS(state)
STATE_SETG0(state, CHARSET_ASCII)
STATE_SETG1(state, CHARSET_ASCII)
return 0;
}
#define HAVE_ENCODER_RESET
ENCODER_RESET(iso2022_jp_3)
{
if (STATE_GETG0(state) != CHARSET_ASCII) {
WRITE3(ESC, '(', 'B')
STATE_SETG0(state, CHARSET_ASCII)
NEXT_OUT(3)
}
return 0;
}
ENCODER(iso2022_jp_3)
{
while (inleft > 0) {
unsigned char charset;
ucs4_t c = IN1;
DBCHAR code;
size_t insize;
if (c < 0x80) {
switch (STATE_GETG0(state)) {
case CHARSET_ASCII:
WRITE1(c)
NEXT(1, 1)
break;
default:
WRITE4(ESC, '(', 'B', c)
STATE_SETG0(state, CHARSET_ASCII)
NEXT(1, 4)
break;
}
if (c == '\n')
STATE_CLEARFLAG(state, F_SHIFTED)
continue;
}
DECODE_SURROGATE(c)
insize = GET_INSIZE(c);
if (c <= 0xffff) {
TRYMAP_ENC(jisx0213_bmp, code, c) {
if (code == MULTIC) {
if (inleft < 2) {
if (flags & MBENC_FLUSH) {
code = find_pairencmap(c, 0, jisx0213_pairencmap,
JISX0213_ENCPAIRS);
if (code == DBCINV)
return 1;
} else
return MBERR_TOOFEW;
} else {
code = find_pairencmap(c, IN2,
jisx0213_pairencmap, JISX0213_ENCPAIRS);
if (code == DBCINV) {
code = find_pairencmap(c, 0, jisx0213_pairencmap,
JISX0213_ENCPAIRS);
if (code == DBCINV)
return 1;
} else
insize = 2;
}
}
} else TRYMAP_ENC(jisxcommon, code, c) {
if (code & 0x8000)
return 1; /* avoid JIS X 0212 codes */
} else if (c == 0xff3c) /* F/W REVERSE SOLIDUS */
code = 0x2140;
else
return 1;
} else if (c >> 16 == EMPBASE >> 16) {
TRYMAP_ENC(jisx0213_emp, code, c & 0xffff);
else return insize;
} else
return insize;
charset = STATE_GETG0(state);
if (code & 0x8000) { /* MSB set: Plane 2 */
if (charset != CHARSET_JISX0213_2) {
WRITE4(ESC, '$', '(', 'P')
STATE_SETG0(state, CHARSET_JISX0213_2)
NEXT_OUT(4)
}
WRITE2((code >> 8) & 0x7f, code & 0x7f)
} else { /* MSB unset: Plane 1 */
if (charset != CHARSET_JISX0213_1) {
WRITE4(ESC, '$', '(', 'O')
STATE_SETG0(state, CHARSET_JISX0213_1)
NEXT_OUT(4)
}
WRITE2(code >> 8, code & 0xff)
}
NEXT(insize, 2)
}
return 0;
}
#define HAVE_DECODER_INIT
DECODER_INIT(iso2022_jp_3)
{
STATE_CLEARFLAGS(state)
STATE_SETG0(state, CHARSET_ASCII)
STATE_SETG1(state, CHARSET_ASCII)
return 0;
}
#define HAVE_DECODER_RESET
DECODER_RESET(iso2022_jp_3)
{
STATE_CLEARFLAG(state, F_SHIFTED)
return 0;
}
DECODER(iso2022_jp_3)
{
ISO2022_LOOP_BEGIN
unsigned char charset, c2;
ucs4_t code;
ISO2022_GETCHARSET(charset, c)
if (charset & CHARSET_DOUBLEBYTE) {
RESERVE_INBUF(2)
RESERVE_OUTBUF(1)
c2 = IN2;
if (charset == CHARSET_JISX0213_1) {
if (c == 0x21 && c2 == 0x40) **outbuf = 0xff3c;
else TRYMAP_DEC(jisx0208, **outbuf, c, c2);
else TRYMAP_DEC(jisx0213_1_bmp, **outbuf, c, c2);
else TRYMAP_DEC(jisx0213_1_emp, code, c, c2) {
PUTUCS4(EMPBASE | code)
NEXT_IN(2)
continue;
} else TRYMAP_DEC(jisx0213_pair, code, c, c2) {
WRITE2(code >> 16, code & 0xffff)
NEXT(2, 2)
continue;
} else return 2;
} else if (charset == CHARSET_JISX0213_2) {
TRYMAP_DEC(jisx0213_2_bmp, **outbuf, c, c2);
else TRYMAP_DEC(jisx0213_2_emp, code, c, c2) {
PUTUCS4(EMPBASE | code)
NEXT_IN(2)
continue;
} else return 2;
} else
return MBERR_INTERNAL;
NEXT(2, 1)
} else if (charset == CHARSET_ASCII) {
RESERVE_OUTBUF(1)
OUT1(c)
NEXT(1, 1)
} else
return MBERR_INTERNAL;
ISO2022_LOOP_END
return 0;
}
#include "codecentry.h"
BEGIN_CODEC_REGISTRY(iso2022_jp_3)
MAPOPEN(ja_JP)
IMPORTMAP_DEC(jisx0208)
IMPORTMAP_DEC(jisx0212)
IMPORTMAP_ENC(jisxcommon)
IMPORTMAP_ENC(jisx0213_bmp)
IMPORTMAP_DEC(jisx0213_1_bmp)
IMPORTMAP_DEC(jisx0213_2_bmp)
IMPORTMAP_ENC(jisx0213_emp)
IMPORTMAP_DEC(jisx0213_1_emp)
IMPORTMAP_DEC(jisx0213_2_emp)
MAPCLOSE()
END_CODEC_REGISTRY(iso2022_jp_3)
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
/*
* alg_iso8859_1.c: Encoder/Decoder macro for ISO8859-1
*
* Written by Hye-Shik Chang <perky@FreeBSD.org>
* $CJKCodecs: alg_iso8859_1.h,v 1.3 2003/12/31 05:46:55 perky Exp $
*/
#define ISO8859_1_ENCODE(c, assi) \
if ((c) <= 0xff) (assi) = (c);
#define ISO8859_1_DECODE(c, assi) \
if (1/*(c) <= 0xff*/) (assi) = (c);
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment