Commit 6e390806 authored by Walter Dörwald's avatar Walter Dörwald

Backport r57105 and r57145 from the py3k branch: UTF-32 codecs.

parent 437e6a3b
......@@ -1301,6 +1301,79 @@ These are the UTF-8 codec APIs:
object. Error handling is "strict". Return *NULL* if an exception was raised
by the codec.
These are the UTF-32 codec APIs:
.. % --- UTF-32 Codecs ------------------------------------------------------ */
.. cfunction:: PyObject* PyUnicode_DecodeUTF32(const char *s, Py_ssize_t size, const char *errors, int *byteorder)
Decode *length* bytes from a UTF-32 encoded buffer string and return the
corresponding Unicode object. *errors* (if non-*NULL*) defines the error
handling. It defaults to "strict".
If *byteorder* is non-*NULL*, the decoder starts decoding using the given byte
order::
*byteorder == -1: little endian
*byteorder == 0: native order
*byteorder == 1: big endian
and then switches if the first four bytes of the input data are a byte order mark
(BOM) and the specified byte order is native order. This BOM is not copied into
the resulting Unicode string. After completion, *\*byteorder* is set to the
current byte order at the end of input data.
In a narrow build codepoints outside the BMP will be decoded as surrogate pairs.
If *byteorder* is *NULL*, the codec starts in native order mode.
Return *NULL* if an exception was raised by the codec.
.. versionadded:: 2.6
.. cfunction:: PyObject* PyUnicode_DecodeUTF32Stateful(const char *s, Py_ssize_t size, const char *errors, int *byteorder, Py_ssize_t *consumed)
If *consumed* is *NULL*, behave like :cfunc:`PyUnicode_DecodeUTF32`. If
*consumed* is not *NULL*, :cfunc:`PyUnicode_DecodeUTF32Stateful` will not treat
trailing incomplete UTF-32 byte sequences (such as a number of bytes not divisible
by four) as an error. Those bytes will not be decoded and the number of bytes
that have been decoded will be stored in *consumed*.
.. versionadded:: 2.6
.. cfunction:: PyObject* PyUnicode_EncodeUTF32(const Py_UNICODE *s, Py_ssize_t size, const char *errors, int byteorder)
Return a Python bytes object holding the UTF-32 encoded value of the Unicode
data in *s*. If *byteorder* is not ``0``, output is written according to the
following byte order::
byteorder == -1: little endian
byteorder == 0: native byte order (writes a BOM mark)
byteorder == 1: big endian
If byteorder is ``0``, the output string will always start with the Unicode BOM
mark (U+FEFF). In the other two modes, no BOM mark is prepended.
If *Py_UNICODE_WIDE* is not defined, surrogate pairs will be output
as a single codepoint.
Return *NULL* if an exception was raised by the codec.
.. versionadded:: 2.6
.. cfunction:: PyObject* PyUnicode_AsUTF32String(PyObject *unicode)
Return a Python string using the UTF-32 encoding in native byte order. The
string always starts with a BOM mark. Error handling is "strict". Return
*NULL* if an exception was raised by the codec.
.. versionadded:: 2.6
These are the UTF-16 codec APIs:
.. % --- UTF-16 Codecs ------------------------------------------------------ */
......
......@@ -1045,6 +1045,12 @@ particular, the following variants typically exist:
| shift_jisx0213 | shiftjisx0213, sjisx0213, | Japanese |
| | s_jisx0213 | |
+-----------------+--------------------------------+--------------------------------+
| utf_32 | U32, utf32 | all languages |
+-----------------+--------------------------------+--------------------------------+
| utf_32_be | UTF-32BE | all languages |
+-----------------+--------------------------------+--------------------------------+
| utf_32_le | UTF-32LE | all languages |
+-----------------+--------------------------------+--------------------------------+
| utf_16 | U16, utf16 | all languages |
+-----------------+--------------------------------+--------------------------------+
| utf_16_be | UTF-16BE | all languages (BMP only) |
......
......@@ -145,6 +145,7 @@ typedef PY_UNICODE_TYPE Py_UNICODE;
# define PyUnicode_AsEncodedString PyUnicodeUCS2_AsEncodedString
# define PyUnicode_AsLatin1String PyUnicodeUCS2_AsLatin1String
# define PyUnicode_AsRawUnicodeEscapeString PyUnicodeUCS2_AsRawUnicodeEscapeString
# define PyUnicode_AsUTF32String PyUnicodeUCS2_AsUTF32String
# define PyUnicode_AsUTF16String PyUnicodeUCS2_AsUTF16String
# define PyUnicode_AsUTF8String PyUnicodeUCS2_AsUTF8String
# define PyUnicode_AsUnicode PyUnicodeUCS2_AsUnicode
......@@ -159,6 +160,8 @@ typedef PY_UNICODE_TYPE Py_UNICODE;
# define PyUnicode_DecodeCharmap PyUnicodeUCS2_DecodeCharmap
# define PyUnicode_DecodeLatin1 PyUnicodeUCS2_DecodeLatin1
# define PyUnicode_DecodeRawUnicodeEscape PyUnicodeUCS2_DecodeRawUnicodeEscape
# define PyUnicode_DecodeUTF32 PyUnicodeUCS2_DecodeUTF32
# define PyUnicode_DecodeUTF32Stateful PyUnicodeUCS2_DecodeUTF32Stateful
# define PyUnicode_DecodeUTF16 PyUnicodeUCS2_DecodeUTF16
# define PyUnicode_DecodeUTF16Stateful PyUnicodeUCS2_DecodeUTF16Stateful
# define PyUnicode_DecodeUTF8 PyUnicodeUCS2_DecodeUTF8
......@@ -170,6 +173,7 @@ typedef PY_UNICODE_TYPE Py_UNICODE;
# define PyUnicode_EncodeDecimal PyUnicodeUCS2_EncodeDecimal
# define PyUnicode_EncodeLatin1 PyUnicodeUCS2_EncodeLatin1
# define PyUnicode_EncodeRawUnicodeEscape PyUnicodeUCS2_EncodeRawUnicodeEscape
# define PyUnicode_EncodeUTF32 PyUnicodeUCS2_EncodeUTF32
# define PyUnicode_EncodeUTF16 PyUnicodeUCS2_EncodeUTF16
# define PyUnicode_EncodeUTF8 PyUnicodeUCS2_EncodeUTF8
# define PyUnicode_EncodeUnicodeEscape PyUnicodeUCS2_EncodeUnicodeEscape
......@@ -223,6 +227,7 @@ typedef PY_UNICODE_TYPE Py_UNICODE;
# define PyUnicode_AsEncodedString PyUnicodeUCS4_AsEncodedString
# define PyUnicode_AsLatin1String PyUnicodeUCS4_AsLatin1String
# define PyUnicode_AsRawUnicodeEscapeString PyUnicodeUCS4_AsRawUnicodeEscapeString
# define PyUnicode_AsUTF32String PyUnicodeUCS4_AsUTF32String
# define PyUnicode_AsUTF16String PyUnicodeUCS4_AsUTF16String
# define PyUnicode_AsUTF8String PyUnicodeUCS4_AsUTF8String
# define PyUnicode_AsUnicode PyUnicodeUCS4_AsUnicode
......@@ -237,6 +242,8 @@ typedef PY_UNICODE_TYPE Py_UNICODE;
# define PyUnicode_DecodeCharmap PyUnicodeUCS4_DecodeCharmap
# define PyUnicode_DecodeLatin1 PyUnicodeUCS4_DecodeLatin1
# define PyUnicode_DecodeRawUnicodeEscape PyUnicodeUCS4_DecodeRawUnicodeEscape
# define PyUnicode_DecodeUTF32 PyUnicodeUCS4_DecodeUTF32
# define PyUnicode_DecodeUTF32Stateful PyUnicodeUCS4_DecodeUTF32Stateful
# define PyUnicode_DecodeUTF16 PyUnicodeUCS4_DecodeUTF16
# define PyUnicode_DecodeUTF16Stateful PyUnicodeUCS4_DecodeUTF16Stateful
# define PyUnicode_DecodeUTF8 PyUnicodeUCS4_DecodeUTF8
......@@ -248,6 +255,7 @@ typedef PY_UNICODE_TYPE Py_UNICODE;
# define PyUnicode_EncodeDecimal PyUnicodeUCS4_EncodeDecimal
# define PyUnicode_EncodeLatin1 PyUnicodeUCS4_EncodeLatin1
# define PyUnicode_EncodeRawUnicodeEscape PyUnicodeUCS4_EncodeRawUnicodeEscape
# define PyUnicode_EncodeUTF32 PyUnicodeUCS4_EncodeUTF32
# define PyUnicode_EncodeUTF16 PyUnicodeUCS4_EncodeUTF16
# define PyUnicode_EncodeUTF8 PyUnicodeUCS4_EncodeUTF8
# define PyUnicode_EncodeUnicodeEscape PyUnicodeUCS4_EncodeUnicodeEscape
......@@ -701,6 +709,80 @@ PyAPI_FUNC(PyObject*) PyUnicode_EncodeUTF8(
const char *errors /* error handling */
);
/* --- UTF-32 Codecs ------------------------------------------------------ */
/* Decodes length bytes from a UTF-32 encoded buffer string and returns
the corresponding Unicode object.
errors (if non-NULL) defines the error handling. It defaults
to "strict".
If byteorder is non-NULL, the decoder starts decoding using the
given byte order:
*byteorder == -1: little endian
*byteorder == 0: native order
*byteorder == 1: big endian
In native mode, the first four bytes of the stream are checked for a
BOM mark. If found, the BOM mark is analysed, the byte order
adjusted and the BOM skipped. In the other modes, no BOM mark
interpretation is done. After completion, *byteorder is set to the
current byte order at the end of input data.
If byteorder is NULL, the codec starts in native order mode.
*/
PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF32(
const char *string, /* UTF-32 encoded string */
Py_ssize_t length, /* size of string */
const char *errors, /* error handling */
int *byteorder /* pointer to byteorder to use
0=native;-1=LE,1=BE; updated on
exit */
);
PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF32Stateful(
const char *string, /* UTF-32 encoded string */
Py_ssize_t length, /* size of string */
const char *errors, /* error handling */
int *byteorder, /* pointer to byteorder to use
0=native;-1=LE,1=BE; updated on
exit */
Py_ssize_t *consumed /* bytes consumed */
);
/* Returns a Python string using the UTF-32 encoding in native byte
order. The string always starts with a BOM mark. */
PyAPI_FUNC(PyObject*) PyUnicode_AsUTF32String(
PyObject *unicode /* Unicode object */
);
/* Returns a Python string object holding the UTF-32 encoded value of
the Unicode data.
If byteorder is not 0, output is written according to the following
byte order:
byteorder == -1: little endian
byteorder == 0: native byte order (writes a BOM mark)
byteorder == 1: big endian
If byteorder is 0, the output string will always start with the
Unicode BOM mark (U+FEFF). In the other two modes, no BOM mark is
prepended.
*/
PyAPI_FUNC(PyObject*) PyUnicode_EncodeUTF32(
const Py_UNICODE *data, /* Unicode char buffer */
Py_ssize_t length, /* number of Py_UNICODE chars to encode */
const char *errors, /* error handling */
int byteorder /* byteorder to use 0=BOM+native;-1=LE,1=BE */
);
/* --- UTF-16 Codecs ------------------------------------------------------ */
/* Decodes length bytes from a UTF-16 encoded buffer string and returns
......
......@@ -490,6 +490,16 @@ aliases = {
'unicodelittleunmarked' : 'utf_16_le',
'utf_16le' : 'utf_16_le',
# utf_32 codec
'u32' : 'utf_32',
'utf32' : 'utf_32',
# utf_32_be codec
'utf_32be' : 'utf_32_be',
# utf_32_le codec
'utf_32le' : 'utf_32_le',
# utf_7 codec
'u7' : 'utf_7',
'utf7' : 'utf_7',
......
"""
Python 'utf-32' Codec
"""
import codecs, sys
### Codec APIs
encode = codecs.utf_32_encode
def decode(input, errors='strict'):
return codecs.utf_32_decode(input, errors, True)
class IncrementalEncoder(codecs.IncrementalEncoder):
def __init__(self, errors='strict'):
codecs.IncrementalEncoder.__init__(self, errors)
self.encoder = None
def encode(self, input, final=False):
if self.encoder is None:
result = codecs.utf_32_encode(input, self.errors)[0]
if sys.byteorder == 'little':
self.encoder = codecs.utf_32_le_encode
else:
self.encoder = codecs.utf_32_be_encode
return result
return self.encoder(input, self.errors)[0]
def reset(self):
codecs.IncrementalEncoder.reset(self)
self.encoder = None
def getstate(self):
# state info we return to the caller:
# 0: stream is in natural order for this platform
# 2: endianness hasn't been determined yet
# (we're never writing in unnatural order)
return (2 if self.encoder is None else 0)
def setstate(self, state):
if state:
self.encoder = None
else:
if sys.byteorder == 'little':
self.encoder = codecs.utf_32_le_encode
else:
self.encoder = codecs.utf_32_be_encode
class IncrementalDecoder(codecs.BufferedIncrementalDecoder):
def __init__(self, errors='strict'):
codecs.BufferedIncrementalDecoder.__init__(self, errors)
self.decoder = None
def _buffer_decode(self, input, errors, final):
if self.decoder is None:
(output, consumed, byteorder) = \
codecs.utf_32_ex_decode(input, errors, 0, final)
if byteorder == -1:
self.decoder = codecs.utf_32_le_decode
elif byteorder == 1:
self.decoder = codecs.utf_32_be_decode
elif consumed >= 4:
raise UnicodeError("UTF-32 stream does not start with BOM")
return (output, consumed)
return self.decoder(input, self.errors, final)
def reset(self):
codecs.BufferedIncrementalDecoder.reset(self)
self.decoder = None
def getstate(self):
# additonal state info from the base class must be None here,
# as it isn't passed along to the caller
state = codecs.BufferedIncrementalDecoder.getstate(self)[0]
# additional state info we pass to the caller:
# 0: stream is in natural order for this platform
# 1: stream is in unnatural order
# 2: endianness hasn't been determined yet
if self.decoder is None:
return (state, 2)
addstate = int((sys.byteorder == "big") !=
(self.decoder is codecs.utf_32_be_decode))
return (state, addstate)
def setstate(self, state):
# state[1] will be ignored by BufferedIncrementalDecoder.setstate()
codecs.BufferedIncrementalDecoder.setstate(self, state)
state = state[1]
if state == 0:
self.decoder = (codecs.utf_32_be_decode
if sys.byteorder == "big"
else codecs.utf_32_le_decode)
elif state == 1:
self.decoder = (codecs.utf_32_le_decode
if sys.byteorder == "big"
else codecs.utf_32_be_decode)
else:
self.decoder = None
class StreamWriter(codecs.StreamWriter):
def __init__(self, stream, errors='strict'):
self.bom_written = False
codecs.StreamWriter.__init__(self, stream, errors)
def encode(self, input, errors='strict'):
self.bom_written = True
result = codecs.utf_32_encode(input, errors)
if sys.byteorder == 'little':
self.encode = codecs.utf_32_le_encode
else:
self.encode = codecs.utf_32_be_encode
return result
class StreamReader(codecs.StreamReader):
def reset(self):
codecs.StreamReader.reset(self)
try:
del self.decode
except AttributeError:
pass
def decode(self, input, errors='strict'):
(object, consumed, byteorder) = \
codecs.utf_32_ex_decode(input, errors, 0, False)
if byteorder == -1:
self.decode = codecs.utf_32_le_decode
elif byteorder == 1:
self.decode = codecs.utf_32_be_decode
elif consumed>=4:
raise UnicodeError,"UTF-32 stream does not start with BOM"
return (object, consumed)
### encodings module API
def getregentry():
return codecs.CodecInfo(
name='utf-32',
encode=encode,
decode=decode,
incrementalencoder=IncrementalEncoder,
incrementaldecoder=IncrementalDecoder,
streamreader=StreamReader,
streamwriter=StreamWriter,
)
"""
Python 'utf-32-be' Codec
"""
import codecs
### Codec APIs
encode = codecs.utf_32_be_encode
def decode(input, errors='strict'):
return codecs.utf_32_be_decode(input, errors, True)
class IncrementalEncoder(codecs.IncrementalEncoder):
def encode(self, input, final=False):
return codecs.utf_32_be_encode(input, self.errors)[0]
class IncrementalDecoder(codecs.BufferedIncrementalDecoder):
_buffer_decode = codecs.utf_32_be_decode
class StreamWriter(codecs.StreamWriter):
encode = codecs.utf_32_be_encode
class StreamReader(codecs.StreamReader):
decode = codecs.utf_32_be_decode
### encodings module API
def getregentry():
return codecs.CodecInfo(
name='utf-32-be',
encode=encode,
decode=decode,
incrementalencoder=IncrementalEncoder,
incrementaldecoder=IncrementalDecoder,
streamreader=StreamReader,
streamwriter=StreamWriter,
)
"""
Python 'utf-32-le' Codec
"""
import codecs
### Codec APIs
encode = codecs.utf_32_le_encode
def decode(input, errors='strict'):
return codecs.utf_32_le_decode(input, errors, True)
class IncrementalEncoder(codecs.IncrementalEncoder):
def encode(self, input, final=False):
return codecs.utf_32_le_encode(input, self.errors)[0]
class IncrementalDecoder(codecs.BufferedIncrementalDecoder):
_buffer_decode = codecs.utf_32_le_decode
class StreamWriter(codecs.StreamWriter):
encode = codecs.utf_32_le_encode
class StreamReader(codecs.StreamReader):
decode = codecs.utf_32_le_decode
### encodings module API
def getregentry():
return codecs.CodecInfo(
name='utf-32-le',
encode=encode,
decode=decode,
incrementalencoder=IncrementalEncoder,
incrementaldecoder=IncrementalDecoder,
streamreader=StreamReader,
streamwriter=StreamWriter,
)
......@@ -285,7 +285,8 @@ class CodecCallbackTest(unittest.TestCase):
def test_longstrings(self):
# test long strings to check for memory overflow problems
errors = [ "strict", "ignore", "replace", "xmlcharrefreplace", "backslashreplace"]
errors = [ "strict", "ignore", "replace", "xmlcharrefreplace",
"backslashreplace"]
# register the handlers under different names,
# to prevent the codec from recognizing the name
for err in errors:
......@@ -293,7 +294,8 @@ class CodecCallbackTest(unittest.TestCase):
l = 1000
errors += [ "test." + err for err in errors ]
for uni in [ s*l for s in (u"x", u"\u3042", u"a\xe4") ]:
for enc in ("ascii", "latin-1", "iso-8859-1", "iso-8859-15", "utf-8", "utf-7", "utf-16"):
for enc in ("ascii", "latin-1", "iso-8859-1", "iso-8859-15",
"utf-8", "utf-7", "utf-16", "utf-32"):
for err in errors:
try:
uni.encode(enc, err)
......
......@@ -244,6 +244,137 @@ class ReadTest(unittest.TestCase):
self.assertEqual(reader.readline(), s5)
self.assertEqual(reader.readline(), u"")
class UTF32Test(ReadTest):
encoding = "utf-32"
spamle = ('\xff\xfe\x00\x00'
's\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m\x00\x00\x00'
's\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m\x00\x00\x00')
spambe = ('\x00\x00\xfe\xff'
'\x00\x00\x00s\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m'
'\x00\x00\x00s\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m')
def test_only_one_bom(self):
_,_,reader,writer = codecs.lookup(self.encoding)
# encode some stream
s = StringIO.StringIO()
f = writer(s)
f.write(u"spam")
f.write(u"spam")
d = s.getvalue()
# check whether there is exactly one BOM in it
self.assert_(d == self.spamle or d == self.spambe)
# try to read it back
s = StringIO.StringIO(d)
f = reader(s)
self.assertEquals(f.read(), u"spamspam")
def test_badbom(self):
s = StringIO.StringIO(4*"\xff")
f = codecs.getreader(self.encoding)(s)
self.assertRaises(UnicodeError, f.read)
s = StringIO.StringIO(8*"\xff")
f = codecs.getreader(self.encoding)(s)
self.assertRaises(UnicodeError, f.read)
def test_partial(self):
self.check_partial(
u"\x00\xff\u0100\uffff",
[
u"", # first byte of BOM read
u"", # second byte of BOM read
u"", # third byte of BOM read
u"", # fourth byte of BOM read => byteorder known
u"",
u"",
u"",
u"\x00",
u"\x00",
u"\x00",
u"\x00",
u"\x00\xff",
u"\x00\xff",
u"\x00\xff",
u"\x00\xff",
u"\x00\xff\u0100",
u"\x00\xff\u0100",
u"\x00\xff\u0100",
u"\x00\xff\u0100",
u"\x00\xff\u0100\uffff",
]
)
def test_errors(self):
self.assertRaises(UnicodeDecodeError, codecs.utf_32_decode,
"\xff", "strict", True)
class UTF32LETest(ReadTest):
encoding = "utf-32-le"
def test_partial(self):
self.check_partial(
u"\x00\xff\u0100\uffff",
[
u"",
u"",
u"",
u"\x00",
u"\x00",
u"\x00",
u"\x00",
u"\x00\xff",
u"\x00\xff",
u"\x00\xff",
u"\x00\xff",
u"\x00\xff\u0100",
u"\x00\xff\u0100",
u"\x00\xff\u0100",
u"\x00\xff\u0100",
u"\x00\xff\u0100\uffff",
]
)
def test_simple(self):
self.assertEqual(u"\U00010203".encode(self.encoding), "\x03\x02\x01\x00")
def test_errors(self):
self.assertRaises(UnicodeDecodeError, codecs.utf_32_le_decode,
"\xff", "strict", True)
class UTF32BETest(ReadTest):
encoding = "utf-32-be"
def test_partial(self):
self.check_partial(
u"\x00\xff\u0100\uffff",
[
u"",
u"",
u"",
u"\x00",
u"\x00",
u"\x00",
u"\x00",
u"\x00\xff",
u"\x00\xff",
u"\x00\xff",
u"\x00\xff",
u"\x00\xff\u0100",
u"\x00\xff\u0100",
u"\x00\xff\u0100",
u"\x00\xff\u0100",
u"\x00\xff\u0100\uffff",
]
)
def test_simple(self):
self.assertEqual(u"\U00010203".encode(self.encoding), "\x00\x01\x02\x03")
def test_errors(self):
self.assertRaises(UnicodeDecodeError, codecs.utf_32_be_decode,
"\xff", "strict", True)
class UTF16Test(ReadTest):
encoding = "utf-16"
......@@ -1278,6 +1409,9 @@ class WithStmtTest(unittest.TestCase):
def test_main():
test_support.run_unittest(
UTF32Test,
UTF32LETest,
UTF32BETest,
UTF16Test,
UTF16LETest,
UTF16BETest,
......
......@@ -243,6 +243,8 @@ Library
- GB18030 codec now can encode additional two-byte characters that
are missing in GBK.
- Add new codecs for UTF-32, UTF-32-LE and UTF-32-BE.
- Bug #1704793: Return UTF-16 pair if unicodedata.lookup cannot
represent the result in a single character.
......
......@@ -391,6 +391,126 @@ utf_16_ex_decode(PyObject *self,
return tuple;
}
static PyObject *
utf_32_decode(PyObject *self,
PyObject *args)
{
const char *data;
Py_ssize_t size;
const char *errors = NULL;
int byteorder = 0;
int final = 0;
Py_ssize_t consumed;
PyObject *decoded;
if (!PyArg_ParseTuple(args, "t#|zi:utf_32_decode",
&data, &size, &errors, &final))
return NULL;
if (size < 0) {
PyErr_SetString(PyExc_ValueError, "negative argument");
return 0;
}
consumed = size; /* This is overwritten unless final is true. */
decoded = PyUnicode_DecodeUTF32Stateful(data, size, errors, &byteorder,
final ? NULL : &consumed);
if (decoded == NULL)
return NULL;
return codec_tuple(decoded, consumed);
}
static PyObject *
utf_32_le_decode(PyObject *self,
PyObject *args)
{
const char *data;
Py_ssize_t size;
const char *errors = NULL;
int byteorder = -1;
int final = 0;
Py_ssize_t consumed;
PyObject *decoded = NULL;
if (!PyArg_ParseTuple(args, "t#|zi:utf_32_le_decode",
&data, &size, &errors, &final))
return NULL;
if (size < 0) {
PyErr_SetString(PyExc_ValueError, "negative argument");
return 0;
}
consumed = size; /* This is overwritten unless final is true. */
decoded = PyUnicode_DecodeUTF32Stateful(data, size, errors,
&byteorder, final ? NULL : &consumed);
if (decoded == NULL)
return NULL;
return codec_tuple(decoded, consumed);
}
static PyObject *
utf_32_be_decode(PyObject *self,
PyObject *args)
{
const char *data;
Py_ssize_t size;
const char *errors = NULL;
int byteorder = 1;
int final = 0;
Py_ssize_t consumed;
PyObject *decoded = NULL;
if (!PyArg_ParseTuple(args, "t#|zi:utf_32_be_decode",
&data, &size, &errors, &final))
return NULL;
if (size < 0) {
PyErr_SetString(PyExc_ValueError, "negative argument");
return 0;
}
consumed = size; /* This is overwritten unless final is true. */
decoded = PyUnicode_DecodeUTF32Stateful(data, size, errors,
&byteorder, final ? NULL : &consumed);
if (decoded == NULL)
return NULL;
return codec_tuple(decoded, consumed);
}
/* This non-standard version also provides access to the byteorder
parameter of the builtin UTF-32 codec.
It returns a tuple (unicode, bytesread, byteorder) with byteorder
being the value in effect at the end of data.
*/
static PyObject *
utf_32_ex_decode(PyObject *self,
PyObject *args)
{
const char *data;
Py_ssize_t size;
const char *errors = NULL;
int byteorder = 0;
PyObject *unicode, *tuple;
int final = 0;
Py_ssize_t consumed;
if (!PyArg_ParseTuple(args, "t#|zii:utf_32_ex_decode",
&data, &size, &errors, &byteorder, &final))
return NULL;
if (size < 0) {
PyErr_SetString(PyExc_ValueError, "negative argument");
return 0;
}
consumed = size; /* This is overwritten unless final is true. */
unicode = PyUnicode_DecodeUTF32Stateful(data, size, errors, &byteorder,
final ? NULL : &consumed);
if (unicode == NULL)
return NULL;
tuple = Py_BuildValue("Oni", unicode, consumed, byteorder);
Py_DECREF(unicode);
return tuple;
}
static PyObject *
unicode_escape_decode(PyObject *self,
PyObject *args)
......@@ -683,6 +803,83 @@ utf_16_be_encode(PyObject *self,
return v;
}
/* This version provides access to the byteorder parameter of the
builtin UTF-32 codecs as optional third argument. It defaults to 0
which means: use the native byte order and prepend the data with a
BOM mark.
*/
static PyObject *
utf_32_encode(PyObject *self,
PyObject *args)
{
PyObject *str, *v;
const char *errors = NULL;
int byteorder = 0;
if (!PyArg_ParseTuple(args, "O|zi:utf_32_encode",
&str, &errors, &byteorder))
return NULL;
str = PyUnicode_FromObject(str);
if (str == NULL)
return NULL;
v = codec_tuple(PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(str),
PyUnicode_GET_SIZE(str),
errors,
byteorder),
PyUnicode_GET_SIZE(str));
Py_DECREF(str);
return v;
}
static PyObject *
utf_32_le_encode(PyObject *self,
PyObject *args)
{
PyObject *str, *v;
const char *errors = NULL;
if (!PyArg_ParseTuple(args, "O|z:utf_32_le_encode",
&str, &errors))
return NULL;
str = PyUnicode_FromObject(str);
if (str == NULL)
return NULL;
v = codec_tuple(PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(str),
PyUnicode_GET_SIZE(str),
errors,
-1),
PyUnicode_GET_SIZE(str));
Py_DECREF(str);
return v;
}
static PyObject *
utf_32_be_encode(PyObject *self,
PyObject *args)
{
PyObject *str, *v;
const char *errors = NULL;
if (!PyArg_ParseTuple(args, "O|z:utf_32_be_encode",
&str, &errors))
return NULL;
str = PyUnicode_FromObject(str);
if (str == NULL)
return NULL;
v = codec_tuple(PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(str),
PyUnicode_GET_SIZE(str),
errors,
+1),
PyUnicode_GET_SIZE(str));
Py_DECREF(str);
return v;
}
static PyObject *
unicode_escape_encode(PyObject *self,
PyObject *args)
......@@ -901,6 +1098,13 @@ static PyMethodDef _codecs_functions[] = {
{"utf_16_le_decode", utf_16_le_decode, METH_VARARGS},
{"utf_16_be_decode", utf_16_be_decode, METH_VARARGS},
{"utf_16_ex_decode", utf_16_ex_decode, METH_VARARGS},
{"utf_32_encode", utf_32_encode, METH_VARARGS},
{"utf_32_le_encode", utf_32_le_encode, METH_VARARGS},
{"utf_32_be_encode", utf_32_be_encode, METH_VARARGS},
{"utf_32_decode", utf_32_decode, METH_VARARGS},
{"utf_32_le_decode", utf_32_le_decode, METH_VARARGS},
{"utf_32_be_decode", utf_32_be_decode, METH_VARARGS},
{"utf_32_ex_decode", utf_32_ex_decode, METH_VARARGS},
{"unicode_escape_encode", unicode_escape_encode, METH_VARARGS},
{"unicode_escape_decode", unicode_escape_decode, METH_VARARGS},
{"unicode_internal_encode", unicode_internal_encode, METH_VARARGS},
......
......@@ -1504,6 +1504,272 @@ PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
NULL);
}
/* --- UTF-32 Codec ------------------------------------------------------- */
PyObject *
PyUnicode_DecodeUTF32(const char *s,
Py_ssize_t size,
const char *errors,
int *byteorder)
{
return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
}
PyObject *
PyUnicode_DecodeUTF32Stateful(const char *s,
Py_ssize_t size,
const char *errors,
int *byteorder,
Py_ssize_t *consumed)
{
const char *starts = s;
Py_ssize_t startinpos;
Py_ssize_t endinpos;
Py_ssize_t outpos;
PyUnicodeObject *unicode;
Py_UNICODE *p;
#ifndef Py_UNICODE_WIDE
int i, pairs;
#else
const int pairs = 0;
#endif
const unsigned char *q, *e;
int bo = 0; /* assume native ordering by default */
const char *errmsg = "";
/* On narrow builds we split characters outside the BMP into two
codepoints => count how much extra space we need. */
#ifndef Py_UNICODE_WIDE
for (i = pairs = 0; i < size/4; i++)
if (((Py_UCS4 *)s)[i] >= 0x10000)
pairs++;
#endif
/* Offsets from q for retrieving bytes in the right order. */
#ifdef BYTEORDER_IS_LITTLE_ENDIAN
int iorder[] = {0, 1, 2, 3};
#else
int iorder[] = {3, 2, 1, 0};
#endif
PyObject *errorHandler = NULL;
PyObject *exc = NULL;
/* This might be one to much, because of a BOM */
unicode = _PyUnicode_New((size+3)/4+pairs);
if (!unicode)
return NULL;
if (size == 0)
return (PyObject *)unicode;
/* Unpack UTF-32 encoded data */
p = unicode->str;
q = (unsigned char *)s;
e = q + size;
if (byteorder)
bo = *byteorder;
/* Check for BOM marks (U+FEFF) in the input and adjust current
byte order setting accordingly. In native mode, the leading BOM
mark is skipped, in all other modes, it is copied to the output
stream as-is (giving a ZWNBSP character). */
if (bo == 0) {
if (size >= 4) {
const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
(q[iorder[1]] << 8) | q[iorder[0]];
#ifdef BYTEORDER_IS_LITTLE_ENDIAN
if (bom == 0x0000FEFF) {
q += 4;
bo = -1;
}
else if (bom == 0xFFFE0000) {
q += 4;
bo = 1;
}
#else
if (bom == 0x0000FEFF) {
q += 4;
bo = 1;
}
else if (bom == 0xFFFE0000) {
q += 4;
bo = -1;
}
#endif
}
}
if (bo == -1) {
/* force LE */
iorder[0] = 0;
iorder[1] = 1;
iorder[2] = 2;
iorder[3] = 3;
}
else if (bo == 1) {
/* force BE */
iorder[0] = 3;
iorder[1] = 2;
iorder[2] = 1;
iorder[3] = 0;
}
while (q < e) {
Py_UCS4 ch;
/* remaining bytes at the end? (size should be divisible by 4) */
if (e-q<4) {
if (consumed)
break;
errmsg = "truncated data";
startinpos = ((const char *)q)-starts;
endinpos = ((const char *)e)-starts;
goto utf32Error;
/* The remaining input chars are ignored if the callback
chooses to skip the input */
}
ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
(q[iorder[1]] << 8) | q[iorder[0]];
if (ch >= 0x110000)
{
errmsg = "codepoint not in range(0x110000)";
startinpos = ((const char *)q)-starts;
endinpos = startinpos+4;
goto utf32Error;
}
#ifndef Py_UNICODE_WIDE
if (ch >= 0x10000)
{
*p++ = 0xD800 | ((ch-0x10000) >> 10);
*p++ = 0xDC00 | ((ch-0x10000) & 0x3FF);
}
else
#endif
*p++ = ch;
q += 4;
continue;
utf32Error:
outpos = p-PyUnicode_AS_UNICODE(unicode);
if (unicode_decode_call_errorhandler(
errors, &errorHandler,
"utf32", errmsg,
starts, size, &startinpos, &endinpos, &exc, &s,
(PyObject **)&unicode, &outpos, &p))
goto onError;
}
if (byteorder)
*byteorder = bo;
if (consumed)
*consumed = (const char *)q-starts;
/* Adjust length */
if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
goto onError;
Py_XDECREF(errorHandler);
Py_XDECREF(exc);
return (PyObject *)unicode;
onError:
Py_DECREF(unicode);
Py_XDECREF(errorHandler);
Py_XDECREF(exc);
return NULL;
}
PyObject *
PyUnicode_EncodeUTF32(const Py_UNICODE *s,
Py_ssize_t size,
const char *errors,
int byteorder)
{
PyObject *v;
unsigned char *p;
#ifndef Py_UNICODE_WIDE
int i, pairs;
#else
const int pairs = 0;
#endif
/* Offsets from p for storing byte pairs in the right order. */
#ifdef BYTEORDER_IS_LITTLE_ENDIAN
int iorder[] = {0, 1, 2, 3};
#else
int iorder[] = {3, 2, 1, 0};
#endif
#define STORECHAR(CH) \
do { \
p[iorder[3]] = ((CH) >> 24) & 0xff; \
p[iorder[2]] = ((CH) >> 16) & 0xff; \
p[iorder[1]] = ((CH) >> 8) & 0xff; \
p[iorder[0]] = (CH) & 0xff; \
p += 4; \
} while(0)
/* In narrow builds we can output surrogate pairs as one codepoint,
so we need less space. */
#ifndef Py_UNICODE_WIDE
for (i = pairs = 0; i < size-1; i++)
if (0xD800 <= s[i] && s[i] <= 0xDBFF &&
0xDC00 <= s[i+1] && s[i+1] <= 0xDFFF)
pairs++;
#endif
v = PyString_FromStringAndSize(NULL,
4 * (size - pairs + (byteorder == 0)));
if (v == NULL)
return NULL;
p = (unsigned char *)PyString_AS_STRING(v);
if (byteorder == 0)
STORECHAR(0xFEFF);
if (size == 0)
return v;
if (byteorder == -1) {
/* force LE */
iorder[0] = 0;
iorder[1] = 1;
iorder[2] = 2;
iorder[3] = 3;
}
else if (byteorder == 1) {
/* force BE */
iorder[0] = 3;
iorder[1] = 2;
iorder[2] = 1;
iorder[3] = 0;
}
while (size-- > 0) {
Py_UCS4 ch = *s++;
#ifndef Py_UNICODE_WIDE
if (0xD800 <= ch && ch <= 0xDBFF && size > 0) {
Py_UCS4 ch2 = *s;
if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
s++;
size--;
}
}
#endif
STORECHAR(ch);
}
return v;
#undef STORECHAR
}
PyObject *PyUnicode_AsUTF32String(PyObject *unicode)
{
if (!PyUnicode_Check(unicode)) {
PyErr_BadArgument();
return NULL;
}
return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(unicode),
PyUnicode_GET_SIZE(unicode),
NULL,
0);
}
/* --- UTF-16 Codec ------------------------------------------------------- */
PyObject *
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment