Commit b8a36ee9 authored by Walter Dörwald's avatar Walter Dörwald

Part of SF patch #1313939: Speedup charmap decoding by extending

PyUnicode_DecodeCharmap() the accept a unicode string as the mapping
argument which is used as a mapping table.

This code isn't used by any of the codecs yet.
parent 7d76641e
...@@ -1322,7 +1322,12 @@ points. ...@@ -1322,7 +1322,12 @@ points.
const char *errors} const char *errors}
Create a Unicode object by decoding \var{size} bytes of the encoded Create a Unicode object by decoding \var{size} bytes of the encoded
string \var{s} using the given \var{mapping} object. Return string \var{s} using the given \var{mapping} object. Return
\NULL{} if an exception was raised by the codec. \NULL{} if an exception was raised by the codec. If \var{mapping} is \NULL{}
latin-1 decoding will be done. Else it can be a dictionary mapping byte or a
unicode string, which is treated as a lookup table. Byte values greater
that the length of the string and U+FFFE "characters" are treated as
"undefined mapping".
\versionchanged[Allowed unicode string as mapping argument]{2.4}
\end{cfuncdesc} \end{cfuncdesc}
\begin{cfuncdesc}{PyObject*}{PyUnicode_EncodeCharmap}{const Py_UNICODE *s, \begin{cfuncdesc}{PyObject*}{PyUnicode_EncodeCharmap}{const Py_UNICODE *s,
......
...@@ -924,6 +924,40 @@ class BasicStrTest(unittest.TestCase): ...@@ -924,6 +924,40 @@ class BasicStrTest(unittest.TestCase):
(chars, size) = codecs.getdecoder(encoding)(bytes) (chars, size) = codecs.getdecoder(encoding)(bytes)
self.assertEqual(chars, s, "%r != %r (encoding=%r)" % (chars, s, encoding)) self.assertEqual(chars, s, "%r != %r (encoding=%r)" % (chars, s, encoding))
class CharmapTest(unittest.TestCase):
def test_decode_with_string_map(self):
self.assertEquals(
codecs.charmap_decode("\x00\x01\x02", "strict", u"abc"),
(u"abc", 3)
)
self.assertEquals(
codecs.charmap_decode("\x00\x01\x02", "replace", u"ab"),
(u"ab\ufffd", 3)
)
self.assertEquals(
codecs.charmap_decode("\x00\x01\x02", "replace", u"ab\ufffe"),
(u"ab\ufffd", 3)
)
self.assertEquals(
codecs.charmap_decode("\x00\x01\x02", "ignore", u"ab"),
(u"ab", 3)
)
self.assertEquals(
codecs.charmap_decode("\x00\x01\x02", "ignore", u"ab\ufffe"),
(u"ab", 3)
)
allbytes = "".join(chr(i) for i in xrange(256))
self.assertEquals(
codecs.charmap_decode(allbytes, "ignore", u""),
(u"", len(allbytes))
)
def test_main(): def test_main():
test_support.run_unittest( test_support.run_unittest(
UTF16Test, UTF16Test,
...@@ -940,7 +974,8 @@ def test_main(): ...@@ -940,7 +974,8 @@ def test_main():
StreamReaderTest, StreamReaderTest,
Str2StrTest, Str2StrTest,
BasicUnicodeTest, BasicUnicodeTest,
BasicStrTest BasicStrTest,
CharmapTest
) )
......
...@@ -563,6 +563,11 @@ C API ...@@ -563,6 +563,11 @@ C API
- Removed PyRange_New(). - Removed PyRange_New().
- Patch #1313939: PyUnicode_DecodeCharmap() accepts a unicode string as the
mapping argument now. This string is used as a mapping table. Byte values
greater than the length of the string and 0xFFFE are treated as undefined
mappings.
Tests Tests
----- -----
......
...@@ -2833,6 +2833,8 @@ PyObject *PyUnicode_DecodeCharmap(const char *s, ...@@ -2833,6 +2833,8 @@ PyObject *PyUnicode_DecodeCharmap(const char *s,
int extrachars = 0; int extrachars = 0;
PyObject *errorHandler = NULL; PyObject *errorHandler = NULL;
PyObject *exc = NULL; PyObject *exc = NULL;
Py_UNICODE *mapstring = NULL;
int maplen = 0;
/* Default to Latin-1 */ /* Default to Latin-1 */
if (mapping == NULL) if (mapping == NULL)
...@@ -2845,91 +2847,121 @@ PyObject *PyUnicode_DecodeCharmap(const char *s, ...@@ -2845,91 +2847,121 @@ PyObject *PyUnicode_DecodeCharmap(const char *s,
return (PyObject *)v; return (PyObject *)v;
p = PyUnicode_AS_UNICODE(v); p = PyUnicode_AS_UNICODE(v);
e = s + size; e = s + size;
while (s < e) { if (PyUnicode_CheckExact(mapping)) {
unsigned char ch = *s; mapstring = PyUnicode_AS_UNICODE(mapping);
PyObject *w, *x; maplen = PyUnicode_GET_SIZE(mapping);
while (s < e) {
/* Get mapping (char ordinal -> integer, Unicode char or None) */ unsigned char ch = *s;
w = PyInt_FromLong((long)ch); Py_UNICODE x = 0xfffe; /* illegal value */
if (w == NULL)
goto onError; if (ch < maplen)
x = PyObject_GetItem(mapping, w); x = mapstring[ch];
Py_DECREF(w);
if (x == NULL) { if (x == 0xfffe) {
if (PyErr_ExceptionMatches(PyExc_LookupError)) { /* undefined mapping */
/* No mapping found means: mapping is undefined. */ outpos = p-PyUnicode_AS_UNICODE(v);
PyErr_Clear(); startinpos = s-starts;
x = Py_None; endinpos = startinpos+1;
Py_INCREF(x); if (unicode_decode_call_errorhandler(
} else errors, &errorHandler,
goto onError; "charmap", "character maps to <undefined>",
starts, size, &startinpos, &endinpos, &exc, &s,
(PyObject **)&v, &outpos, &p)) {
goto onError;
}
continue;
}
*p++ = x;
++s;
} }
}
else {
while (s < e) {
unsigned char ch = *s;
PyObject *w, *x;
/* Apply mapping */ /* Get mapping (char ordinal -> integer, Unicode char or None) */
if (PyInt_Check(x)) { w = PyInt_FromLong((long)ch);
long value = PyInt_AS_LONG(x); if (w == NULL)
if (value < 0 || value > 65535) {
PyErr_SetString(PyExc_TypeError,
"character mapping must be in range(65536)");
Py_DECREF(x);
goto onError; goto onError;
x = PyObject_GetItem(mapping, w);
Py_DECREF(w);
if (x == NULL) {
if (PyErr_ExceptionMatches(PyExc_LookupError)) {
/* No mapping found means: mapping is undefined. */
PyErr_Clear();
x = Py_None;
Py_INCREF(x);
} else
goto onError;
} }
*p++ = (Py_UNICODE)value;
} /* Apply mapping */
else if (x == Py_None) { if (PyInt_Check(x)) {
/* undefined mapping */ long value = PyInt_AS_LONG(x);
outpos = p-PyUnicode_AS_UNICODE(v); if (value < 0 || value > 65535) {
startinpos = s-starts; PyErr_SetString(PyExc_TypeError,
endinpos = startinpos+1; "character mapping must be in range(65536)");
if (unicode_decode_call_errorhandler( Py_DECREF(x);
errors, &errorHandler, goto onError;
"charmap", "character maps to <undefined>", }
starts, size, &startinpos, &endinpos, &exc, &s, *p++ = (Py_UNICODE)value;
(PyObject **)&v, &outpos, &p)) {
Py_DECREF(x);
goto onError;
} }
continue; else if (x == Py_None) {
} /* undefined mapping */
else if (PyUnicode_Check(x)) { outpos = p-PyUnicode_AS_UNICODE(v);
int targetsize = PyUnicode_GET_SIZE(x); startinpos = s-starts;
endinpos = startinpos+1;
if (targetsize == 1) if (unicode_decode_call_errorhandler(
/* 1-1 mapping */ errors, &errorHandler,
*p++ = *PyUnicode_AS_UNICODE(x); "charmap", "character maps to <undefined>",
starts, size, &startinpos, &endinpos, &exc, &s,
else if (targetsize > 1) { (PyObject **)&v, &outpos, &p)) {
/* 1-n mapping */ Py_DECREF(x);
if (targetsize > extrachars) { goto onError;
/* resize first */ }
int oldpos = (int)(p - PyUnicode_AS_UNICODE(v)); continue;
int needed = (targetsize - extrachars) + \ }
(targetsize << 2); else if (PyUnicode_Check(x)) {
extrachars += needed; int targetsize = PyUnicode_GET_SIZE(x);
if (_PyUnicode_Resize(&v,
PyUnicode_GET_SIZE(v) + needed) < 0) { if (targetsize == 1)
Py_DECREF(x); /* 1-1 mapping */
goto onError; *p++ = *PyUnicode_AS_UNICODE(x);
else if (targetsize > 1) {
/* 1-n mapping */
if (targetsize > extrachars) {
/* resize first */
int oldpos = (int)(p - PyUnicode_AS_UNICODE(v));
int needed = (targetsize - extrachars) + \
(targetsize << 2);
extrachars += needed;
if (_PyUnicode_Resize(&v,
PyUnicode_GET_SIZE(v) + needed) < 0) {
Py_DECREF(x);
goto onError;
}
p = PyUnicode_AS_UNICODE(v) + oldpos;
} }
p = PyUnicode_AS_UNICODE(v) + oldpos; Py_UNICODE_COPY(p,
PyUnicode_AS_UNICODE(x),
targetsize);
p += targetsize;
extrachars -= targetsize;
} }
Py_UNICODE_COPY(p, /* 1-0 mapping: skip the character */
PyUnicode_AS_UNICODE(x), }
targetsize); else {
p += targetsize; /* wrong return value */
extrachars -= targetsize; PyErr_SetString(PyExc_TypeError,
"character mapping must return integer, None or unicode");
Py_DECREF(x);
goto onError;
} }
/* 1-0 mapping: skip the character */
}
else {
/* wrong return value */
PyErr_SetString(PyExc_TypeError,
"character mapping must return integer, None or unicode");
Py_DECREF(x); Py_DECREF(x);
goto onError; ++s;
} }
Py_DECREF(x);
++s;
} }
if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v)) if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))) < 0) if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))) < 0)
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment