Commit 2e0b18af authored by Walter Dörwald's avatar Walter Dörwald

Change the treatment of positions returned by PEP293

error handers in the Unicode codecs: Negative
positions are treated as being relative to the end of
the input and out of bounds positions result in an
IndexError.

Also update the PEP and include an explanation of
this in the documentation for codecs.register_error.

Fixes a small bug in iconv_codecs: if the position
from the callback is negative *add* it to the size
instead of substracting it.

From SF patch #677429.
parent f7f4517f
......@@ -103,11 +103,22 @@ Raises a \exception{LookupError} in case the encoding cannot be found.
Register the error handling function \var{error_handler} under the
name \var{name}. \var{error_handler} will be called during encoding
and decoding in case of an error, when \var{name} is specified as the
errors parameter. \var{error_handler} will be called with an
\exception{UnicodeEncodeError}, \exception{UnicodeDecodeError} or
\exception{UnicodeTranslateError} instance and must return a tuple
with a replacement for the unencodable/undecodable part of the input
and a position where encoding/decoding should continue.
errors parameter.
For encoding \var{error_handler} will be called with a
\exception{UnicodeEncodeError} instance, which contains information about
the location of the error. The error handler must either raise this or
a different exception or return a tuple with a replacement for the
unencodable part of the input and a position where encoding should
continue. The encoder will encode the replacement and continue encoding
the original input at the specified position. Negative position values
will be treated as being relative to the end of the input string. If the
resulting position is out of bound an IndexError will be raised.
Decoding and translating works similar, except \exception{UnicodeDecodeError}
or \exception{UnicodeTranslateError} will be passed to the handler and
that the replacement from the error handler will be put into the output
directly.
\end{funcdesc}
\begin{funcdesc}{lookup_error}{name}
......
......@@ -572,7 +572,7 @@ class C:
\var{classinfo} argument, or of a (direct or indirect) subclass
thereof. Also return true if \var{classinfo} is a type object and
\var{object} is an object of that type. If \var{object} is not a
class instance or a object of the given type, the function always
class instance or an object of the given type, the function always
returns false. If \var{classinfo} is neither a class object nor a
type object, it may be a tuple of class or type objects, or may
recursively contain other such tuples (other sequence types are not
......
import test.test_support, unittest
import sys, codecs, htmlentitydefs, unicodedata
class PosReturn:
# this can be used for configurable callbacks
def __init__(self):
self.pos = 0
def handle(self, exc):
oldpos = self.pos
realpos = oldpos
if realpos<0:
realpos = len(exc.object) + realpos
# if we don't advance this time, terminate on the next call
# otherwise we'd get an endless loop
if realpos <= exc.start:
self.pos = len(exc.object)
return (u"<?>", oldpos)
class CodecCallbackTest(unittest.TestCase):
def test_xmlcharrefreplace(self):
......@@ -543,18 +560,36 @@ class CodecCallbackTest(unittest.TestCase):
codecs.register_error("test.baddecodereturn2", baddecodereturn2)
self.assertRaises(TypeError, "\xff".decode, "ascii", "test.baddecodereturn2")
pos = [-42]
def negposreturn(exc):
pos[0] += 1 # use list to work around scoping problem
return (u"?", pos[0])
codecs.register_error("test.negposreturn", negposreturn)
"\xff".decode("ascii", "test.negposreturn")
handler = PosReturn()
codecs.register_error("test.posreturn", handler.handle)
# Valid negative position
handler.pos = -1
self.assertEquals("\xff0".decode("ascii", "test.posreturn"), u"<?>0")
# Valid negative position
handler.pos = -2
self.assertEquals("\xff0".decode("ascii", "test.posreturn"), u"<?><?>")
# Negative position out of bounds
handler.pos = -3
self.assertRaises(IndexError, "\xff0".decode, "ascii", "test.posreturn")
# Valid positive position
handler.pos = 1
self.assertEquals("\xff0".decode("ascii", "test.posreturn"), u"<?>0")
# Largest valid positive position (one beyond end of input
handler.pos = 2
self.assertEquals("\xff0".decode("ascii", "test.posreturn"), u"<?>")
# Invalid positive position
handler.pos = 3
self.assertRaises(IndexError, "\xff0".decode, "ascii", "test.posreturn")
def hugeposreturn(exc):
return (u"?", 424242)
codecs.register_error("test.hugeposreturn", hugeposreturn)
"\xff".decode("ascii", "test.hugeposreturn")
"\\uyyyy".decode("raw-unicode-escape", "test.hugeposreturn")
# Restart at the "0"
handler.pos = 6
self.assertEquals("\\uyyyy0".decode("raw-unicode-escape", "test.posreturn"), u"<?>0")
class D(dict):
def __getitem__(self, key):
......@@ -579,22 +614,39 @@ class CodecCallbackTest(unittest.TestCase):
codecs.register_error("test.badencodereturn2", badencodereturn2)
self.assertRaises(TypeError, u"\xff".encode, "ascii", "test.badencodereturn2")
pos = [-42]
def negposreturn(exc):
pos[0] += 1 # use list to work around scoping problem
return (u"?", pos[0])
codecs.register_error("test.negposreturn", negposreturn)
u"\xff".encode("ascii", "test.negposreturn")
handler = PosReturn()
codecs.register_error("test.posreturn", handler.handle)
# Valid negative position
handler.pos = -1
self.assertEquals(u"\xff0".encode("ascii", "test.posreturn"), "<?>0")
# Valid negative position
handler.pos = -2
self.assertEquals(u"\xff0".encode("ascii", "test.posreturn"), "<?><?>")
# Negative position out of bounds
handler.pos = -3
self.assertRaises(IndexError, u"\xff0".encode, "ascii", "test.posreturn")
# Valid positive position
handler.pos = 1
self.assertEquals(u"\xff0".encode("ascii", "test.posreturn"), "<?>0")
# Largest valid positive position (one beyond end of input
handler.pos = 2
self.assertEquals(u"\xff0".encode("ascii", "test.posreturn"), "<?>")
# Invalid positive position
handler.pos = 3
self.assertRaises(IndexError, u"\xff0".encode, "ascii", "test.posreturn")
def hugeposreturn(exc):
return (u"?", 424242)
codecs.register_error("test.hugeposreturn", hugeposreturn)
u"\xff".encode("ascii", "test.hugeposreturn")
handler.pos = 0
class D(dict):
def __getitem__(self, key):
raise ValueError
for err in ("strict", "replace", "xmlcharrefreplace", "backslashreplace", "test.hugeposreturn"):
for err in ("strict", "replace", "xmlcharrefreplace", "backslashreplace", "test.posreturn"):
self.assertRaises(UnicodeError, codecs.charmap_encode, u"\xff", err, {0xff: None})
self.assertRaises(ValueError, codecs.charmap_encode, u"\xff", err, D())
self.assertRaises(TypeError, codecs.charmap_encode, u"\xff", err, {0xff: 300})
......
......@@ -247,8 +247,13 @@ errorexit_cbpad: Py_XDECREF(retobj);
Py_DECREF(retobj);
if (newpos < 0)
newpos = inputlen - newpos;
if (newpos < 0 || newpos >= inputlen)
newpos = inputlen + newpos;
if (newpos < 0 || newpos > inputlen) {
PyErr_Format(PyExc_IndexError, "position %ld from error handler"
" out of bounds", newpos);
goto errorexit;
}
if (newpos == inputlen)
break;
inp = inp_top + Py_UNICODE_SIZE * newpos;
inplen = inplen_total - Py_UNICODE_SIZE * newpos;
......@@ -471,8 +476,13 @@ errorexit_cbpad: Py_DECREF(retobj);
Py_DECREF(retobj);
if (newpos < 0)
newpos = inplen_total - newpos;
if (newpos < 0 || newpos >= inplen_total)
newpos = inplen_total + newpos;
if (newpos < 0 || newpos > inplen_total) {
PyErr_Format(PyExc_IndexError, "position %ld from error handler"
" out of bounds", newpos);
goto errorexit;
}
if (newpos == inplen_total)
break;
inp = inp_top + newpos;
inplen = inplen_total - newpos;
......
......@@ -728,9 +728,11 @@ int unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler
if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
goto onError;
if (newpos<0)
newpos = 0;
else if (newpos>insize)
newpos = insize;
newpos = insize+newpos;
if (newpos<0 || newpos>insize) {
PyErr_Format(PyExc_IndexError, "position %d from error handler out of bounds", newpos);
goto onError;
}
/* need more space? (at least enough for what we
have+the replacement+the rest of the string (starting
......@@ -2246,9 +2248,12 @@ static PyObject *unicode_encode_call_errorhandler(const char *errors,
return NULL;
}
if (*newpos<0)
*newpos = 0;
else if (*newpos>size)
*newpos = size;
*newpos = size+*newpos;
if (*newpos<0 || *newpos>size) {
PyErr_Format(PyExc_IndexError, "position %d from error handler out of bounds", *newpos);
Py_DECREF(restuple);
return NULL;
}
Py_INCREF(resunicode);
Py_DECREF(restuple);
return resunicode;
......@@ -3084,9 +3089,12 @@ static PyObject *unicode_translate_call_errorhandler(const char *errors,
return NULL;
}
if (*newpos<0)
*newpos = 0;
else if (*newpos>size)
*newpos = size;
*newpos = size+*newpos;
if (*newpos<0 || *newpos>size) {
PyErr_Format(PyExc_IndexError, "position %d from error handler out of bounds", *newpos);
Py_DECREF(restuple);
return NULL;
}
Py_INCREF(resunicode);
Py_DECREF(restuple);
return resunicode;
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment