Commit 87e66730 authored by Serhiy Storchaka's avatar Serhiy Storchaka

Issue #19619: Blacklist non-text codecs in method API

str.encode, bytes.decode and bytearray.decode now use an
internal API to throw LookupError for known non-text encodings,
rather than attempting the encoding or decoding operation and
then throwing a TypeError for an unexpected output type.

The latter mechanism remains in place for third party non-text
encodings.

Backported changeset d68df99d7a57.
parent 671156a1
...@@ -94,6 +94,33 @@ PyAPI_FUNC(PyObject *) PyCodec_Decode( ...@@ -94,6 +94,33 @@ PyAPI_FUNC(PyObject *) PyCodec_Decode(
const char *errors const char *errors
); );
#ifndef PY_LIMITED_API
/* Text codec specific encoding and decoding API.
Checks the encoding against a list of codecs which do not
implement a str<->bytes encoding before attempting the
operation.
Please note that these APIs are internal and should not
be used in Python C extensions.
*/
PyAPI_FUNC(PyObject *) _PyCodec_EncodeText(
PyObject *object,
const char *encoding,
const char *errors
);
PyAPI_FUNC(PyObject *) _PyCodec_DecodeText(
PyObject *object,
const char *encoding,
const char *errors
);
#endif
/* --- Codec Lookup APIs -------------------------------------------------- /* --- Codec Lookup APIs --------------------------------------------------
All APIs return a codec object with incremented refcount and are All APIs return a codec object with incremented refcount and are
......
...@@ -73,9 +73,19 @@ BOM64_BE = BOM_UTF32_BE ...@@ -73,9 +73,19 @@ BOM64_BE = BOM_UTF32_BE
### Codec base classes (defining the API) ### Codec base classes (defining the API)
class CodecInfo(tuple): class CodecInfo(tuple):
"""Codec details when looking up the codec registry"""
# Private API to allow Python 3.4 to blacklist the known non-Unicode
# codecs in the standard library. A more general mechanism to
# reliably distinguish test encodings from other codecs will hopefully
# be defined for Python 3.5
#
# See http://bugs.python.org/issue19619
_is_text_encoding = True # Assume codecs are text encodings by default
def __new__(cls, encode, decode, streamreader=None, streamwriter=None, def __new__(cls, encode, decode, streamreader=None, streamwriter=None,
incrementalencoder=None, incrementaldecoder=None, name=None): incrementalencoder=None, incrementaldecoder=None, name=None,
*, _is_text_encoding=None):
self = tuple.__new__(cls, (encode, decode, streamreader, streamwriter)) self = tuple.__new__(cls, (encode, decode, streamreader, streamwriter))
self.name = name self.name = name
self.encode = encode self.encode = encode
...@@ -84,6 +94,8 @@ class CodecInfo(tuple): ...@@ -84,6 +94,8 @@ class CodecInfo(tuple):
self.incrementaldecoder = incrementaldecoder self.incrementaldecoder = incrementaldecoder
self.streamwriter = streamwriter self.streamwriter = streamwriter
self.streamreader = streamreader self.streamreader = streamreader
if _is_text_encoding is not None:
self._is_text_encoding = _is_text_encoding
return self return self
def __repr__(self): def __repr__(self):
......
...@@ -52,4 +52,5 @@ def getregentry(): ...@@ -52,4 +52,5 @@ def getregentry():
incrementaldecoder=IncrementalDecoder, incrementaldecoder=IncrementalDecoder,
streamwriter=StreamWriter, streamwriter=StreamWriter,
streamreader=StreamReader, streamreader=StreamReader,
_is_text_encoding=False,
) )
...@@ -74,4 +74,5 @@ def getregentry(): ...@@ -74,4 +74,5 @@ def getregentry():
incrementaldecoder=IncrementalDecoder, incrementaldecoder=IncrementalDecoder,
streamwriter=StreamWriter, streamwriter=StreamWriter,
streamreader=StreamReader, streamreader=StreamReader,
_is_text_encoding=False,
) )
...@@ -52,4 +52,5 @@ def getregentry(): ...@@ -52,4 +52,5 @@ def getregentry():
incrementaldecoder=IncrementalDecoder, incrementaldecoder=IncrementalDecoder,
streamwriter=StreamWriter, streamwriter=StreamWriter,
streamreader=StreamReader, streamreader=StreamReader,
_is_text_encoding=False,
) )
...@@ -53,4 +53,5 @@ def getregentry(): ...@@ -53,4 +53,5 @@ def getregentry():
incrementaldecoder=IncrementalDecoder, incrementaldecoder=IncrementalDecoder,
streamwriter=StreamWriter, streamwriter=StreamWriter,
streamreader=StreamReader, streamreader=StreamReader,
_is_text_encoding=False,
) )
...@@ -43,6 +43,7 @@ def getregentry(): ...@@ -43,6 +43,7 @@ def getregentry():
incrementaldecoder=IncrementalDecoder, incrementaldecoder=IncrementalDecoder,
streamwriter=StreamWriter, streamwriter=StreamWriter,
streamreader=StreamReader, streamreader=StreamReader,
_is_text_encoding=False,
) )
### Map ### Map
......
...@@ -96,4 +96,5 @@ def getregentry(): ...@@ -96,4 +96,5 @@ def getregentry():
incrementaldecoder=IncrementalDecoder, incrementaldecoder=IncrementalDecoder,
streamreader=StreamReader, streamreader=StreamReader,
streamwriter=StreamWriter, streamwriter=StreamWriter,
_is_text_encoding=False,
) )
...@@ -74,4 +74,5 @@ def getregentry(): ...@@ -74,4 +74,5 @@ def getregentry():
incrementaldecoder=IncrementalDecoder, incrementaldecoder=IncrementalDecoder,
streamreader=StreamReader, streamreader=StreamReader,
streamwriter=StreamWriter, streamwriter=StreamWriter,
_is_text_encoding=False,
) )
...@@ -4,6 +4,7 @@ import locale ...@@ -4,6 +4,7 @@ import locale
import sys import sys
import unittest import unittest
import warnings import warnings
import encodings
from test import support from test import support
...@@ -2408,6 +2409,47 @@ class TransformCodecTest(unittest.TestCase): ...@@ -2408,6 +2409,47 @@ class TransformCodecTest(unittest.TestCase):
sout = reader.readline() sout = reader.readline()
self.assertEqual(sout, b"\x80") self.assertEqual(sout, b"\x80")
def test_text_to_binary_blacklists_binary_transforms(self):
# Check binary -> binary codecs give a good error for str input
bad_input = "bad input type"
for encoding in bytes_transform_encodings:
fmt = (r"{!r} is not a text encoding; "
r"use codecs.encode\(\) to handle arbitrary codecs")
msg = fmt.format(encoding)
with self.assertRaisesRegex(LookupError, msg) as failure:
bad_input.encode(encoding)
self.assertIsNone(failure.exception.__cause__)
def test_text_to_binary_blacklists_text_transforms(self):
# Check str.encode gives a good error message for str -> str codecs
msg = (r"^'rot_13' is not a text encoding; "
r"use codecs.encode\(\) to handle arbitrary codecs")
with self.assertRaisesRegex(LookupError, msg):
"just an example message".encode("rot_13")
def test_binary_to_text_blacklists_binary_transforms(self):
# Check bytes.decode and bytearray.decode give a good error
# message for binary -> binary codecs
data = b"encode first to ensure we meet any format restrictions"
for encoding in bytes_transform_encodings:
encoded_data = codecs.encode(data, encoding)
fmt = (r"{!r} is not a text encoding; "
r"use codecs.decode\(\) to handle arbitrary codecs")
msg = fmt.format(encoding)
with self.assertRaisesRegex(LookupError, msg):
encoded_data.decode(encoding)
with self.assertRaisesRegex(LookupError, msg):
bytearray(encoded_data).decode(encoding)
def test_binary_to_text_blacklists_text_transforms(self):
# Check str -> str codec gives a good error for binary input
for bad_input in (b"immutable", bytearray(b"mutable")):
msg = (r"^'rot_13' is not a text encoding; "
r"use codecs.decode\(\) to handle arbitrary codecs")
with self.assertRaisesRegex(LookupError, msg) as failure:
bad_input.decode("rot_13")
self.assertIsNone(failure.exception.__cause__)
@unittest.skipUnless(sys.platform == 'win32', @unittest.skipUnless(sys.platform == 'win32',
'code pages are specific to Windows') 'code pages are specific to Windows')
......
...@@ -10,6 +10,12 @@ What's New in Python 3.3.5 release candidate 1? ...@@ -10,6 +10,12 @@ What's New in Python 3.3.5 release candidate 1?
Core and Builtins Core and Builtins
----------------- -----------------
- Issue #19619: str.encode, bytes.decode and bytearray.decode now use an
internal API to throw LookupError for known non-text encodings, rather
than attempting the encoding or decoding operation and then throwing a
TypeError for an unexpected output type. (The latter mechanism remains
in place for third party non-text encodings)
- Issue #20588: Make Python-ast.c C89 compliant. - Issue #20588: Make Python-ast.c C89 compliant.
- Issue #20437: Fixed 21 potential bugs when deleting objects references. - Issue #20437: Fixed 21 potential bugs when deleting objects references.
......
...@@ -3129,7 +3129,7 @@ PyUnicode_Decode(const char *s, ...@@ -3129,7 +3129,7 @@ PyUnicode_Decode(const char *s,
buffer = PyMemoryView_FromBuffer(&info); buffer = PyMemoryView_FromBuffer(&info);
if (buffer == NULL) if (buffer == NULL)
goto onError; goto onError;
unicode = PyCodec_Decode(buffer, encoding, errors); unicode = _PyCodec_DecodeText(buffer, encoding, errors);
if (unicode == NULL) if (unicode == NULL)
goto onError; goto onError;
if (!PyUnicode_Check(unicode)) { if (!PyUnicode_Check(unicode)) {
...@@ -3489,7 +3489,7 @@ PyUnicode_AsEncodedString(PyObject *unicode, ...@@ -3489,7 +3489,7 @@ PyUnicode_AsEncodedString(PyObject *unicode,
} }
/* Encode via the codec registry */ /* Encode via the codec registry */
v = PyCodec_Encode(unicode, encoding, errors); v = _PyCodec_EncodeText(unicode, encoding, errors);
if (v == NULL) if (v == NULL)
return NULL; return NULL;
......
...@@ -337,18 +337,15 @@ PyObject *PyCodec_StreamWriter(const char *encoding, ...@@ -337,18 +337,15 @@ PyObject *PyCodec_StreamWriter(const char *encoding,
errors is passed to the encoder factory as argument if non-NULL. */ errors is passed to the encoder factory as argument if non-NULL. */
PyObject *PyCodec_Encode(PyObject *object, static PyObject *
const char *encoding, _PyCodec_EncodeInternal(PyObject *object,
const char *errors) PyObject *encoder,
const char *encoding,
const char *errors)
{ {
PyObject *encoder = NULL;
PyObject *args = NULL, *result = NULL; PyObject *args = NULL, *result = NULL;
PyObject *v = NULL; PyObject *v = NULL;
encoder = PyCodec_Encoder(encoding);
if (encoder == NULL)
goto onError;
args = args_tuple(object, errors); args = args_tuple(object, errors);
if (args == NULL) if (args == NULL)
goto onError; goto onError;
...@@ -384,18 +381,15 @@ PyObject *PyCodec_Encode(PyObject *object, ...@@ -384,18 +381,15 @@ PyObject *PyCodec_Encode(PyObject *object,
errors is passed to the decoder factory as argument if non-NULL. */ errors is passed to the decoder factory as argument if non-NULL. */
PyObject *PyCodec_Decode(PyObject *object, static PyObject *
const char *encoding, _PyCodec_DecodeInternal(PyObject *object,
const char *errors) PyObject *decoder,
const char *encoding,
const char *errors)
{ {
PyObject *decoder = NULL;
PyObject *args = NULL, *result = NULL; PyObject *args = NULL, *result = NULL;
PyObject *v; PyObject *v;
decoder = PyCodec_Decoder(encoding);
if (decoder == NULL)
goto onError;
args = args_tuple(object, errors); args = args_tuple(object, errors);
if (args == NULL) if (args == NULL)
goto onError; goto onError;
...@@ -425,6 +419,118 @@ PyObject *PyCodec_Decode(PyObject *object, ...@@ -425,6 +419,118 @@ PyObject *PyCodec_Decode(PyObject *object,
return NULL; return NULL;
} }
/* Generic encoding/decoding API */
PyObject *PyCodec_Encode(PyObject *object,
const char *encoding,
const char *errors)
{
PyObject *encoder;
encoder = PyCodec_Encoder(encoding);
if (encoder == NULL)
return NULL;
return _PyCodec_EncodeInternal(object, encoder, encoding, errors);
}
PyObject *PyCodec_Decode(PyObject *object,
const char *encoding,
const char *errors)
{
PyObject *decoder;
decoder = PyCodec_Decoder(encoding);
if (decoder == NULL)
return NULL;
return _PyCodec_DecodeInternal(object, decoder, encoding, errors);
}
/* Text encoding/decoding API */
static
PyObject *codec_getitem_checked(const char *encoding,
const char *operation_name,
int index)
{
_Py_IDENTIFIER(_is_text_encoding);
PyObject *codec;
PyObject *attr;
PyObject *v;
int is_text_codec;
codec = _PyCodec_Lookup(encoding);
if (codec == NULL)
return NULL;
/* Backwards compatibility: assume any raw tuple describes a text
* encoding, and the same for anything lacking the private
* attribute.
*/
if (!PyTuple_CheckExact(codec)) {
attr = _PyObject_GetAttrId(codec, &PyId__is_text_encoding);
if (attr == NULL) {
if (PyErr_ExceptionMatches(PyExc_AttributeError)) {
PyErr_Clear();
} else {
Py_DECREF(codec);
return NULL;
}
} else {
is_text_codec = PyObject_IsTrue(attr);
Py_DECREF(attr);
if (!is_text_codec) {
Py_DECREF(codec);
PyErr_Format(PyExc_LookupError,
"'%.400s' is not a text encoding; "
"use codecs.%s() to handle arbitrary codecs",
encoding, operation_name);
return NULL;
}
}
}
v = PyTuple_GET_ITEM(codec, index);
Py_DECREF(codec);
Py_INCREF(v);
return v;
}
static PyObject * _PyCodec_TextEncoder(const char *encoding)
{
return codec_getitem_checked(encoding, "encode", 0);
}
static PyObject * _PyCodec_TextDecoder(const char *encoding)
{
return codec_getitem_checked(encoding, "decode", 1);
}
PyObject *_PyCodec_EncodeText(PyObject *object,
const char *encoding,
const char *errors)
{
PyObject *encoder;
encoder = _PyCodec_TextEncoder(encoding);
if (encoder == NULL)
return NULL;
return _PyCodec_EncodeInternal(object, encoder, encoding, errors);
}
PyObject *_PyCodec_DecodeText(PyObject *object,
const char *encoding,
const char *errors)
{
PyObject *decoder;
decoder = _PyCodec_TextDecoder(encoding);
if (decoder == NULL)
return NULL;
return _PyCodec_DecodeInternal(object, decoder, encoding, errors);
}
/* Register the error handling callback function error under the name /* Register the error handling callback function error under the name
name. This function will be called by the codec when it encounters name. This function will be called by the codec when it encounters
an unencodable characters/undecodable bytes and doesn't know the an unencodable characters/undecodable bytes and doesn't know the
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment