Commit 3d4226a8 authored by Victor Stinner's avatar Victor Stinner Committed by GitHub

bpo-34523: Support surrogatepass in locale codecs (GH-8995)

Add support for the "surrogatepass" error handler in
PyUnicode_DecodeFSDefault() and PyUnicode_EncodeFSDefault()
for the UTF-8 encoding.

Changes:

* _Py_DecodeUTF8Ex() and _Py_EncodeUTF8Ex() now support the
  surrogatepass error handler (_Py_ERROR_SURROGATEPASS).
* _Py_DecodeLocaleEx() and _Py_EncodeLocaleEx() now use
  the _Py_error_handler enum instead of "int surrogateescape" to pass
  the error handler. These functions now return -3 if the error
  handler is unknown.
* Add unit tests on _Py_DecodeLocaleEx() and _Py_EncodeLocaleEx()
  in test_codecs.
* Rename get_error_handler() to _Py_GetErrorHandler() and expose it
  as a private function.
* _freeze_importlib doesn't need config.filesystem_errors="strict"
  workaround anymore.
parent c5989cd8
...@@ -5,6 +5,24 @@ ...@@ -5,6 +5,24 @@
extern "C" { extern "C" {
#endif #endif
#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03080000
typedef enum {
_Py_ERROR_UNKNOWN=0,
_Py_ERROR_STRICT,
_Py_ERROR_SURROGATEESCAPE,
_Py_ERROR_REPLACE,
_Py_ERROR_IGNORE,
_Py_ERROR_BACKSLASHREPLACE,
_Py_ERROR_SURROGATEPASS,
_Py_ERROR_XMLCHARREFREPLACE,
_Py_ERROR_OTHER
} _Py_error_handler;
PyAPI_FUNC(_Py_error_handler) _Py_GetErrorHandler(const char *errors);
#endif
#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03050000 #if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03050000
PyAPI_FUNC(wchar_t *) Py_DecodeLocale( PyAPI_FUNC(wchar_t *) Py_DecodeLocale(
const char *arg, const char *arg,
...@@ -26,7 +44,7 @@ PyAPI_FUNC(int) _Py_DecodeUTF8Ex( ...@@ -26,7 +44,7 @@ PyAPI_FUNC(int) _Py_DecodeUTF8Ex(
wchar_t **wstr, wchar_t **wstr,
size_t *wlen, size_t *wlen,
const char **reason, const char **reason,
int surrogateescape); _Py_error_handler errors);
PyAPI_FUNC(int) _Py_EncodeUTF8Ex( PyAPI_FUNC(int) _Py_EncodeUTF8Ex(
const wchar_t *text, const wchar_t *text,
...@@ -34,19 +52,22 @@ PyAPI_FUNC(int) _Py_EncodeUTF8Ex( ...@@ -34,19 +52,22 @@ PyAPI_FUNC(int) _Py_EncodeUTF8Ex(
size_t *error_pos, size_t *error_pos,
const char **reason, const char **reason,
int raw_malloc, int raw_malloc,
int surrogateescape); _Py_error_handler errors);
PyAPI_FUNC(wchar_t*) _Py_DecodeUTF8_surrogateescape( PyAPI_FUNC(wchar_t*) _Py_DecodeUTF8_surrogateescape(
const char *arg, const char *arg,
Py_ssize_t arglen); Py_ssize_t arglen);
#endif
#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03080000
PyAPI_FUNC(int) _Py_DecodeLocaleEx( PyAPI_FUNC(int) _Py_DecodeLocaleEx(
const char *arg, const char *arg,
wchar_t **wstr, wchar_t **wstr,
size_t *wlen, size_t *wlen,
const char **reason, const char **reason,
int current_locale, int current_locale,
int surrogateescape); _Py_error_handler errors);
PyAPI_FUNC(int) _Py_EncodeLocaleEx( PyAPI_FUNC(int) _Py_EncodeLocaleEx(
const wchar_t *text, const wchar_t *text,
...@@ -54,7 +75,7 @@ PyAPI_FUNC(int) _Py_EncodeLocaleEx( ...@@ -54,7 +75,7 @@ PyAPI_FUNC(int) _Py_EncodeLocaleEx(
size_t *error_pos, size_t *error_pos,
const char **reason, const char **reason,
int current_locale, int current_locale,
int surrogateescape); _Py_error_handler errors);
#endif #endif
#ifndef Py_LIMITED_API #ifndef Py_LIMITED_API
......
...@@ -9,6 +9,11 @@ from unittest import mock ...@@ -9,6 +9,11 @@ from unittest import mock
from test import support from test import support
try:
import _testcapi
except ImportError as exc:
_testcapi = None
try: try:
import ctypes import ctypes
except ImportError: except ImportError:
...@@ -2051,13 +2056,12 @@ class BasicUnicodeTest(unittest.TestCase, MixInCheckStateHandling): ...@@ -2051,13 +2056,12 @@ class BasicUnicodeTest(unittest.TestCase, MixInCheckStateHandling):
@support.cpython_only @support.cpython_only
def test_basics_capi(self): def test_basics_capi(self):
from _testcapi import codec_incrementalencoder, codec_incrementaldecoder
s = "abc123" # all codecs should be able to encode these s = "abc123" # all codecs should be able to encode these
for encoding in all_unicode_encodings: for encoding in all_unicode_encodings:
if encoding not in broken_unicode_with_stateful: if encoding not in broken_unicode_with_stateful:
# check incremental decoder/encoder (fetched via the C API) # check incremental decoder/encoder (fetched via the C API)
try: try:
cencoder = codec_incrementalencoder(encoding) cencoder = _testcapi.codec_incrementalencoder(encoding)
except LookupError: # no IncrementalEncoder except LookupError: # no IncrementalEncoder
pass pass
else: else:
...@@ -2066,7 +2070,7 @@ class BasicUnicodeTest(unittest.TestCase, MixInCheckStateHandling): ...@@ -2066,7 +2070,7 @@ class BasicUnicodeTest(unittest.TestCase, MixInCheckStateHandling):
for c in s: for c in s:
encodedresult += cencoder.encode(c) encodedresult += cencoder.encode(c)
encodedresult += cencoder.encode("", True) encodedresult += cencoder.encode("", True)
cdecoder = codec_incrementaldecoder(encoding) cdecoder = _testcapi.codec_incrementaldecoder(encoding)
decodedresult = "" decodedresult = ""
for c in encodedresult: for c in encodedresult:
decodedresult += cdecoder.decode(bytes([c])) decodedresult += cdecoder.decode(bytes([c]))
...@@ -2077,12 +2081,12 @@ class BasicUnicodeTest(unittest.TestCase, MixInCheckStateHandling): ...@@ -2077,12 +2081,12 @@ class BasicUnicodeTest(unittest.TestCase, MixInCheckStateHandling):
if encoding not in ("idna", "mbcs"): if encoding not in ("idna", "mbcs"):
# check incremental decoder/encoder with errors argument # check incremental decoder/encoder with errors argument
try: try:
cencoder = codec_incrementalencoder(encoding, "ignore") cencoder = _testcapi.codec_incrementalencoder(encoding, "ignore")
except LookupError: # no IncrementalEncoder except LookupError: # no IncrementalEncoder
pass pass
else: else:
encodedresult = b"".join(cencoder.encode(c) for c in s) encodedresult = b"".join(cencoder.encode(c) for c in s)
cdecoder = codec_incrementaldecoder(encoding, "ignore") cdecoder = _testcapi.codec_incrementaldecoder(encoding, "ignore")
decodedresult = "".join(cdecoder.decode(bytes([c])) decodedresult = "".join(cdecoder.decode(bytes([c]))
for c in encodedresult) for c in encodedresult)
self.assertEqual(decodedresult, s, self.assertEqual(decodedresult, s,
...@@ -3263,5 +3267,109 @@ class Latin1Test(unittest.TestCase): ...@@ -3263,5 +3267,109 @@ class Latin1Test(unittest.TestCase):
self.assertEqual(data.decode('latin1'), expected) self.assertEqual(data.decode('latin1'), expected)
@unittest.skipIf(_testcapi is None, 'need _testcapi module')
class LocaleCodecTest(unittest.TestCase):
"""
Test indirectly _Py_DecodeUTF8Ex() and _Py_EncodeUTF8Ex().
"""
ENCODING = sys.getfilesystemencoding()
STRINGS = ("ascii", "ulatin1:\xa7\xe9",
"u255:\xff",
"UCS:\xe9\u20ac\U0010ffff",
"surrogates:\uDC80\uDCFF")
BYTES_STRINGS = (b"blatin1:\xa7\xe9", b"b255:\xff")
SURROGATES = "\uDC80\uDCFF"
def encode(self, text, errors="strict"):
return _testcapi.EncodeLocaleEx(text, 0, errors)
def check_encode_strings(self, errors):
for text in self.STRINGS:
with self.subTest(text=text):
try:
expected = text.encode(self.ENCODING, errors)
except UnicodeEncodeError:
with self.assertRaises(RuntimeError) as cm:
self.encode(self.SURROGATES)
errmsg = str(cm.exception)
self.assertTrue(errmsg.startswith("encode error: pos=0, reason="), errmsg)
else:
encoded = self.encode(text, errors)
self.assertEqual(encoded, expected)
def test_encode_strict(self):
self.check_encode_strings("strict")
def test_encode_surrogateescape(self):
self.check_encode_strings("surrogateescape")
def test_encode_surrogatepass(self):
try:
self.encode('', 'surrogatepass')
except ValueError as exc:
if str(exc) == 'unsupported error handler':
self.skipTest(f"{self.ENCODING!r} encoder doesn't support "
f"surrogatepass error handler")
else:
raise
self.check_encode_strings("surrogatepass")
def decode(self, encoded, errors="strict"):
return _testcapi.DecodeLocaleEx(encoded, 0, errors)
def check_decode_strings(self, errors):
is_utf8 = (self.ENCODING == "utf-8")
if is_utf8:
encode_errors = 'surrogateescape'
else:
encode_errors = 'strict'
strings = list(self.BYTES_STRINGS)
for text in self.STRINGS:
try:
encoded = text.encode(self.ENCODING, encode_errors)
if encoded not in strings:
strings.append(encoded)
except UnicodeEncodeError:
encoded = None
if is_utf8:
encoded2 = text.encode(self.ENCODING, 'surrogatepass')
if encoded2 != encoded:
strings.append(encoded2)
for encoded in strings:
with self.subTest(encoded=encoded):
try:
expected = encoded.decode(self.ENCODING, errors)
except UnicodeDecodeError:
with self.assertRaises(RuntimeError) as cm:
self.decode(encoded, errors)
errmsg = str(cm.exception)
self.assertTrue(errmsg.startswith("decode error: "), errmsg)
else:
decoded = self.decode(encoded, errors)
self.assertEqual(decoded, expected)
def test_decode_strict(self):
self.check_decode_strings("strict")
def test_decode_surrogateescape(self):
self.check_decode_strings("surrogateescape")
def test_decode_surrogatepass(self):
try:
self.decode(b'', 'surrogatepass')
except ValueError as exc:
if str(exc) == 'unsupported error handler':
self.skipTest(f"{self.ENCODING!r} decoder doesn't support "
f"surrogatepass error handler")
else:
raise
self.check_decode_strings("surrogatepass")
if __name__ == "__main__": if __name__ == "__main__":
unittest.main() unittest.main()
...@@ -4550,6 +4550,98 @@ new_hamt(PyObject *self, PyObject *args) ...@@ -4550,6 +4550,98 @@ new_hamt(PyObject *self, PyObject *args)
} }
static PyObject *
encode_locale_ex(PyObject *self, PyObject *args)
{
PyObject *unicode;
int current_locale = 0;
wchar_t *wstr;
PyObject *res = NULL;
const char *errors = NULL;
if (!PyArg_ParseTuple(args, "U|is", &unicode, &current_locale, &errors)) {
return NULL;
}
wstr = PyUnicode_AsWideCharString(unicode, NULL);
if (wstr == NULL) {
return NULL;
}
_Py_error_handler error_handler = _Py_GetErrorHandler(errors);
char *str = NULL;
size_t error_pos;
const char *reason = NULL;
int ret = _Py_EncodeLocaleEx(wstr,
&str, &error_pos, &reason,
current_locale, error_handler);
PyMem_Free(wstr);
switch(ret) {
case 0:
res = PyBytes_FromString(str);
PyMem_RawFree(str);
break;
case -1:
PyErr_NoMemory();
break;
case -2:
PyErr_Format(PyExc_RuntimeError, "encode error: pos=%zu, reason=%s",
error_pos, reason);
break;
case -3:
PyErr_SetString(PyExc_ValueError, "unsupported error handler");
break;
default:
PyErr_SetString(PyExc_ValueError, "unknow error code");
break;
}
return res;
}
static PyObject *
decode_locale_ex(PyObject *self, PyObject *args)
{
char *str;
int current_locale = 0;
PyObject *res = NULL;
const char *errors = NULL;
if (!PyArg_ParseTuple(args, "y|is", &str, &current_locale, &errors)) {
return NULL;
}
_Py_error_handler error_handler = _Py_GetErrorHandler(errors);
wchar_t *wstr = NULL;
size_t wlen = 0;
const char *reason = NULL;
int ret = _Py_DecodeLocaleEx(str,
&wstr, &wlen, &reason,
current_locale, error_handler);
switch(ret) {
case 0:
res = PyUnicode_FromWideChar(wstr, wlen);
PyMem_RawFree(wstr);
break;
case -1:
PyErr_NoMemory();
break;
case -2:
PyErr_Format(PyExc_RuntimeError, "decode error: pos=%zu, reason=%s",
wlen, reason);
break;
case -3:
PyErr_SetString(PyExc_ValueError, "unsupported error handler");
break;
default:
PyErr_SetString(PyExc_ValueError, "unknow error code");
break;
}
return res;
}
static PyMethodDef TestMethods[] = { static PyMethodDef TestMethods[] = {
{"raise_exception", raise_exception, METH_VARARGS}, {"raise_exception", raise_exception, METH_VARARGS},
{"raise_memoryerror", raise_memoryerror, METH_NOARGS}, {"raise_memoryerror", raise_memoryerror, METH_NOARGS},
...@@ -4771,6 +4863,8 @@ static PyMethodDef TestMethods[] = { ...@@ -4771,6 +4863,8 @@ static PyMethodDef TestMethods[] = {
{"get_mapping_items", get_mapping_items, METH_O}, {"get_mapping_items", get_mapping_items, METH_O},
{"test_pythread_tss_key_state", test_pythread_tss_key_state, METH_VARARGS}, {"test_pythread_tss_key_state", test_pythread_tss_key_state, METH_VARARGS},
{"hamt", new_hamt, METH_NOARGS}, {"hamt", new_hamt, METH_NOARGS},
{"EncodeLocaleEx", encode_locale_ex, METH_VARARGS},
{"DecodeLocaleEx", decode_locale_ex, METH_VARARGS},
{NULL, NULL} /* sentinel */ {NULL, NULL} /* sentinel */
}; };
......
...@@ -313,7 +313,7 @@ STRINGLIB(utf8_encoder)(PyObject *unicode, ...@@ -313,7 +313,7 @@ STRINGLIB(utf8_encoder)(PyObject *unicode,
Py_ssize_t startpos, endpos, newpos; Py_ssize_t startpos, endpos, newpos;
Py_ssize_t k; Py_ssize_t k;
if (error_handler == _Py_ERROR_UNKNOWN) { if (error_handler == _Py_ERROR_UNKNOWN) {
error_handler = get_error_handler(errors); error_handler = _Py_GetErrorHandler(errors);
} }
startpos = i-1; startpos = i-1;
......
This diff is collapsed.
...@@ -82,14 +82,6 @@ main(int argc, char *argv[]) ...@@ -82,14 +82,6 @@ main(int argc, char *argv[])
/* Don't install importlib, since it could execute outdated bytecode. */ /* Don't install importlib, since it could execute outdated bytecode. */
config._install_importlib = 0; config._install_importlib = 0;
config._frozen = 1; config._frozen = 1;
#ifdef MS_WINDOWS
/* bpo-34523: initfsencoding() is not called if _install_importlib=0,
so interp->fscodec_initialized value remains 0.
PyUnicode_EncodeFSDefault() doesn't support the "surrogatepass" error
handler in such case, whereas it's the default error handler on Windows.
Force the "strict" error handler to work around this bootstrap issue. */
config.filesystem_errors = "strict";
#endif
_PyInitError err = _Py_InitializeFromConfig(&config); _PyInitError err = _Py_InitializeFromConfig(&config);
/* No need to call _PyCoreConfig_Clear() since we didn't allocate any /* No need to call _PyCoreConfig_Clear() since we didn't allocate any
......
...@@ -32,6 +32,24 @@ extern int winerror_to_errno(int); ...@@ -32,6 +32,24 @@ extern int winerror_to_errno(int);
int _Py_open_cloexec_works = -1; int _Py_open_cloexec_works = -1;
#endif #endif
static int
get_surrogateescape(_Py_error_handler errors, int *surrogateescape)
{
switch (errors)
{
case _Py_ERROR_STRICT:
*surrogateescape = 0;
return 0;
case _Py_ERROR_SURROGATEESCAPE:
*surrogateescape = 1;
return 0;
default:
return -1;
}
}
PyObject * PyObject *
_Py_device_encoding(int fd) _Py_device_encoding(int fd)
{ {
...@@ -215,12 +233,17 @@ _Py_GetForceASCII(void) ...@@ -215,12 +233,17 @@ _Py_GetForceASCII(void)
static int static int
encode_ascii(const wchar_t *text, char **str, encode_ascii(const wchar_t *text, char **str,
size_t *error_pos, const char **reason, size_t *error_pos, const char **reason,
int raw_malloc, int surrogateescape) int raw_malloc, _Py_error_handler errors)
{ {
char *result = NULL, *out; char *result = NULL, *out;
size_t len, i; size_t len, i;
wchar_t ch; wchar_t ch;
int surrogateescape;
if (get_surrogateescape(errors, &surrogateescape) < 0) {
return -3;
}
len = wcslen(text); len = wcslen(text);
/* +1 for NULL byte */ /* +1 for NULL byte */
...@@ -278,13 +301,18 @@ _Py_GetForceASCII(void) ...@@ -278,13 +301,18 @@ _Py_GetForceASCII(void)
#if !defined(HAVE_MBRTOWC) || defined(USE_FORCE_ASCII) #if !defined(HAVE_MBRTOWC) || defined(USE_FORCE_ASCII)
static int static int
decode_ascii(const char *arg, wchar_t **wstr, size_t *wlen, decode_ascii(const char *arg, wchar_t **wstr, size_t *wlen,
const char **reason, int surrogateescape) const char **reason, _Py_error_handler errors)
{ {
wchar_t *res; wchar_t *res;
unsigned char *in; unsigned char *in;
wchar_t *out; wchar_t *out;
size_t argsize = strlen(arg) + 1; size_t argsize = strlen(arg) + 1;
int surrogateescape;
if (get_surrogateescape(errors, &surrogateescape) < 0) {
return -3;
}
if (argsize > PY_SSIZE_T_MAX / sizeof(wchar_t)) { if (argsize > PY_SSIZE_T_MAX / sizeof(wchar_t)) {
return -1; return -1;
} }
...@@ -325,7 +353,7 @@ decode_ascii(const char *arg, wchar_t **wstr, size_t *wlen, ...@@ -325,7 +353,7 @@ decode_ascii(const char *arg, wchar_t **wstr, size_t *wlen,
static int static int
decode_current_locale(const char* arg, wchar_t **wstr, size_t *wlen, decode_current_locale(const char* arg, wchar_t **wstr, size_t *wlen,
const char **reason, int surrogateescape) const char **reason, _Py_error_handler errors)
{ {
wchar_t *res; wchar_t *res;
size_t argsize; size_t argsize;
...@@ -336,6 +364,11 @@ decode_current_locale(const char* arg, wchar_t **wstr, size_t *wlen, ...@@ -336,6 +364,11 @@ decode_current_locale(const char* arg, wchar_t **wstr, size_t *wlen,
mbstate_t mbs; mbstate_t mbs;
#endif #endif
int surrogateescape;
if (get_surrogateescape(errors, &surrogateescape) < 0) {
return -3;
}
#ifdef HAVE_BROKEN_MBSTOWCS #ifdef HAVE_BROKEN_MBSTOWCS
/* Some platforms have a broken implementation of /* Some platforms have a broken implementation of
* mbstowcs which does not count the characters that * mbstowcs which does not count the characters that
...@@ -456,7 +489,7 @@ decode_error: ...@@ -456,7 +489,7 @@ decode_error:
/* Cannot use C locale for escaping; manually escape as if charset /* Cannot use C locale for escaping; manually escape as if charset
is ASCII (i.e. escape all bytes > 128. This will still roundtrip is ASCII (i.e. escape all bytes > 128. This will still roundtrip
correctly in the locale's charset, which must be an ASCII superset. */ correctly in the locale's charset, which must be an ASCII superset. */
return decode_ascii(arg, wstr, wlen, reason, surrogateescape); return decode_ascii(arg, wstr, wlen, reason, errors);
#endif /* HAVE_MBRTOWC */ #endif /* HAVE_MBRTOWC */
} }
...@@ -479,33 +512,35 @@ decode_error: ...@@ -479,33 +512,35 @@ decode_error:
invalid byte sequence in the input string into *wlen. If reason is not NULL, invalid byte sequence in the input string into *wlen. If reason is not NULL,
write the decoding error message into *reason. write the decoding error message into *reason.
Return -3 if the error handler 'errors' is not supported.
Use the Py_EncodeLocaleEx() function to encode the character string back to Use the Py_EncodeLocaleEx() function to encode the character string back to
a byte string. */ a byte string. */
int int
_Py_DecodeLocaleEx(const char* arg, wchar_t **wstr, size_t *wlen, _Py_DecodeLocaleEx(const char* arg, wchar_t **wstr, size_t *wlen,
const char **reason, const char **reason,
int current_locale, int surrogateescape) int current_locale, _Py_error_handler errors)
{ {
if (current_locale) { if (current_locale) {
#ifdef __ANDROID__ #ifdef __ANDROID__
return _Py_DecodeUTF8Ex(arg, strlen(arg), wstr, wlen, reason, return _Py_DecodeUTF8Ex(arg, strlen(arg), wstr, wlen, reason,
surrogateescape); errors);
#else #else
return decode_current_locale(arg, wstr, wlen, reason, surrogateescape); return decode_current_locale(arg, wstr, wlen, reason, errors);
#endif #endif
} }
#if defined(__APPLE__) || defined(__ANDROID__) #if defined(__APPLE__) || defined(__ANDROID__)
return _Py_DecodeUTF8Ex(arg, strlen(arg), wstr, wlen, reason, return _Py_DecodeUTF8Ex(arg, strlen(arg), wstr, wlen, reason,
surrogateescape); errors);
#else #else
int use_utf8 = (Py_UTF8Mode == 1); int use_utf8 = (Py_UTF8Mode == 1);
#ifdef MS_WINDOWS #ifdef MS_WINDOWS
use_utf8 |= !Py_LegacyWindowsFSEncodingFlag; use_utf8 |= !Py_LegacyWindowsFSEncodingFlag;
#endif #endif
if (use_utf8) { if (use_utf8) {
return _Py_DecodeUTF8Ex(arg, strlen(arg), wstr, wlen, return _Py_DecodeUTF8Ex(arg, strlen(arg), wstr, wlen, reason,
reason, surrogateescape); errors);
} }
#ifdef USE_FORCE_ASCII #ifdef USE_FORCE_ASCII
...@@ -515,11 +550,11 @@ _Py_DecodeLocaleEx(const char* arg, wchar_t **wstr, size_t *wlen, ...@@ -515,11 +550,11 @@ _Py_DecodeLocaleEx(const char* arg, wchar_t **wstr, size_t *wlen,
if (force_ascii) { if (force_ascii) {
/* force ASCII encoding to workaround mbstowcs() issue */ /* force ASCII encoding to workaround mbstowcs() issue */
return decode_ascii(arg, wstr, wlen, reason, surrogateescape); return decode_ascii(arg, wstr, wlen, reason, errors);
} }
#endif #endif
return decode_current_locale(arg, wstr, wlen, reason, surrogateescape); return decode_current_locale(arg, wstr, wlen, reason, errors);
#endif /* __APPLE__ or __ANDROID__ */ #endif /* __APPLE__ or __ANDROID__ */
} }
...@@ -547,8 +582,11 @@ wchar_t* ...@@ -547,8 +582,11 @@ wchar_t*
Py_DecodeLocale(const char* arg, size_t *wlen) Py_DecodeLocale(const char* arg, size_t *wlen)
{ {
wchar_t *wstr; wchar_t *wstr;
int res = _Py_DecodeLocaleEx(arg, &wstr, wlen, NULL, 0, 1); int res = _Py_DecodeLocaleEx(arg, &wstr, wlen,
NULL, 0,
_Py_ERROR_SURROGATEESCAPE);
if (res != 0) { if (res != 0) {
assert(res != -3);
if (wlen != NULL) { if (wlen != NULL) {
*wlen = (size_t)res; *wlen = (size_t)res;
} }
...@@ -561,13 +599,18 @@ Py_DecodeLocale(const char* arg, size_t *wlen) ...@@ -561,13 +599,18 @@ Py_DecodeLocale(const char* arg, size_t *wlen)
static int static int
encode_current_locale(const wchar_t *text, char **str, encode_current_locale(const wchar_t *text, char **str,
size_t *error_pos, const char **reason, size_t *error_pos, const char **reason,
int raw_malloc, int surrogateescape) int raw_malloc, _Py_error_handler errors)
{ {
const size_t len = wcslen(text); const size_t len = wcslen(text);
char *result = NULL, *bytes = NULL; char *result = NULL, *bytes = NULL;
size_t i, size, converted; size_t i, size, converted;
wchar_t c, buf[2]; wchar_t c, buf[2];
int surrogateescape;
if (get_surrogateescape(errors, &surrogateescape) < 0) {
return -3;
}
/* The function works in two steps: /* The function works in two steps:
1. compute the length of the output buffer in bytes (size) 1. compute the length of the output buffer in bytes (size)
2. outputs the bytes */ 2. outputs the bytes */
...@@ -646,32 +689,50 @@ encode_error: ...@@ -646,32 +689,50 @@ encode_error:
return -2; return -2;
} }
/* Encode a string to the locale encoding.
Parameters:
* raw_malloc: if non-zero, allocate memory using PyMem_RawMalloc() instead
of PyMem_Malloc().
* current_locale: if non-zero, use the current LC_CTYPE, otherwise use
Python filesystem encoding.
* errors: error handler like "strict" or "surrogateescape".
Return value:
0: success, *str is set to a newly allocated decoded string.
-1: memory allocation failure
-2: encoding error, set *error_pos and *reason (if set).
-3: the error handler 'errors' is not supported.
*/
static int static int
encode_locale_ex(const wchar_t *text, char **str, size_t *error_pos, encode_locale_ex(const wchar_t *text, char **str, size_t *error_pos,
const char **reason, const char **reason,
int raw_malloc, int current_locale, int surrogateescape) int raw_malloc, int current_locale, _Py_error_handler errors)
{ {
if (current_locale) { if (current_locale) {
#ifdef __ANDROID__ #ifdef __ANDROID__
return _Py_EncodeUTF8Ex(text, str, error_pos, reason, return _Py_EncodeUTF8Ex(text, str, error_pos, reason,
raw_malloc, surrogateescape); raw_malloc, errors);
#else #else
return encode_current_locale(text, str, error_pos, reason, return encode_current_locale(text, str, error_pos, reason,
raw_malloc, surrogateescape); raw_malloc, errors);
#endif #endif
} }
#if defined(__APPLE__) || defined(__ANDROID__) #if defined(__APPLE__) || defined(__ANDROID__)
return _Py_EncodeUTF8Ex(text, str, error_pos, reason, return _Py_EncodeUTF8Ex(text, str, error_pos, reason,
raw_malloc, surrogateescape); raw_malloc, errors);
#else /* __APPLE__ */ #else
int use_utf8 = (Py_UTF8Mode == 1); int use_utf8 = (Py_UTF8Mode == 1);
#ifdef MS_WINDOWS #ifdef MS_WINDOWS
use_utf8 |= !Py_LegacyWindowsFSEncodingFlag; use_utf8 |= !Py_LegacyWindowsFSEncodingFlag;
#endif #endif
if (use_utf8) { if (use_utf8) {
return _Py_EncodeUTF8Ex(text, str, error_pos, reason, return _Py_EncodeUTF8Ex(text, str, error_pos, reason,
raw_malloc, surrogateescape); raw_malloc, errors);
} }
#ifdef USE_FORCE_ASCII #ifdef USE_FORCE_ASCII
...@@ -681,12 +742,12 @@ encode_locale_ex(const wchar_t *text, char **str, size_t *error_pos, ...@@ -681,12 +742,12 @@ encode_locale_ex(const wchar_t *text, char **str, size_t *error_pos,
if (force_ascii) { if (force_ascii) {
return encode_ascii(text, str, error_pos, reason, return encode_ascii(text, str, error_pos, reason,
raw_malloc, surrogateescape); raw_malloc, errors);
} }
#endif #endif
return encode_current_locale(text, str, error_pos, reason, return encode_current_locale(text, str, error_pos, reason,
raw_malloc, surrogateescape); raw_malloc, errors);
#endif /* __APPLE__ or __ANDROID__ */ #endif /* __APPLE__ or __ANDROID__ */
} }
...@@ -696,7 +757,8 @@ encode_locale(const wchar_t *text, size_t *error_pos, ...@@ -696,7 +757,8 @@ encode_locale(const wchar_t *text, size_t *error_pos,
{ {
char *str; char *str;
int res = encode_locale_ex(text, &str, error_pos, NULL, int res = encode_locale_ex(text, &str, error_pos, NULL,
raw_malloc, current_locale, 1); raw_malloc, current_locale,
_Py_ERROR_SURROGATEESCAPE);
if (res != -2 && error_pos) { if (res != -2 && error_pos) {
*error_pos = (size_t)-1; *error_pos = (size_t)-1;
} }
...@@ -737,10 +799,10 @@ _Py_EncodeLocaleRaw(const wchar_t *text, size_t *error_pos) ...@@ -737,10 +799,10 @@ _Py_EncodeLocaleRaw(const wchar_t *text, size_t *error_pos)
int int
_Py_EncodeLocaleEx(const wchar_t *text, char **str, _Py_EncodeLocaleEx(const wchar_t *text, char **str,
size_t *error_pos, const char **reason, size_t *error_pos, const char **reason,
int current_locale, int surrogateescape) int current_locale, _Py_error_handler errors)
{ {
return encode_locale_ex(text, str, error_pos, reason, 1, return encode_locale_ex(text, str, error_pos, reason, 1,
current_locale, surrogateescape); current_locale, errors);
} }
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment