Commit f2ea71fc authored by Victor Stinner's avatar Victor Stinner

Issue #13560: Add PyUnicode_EncodeLocale()

 * Use PyUnicode_EncodeLocale() in time.strftime() if wcsftime() is not
   available
 * Document my last changes in Misc/NEWS
parent 9987d935
......@@ -713,7 +713,7 @@ system.
bytes. If a byte sequence can be decoded as a surrogate character and
*surrogateescape* is not equal to zero, the byte sequence is escaped using
the ``'surrogateescape'`` error handler instead of being decoded. *str*
must end with a null character but cannot contain embedded null character.
must end with a null character but cannot contain embedded null characters.
.. seealso::
......@@ -732,6 +732,22 @@ system.
.. versionadded:: 3.3
.. c:function:: PyObject* PyUnicode_EncodeLocale(PyObject *unicode, int surrogateescape)
Encode a Unicode object to the current locale encoding. The encoder is
strict if *surrogateescape* is equal to zero, otherwise it uses the
``'surrogateescape'`` error handler (:pep:`383`). Return a :class:`bytes`
object. *str* cannot contain embedded null characters.
.. seealso::
Use :c:func:`PyUnicode_EncodeFSDefault` to encode a string to
:c:data:`Py_FileSystemDefaultEncoding` (the locale encoding read at
Python startup).
.. versionadded:: 3.3
File System Encoding
""""""""""""""""""""
......@@ -806,6 +822,13 @@ used, passing :c:func:`PyUnicode_FSDecoder` as the conversion function:
If :c:data:`Py_FileSystemDefaultEncoding` is not set, fall back to the
locale encoding.
.. seealso::
:c:data:`Py_FileSystemDefaultEncoding` is initialized at startup from the
locale encoding and cannot be modified later. If you need to encode a
string to the current locale encoding, use
:c:func:`PyUnicode_EncodeLocale`.
.. versionadded:: 3.2
......
......@@ -1603,7 +1603,7 @@ PyAPI_FUNC(PyObject*) _PyUnicode_TransformDecimalAndSpaceToASCII(
be decoded as a surrogate character and *surrogateescape* is not equal to
zero, the byte sequence is escaped using the 'surrogateescape' error handler
instead of being decoded. *str* must end with a null character but cannot
contain embedded null character. */
contain embedded null characters. */
PyAPI_FUNC(PyObject*) PyUnicode_DecodeLocaleAndSize(
const char *str,
......@@ -1617,6 +1617,16 @@ PyAPI_FUNC(PyObject*) PyUnicode_DecodeLocale(
const char *str,
int surrogateescape);
/* Encode a Unicode object to the current locale encoding. The encoder is
strict is *surrogateescape* is equal to zero, otherwise the
"surrogateescape" error handler is used. Return a bytes object. The string
cannot contain embedded null characters.. */
PyAPI_FUNC(PyObject*) PyUnicode_EncodeLocale(
PyObject *unicode,
int surrogateescape
);
/* --- File system encoding ---------------------------------------------- */
/* ParseTuple converter: encode str objects to bytes using
......
......@@ -419,6 +419,10 @@ Core and Builtins
Library
-------
- Issue #13560: Add PyUnicode_DecodeLocale(), PyUnicode_DecodeLocaleAndSize()
and PyUnicode_EncodeLocale() functions to the C API to decode/encode from/to
the current locale encoding.
- Issue #8373: The filesystem path of AF_UNIX sockets now uses the filesystem
encoding and the surrogateescape error handler, rather than UTF-8. Patch
by David Watson.
......@@ -451,8 +455,8 @@ Library
'importlib.abc.PyPycLoader', 'nntplib.NNTP.xgtitle', 'nntplib.NNTP.xpath',
and private attributes of 'smtpd.SMTPChannel'.
- Issue #5905: time.strftime() is now using the locale encoding, instead of
UTF-8, if the wcsftime() function is not available.
- Issue #5905, #13560: time.strftime() is now using the current locale
encoding, instead of UTF-8, if the wcsftime() function is not available.
- Issue #8641: Update IDLE 3 syntax coloring to recognize b".." and not u"..".
Patch by Tal Einat.
......
......@@ -486,7 +486,7 @@ time_strftime(PyObject *self, PyObject *args)
fmt = format;
#else
/* Convert the unicode string to an ascii one */
format = PyUnicode_EncodeFSDefault(format_arg);
format = PyUnicode_EncodeLocale(format_arg, 1);
if (format == NULL)
return NULL;
fmt = PyBytes_AS_STRING(format);
......
......@@ -3073,6 +3073,140 @@ PyUnicode_AsEncodedObject(PyObject *unicode,
return NULL;
}
static size_t
wcstombs_errorpos(const wchar_t *wstr)
{
size_t len;
#if SIZEOF_WCHAR_T == 2
wchar_t buf[3];
#else
wchar_t buf[2];
#endif
char outbuf[MB_LEN_MAX];
const wchar_t *start, *previous;
int save_errno;
save_errno = errno;
#if SIZEOF_WCHAR_T == 2
buf[2] = 0;
#else
buf[1] = 0;
#endif
start = wstr;
while (*wstr != L'\0')
{
previous = wstr;
#if SIZEOF_WCHAR_T == 2
if (Py_UNICODE_IS_HIGH_SURROGATE(wstr[0])
&& Py_UNICODE_IS_LOW_SURROGATE(wstr[1]))
{
buf[0] = wstr[0];
buf[1] = wstr[1];
wstr += 2;
}
else {
buf[0] = *wstr;
buf[1] = 0;
wstr++;
}
#else
buf[0] = *wstr;
wstr++;
#endif
len = wcstombs(outbuf, buf, sizeof(outbuf));
if (len == (size_t)-1) {
errno = save_errno;
return previous - start;
}
}
/* failed to find the unencodable character */
errno = save_errno;
return 0;
}
PyObject *
PyUnicode_EncodeLocale(PyObject *unicode, int surrogateescape)
{
Py_ssize_t wlen, wlen2;
wchar_t *wstr;
PyObject *bytes = NULL;
char *errmsg;
PyObject *exc;
size_t error_pos;
wstr = PyUnicode_AsWideCharString(unicode, &wlen);
if (wstr == NULL)
return NULL;
wlen2 = wcslen(wstr);
if (wlen2 != wlen) {
PyMem_Free(wstr);
PyErr_SetString(PyExc_TypeError, "embedded null character");
return NULL;
}
if (surrogateescape) {
/* locale encoding with surrogateescape */
char *str;
str = _Py_wchar2char(wstr, &error_pos);
if (str == NULL) {
if (error_pos == (size_t)-1) {
PyErr_NoMemory();
PyMem_Free(wstr);
return NULL;
}
else {
goto encode_error;
}
}
PyMem_Free(wstr);
bytes = PyBytes_FromString(str);
PyMem_Free(str);
}
else {
size_t len, len2;
len = wcstombs(NULL, wstr, 0);
if (len == (size_t)-1) {
error_pos = wcstombs_errorpos(wstr);
goto encode_error;
}
bytes = PyBytes_FromStringAndSize(NULL, len);
if (bytes == NULL) {
PyMem_Free(wstr);
return NULL;
}
len2 = wcstombs(PyBytes_AS_STRING(bytes), wstr, len+1);
if (len2 == (size_t)-1 || len2 > len) {
error_pos = wcstombs_errorpos(wstr);
goto encode_error;
}
PyMem_Free(wstr);
}
return bytes;
encode_error:
errmsg = strerror(errno);
assert(errmsg != NULL);
if (errmsg == NULL)
errmsg = "wcstombs() encountered an unencodable wide character";
PyMem_Free(wstr);
Py_XDECREF(bytes);
exc = NULL;
raise_encode_exception(&exc,
"locale", unicode,
error_pos, error_pos+1,
errmsg);
Py_XDECREF(exc);
return NULL;
}
PyObject *
PyUnicode_EncodeFSDefault(PyObject *unicode)
{
......@@ -3097,38 +3231,7 @@ PyUnicode_EncodeFSDefault(PyObject *unicode)
"surrogateescape");
}
else {
/* locale encoding with surrogateescape */
wchar_t *wchar;
char *bytes;
PyObject *bytes_obj;
size_t error_pos;
wchar = PyUnicode_AsWideCharString(unicode, NULL);
if (wchar == NULL)
return NULL;
bytes = _Py_wchar2char(wchar, &error_pos);
if (bytes == NULL) {
if (error_pos != (size_t)-1) {
char *errmsg = strerror(errno);
PyObject *exc = NULL;
if (errmsg == NULL)
errmsg = "Py_wchar2char() failed";
raise_encode_exception(&exc,
"filesystemencoding", unicode,
error_pos, error_pos+1,
errmsg);
Py_XDECREF(exc);
}
else
PyErr_NoMemory();
PyMem_Free(wchar);
return NULL;
}
PyMem_Free(wchar);
bytes_obj = PyBytes_FromString(bytes);
PyMem_Free(bytes);
return bytes_obj;
return PyUnicode_EncodeLocale(unicode, 1);
}
#endif
}
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment