(Merge 3.2) Issue #16416: On Mac OS X, operating system data are now always

encoded/decoded to/from UTF-8/surrogateescape, instead of the locale encoding (which may be ASCII if no locale environment variable is set), to avoid inconsistencies with os.fsencode() and os.fsdecode() functions which are already using UTF-8/surrogateescape.

(Merge 3.2) Issue #16416: On Mac OS X, operating system data are now always
encoded/decoded to/from UTF-8/surrogateescape, instead of the locale encoding (which may be ASCII if no locale environment variable is set), to avoid inconsistencies with os.fsencode() and os.fsdecode() functions which are already using UTF-8/surrogateescape.
2660e427 · Victor Stinner · a2816c2b · 27b1ca29 · 2660e427 · 2660e427
Commit 2660e427 authored Dec 03, 2012 by Victor Stinner
Show whitespace changes
Inline Side-by-side

Showing with 65 additions and 18 deletions

Misc/NEWS Misc/NEWS +6 -0

Modules/python.c Modules/python.c +0 -8

Objects/unicodeobject.c Objects/unicodeobject.c +5 -4

Python/fileutils.c Python/fileutils.c +54 -6

No files found.
--- a/Misc/NEWS
+++ b/Misc/NEWS
@@ -12,6 +12,12 @@ What's New in Python 3.3.1?
 Core and Builtins
 -----------------
+- Issue #16416: On Mac OS X, operating system data are now always
+  encoded/decoded to/from UTF-8/surrogateescape, instead of the locale encoding
+  (which may be ASCII if no locale environment variable is set), to avoid
+  inconsistencies with os.fsencode() and os.fsdecode() functions which are
+  already using UTF-8/surrogateescape.
 - Issue #16588: Silence unused-but-set warnings in Python/thread_pthread
 - Issue #16546: Fix: ast.YieldFrom argument is now mandatory.

--- a/Modules/python.c
+++ b/Modules/python.c
@@ -15,10 +15,6 @@ wmain(int argc, wchar_t **argv)
 }
 #else
-#ifdef __APPLE__
-extern wchar_t* _Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size);
-#endif
 int
 main(int argc, char **argv)
 {
@@ -45,11 +41,7 @@ main(int argc, char **argv)
    oldloc = strdup(setlocale(LC_ALL, NULL));
    setlocale(LC_ALL, "");
    for (i = 0; i < argc; i++) {
-#ifdef __APPLE__
-        argv_copy[i] = _Py_DecodeUTF8_surrogateescape(argv[i], strlen(argv[i]));
-#else
        argv_copy[i] = _Py_char2wchar(argv[i], NULL);
-#endif
        if (!argv_copy[i]) {
            free(oldloc);
            fprintf(stderr, "Fatal Python error: "

--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@@ -4809,7 +4809,10 @@ onError:
 #ifdef __APPLE__
 /* Simplified UTF-8 decoder using surrogateescape error handler,
-   used to decode the command line arguments on Mac OS X. */
+   used to decode the command line arguments on Mac OS X.
+   Return a pointer to a newly allocated wide character string (use
+   PyMem_Free() to free the memory), or NULL on memory allocation error. */
 wchar_t*
 _Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size)
@@ -4820,10 +4823,8 @@ _Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size)
    /* Note: size will always be longer than the resulting Unicode
       character count */
-    if (PY_SSIZE_T_MAX / sizeof(wchar_t) < (size + 1)) {
+    if (PY_SSIZE_T_MAX / sizeof(wchar_t) < (size + 1))
-        PyErr_NoMemory();
        return NULL;
-    }
    unicode = PyMem_Malloc((size + 1) * sizeof(wchar_t));
    if (!unicode)
        return NULL;

--- a/Python/fileutils.c
+++ b/Python/fileutils.c
@@ -8,6 +8,10 @@
 #include <langinfo.h>
 #endif
+#ifdef __APPLE__
+extern wchar_t* _Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size);
+#endif
 PyObject *
 _Py_device_encoding(int fd)
 {
@@ -60,6 +64,17 @@ _Py_device_encoding(int fd)
 wchar_t*
 _Py_char2wchar(const char* arg, size_t *size)
 {
+#ifdef __APPLE__
+    wchar_t *wstr;
+    wstr = _Py_DecodeUTF8_surrogateescape(arg, strlen(arg));
+    if (size != NULL) {
+        if (wstr != NULL)
+            *size = wcslen(wstr);
+        else
+            *size = (size_t)-1;
+    }
+    return wstr;
+#else
    wchar_t *res;
 #ifdef HAVE_BROKEN_MBSTOWCS
    /* Some platforms have a broken implementation of
@@ -145,7 +160,7 @@ _Py_char2wchar(const char* arg, size_t *size)
        argsize -= converted;
        out++;
    }
-#else
+#else   /* HAVE_MBRTOWC */
    /* Cannot use C locale for escaping; manually escape as if charset
       is ASCII (i.e. escape all bytes > 128. This will still roundtrip
       correctly in the locale's charset, which must be an ASCII superset. */
@@ -160,7 +175,7 @@ _Py_char2wchar(const char* arg, size_t *size)
        else
            *out++ = 0xdc00 + *in++;
    *out = 0;
-#endif
+#endif   /* HAVE_MBRTOWC */
    if (size != NULL)
        *size = out - res;
    return res;
@@ -168,6 +183,7 @@ oom:
    if (size != NULL)
        *size = (size_t)-1;
    return NULL;
+#endif   /* __APPLE__ */
 }
 /* Encode a (wide) character string to the locale encoding with the
@@ -184,14 +200,42 @@ oom:
 char*
 _Py_wchar2char(const wchar_t *text, size_t *error_pos)
 {
+#ifdef __APPLE__
+    Py_ssize_t len;
+    PyObject *unicode, *bytes = NULL;
+    char *cpath;
+    unicode = PyUnicode_FromWideChar(text, wcslen(text));
+    if (unicode == NULL)
+        return NULL;
+    bytes = _PyUnicode_AsUTF8String(unicode, "surrogateescape");
+    Py_DECREF(unicode);
+    if (bytes == NULL) {
+        PyErr_Clear();
+        if (error_pos != NULL)
+            *error_pos = (size_t)-1;
+        return NULL;
+    }
+    len = PyBytes_GET_SIZE(bytes);
+    cpath = PyMem_Malloc(len+1);
+    if (cpath == NULL) {
+        PyErr_Clear();
+        Py_DECREF(bytes);
+        if (error_pos != NULL)
+            *error_pos = (size_t)-1;
+        return NULL;
+    }
+    memcpy(cpath, PyBytes_AsString(bytes), len + 1);
+    Py_DECREF(bytes);
+    return cpath;
+#else   /* __APPLE__ */
    const size_t len = wcslen(text);
    char *result = NULL, *bytes = NULL;
    size_t i, size, converted;
    wchar_t c, buf[2];
-    if (error_pos != NULL)
-        *error_pos = (size_t)-1;
    /* The function works in two steps:
       1. compute the length of the output buffer in bytes (size)
       2. outputs the bytes */
@@ -238,11 +282,15 @@ _Py_wchar2char(const wchar_t *text, size_t *error_pos)
        size += 1; /* nul byte at the end */
        result = PyMem_Malloc(size);
-        if (result == NULL)
+        if (result == NULL) {
+            if (error_pos != NULL)
+                *error_pos = (size_t)-1;
            return NULL;
+        }
        bytes = result;
    }
    return result;
+#endif   /* __APPLE__ */
 }
 /* In principle, this should use HAVE__WSTAT, and _wstat