Commit f933e1ab authored by Victor Stinner's avatar Victor Stinner

Issue #4388: On Mac OS X, decode command line arguments from UTF-8, instead of

the locale encoding. If the LANG (and LC_ALL and LC_CTYPE) environment variable
is not set, the locale encoding is ISO-8859-1, whereas most programs (including
Python) expect UTF-8. Python already uses UTF-8 for the filesystem encoding and
to encode command line arguments on this OS.
parent 073f759d
...@@ -148,6 +148,38 @@ class CmdLineTest(unittest.TestCase): ...@@ -148,6 +148,38 @@ class CmdLineTest(unittest.TestCase):
if not stdout.startswith(pattern): if not stdout.startswith(pattern):
raise AssertionError("%a doesn't start with %a" % (stdout, pattern)) raise AssertionError("%a doesn't start with %a" % (stdout, pattern))
@unittest.skipUnless(sys.platform == 'darwin', 'test specific to Mac OS X')
def test_osx_utf8(self):
def check_output(text):
decoded = text.decode('utf8', 'surrogateescape')
expected = ascii(decoded).encode('ascii') + b'\n'
env = os.environ.copy()
# C locale gives ASCII locale encoding, but Python uses UTF-8
# to parse the command line arguments on Mac OS X
env['LC_ALL'] = 'C'
p = subprocess.Popen(
(sys.executable, "-c", "import sys; print(ascii(sys.argv[1]))", text),
stdout=subprocess.PIPE,
env=env)
stdout, stderr = p.communicate()
self.assertEqual(stdout, expected)
self.assertEqual(p.returncode, 0)
# test valid utf-8
text = 'e:\xe9, euro:\u20ac, non-bmp:\U0010ffff'.encode('utf-8')
check_output(text)
# test invalid utf-8
text = (
b'\xff' # invalid byte
b'\xc3\xa9' # valid utf-8 character
b'\xc3\xff' # invalid byte sequence
b'\xed\xa0\x80' # lone surrogate character (invalid)
)
check_output(text)
def test_unbuffered_output(self): def test_unbuffered_output(self):
# Test expected operation of the '-u' switch # Test expected operation of the '-u' switch
for stream in ('stdout', 'stderr'): for stream in ('stdout', 'stderr'):
......
...@@ -10,6 +10,12 @@ What's New in Python 3.2 Beta 1? ...@@ -10,6 +10,12 @@ What's New in Python 3.2 Beta 1?
Core and Builtins Core and Builtins
----------------- -----------------
- Issue #4388: On Mac OS X, decode command line arguments from UTF-8, instead
of the locale encoding. If the LANG (and LC_ALL and LC_CTYPE) environment
variable is not set, the locale encoding is ISO-8859-1, whereas most programs
(including Python) expect UTF-8. Python already uses UTF-8 for the filesystem
encoding and to encode command line arguments on this OS.
- Issue #9713, #10114: Parser functions (eg. PyParser_ASTFromFile) expects - Issue #9713, #10114: Parser functions (eg. PyParser_ASTFromFile) expects
filenames encoded to the filesystem encoding with surrogateescape error filenames encoded to the filesystem encoding with surrogateescape error
handler (to support undecodable bytes), instead of UTF-8 in strict mode. handler (to support undecodable bytes), instead of UTF-8 in strict mode.
......
...@@ -15,6 +15,10 @@ wmain(int argc, wchar_t **argv) ...@@ -15,6 +15,10 @@ wmain(int argc, wchar_t **argv)
} }
#else #else
#ifdef __APPLE__
extern wchar_t* _Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size);
#endif
int int
main(int argc, char **argv) main(int argc, char **argv)
{ {
...@@ -41,7 +45,11 @@ main(int argc, char **argv) ...@@ -41,7 +45,11 @@ main(int argc, char **argv)
oldloc = strdup(setlocale(LC_ALL, NULL)); oldloc = strdup(setlocale(LC_ALL, NULL));
setlocale(LC_ALL, ""); setlocale(LC_ALL, "");
for (i = 0; i < argc; i++) { for (i = 0; i < argc; i++) {
#ifdef __APPLE__
argv_copy[i] = _Py_DecodeUTF8_surrogateescape(argv[i], strlen(argv[i]));
#else
argv_copy[i] = _Py_char2wchar(argv[i], NULL); argv_copy[i] = _Py_char2wchar(argv[i], NULL);
#endif
if (!argv_copy[i]) if (!argv_copy[i])
return 1; return 1;
argv_copy2[i] = argv_copy[i]; argv_copy2[i] = argv_copy[i];
......
...@@ -2716,6 +2716,120 @@ PyObject *PyUnicode_DecodeUTF8Stateful(const char *s, ...@@ -2716,6 +2716,120 @@ PyObject *PyUnicode_DecodeUTF8Stateful(const char *s,
#undef ASCII_CHAR_MASK #undef ASCII_CHAR_MASK
#ifdef __APPLE__
/* Simplified UTF-8 decoder using surrogateescape error handler,
used to decode the command line arguments on Mac OS X. */
wchar_t*
_Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size)
{
int n;
const char *e;
wchar_t *unicode, *p;
/* Note: size will always be longer than the resulting Unicode
character count */
if (PY_SSIZE_T_MAX / sizeof(wchar_t) < (size + 1)) {
PyErr_NoMemory();
return NULL;
}
unicode = PyMem_Malloc((size + 1) * sizeof(wchar_t));
if (!unicode)
return NULL;
/* Unpack UTF-8 encoded data */
p = unicode;
e = s + size;
while (s < e) {
Py_UCS4 ch = (unsigned char)*s;
if (ch < 0x80) {
*p++ = (wchar_t)ch;
s++;
continue;
}
n = utf8_code_length[ch];
if (s + n > e) {
goto surrogateescape;
}
switch (n) {
case 0:
case 1:
goto surrogateescape;
case 2:
if ((s[1] & 0xc0) != 0x80)
goto surrogateescape;
ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
assert ((ch > 0x007F) && (ch <= 0x07FF));
*p++ = (wchar_t)ch;
break;
case 3:
/* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
will result in surrogates in range d800-dfff. Surrogates are
not valid UTF-8 so they are rejected.
See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
(table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
if ((s[1] & 0xc0) != 0x80 ||
(s[2] & 0xc0) != 0x80 ||
((unsigned char)s[0] == 0xE0 &&
(unsigned char)s[1] < 0xA0) ||
((unsigned char)s[0] == 0xED &&
(unsigned char)s[1] > 0x9F)) {
goto surrogateescape;
}
ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
assert ((ch > 0x07FF) && (ch <= 0xFFFF));
*p++ = (Py_UNICODE)ch;
break;
case 4:
if ((s[1] & 0xc0) != 0x80 ||
(s[2] & 0xc0) != 0x80 ||
(s[3] & 0xc0) != 0x80 ||
((unsigned char)s[0] == 0xF0 &&
(unsigned char)s[1] < 0x90) ||
((unsigned char)s[0] == 0xF4 &&
(unsigned char)s[1] > 0x8F)) {
goto surrogateescape;
}
ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
assert ((ch > 0xFFFF) && (ch <= 0x10ffff));
#if SIZEOF_WCHAR_T == 4
*p++ = (wchar_t)ch;
#else
/* compute and append the two surrogates: */
/* translate from 10000..10FFFF to 0..FFFF */
ch -= 0x10000;
/* high surrogate = top 10 bits added to D800 */
*p++ = (wchar_t)(0xD800 + (ch >> 10));
/* low surrogate = bottom 10 bits added to DC00 */
*p++ = (wchar_t)(0xDC00 + (ch & 0x03FF));
#endif
break;
}
s += n;
continue;
surrogateescape:
*p++ = 0xDC00 + ch;
s++;
}
*p = L'\0';
return unicode;
}
#endif /* __APPLE__ */
/* Allocation strategy: if the string is short, convert into a stack buffer /* Allocation strategy: if the string is short, convert into a stack buffer
and allocate exactly as much space needed at the end. Else allocate the and allocate exactly as much space needed at the end. Else allocate the
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment