Commit 9454060e authored by Victor Stinner's avatar Victor Stinner Committed by GitHub

bpo-29240, bpo-32030: Py_Main() re-reads config if encoding changes (#4899)

bpo-29240, bpo-32030: If the encoding change (C locale coerced or
UTF-8 Mode changed), Py_Main() now reads again the configuration with
the new encoding.

Changes:

* Add _Py_UnixMain() called by main().
* Rename pymain_free_pymain() to pymain_clear_pymain(), it can now be
  called multipled times.
* Rename pymain_parse_cmdline_envvars() to pymain_read_conf().
* Py_Main() now clears orig_argc and orig_argv at exit.
* Remove argv_copy2, Py_Main() doesn't modify argv anymore. There is
  no need anymore to get two copies of the wchar_t** argv.
* _PyCoreConfig: add coerce_c_locale and coerce_c_locale_warn.
* Py_UTF8Mode is now initialized to -1.
* Locale coercion (PEP 538) now respects -I and -E options.
parent e796b2fe
......@@ -779,9 +779,7 @@ conflict.
If set to the value ``0``, causes the main Python command line application
to skip coercing the legacy ASCII-based C locale to a more capable UTF-8
based alternative. Note that this setting is checked even when the
:option:`-E` or :option:`-I` options are used, as it is handled prior to
the processing of command line options.
based alternative.
If this variable is *not* set, or is set to a value other than ``0``, and
the current locale reported for the ``LC_CTYPE`` category is the default
......
......@@ -105,6 +105,9 @@ PyAPI_FUNC(int) Py_FdIsInteractive(FILE *, const char *);
/* Bootstrap __main__ (defined in Modules/main.c) */
PyAPI_FUNC(int) Py_Main(int argc, wchar_t **argv);
#ifdef Py_BUILD_CORE
PyAPI_FUNC(int) _Py_UnixMain(int argc, char **argv);
#endif
/* In getpath.c */
PyAPI_FUNC(wchar_t *) Py_GetProgramFullPath(void);
......@@ -194,7 +197,7 @@ PyAPI_FUNC(int) _PyOS_URandomNonblock(void *buffer, Py_ssize_t size);
/* Legacy locale support */
#ifndef Py_LIMITED_API
PyAPI_FUNC(void) _Py_CoerceLegacyLocale(void);
PyAPI_FUNC(void) _Py_CoerceLegacyLocale(const _PyCoreConfig *config);
PyAPI_FUNC(int) _Py_LegacyLocaleDetected(void);
PyAPI_FUNC(char *) _Py_SetLocaleFromEnv(int category);
#endif
......
......@@ -38,7 +38,10 @@ typedef struct {
int show_alloc_count; /* -X showalloccount */
int dump_refs; /* PYTHONDUMPREFS */
int malloc_stats; /* PYTHONMALLOCSTATS */
int utf8_mode; /* -X utf8 or PYTHONUTF8 environment variable */
int coerce_c_locale; /* PYTHONCOERCECLOCALE, -1 means unknown */
int coerce_c_locale_warn; /* PYTHONCOERCECLOCALE=warn */
int utf8_mode; /* -X utf8 or PYTHONUTF8 environment variable,
-1 means unknown */
wchar_t *module_search_path_env; /* PYTHONPATH environment variable */
wchar_t *home; /* PYTHONHOME environment variable,
......@@ -46,7 +49,8 @@ typedef struct {
wchar_t *program_name; /* Program name, see also Py_GetProgramName() */
} _PyCoreConfig;
#define _PyCoreConfig_INIT (_PyCoreConfig){.use_hash_seed = -1}
#define _PyCoreConfig_INIT \
(_PyCoreConfig){.use_hash_seed = -1, .coerce_c_locale = -1, .utf8_mode = -1}
/* Note: _PyCoreConfig_INIT sets other fields to 0/NULL */
/* Placeholders while working on the new configuration API
......
......@@ -65,7 +65,7 @@ def _set_locale_in_subprocess(locale_name):
# If there's no valid CODESET, we expect coercion to be skipped
cmd_fmt += "; import sys; sys.exit(not locale.nl_langinfo(locale.CODESET))"
cmd = cmd_fmt.format(locale_name)
result, py_cmd = run_python_until_end("-c", cmd, __isolated=True)
result, py_cmd = run_python_until_end("-c", cmd, PYTHONCOERCECLOCALE='')
return result.rc == 0
......@@ -131,7 +131,6 @@ class EncodingDetails(_EncodingDetails):
"""
result, py_cmd = run_python_until_end(
"-X", "utf8=0", "-c", cls.CHILD_PROCESS_SCRIPT,
__isolated=True,
**env_vars
)
if not result.rc == 0:
......@@ -236,6 +235,7 @@ class LocaleConfigurationTests(_LocaleHandlingTestCase):
"LANG": "",
"LC_CTYPE": "",
"LC_ALL": "",
"PYTHONCOERCECLOCALE": "",
}
for env_var in ("LANG", "LC_CTYPE"):
for locale_to_set in AVAILABLE_TARGETS:
......@@ -294,6 +294,7 @@ class LocaleCoercionTests(_LocaleHandlingTestCase):
"LANG": "",
"LC_CTYPE": "",
"LC_ALL": "",
"PYTHONCOERCECLOCALE": "",
}
base_var_dict.update(extra_vars)
for env_var in ("LANG", "LC_CTYPE"):
......
......@@ -551,7 +551,7 @@ class CmdLineTest(unittest.TestCase):
self.assertEqual(out, "True")
# Warnings
code = ("import sys, warnings; "
code = ("import warnings; "
"print(' '.join('%s::%s' % (f[0], f[2].__name__) "
"for f in warnings.filters))")
if Py_DEBUG:
......
......@@ -7,6 +7,7 @@ import os
import sys
import textwrap
import unittest
from test import support
from test.support.script_helper import assert_python_ok, assert_python_failure
......@@ -14,9 +15,11 @@ MS_WINDOWS = (sys.platform == 'win32')
class UTF8ModeTests(unittest.TestCase):
# Override PYTHONUTF8 and PYTHONLEGACYWINDOWSFSENCODING environment
# variables by default
DEFAULT_ENV = {'PYTHONUTF8': '', 'PYTHONLEGACYWINDOWSFSENCODING': ''}
DEFAULT_ENV = {
'PYTHONUTF8': '',
'PYTHONLEGACYWINDOWSFSENCODING': '',
'PYTHONCOERCECLOCALE': '0',
}
def posix_locale(self):
loc = locale.setlocale(locale.LC_CTYPE, None)
......@@ -53,7 +56,7 @@ class UTF8ModeTests(unittest.TestCase):
self.assertEqual(out, '0')
if MS_WINDOWS:
# PYTHONLEGACYWINDOWSFSENCODING disables the UTF-8
# PYTHONLEGACYWINDOWSFSENCODING disables the UTF-8 Mode
# and has the priority over -X utf8
out = self.get_output('-X', 'utf8', '-c', code,
PYTHONLEGACYWINDOWSFSENCODING='1')
......@@ -201,6 +204,25 @@ class UTF8ModeTests(unittest.TestCase):
out = self.get_output('-X', 'utf8', '-c', code, LC_ALL='C')
self.assertEqual(out, 'UTF-8 UTF-8')
@unittest.skipIf(MS_WINDOWS, 'test specific to Unix')
def test_cmd_line(self):
arg = 'h\xe9\u20ac'.encode('utf-8')
arg_utf8 = arg.decode('utf-8')
arg_ascii = arg.decode('ascii', 'surrogateescape')
code = 'import locale, sys; print("%s:%s" % (locale.getpreferredencoding(), ascii(sys.argv[1:])))'
def check(utf8_opt, expected, **kw):
out = self.get_output('-X', utf8_opt, '-c', code, arg, **kw)
args = out.partition(':')[2].rstrip()
self.assertEqual(args, ascii(expected), out)
check('utf8', [arg_utf8])
if sys.platform == 'darwin' or support.is_android:
c_arg = arg_utf8
else:
c_arg = arg_ascii
check('utf8=0', [c_arg], LC_ALL='C')
if __name__ == "__main__":
unittest.main()
......@@ -112,7 +112,7 @@ extern "C" {
#define DECODE_LOCALE_ERR(NAME, LEN) \
((LEN) == (size_t)-2) \
? _Py_INIT_USER_ERR("cannot decode " #NAME) \
? _Py_INIT_USER_ERR("cannot decode " NAME) \
: _Py_INIT_NO_MEMORY()
typedef struct {
......
This diff is collapsed.
......@@ -17,98 +17,9 @@ wmain(int argc, wchar_t **argv)
#else
static void _Py_NO_RETURN
fatal_error(const char *msg)
{
fprintf(stderr, "Fatal Python error: %s\n", msg);
fflush(stderr);
exit(1);
}
int
main(int argc, char **argv)
{
wchar_t **argv_copy;
/* We need a second copy, as Python might modify the first one. */
wchar_t **argv_copy2;
int i, status;
char *oldloc;
_PyInitError err = _PyRuntime_Initialize();
if (_Py_INIT_FAILED(err)) {
fatal_error(err.msg);
}
/* Force default allocator, to be able to release memory above
with a known allocator. */
_PyMem_SetDefaultAllocator(PYMEM_DOMAIN_RAW, NULL);
argv_copy = (wchar_t **)PyMem_RawMalloc(sizeof(wchar_t*) * (argc+1));
argv_copy2 = (wchar_t **)PyMem_RawMalloc(sizeof(wchar_t*) * (argc+1));
if (!argv_copy || !argv_copy2) {
fatal_error("out of memory");
return 1;
}
/* 754 requires that FP exceptions run in "no stop" mode by default,
* and until C vendors implement C99's ways to control FP exceptions,
* Python requires non-stop mode. Alas, some platforms enable FP
* exceptions by default. Here we disable them.
*/
#ifdef __FreeBSD__
fedisableexcept(FE_OVERFLOW);
#endif
oldloc = _PyMem_RawStrdup(setlocale(LC_ALL, NULL));
if (!oldloc) {
fatal_error("out of memory");
return 1;
}
/* Reconfigure the locale to the default for this process */
_Py_SetLocaleFromEnv(LC_ALL);
/* The legacy C locale assumes ASCII as the default text encoding, which
* causes problems not only for the CPython runtime, but also other
* components like GNU readline.
*
* Accordingly, when the CLI detects it, it attempts to coerce it to a
* more capable UTF-8 based alternative.
*
* See the documentation of the PYTHONCOERCECLOCALE setting for more
* details.
*/
if (_Py_LegacyLocaleDetected()) {
Py_UTF8Mode = 1;
_Py_CoerceLegacyLocale();
}
/* Convert from char to wchar_t based on the locale settings */
for (i = 0; i < argc; i++) {
argv_copy[i] = Py_DecodeLocale(argv[i], NULL);
if (!argv_copy[i]) {
PyMem_RawFree(oldloc);
fatal_error("unable to decode the command line arguments");
}
argv_copy2[i] = argv_copy[i];
}
argv_copy2[argc] = argv_copy[argc] = NULL;
setlocale(LC_ALL, oldloc);
PyMem_RawFree(oldloc);
status = Py_Main(argc, argv_copy);
/* Py_Main() can change PyMem_RawMalloc() allocator, so restore the default
to release memory blocks allocated before Py_Main() */
_PyMem_SetDefaultAllocator(PYMEM_DOMAIN_RAW, NULL);
for (i = 0; i < argc; i++) {
PyMem_RawFree(argv_copy2[i]);
}
PyMem_RawFree(argv_copy);
PyMem_RawFree(argv_copy2);
return status;
return _Py_UnixMain(argc, argv);
}
#endif
......@@ -29,9 +29,10 @@ const char *Py_FileSystemDefaultEncoding = NULL; /* set by initfsencoding() */
int Py_HasFileSystemDefaultEncoding = 0;
#endif
const char *Py_FileSystemDefaultEncodeErrors = "surrogateescape";
/* UTF-8 mode (PEP 540): if non-zero, use the UTF-8 encoding, and change stdin
and stdout error handler to "surrogateescape". */
int Py_UTF8Mode = 0;
/* UTF-8 mode (PEP 540): if equals to 1, use the UTF-8 encoding, and change
stdin and stdout error handler to "surrogateescape". It is equal to
-1 by default: unknown, will be set by Py_Main() */
int Py_UTF8Mode = -1;
_Py_IDENTIFIER(__builtins__);
_Py_IDENTIFIER(__dict__);
......
......@@ -393,7 +393,7 @@ Py_DecodeLocale(const char* arg, size_t *size)
#if defined(__APPLE__) || defined(__ANDROID__)
return _Py_DecodeUTF8_surrogateescape(arg, strlen(arg), size);
#else
if (Py_UTF8Mode) {
if (Py_UTF8Mode == 1) {
return _Py_DecodeUTF8_surrogateescape(arg, strlen(arg), size);
}
......@@ -539,7 +539,7 @@ Py_EncodeLocale(const wchar_t *text, size_t *error_pos)
#if defined(__APPLE__) || defined(__ANDROID__)
return _Py_EncodeLocaleUTF8(text, error_pos);
#else /* __APPLE__ */
if (Py_UTF8Mode) {
if (Py_UTF8Mode == 1) {
return _Py_EncodeLocaleUTF8(text, error_pos);
}
......
......@@ -385,18 +385,10 @@ static const char *_C_LOCALE_WARNING =
"C.utf8, or UTF-8 (if available) as alternative Unicode-compatible "
"locales is recommended.\n";
static int
_legacy_locale_warnings_enabled(void)
{
const char *coerce_c_locale = getenv("PYTHONCOERCECLOCALE");
return (coerce_c_locale != NULL &&
strncmp(coerce_c_locale, "warn", 5) == 0);
}
static void
_emit_stderr_warning_for_legacy_locale(void)
_emit_stderr_warning_for_legacy_locale(const _PyCoreConfig *core_config)
{
if (_legacy_locale_warnings_enabled()) {
if (core_config->coerce_c_locale_warn) {
if (_Py_LegacyLocaleDetected()) {
fprintf(stderr, "%s", _C_LOCALE_WARNING);
}
......@@ -440,12 +432,12 @@ get_default_standard_stream_error_handler(void)
}
#ifdef PY_COERCE_C_LOCALE
static const char _C_LOCALE_COERCION_WARNING[] =
static const char C_LOCALE_COERCION_WARNING[] =
"Python detected LC_CTYPE=C: LC_CTYPE coerced to %.20s (set another locale "
"or PYTHONCOERCECLOCALE=0 to disable this locale coercion behavior).\n";
static void
_coerce_default_locale_settings(const _LocaleCoercionTarget *target)
_coerce_default_locale_settings(const _PyCoreConfig *config, const _LocaleCoercionTarget *target)
{
const char *newloc = target->locale_name;
......@@ -458,8 +450,8 @@ _coerce_default_locale_settings(const _LocaleCoercionTarget *target)
"Error setting LC_CTYPE, skipping C locale coercion\n");
return;
}
if (_legacy_locale_warnings_enabled()) {
fprintf(stderr, _C_LOCALE_COERCION_WARNING, newloc);
if (config->coerce_c_locale_warn) {
fprintf(stderr, C_LOCALE_COERCION_WARNING, newloc);
}
/* Reconfigure with the overridden environment variables */
......@@ -468,47 +460,31 @@ _coerce_default_locale_settings(const _LocaleCoercionTarget *target)
#endif
void
_Py_CoerceLegacyLocale(void)
_Py_CoerceLegacyLocale(const _PyCoreConfig *config)
{
#ifdef PY_COERCE_C_LOCALE
/* We ignore the Python -E and -I flags here, as the CLI needs to sort out
* the locale settings *before* we try to do anything with the command
* line arguments. For cross-platform debugging purposes, we also need
* to give end users a way to force even scripts that are otherwise
* isolated from their environment to use the legacy ASCII-centric C
* locale.
*
* Ignoring -E and -I is safe from a security perspective, as we only use
* the setting to turn *off* the implicit locale coercion, and anyone with
* access to the process environment already has the ability to set
* `LC_ALL=C` to override the C level locale settings anyway.
*/
const char *coerce_c_locale = getenv("PYTHONCOERCECLOCALE");
if (coerce_c_locale == NULL || strncmp(coerce_c_locale, "0", 2) != 0) {
/* PYTHONCOERCECLOCALE is not set, or is set to something other than "0" */
const char *locale_override = getenv("LC_ALL");
if (locale_override == NULL || *locale_override == '\0') {
/* LC_ALL is also not set (or is set to an empty string) */
const _LocaleCoercionTarget *target = NULL;
for (target = _TARGET_LOCALES; target->locale_name; target++) {
const char *new_locale = setlocale(LC_CTYPE,
target->locale_name);
if (new_locale != NULL) {
const char *locale_override = getenv("LC_ALL");
if (locale_override == NULL || *locale_override == '\0') {
/* LC_ALL is also not set (or is set to an empty string) */
const _LocaleCoercionTarget *target = NULL;
for (target = _TARGET_LOCALES; target->locale_name; target++) {
const char *new_locale = setlocale(LC_CTYPE,
target->locale_name);
if (new_locale != NULL) {
#if !defined(__APPLE__) && !defined(__ANDROID__) && \
defined(HAVE_LANGINFO_H) && defined(CODESET)
/* Also ensure that nl_langinfo works in this locale */
char *codeset = nl_langinfo(CODESET);
if (!codeset || *codeset == '\0') {
/* CODESET is not set or empty, so skip coercion */
new_locale = NULL;
_Py_SetLocaleFromEnv(LC_CTYPE);
continue;
}
#endif
/* Successfully configured locale, so make it the default */
_coerce_default_locale_settings(target);
return;
defined(HAVE_LANGINFO_H) && defined(CODESET)
/* Also ensure that nl_langinfo works in this locale */
char *codeset = nl_langinfo(CODESET);
if (!codeset || *codeset == '\0') {
/* CODESET is not set or empty, so skip coercion */
new_locale = NULL;
_Py_SetLocaleFromEnv(LC_CTYPE);
continue;
}
#endif
/* Successfully configured locale, so make it the default */
_coerce_default_locale_settings(config, target);
return;
}
}
}
......@@ -648,7 +624,7 @@ _Py_InitializeCore(const _PyCoreConfig *core_config)
the locale's charset without having to switch
locales. */
_Py_SetLocaleFromEnv(LC_CTYPE);
_emit_stderr_warning_for_legacy_locale();
_emit_stderr_warning_for_legacy_locale(core_config);
#endif
err = _Py_HashRandomization_Init(core_config);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment