Commit d500e530 authored by Victor Stinner's avatar Victor Stinner Committed by GitHub

bpo-34403: On HP-UX, force ASCII for C locale (GH-8969)

On HP-UX with C or POSIX locale, sys.getfilesystemencoding() now returns
"ascii" instead of "roman8" (when the UTF-8 Mode is disabled and the C locale
is not coerced).

nl_langinfo(CODESET) announces "roman8" whereas it uses the Latin1
encoding in practice.
parent 5cb25895
...@@ -170,6 +170,11 @@ PyAPI_FUNC(int) _Py_GetLocaleconvNumeric( ...@@ -170,6 +170,11 @@ PyAPI_FUNC(int) _Py_GetLocaleconvNumeric(
#endif /* Py_LIMITED_API */ #endif /* Py_LIMITED_API */
#ifdef Py_BUILD_CORE
PyAPI_FUNC(int) _Py_GetForceASCII(void);
#endif
#ifdef __cplusplus #ifdef __cplusplus
} }
#endif #endif
......
On HP-UX with C or POSIX locale, sys.getfilesystemencoding() now returns
"ascii" instead of "roman8" (when the UTF-8 Mode is disabled and the C locale
is not coerced).
...@@ -828,18 +828,21 @@ config_read_complex_options(_PyCoreConfig *config) ...@@ -828,18 +828,21 @@ config_read_complex_options(_PyCoreConfig *config)
static void static void
config_init_locale(_PyCoreConfig *config) config_init_locale(_PyCoreConfig *config)
{ {
if (_Py_LegacyLocaleDetected()) { if (config->coerce_c_locale < 0) {
/* The C locale enables the C locale coercion (PEP 538) */ /* The C locale enables the C locale coercion (PEP 538) */
if (config->coerce_c_locale < 0) { if (_Py_LegacyLocaleDetected()) {
config->coerce_c_locale = 1; config->coerce_c_locale = 1;
} }
} }
#ifndef MS_WINDOWS #ifndef MS_WINDOWS
const char *ctype_loc = setlocale(LC_CTYPE, NULL); if (config->utf8_mode < 0) {
if (ctype_loc != NULL
&& (strcmp(ctype_loc, "C") == 0 || strcmp(ctype_loc, "POSIX") == 0)) {
/* The C locale and the POSIX locale enable the UTF-8 Mode (PEP 540) */ /* The C locale and the POSIX locale enable the UTF-8 Mode (PEP 540) */
if (config->utf8_mode < 0) { const char *ctype_loc = setlocale(LC_CTYPE, NULL);
if (ctype_loc != NULL
&& (strcmp(ctype_loc, "C") == 0
|| strcmp(ctype_loc, "POSIX") == 0))
{
config->utf8_mode = 1; config->utf8_mode = 1;
} }
} }
......
...@@ -72,8 +72,8 @@ _Py_device_encoding(int fd) ...@@ -72,8 +72,8 @@ _Py_device_encoding(int fd)
extern int _Py_normalize_encoding(const char *, char *, size_t); extern int _Py_normalize_encoding(const char *, char *, size_t);
/* Workaround FreeBSD and OpenIndiana locale encoding issue with the C locale. /* Workaround FreeBSD and OpenIndiana locale encoding issue with the C locale
On these operating systems, nl_langinfo(CODESET) announces an alias of the and POSIX locale. nl_langinfo(CODESET) announces an alias of the
ASCII encoding, whereas mbstowcs() and wcstombs() functions use the ASCII encoding, whereas mbstowcs() and wcstombs() functions use the
ISO-8859-1 encoding. The problem is that os.fsencode() and os.fsdecode() use ISO-8859-1 encoding. The problem is that os.fsencode() and os.fsdecode() use
locale.getpreferredencoding() codec. For example, if command line arguments locale.getpreferredencoding() codec. For example, if command line arguments
...@@ -86,6 +86,10 @@ extern int _Py_normalize_encoding(const char *, char *, size_t); ...@@ -86,6 +86,10 @@ extern int _Py_normalize_encoding(const char *, char *, size_t);
workaround is also enabled on error, for example if getting the locale workaround is also enabled on error, for example if getting the locale
failed. failed.
On HP-UX with the C locale or the POSIX locale, nl_langinfo(CODESET)
announces "roman8" but mbstowcs() uses Latin1 in practice. Force also the
ASCII encoding in this case.
Values of force_ascii: Values of force_ascii:
1: the workaround is used: Py_EncodeLocale() uses 1: the workaround is used: Py_EncodeLocale() uses
...@@ -100,13 +104,46 @@ static int force_ascii = -1; ...@@ -100,13 +104,46 @@ static int force_ascii = -1;
static int static int
check_force_ascii(void) check_force_ascii(void)
{ {
char *loc; char *loc = setlocale(LC_CTYPE, NULL);
if (loc == NULL) {
goto error;
}
if (strcmp(loc, "C") != 0 && strcmp(loc, "POSIX") != 0) {
/* the LC_CTYPE locale is different than C and POSIX */
return 0;
}
#if defined(HAVE_LANGINFO_H) && defined(CODESET) #if defined(HAVE_LANGINFO_H) && defined(CODESET)
char *codeset, **alias; const char *codeset = nl_langinfo(CODESET);
if (!codeset || codeset[0] == '\0') {
/* CODESET is not set or empty */
goto error;
}
char encoding[20]; /* longest name: "iso_646.irv_1991\0" */ char encoding[20]; /* longest name: "iso_646.irv_1991\0" */
int is_ascii; if (!_Py_normalize_encoding(codeset, encoding, sizeof(encoding))) {
unsigned int i; goto error;
char* ascii_aliases[] = { }
#ifdef __hpux
if (strcmp(encoding, "roman8") == 0) {
unsigned char ch;
wchar_t wch;
size_t res;
ch = (unsigned char)0xA7;
res = mbstowcs(&wch, (char*)&ch, 1);
if (res != (size_t)-1 && wch == L'\xA7') {
/* On HP-UX withe C locale or the POSIX locale,
nl_langinfo(CODESET) announces "roman8", whereas mbstowcs() uses
Latin1 encoding in practice. Force ASCII in this case.
Roman8 decodes 0xA7 to U+00CF. Latin1 decodes 0xA7 to U+00A7. */
return 1;
}
}
#else
const char* ascii_aliases[] = {
"ascii", "ascii",
/* Aliases from Lib/encodings/aliases.py */ /* Aliases from Lib/encodings/aliases.py */
"646", "646",
...@@ -123,27 +160,9 @@ check_force_ascii(void) ...@@ -123,27 +160,9 @@ check_force_ascii(void)
"us_ascii", "us_ascii",
NULL NULL
}; };
#endif
loc = setlocale(LC_CTYPE, NULL);
if (loc == NULL)
goto error;
if (strcmp(loc, "C") != 0 && strcmp(loc, "POSIX") != 0) {
/* the LC_CTYPE locale is different than C */
return 0;
}
#if defined(HAVE_LANGINFO_H) && defined(CODESET)
codeset = nl_langinfo(CODESET);
if (!codeset || codeset[0] == '\0') {
/* CODESET is not set or empty */
goto error;
}
if (!_Py_normalize_encoding(codeset, encoding, sizeof(encoding)))
goto error;
is_ascii = 0; int is_ascii = 0;
for (alias=ascii_aliases; *alias != NULL; alias++) { for (const char **alias=ascii_aliases; *alias != NULL; alias++) {
if (strcmp(encoding, *alias) == 0) { if (strcmp(encoding, *alias) == 0) {
is_ascii = 1; is_ascii = 1;
break; break;
...@@ -154,13 +173,14 @@ check_force_ascii(void) ...@@ -154,13 +173,14 @@ check_force_ascii(void)
return 0; return 0;
} }
for (i=0x80; i<0xff; i++) { for (unsigned int i=0x80; i<=0xff; i++) {
unsigned char ch; char ch[1];
wchar_t wch; wchar_t wch[1];
size_t res; size_t res;
ch = (unsigned char)i; unsigned uch = (unsigned char)i;
res = mbstowcs(&wch, (char*)&ch, 1); ch[0] = (char)uch;
res = mbstowcs(wch, ch, 1);
if (res != (size_t)-1) { if (res != (size_t)-1) {
/* decoding a non-ASCII character from the locale encoding succeed: /* decoding a non-ASCII character from the locale encoding succeed:
the locale encoding is not ASCII, force ASCII */ the locale encoding is not ASCII, force ASCII */
...@@ -169,17 +189,29 @@ check_force_ascii(void) ...@@ -169,17 +189,29 @@ check_force_ascii(void)
} }
/* None of the bytes in the range 0x80-0xff can be decoded from the locale /* None of the bytes in the range 0x80-0xff can be decoded from the locale
encoding: the locale encoding is really ASCII */ encoding: the locale encoding is really ASCII */
#endif /* !defined(__hpux) */
return 0; return 0;
#else #else
/* nl_langinfo(CODESET) is not available: always force ASCII */ /* nl_langinfo(CODESET) is not available: always force ASCII */
return 1; return 1;
#endif #endif /* defined(HAVE_LANGINFO_H) && defined(CODESET) */
error: error:
/* if an error occurred, force the ASCII encoding */ /* if an error occurred, force the ASCII encoding */
return 1; return 1;
} }
int
_Py_GetForceASCII(void)
{
if (force_ascii == -1) {
force_ascii = check_force_ascii();
}
return force_ascii;
}
static int static int
encode_ascii(const wchar_t *text, char **str, encode_ascii(const wchar_t *text, char **str,
size_t *error_pos, const char **reason, size_t *error_pos, const char **reason,
...@@ -234,6 +266,12 @@ encode_ascii(const wchar_t *text, char **str, ...@@ -234,6 +266,12 @@ encode_ascii(const wchar_t *text, char **str,
*str = result; *str = result;
return 0; return 0;
} }
#else
int
_Py_GetForceASCII(void)
{
return 0;
}
#endif /* !defined(__APPLE__) && !defined(__ANDROID__) && !defined(MS_WINDOWS) */ #endif /* !defined(__APPLE__) && !defined(__ANDROID__) && !defined(MS_WINDOWS) */
......
...@@ -1576,21 +1576,25 @@ initfsencoding(PyInterpreterState *interp) ...@@ -1576,21 +1576,25 @@ initfsencoding(PyInterpreterState *interp)
Py_FileSystemDefaultEncodeErrors = "surrogatepass"; Py_FileSystemDefaultEncodeErrors = "surrogatepass";
} }
#else #else
if (Py_FileSystemDefaultEncoding == NULL && if (Py_FileSystemDefaultEncoding == NULL) {
interp->core_config.utf8_mode) if (interp->core_config.utf8_mode) {
{ Py_FileSystemDefaultEncoding = "utf-8";
Py_FileSystemDefaultEncoding = "utf-8"; Py_HasFileSystemDefaultEncoding = 1;
Py_HasFileSystemDefaultEncoding = 1; }
} else if (_Py_GetForceASCII()) {
else if (Py_FileSystemDefaultEncoding == NULL) { Py_FileSystemDefaultEncoding = "ascii";
Py_FileSystemDefaultEncoding = get_locale_encoding(); Py_HasFileSystemDefaultEncoding = 1;
if (Py_FileSystemDefaultEncoding == NULL) {
return _Py_INIT_ERR("Unable to get the locale encoding");
} }
else {
Py_FileSystemDefaultEncoding = get_locale_encoding();
if (Py_FileSystemDefaultEncoding == NULL) {
return _Py_INIT_ERR("Unable to get the locale encoding");
}
Py_HasFileSystemDefaultEncoding = 0; Py_HasFileSystemDefaultEncoding = 0;
interp->fscodec_initialized = 1; interp->fscodec_initialized = 1;
return _Py_INIT_OK(); return _Py_INIT_OK();
}
} }
#endif #endif
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment