Commit 678db84b authored by Serhiy Storchaka's avatar Serhiy Storchaka

Issue #10156: In the interpreter's initialization phase, unicode globals

are now initialized dynamically as needed.
parents 1c7181d7 05997253
...@@ -12,6 +12,9 @@ What's New in Python 3.3.1? ...@@ -12,6 +12,9 @@ What's New in Python 3.3.1?
Core and Builtins Core and Builtins
----------------- -----------------
- Issue #10156: In the interpreter's initialization phase, unicode globals
are now initialized dynamically as needed.
- Issue #16980: Fix processing of escaped non-ascii bytes in the - Issue #16980: Fix processing of escaped non-ascii bytes in the
unicode-escape-decode decoder. unicode-escape-decode decoder.
......
...@@ -57,8 +57,9 @@ OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. ...@@ -57,8 +57,9 @@ OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
/* --- Globals ------------------------------------------------------------ /* --- Globals ------------------------------------------------------------
The globals are initialized by the _PyUnicode_Init() API and should NOTE: In the interpreter's initialization phase, some globals are currently
not be used before calling that API. initialized dynamically as needed. In the process Unicode objects may
be created before the Unicode type is ready.
*/ */
...@@ -179,17 +180,36 @@ extern "C" { ...@@ -179,17 +180,36 @@ extern "C" {
Another way to look at this is that to say that the actual reference Another way to look at this is that to say that the actual reference
count of a string is: s->ob_refcnt + (s->state ? 2 : 0) count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
*/ */
static PyObject *interned; static PyObject *interned = NULL;
/* The empty Unicode object is shared to improve performance. */ /* The empty Unicode object is shared to improve performance. */
static PyObject *unicode_empty; static PyObject *unicode_empty = NULL;
#define _Py_INCREF_UNICODE_EMPTY() \
do { \
if (unicode_empty != NULL) \
Py_INCREF(unicode_empty); \
else { \
unicode_empty = PyUnicode_New(0, 0); \
if (unicode_empty != NULL) { \
Py_INCREF(unicode_empty); \
assert(_PyUnicode_CheckConsistency(unicode_empty, 1)); \
} \
} \
} while (0)
#define _Py_RETURN_UNICODE_EMPTY() \
do { \
_Py_INCREF_UNICODE_EMPTY(); \
return unicode_empty; \
} while (0)
/* List of static strings. */ /* List of static strings. */
static _Py_Identifier *static_strings; static _Py_Identifier *static_strings = NULL;
/* Single character Unicode strings in the Latin-1 range are being /* Single character Unicode strings in the Latin-1 range are being
shared as well. */ shared as well. */
static PyObject *unicode_latin1[256]; static PyObject *unicode_latin1[256] = {NULL};
/* Fast detection of the most frequent whitespace characters */ /* Fast detection of the most frequent whitespace characters */
const unsigned char _Py_ascii_whitespace[] = { const unsigned char _Py_ascii_whitespace[] = {
...@@ -416,9 +436,8 @@ unicode_result_wchar(PyObject *unicode) ...@@ -416,9 +436,8 @@ unicode_result_wchar(PyObject *unicode)
len = _PyUnicode_WSTR_LENGTH(unicode); len = _PyUnicode_WSTR_LENGTH(unicode);
if (len == 0) { if (len == 0) {
Py_INCREF(unicode_empty);
Py_DECREF(unicode); Py_DECREF(unicode);
return unicode_empty; _Py_RETURN_UNICODE_EMPTY();
} }
if (len == 1) { if (len == 1) {
...@@ -450,8 +469,8 @@ unicode_result_ready(PyObject *unicode) ...@@ -450,8 +469,8 @@ unicode_result_ready(PyObject *unicode)
length = PyUnicode_GET_LENGTH(unicode); length = PyUnicode_GET_LENGTH(unicode);
if (length == 0) { if (length == 0) {
if (unicode != unicode_empty) { if (unicode != unicode_empty) {
Py_INCREF(unicode_empty);
Py_DECREF(unicode); Py_DECREF(unicode);
_Py_RETURN_UNICODE_EMPTY();
} }
return unicode_empty; return unicode_empty;
} }
...@@ -528,7 +547,7 @@ static OSVERSIONINFOEX winver; ...@@ -528,7 +547,7 @@ static OSVERSIONINFOEX winver;
#define BLOOM_MASK unsigned long #define BLOOM_MASK unsigned long
static BLOOM_MASK bloom_linebreak; static BLOOM_MASK bloom_linebreak = ~(BLOOM_MASK)0;
#define BLOOM_ADD(mask, ch) ((mask |= (1UL << ((ch) & (BLOOM_WIDTH - 1))))) #define BLOOM_ADD(mask, ch) ((mask |= (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1))))) #define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
...@@ -1582,9 +1601,11 @@ unicode_resize(PyObject **p_unicode, Py_ssize_t length) ...@@ -1582,9 +1601,11 @@ unicode_resize(PyObject **p_unicode, Py_ssize_t length)
return 0; return 0;
if (length == 0) { if (length == 0) {
_Py_INCREF_UNICODE_EMPTY();
if (!unicode_empty)
return -1;
Py_DECREF(*p_unicode); Py_DECREF(*p_unicode);
*p_unicode = unicode_empty; *p_unicode = unicode_empty;
Py_INCREF(*p_unicode);
return 0; return 0;
} }
...@@ -1731,10 +1752,8 @@ PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size) ...@@ -1731,10 +1752,8 @@ PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size)
some optimizations which share commonly used objects. */ some optimizations which share commonly used objects. */
/* Optimization for empty strings */ /* Optimization for empty strings */
if (size == 0 && unicode_empty != NULL) { if (size == 0)
Py_INCREF(unicode_empty); _Py_RETURN_UNICODE_EMPTY();
return unicode_empty;
}
/* Single character Unicode objects in the Latin-1 range are /* Single character Unicode objects in the Latin-1 range are
shared when using this constructor */ shared when using this constructor */
...@@ -1893,10 +1912,8 @@ _PyUnicode_FromUCS1(const unsigned char* u, Py_ssize_t size) ...@@ -1893,10 +1912,8 @@ _PyUnicode_FromUCS1(const unsigned char* u, Py_ssize_t size)
PyObject *res; PyObject *res;
unsigned char max_char; unsigned char max_char;
if (size == 0) { if (size == 0)
Py_INCREF(unicode_empty); _Py_RETURN_UNICODE_EMPTY();
return unicode_empty;
}
assert(size > 0); assert(size > 0);
if (size == 1) if (size == 1)
return get_latin1_char(u[0]); return get_latin1_char(u[0]);
...@@ -1916,10 +1933,8 @@ _PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size) ...@@ -1916,10 +1933,8 @@ _PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size)
PyObject *res; PyObject *res;
Py_UCS2 max_char; Py_UCS2 max_char;
if (size == 0) { if (size == 0)
Py_INCREF(unicode_empty); _Py_RETURN_UNICODE_EMPTY();
return unicode_empty;
}
assert(size > 0); assert(size > 0);
if (size == 1) { if (size == 1) {
Py_UCS4 ch = u[0]; Py_UCS4 ch = u[0];
...@@ -1954,10 +1969,8 @@ _PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size) ...@@ -1954,10 +1969,8 @@ _PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size)
PyObject *res; PyObject *res;
Py_UCS4 max_char; Py_UCS4 max_char;
if (size == 0) { if (size == 0)
Py_INCREF(unicode_empty); _Py_RETURN_UNICODE_EMPTY();
return unicode_empty;
}
assert(size > 0); assert(size > 0);
if (size == 1) { if (size == 1) {
Py_UCS4 ch = u[0]; Py_UCS4 ch = u[0];
...@@ -2249,10 +2262,8 @@ PyObject * ...@@ -2249,10 +2262,8 @@ PyObject *
PyUnicode_FromWideChar(register const wchar_t *w, Py_ssize_t size) PyUnicode_FromWideChar(register const wchar_t *w, Py_ssize_t size)
{ {
if (w == NULL) { if (w == NULL) {
if (size == 0) { if (size == 0)
Py_INCREF(unicode_empty); _Py_RETURN_UNICODE_EMPTY();
return unicode_empty;
}
PyErr_BadInternalCall(); PyErr_BadInternalCall();
return NULL; return NULL;
} }
...@@ -3007,15 +3018,11 @@ PyUnicode_FromEncodedObject(register PyObject *obj, ...@@ -3007,15 +3018,11 @@ PyUnicode_FromEncodedObject(register PyObject *obj,
/* Decoding bytes objects is the most common case and should be fast */ /* Decoding bytes objects is the most common case and should be fast */
if (PyBytes_Check(obj)) { if (PyBytes_Check(obj)) {
if (PyBytes_GET_SIZE(obj) == 0) { if (PyBytes_GET_SIZE(obj) == 0)
Py_INCREF(unicode_empty); _Py_RETURN_UNICODE_EMPTY();
v = unicode_empty;
}
else {
v = PyUnicode_Decode( v = PyUnicode_Decode(
PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj), PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
encoding, errors); encoding, errors);
}
return v; return v;
} }
...@@ -3035,12 +3042,11 @@ PyUnicode_FromEncodedObject(register PyObject *obj, ...@@ -3035,12 +3042,11 @@ PyUnicode_FromEncodedObject(register PyObject *obj,
} }
if (buffer.len == 0) { if (buffer.len == 0) {
Py_INCREF(unicode_empty); PyBuffer_Release(&buffer);
v = unicode_empty; _Py_RETURN_UNICODE_EMPTY();
} }
else
v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
PyBuffer_Release(&buffer); PyBuffer_Release(&buffer);
return v; return v;
} }
...@@ -4720,8 +4726,7 @@ PyUnicode_DecodeUTF8Stateful(const char *s, ...@@ -4720,8 +4726,7 @@ PyUnicode_DecodeUTF8Stateful(const char *s,
if (size == 0) { if (size == 0) {
if (consumed) if (consumed)
*consumed = 0; *consumed = 0;
Py_INCREF(unicode_empty); _Py_RETURN_UNICODE_EMPTY();
return unicode_empty;
} }
/* ASCII is equivalent to the first 128 ordinals in Unicode. */ /* ASCII is equivalent to the first 128 ordinals in Unicode. */
...@@ -5232,8 +5237,7 @@ PyUnicode_DecodeUTF16Stateful(const char *s, ...@@ -5232,8 +5237,7 @@ PyUnicode_DecodeUTF16Stateful(const char *s,
if (q == e) { if (q == e) {
if (consumed) if (consumed)
*consumed = size; *consumed = size;
Py_INCREF(unicode_empty); _Py_RETURN_UNICODE_EMPTY();
return unicode_empty;
} }
#ifdef BYTEORDER_IS_LITTLE_ENDIAN #ifdef BYTEORDER_IS_LITTLE_ENDIAN
...@@ -6558,10 +6562,8 @@ PyUnicode_DecodeASCII(const char *s, ...@@ -6558,10 +6562,8 @@ PyUnicode_DecodeASCII(const char *s,
PyObject *errorHandler = NULL; PyObject *errorHandler = NULL;
PyObject *exc = NULL; PyObject *exc = NULL;
if (size == 0) { if (size == 0)
Py_INCREF(unicode_empty); _Py_RETURN_UNICODE_EMPTY();
return unicode_empty;
}
/* ASCII is equivalent to the first 128 ordinals in Unicode. */ /* ASCII is equivalent to the first 128 ordinals in Unicode. */
if (size == 1 && (unsigned char)s[0] < 128) if (size == 1 && (unsigned char)s[0] < 128)
...@@ -6940,8 +6942,7 @@ decode_code_page_stateful(int code_page, ...@@ -6940,8 +6942,7 @@ decode_code_page_stateful(int code_page,
if (chunk_size == 0 && done) { if (chunk_size == 0 && done) {
if (v != NULL) if (v != NULL)
break; break;
Py_INCREF(unicode_empty); _Py_RETURN_UNICODE_EMPTY();
return unicode_empty;
} }
...@@ -9503,9 +9504,7 @@ PyUnicode_Join(PyObject *separator, PyObject *seq) ...@@ -9503,9 +9504,7 @@ PyUnicode_Join(PyObject *separator, PyObject *seq)
/* If empty sequence, return u"". */ /* If empty sequence, return u"". */
if (seqlen == 0) { if (seqlen == 0) {
Py_DECREF(fseq); Py_DECREF(fseq);
Py_INCREF(unicode_empty); _Py_RETURN_UNICODE_EMPTY();
res = unicode_empty;
return res;
} }
/* If singleton sequence with an exact Unicode, return that. */ /* If singleton sequence with an exact Unicode, return that. */
...@@ -10205,7 +10204,9 @@ replace(PyObject *self, PyObject *str1, ...@@ -10205,7 +10204,9 @@ replace(PyObject *self, PyObject *str1,
} }
new_size = slen + n * (len2 - len1); new_size = slen + n * (len2 - len1);
if (new_size == 0) { if (new_size == 0) {
Py_INCREF(unicode_empty); _Py_INCREF_UNICODE_EMPTY();
if (!unicode_empty)
goto error;
u = unicode_empty; u = unicode_empty;
goto done; goto done;
} }
...@@ -11672,10 +11673,8 @@ PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end) ...@@ -11672,10 +11673,8 @@ PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end)
PyErr_SetString(PyExc_IndexError, "string index out of range"); PyErr_SetString(PyExc_IndexError, "string index out of range");
return NULL; return NULL;
} }
if (start >= length || end < start) { if (start >= length || end < start)
Py_INCREF(unicode_empty); _Py_RETURN_UNICODE_EMPTY();
return unicode_empty;
}
length = end - start; length = end - start;
if (PyUnicode_IS_ASCII(self)) { if (PyUnicode_IS_ASCII(self)) {
...@@ -11802,10 +11801,8 @@ unicode_repeat(PyObject *str, Py_ssize_t len) ...@@ -11802,10 +11801,8 @@ unicode_repeat(PyObject *str, Py_ssize_t len)
PyObject *u; PyObject *u;
Py_ssize_t nchars, n; Py_ssize_t nchars, n;
if (len < 1) { if (len < 1)
Py_INCREF(unicode_empty); _Py_RETURN_UNICODE_EMPTY();
return unicode_empty;
}
/* no repeat, return original string */ /* no repeat, return original string */
if (len == 1) if (len == 1)
...@@ -12924,8 +12921,7 @@ _PyUnicodeWriter_Finish(_PyUnicodeWriter *writer) ...@@ -12924,8 +12921,7 @@ _PyUnicodeWriter_Finish(_PyUnicodeWriter *writer)
{ {
if (writer->pos == 0) { if (writer->pos == 0) {
Py_XDECREF(writer->buffer); Py_XDECREF(writer->buffer);
Py_INCREF(unicode_empty); _Py_RETURN_UNICODE_EMPTY();
return unicode_empty;
} }
if (writer->readonly) { if (writer->readonly) {
assert(PyUnicode_GET_LENGTH(writer->buffer) == writer->pos); assert(PyUnicode_GET_LENGTH(writer->buffer) == writer->pos);
...@@ -13143,8 +13139,7 @@ unicode_subscript(PyObject* self, PyObject* item) ...@@ -13143,8 +13139,7 @@ unicode_subscript(PyObject* self, PyObject* item)
} }
if (slicelength <= 0) { if (slicelength <= 0) {
Py_INCREF(unicode_empty); _Py_RETURN_UNICODE_EMPTY();
return unicode_empty;
} else if (start == 0 && step == 1 && } else if (start == 0 && step == 1 &&
slicelength == PyUnicode_GET_LENGTH(self)) { slicelength == PyUnicode_GET_LENGTH(self)) {
return unicode_result_unchanged(self); return unicode_result_unchanged(self);
...@@ -13974,10 +13969,8 @@ unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds) ...@@ -13974,10 +13969,8 @@ unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str", if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
kwlist, &x, &encoding, &errors)) kwlist, &x, &encoding, &errors))
return NULL; return NULL;
if (x == NULL) { if (x == NULL)
Py_INCREF(unicode_empty); _Py_RETURN_UNICODE_EMPTY();
return unicode_empty;
}
if (encoding == NULL && errors == NULL) if (encoding == NULL && errors == NULL)
return PyObject_Str(x); return PyObject_Str(x);
else else
...@@ -14146,8 +14139,6 @@ PyTypeObject PyUnicode_Type = { ...@@ -14146,8 +14139,6 @@ PyTypeObject PyUnicode_Type = {
int _PyUnicode_Init(void) int _PyUnicode_Init(void)
{ {
int i;
/* XXX - move this array to unicodectype.c ? */ /* XXX - move this array to unicodectype.c ? */
Py_UCS2 linebreak[] = { Py_UCS2 linebreak[] = {
0x000A, /* LINE FEED */ 0x000A, /* LINE FEED */
...@@ -14161,13 +14152,11 @@ int _PyUnicode_Init(void) ...@@ -14161,13 +14152,11 @@ int _PyUnicode_Init(void)
}; };
/* Init the implementation */ /* Init the implementation */
unicode_empty = PyUnicode_New(0, 0); _Py_INCREF_UNICODE_EMPTY();
if (!unicode_empty) if (!unicode_empty)
Py_FatalError("Can't create empty string"); Py_FatalError("Can't create empty string");
assert(_PyUnicode_CheckConsistency(unicode_empty, 1)); Py_DECREF(unicode_empty);
for (i = 0; i < 256; i++)
unicode_latin1[i] = NULL;
if (PyType_Ready(&PyUnicode_Type) < 0) if (PyType_Ready(&PyUnicode_Type) < 0)
Py_FatalError("Can't initialize 'unicode'"); Py_FatalError("Can't initialize 'unicode'");
...@@ -14207,15 +14196,10 @@ _PyUnicode_Fini(void) ...@@ -14207,15 +14196,10 @@ _PyUnicode_Fini(void)
{ {
int i; int i;
Py_XDECREF(unicode_empty); Py_CLEAR(unicode_empty);
unicode_empty = NULL;
for (i = 0; i < 256; i++) { for (i = 0; i < 256; i++)
if (unicode_latin1[i]) { Py_CLEAR(unicode_latin1[i]);
Py_DECREF(unicode_latin1[i]);
unicode_latin1[i] = NULL;
}
}
_PyUnicode_ClearStaticStrings(); _PyUnicode_ClearStaticStrings();
(void)PyUnicode_ClearFreeList(); (void)PyUnicode_ClearFreeList();
} }
...@@ -14344,8 +14328,7 @@ _Py_ReleaseInternedUnicodeStrings(void) ...@@ -14344,8 +14328,7 @@ _Py_ReleaseInternedUnicodeStrings(void)
"mortal/immortal\n", mortal_size, immortal_size); "mortal/immortal\n", mortal_size, immortal_size);
Py_DECREF(keys); Py_DECREF(keys);
PyDict_Clear(interned); PyDict_Clear(interned);
Py_DECREF(interned); Py_CLEAR(interned);
interned = NULL;
} }
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment