Commit 7e233dee authored by Victor Stinner's avatar Victor Stinner

PyUnicode_Ready() now sets ascii=1 if maxchar < 128

ascii=1 is no more reserved to PyASCIIObject. Use
PyUnicode_IS_COMPACT_ASCII(obj) to check if obj is a PyASCIIObject (as before).
parent 3333b864
...@@ -224,7 +224,7 @@ typedef struct { ...@@ -224,7 +224,7 @@ typedef struct {
PyUnicode_4BYTE_KIND PyUnicode_4BYTE_KIND
* compact = 1 * compact = 1
* ready = 1 * ready = 1
* (ascii = 0) * ascii = 0
- string created by the legacy API (not ready): - string created by the legacy API (not ready):
...@@ -236,7 +236,7 @@ typedef struct { ...@@ -236,7 +236,7 @@ typedef struct {
* data.any is NULL * data.any is NULL
* utf8 is NULL * utf8 is NULL
* interned = SSTATE_NOT_INTERNED * interned = SSTATE_NOT_INTERNED
* (ascii = 0) * ascii = 0
- string created by the legacy API, ready: - string created by the legacy API, ready:
...@@ -246,7 +246,6 @@ typedef struct { ...@@ -246,7 +246,6 @@ typedef struct {
* compact = 0 * compact = 0
* ready = 1 * ready = 1
* data.any is not NULL * data.any is not NULL
* (ascii = 0)
String created by the legacy API becomes ready when calling String created by the legacy API becomes ready when calling
PyUnicode_READY(). PyUnicode_READY().
...@@ -278,8 +277,9 @@ typedef struct { ...@@ -278,8 +277,9 @@ typedef struct {
one block for the PyUnicodeObject struct and another for its data one block for the PyUnicodeObject struct and another for its data
buffer. */ buffer. */
unsigned int compact:1; unsigned int compact:1;
/* Compact objects which are ASCII-only also have the state.compact /* kind is PyUnicode_1BYTE_KIND but data contains only ASCII
flag set, and use the PyASCIIObject struct. */ characters. If ascii is 1 and compact is 1, use the PyASCIIObject
structure. */
unsigned int ascii:1; unsigned int ascii:1;
/* The ready flag indicates whether the object layout is initialized /* The ready flag indicates whether the object layout is initialized
completely. This means that this is either a compact object, or completely. This means that this is either a compact object, or
...@@ -304,7 +304,7 @@ typedef struct { ...@@ -304,7 +304,7 @@ typedef struct {
/* Strings allocated through PyUnicode_FromUnicode(NULL, len) use the /* Strings allocated through PyUnicode_FromUnicode(NULL, len) use the
PyUnicodeObject structure. The actual string data is initially in the wstr PyUnicodeObject structure. The actual string data is initially in the wstr
block, and copied into the data block using PyUnicode_Ready. */ block, and copied into the data block using _PyUnicode_Ready. */
typedef struct { typedef struct {
PyCompactUnicodeObject _base; PyCompactUnicodeObject _base;
union { union {
...@@ -327,7 +327,7 @@ PyAPI_DATA(PyTypeObject) PyUnicodeIter_Type; ...@@ -327,7 +327,7 @@ PyAPI_DATA(PyTypeObject) PyUnicodeIter_Type;
#ifndef Py_LIMITED_API #ifndef Py_LIMITED_API
#define PyUnicode_WSTR_LENGTH(op) \ #define PyUnicode_WSTR_LENGTH(op) \
(((PyASCIIObject*)op)->state.ascii ? \ (PyUnicode_IS_COMPACT_ASCII(op) ? \
((PyASCIIObject*)op)->length : \ ((PyASCIIObject*)op)->length : \
((PyCompactUnicodeObject*)op)->wstr_length) ((PyCompactUnicodeObject*)op)->wstr_length)
...@@ -369,10 +369,24 @@ PyAPI_DATA(PyTypeObject) PyUnicodeIter_Type; ...@@ -369,10 +369,24 @@ PyAPI_DATA(PyTypeObject) PyUnicodeIter_Type;
#define SSTATE_INTERNED_MORTAL 1 #define SSTATE_INTERNED_MORTAL 1
#define SSTATE_INTERNED_IMMORTAL 2 #define SSTATE_INTERNED_IMMORTAL 2
#define PyUnicode_IS_COMPACT_ASCII(op) (((PyASCIIObject*)op)->state.ascii) /* Return true if the string contains only ASCII characters, or 0 if not. The
string may be compact (PyUnicode_IS_COMPACT_ASCII) or not. No type checks
or Ready calls are performed. */
#define PyUnicode_IS_ASCII(op) \
(((PyASCIIObject*)op)->state.ascii)
/* Return true if the string is compact or 0 if not.
No type checks or Ready calls are performed. */
#define PyUnicode_IS_COMPACT(op) \
(((PyASCIIObject*)(op))->state.compact)
/* Return true if the string is a compact ASCII string (use PyASCIIObject
structure), or 0 if not. No type checks or Ready calls are performed. */
#define PyUnicode_IS_COMPACT_ASCII(op) \
(PyUnicode_IS_ASCII(op) && PyUnicode_IS_COMPACT(op))
/* String contains only wstr byte characters. This is only possible /* String contains only wstr byte characters. This is only possible
when the string was created with a legacy API and PyUnicode_Ready() when the string was created with a legacy API and _PyUnicode_Ready()
has not been called yet. */ has not been called yet. */
#define PyUnicode_WCHAR_KIND 0 #define PyUnicode_WCHAR_KIND 0
...@@ -399,11 +413,6 @@ PyAPI_DATA(PyTypeObject) PyUnicodeIter_Type; ...@@ -399,11 +413,6 @@ PyAPI_DATA(PyTypeObject) PyUnicodeIter_Type;
#define PyUnicode_2BYTE_DATA(op) ((Py_UCS2*)PyUnicode_DATA(op)) #define PyUnicode_2BYTE_DATA(op) ((Py_UCS2*)PyUnicode_DATA(op))
#define PyUnicode_4BYTE_DATA(op) ((Py_UCS4*)PyUnicode_DATA(op)) #define PyUnicode_4BYTE_DATA(op) ((Py_UCS4*)PyUnicode_DATA(op))
/* Return true if the string is compact or 0 if not.
No type checks or Ready calls are performed. */
#define PyUnicode_IS_COMPACT(op) \
(((PyASCIIObject*)(op))->state.compact)
/* Return one of the PyUnicode_*_KIND values defined above. */ /* Return one of the PyUnicode_*_KIND values defined above. */
#define PyUnicode_KIND(op) \ #define PyUnicode_KIND(op) \
(assert(PyUnicode_Check(op)), \ (assert(PyUnicode_Check(op)), \
...@@ -500,9 +509,9 @@ PyAPI_DATA(PyTypeObject) PyUnicodeIter_Type; ...@@ -500,9 +509,9 @@ PyAPI_DATA(PyTypeObject) PyUnicodeIter_Type;
#define PyUnicode_IS_READY(op) (((PyASCIIObject*)op)->state.ready) #define PyUnicode_IS_READY(op) (((PyASCIIObject*)op)->state.ready)
/* PyUnicode_READY() does less work than PyUnicode_Ready() in the best /* PyUnicode_READY() does less work than _PyUnicode_Ready() in the best
case. If the canonical representation is not yet set, it will still call case. If the canonical representation is not yet set, it will still call
PyUnicode_Ready(). _PyUnicode_Ready().
Returns 0 on success and -1 on errors. */ Returns 0 on success and -1 on errors. */
#define PyUnicode_READY(op) \ #define PyUnicode_READY(op) \
(assert(PyUnicode_Check(op)), \ (assert(PyUnicode_Check(op)), \
......
...@@ -288,16 +288,14 @@ _PyUnicode_CheckConsistency(void *op) ...@@ -288,16 +288,14 @@ _PyUnicode_CheckConsistency(void *op)
ascii = (PyASCIIObject *)op; ascii = (PyASCIIObject *)op;
kind = ascii->state.kind; kind = ascii->state.kind;
if (ascii->state.ascii == 1) { if (ascii->state.ascii == 1 && ascii->state.compact == 1) {
assert(kind == PyUnicode_1BYTE_KIND); assert(kind == PyUnicode_1BYTE_KIND);
assert(ascii->state.compact == 1);
assert(ascii->state.ready == 1); assert(ascii->state.ready == 1);
} }
else if (ascii->state.compact == 1) { else if (ascii->state.compact == 1) {
assert(kind == PyUnicode_1BYTE_KIND assert(kind == PyUnicode_1BYTE_KIND
|| kind == PyUnicode_2BYTE_KIND || kind == PyUnicode_2BYTE_KIND
|| kind == PyUnicode_4BYTE_KIND); || kind == PyUnicode_4BYTE_KIND);
assert(ascii->state.compact == 1);
assert(ascii->state.ascii == 0); assert(ascii->state.ascii == 0);
assert(ascii->state.ready == 1); assert(ascii->state.ready == 1);
} else { } else {
...@@ -305,9 +303,9 @@ _PyUnicode_CheckConsistency(void *op) ...@@ -305,9 +303,9 @@ _PyUnicode_CheckConsistency(void *op)
PyUnicodeObject *unicode = (PyUnicodeObject *)op; PyUnicodeObject *unicode = (PyUnicodeObject *)op;
if (kind == PyUnicode_WCHAR_KIND) { if (kind == PyUnicode_WCHAR_KIND) {
assert(!ascii->state.compact == 1); assert(ascii->state.compact == 0);
assert(ascii->state.ascii == 0); assert(ascii->state.ascii == 0);
assert(!ascii->state.ready == 1); assert(ascii->state.ready == 0);
assert(ascii->wstr != NULL); assert(ascii->wstr != NULL);
assert(unicode->data.any == NULL); assert(unicode->data.any == NULL);
assert(compact->utf8 == NULL); assert(compact->utf8 == NULL);
...@@ -317,10 +315,9 @@ _PyUnicode_CheckConsistency(void *op) ...@@ -317,10 +315,9 @@ _PyUnicode_CheckConsistency(void *op)
assert(kind == PyUnicode_1BYTE_KIND assert(kind == PyUnicode_1BYTE_KIND
|| kind == PyUnicode_2BYTE_KIND || kind == PyUnicode_2BYTE_KIND
|| kind == PyUnicode_4BYTE_KIND); || kind == PyUnicode_4BYTE_KIND);
assert(!ascii->state.compact == 1); assert(ascii->state.compact == 0);
assert(ascii->state.ready == 1); assert(ascii->state.ready == 1);
assert(unicode->data.any != NULL); assert(unicode->data.any != NULL);
assert(ascii->state.ascii == 0);
} }
} }
return 1; return 1;
...@@ -638,7 +635,7 @@ unicode_kind_name(PyObject *unicode) ...@@ -638,7 +635,7 @@ unicode_kind_name(PyObject *unicode)
switch(PyUnicode_KIND(unicode)) switch(PyUnicode_KIND(unicode))
{ {
case PyUnicode_1BYTE_KIND: case PyUnicode_1BYTE_KIND:
if (PyUnicode_IS_COMPACT_ASCII(unicode)) if (PyUnicode_IS_ASCII(unicode))
return "legacy ascii"; return "legacy ascii";
else else
return "legacy latin1"; return "legacy latin1";
...@@ -654,14 +651,14 @@ unicode_kind_name(PyObject *unicode) ...@@ -654,14 +651,14 @@ unicode_kind_name(PyObject *unicode)
switch(PyUnicode_KIND(unicode)) switch(PyUnicode_KIND(unicode))
{ {
case PyUnicode_1BYTE_KIND: case PyUnicode_1BYTE_KIND:
if (PyUnicode_IS_COMPACT_ASCII(unicode)) if (PyUnicode_IS_ASCII(unicode))
return "ascii"; return "ascii";
else else
return "compact latin1"; return "latin1";
case PyUnicode_2BYTE_KIND: case PyUnicode_2BYTE_KIND:
return "compact UCS2"; return "UCS2";
case PyUnicode_4BYTE_KIND: case PyUnicode_4BYTE_KIND:
return "compact UCS4"; return "UCS4";
default: default:
return "<invalid compact kind>"; return "<invalid compact kind>";
} }
...@@ -703,7 +700,7 @@ _PyUnicode_Dump(PyObject *op) ...@@ -703,7 +700,7 @@ _PyUnicode_Dump(PyObject *op)
if (ascii->wstr == data) if (ascii->wstr == data)
printf("shared "); printf("shared ");
printf("wstr=%p", ascii->wstr); printf("wstr=%p", ascii->wstr);
if (!ascii->state.ascii) { if (!(ascii->state.ascii == 1 && ascii->state.compact == 1)) {
printf(" (%zu), ", compact->wstr_length); printf(" (%zu), ", compact->wstr_length);
if (!ascii->state.compact && compact->utf8 == unicode->data.any) if (!ascii->state.compact && compact->utf8 == unicode->data.any)
printf("shared "); printf("shared ");
...@@ -954,9 +951,9 @@ PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start, ...@@ -954,9 +951,9 @@ PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start,
/* check if max_char(from substring) <= max_char(to) */ /* check if max_char(from substring) <= max_char(to) */
if (from_kind > to_kind if (from_kind > to_kind
/* latin1 => ascii */ /* latin1 => ascii */
|| (PyUnicode_IS_COMPACT_ASCII(to) || (PyUnicode_IS_ASCII(to)
&& to_kind == PyUnicode_1BYTE_KIND && to_kind == PyUnicode_1BYTE_KIND
&& !PyUnicode_IS_COMPACT_ASCII(from))) && !PyUnicode_IS_ASCII(from)))
{ {
/* slow path to check for character overflow */ /* slow path to check for character overflow */
const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to); const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
...@@ -1115,10 +1112,12 @@ unicode_ready(PyObject **p_obj, int replace) ...@@ -1115,10 +1112,12 @@ unicode_ready(PyObject **p_obj, int replace)
_PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode); _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
_PyUnicode_STATE(unicode).kind = PyUnicode_1BYTE_KIND; _PyUnicode_STATE(unicode).kind = PyUnicode_1BYTE_KIND;
if (maxchar < 128) { if (maxchar < 128) {
_PyUnicode_STATE(unicode).ascii = 1;
_PyUnicode_UTF8(unicode) = _PyUnicode_DATA_ANY(unicode); _PyUnicode_UTF8(unicode) = _PyUnicode_DATA_ANY(unicode);
_PyUnicode_UTF8_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode); _PyUnicode_UTF8_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
} }
else { else {
_PyUnicode_STATE(unicode).ascii = 0;
_PyUnicode_UTF8(unicode) = NULL; _PyUnicode_UTF8(unicode) = NULL;
_PyUnicode_UTF8_LENGTH(unicode) = 0; _PyUnicode_UTF8_LENGTH(unicode) = 0;
} }
......
...@@ -1132,15 +1132,16 @@ class PyUnicodeObjectPtr(PyObjectPtr): ...@@ -1132,15 +1132,16 @@ class PyUnicodeObjectPtr(PyObjectPtr):
compact = self.field('_base') compact = self.field('_base')
ascii = compact['_base'] ascii = compact['_base']
state = ascii['state'] state = ascii['state']
is_compact_ascii = (int(state['ascii']) and int(state['compact']))
field_length = long(ascii['length']) field_length = long(ascii['length'])
if not int(state['ready']): if not int(state['ready']):
# string is not ready # string is not ready
may_have_surrogates = True may_have_surrogates = True
field_str = ascii['wstr'] field_str = ascii['wstr']
if not int(state['ascii']): if not is_compact_ascii:
field_length = compact('wstr_length') field_length = compact('wstr_length')
else: else:
if int(state['ascii']): if is_compact_ascii:
field_str = ascii.address + 1 field_str = ascii.address + 1
elif int(state['compact']): elif int(state['compact']):
field_str = compact.address + 1 field_str = compact.address + 1
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment