Commit ce9b5a55 authored by Martin v. Löwis's avatar Martin v. Löwis

Encode surrogates in UTF-8 even for a wide Py_UNICODE.

Implement sys.maxunicode.
Explicitly wrap around upper/lower computations for wide Py_UNICODE.
When decoding large characters with UTF-8, represent expected test
results using the \U notation.
parent 236d8b79
...@@ -274,6 +274,9 @@ extern DL_IMPORT(int) PyUnicode_GetSize( ...@@ -274,6 +274,9 @@ extern DL_IMPORT(int) PyUnicode_GetSize(
PyObject *unicode /* Unicode object */ PyObject *unicode /* Unicode object */
); );
/* Get the maximum ordinal for a Unicode character. */
extern DL_IMPORT(Py_UNICODE) PyUnicode_GetMax(void);
/* Resize an already allocated Unicode object to the new size length. /* Resize an already allocated Unicode object to the new size length.
*unicode is modified to point to the new (resized) object and 0 *unicode is modified to point to the new (resized) object and 0
......
...@@ -386,9 +386,9 @@ verify(u'\ud84d\udc56'.encode('utf-8') == \ ...@@ -386,9 +386,9 @@ verify(u'\ud84d\udc56'.encode('utf-8') == \
''.join((chr(0xf0), chr(0xa3), chr(0x91), chr(0x96))) ) ''.join((chr(0xf0), chr(0xa3), chr(0x91), chr(0x96))) )
# UTF-8 specific decoding tests # UTF-8 specific decoding tests
verify(unicode(''.join((chr(0xf0), chr(0xa3), chr(0x91), chr(0x96))), verify(unicode(''.join((chr(0xf0), chr(0xa3), chr(0x91), chr(0x96))),
'utf-8') == u'\ud84d\udc56' ) 'utf-8') == u'\U00023456' )
verify(unicode(''.join((chr(0xf0), chr(0x90), chr(0x80), chr(0x82))), verify(unicode(''.join((chr(0xf0), chr(0x90), chr(0x80), chr(0x82))),
'utf-8') == u'\ud800\udc02' ) 'utf-8') == u'\U00010002' )
verify(unicode(''.join((chr(0xe2), chr(0x82), chr(0xac))), verify(unicode(''.join((chr(0xe2), chr(0x82), chr(0xac))),
'utf-8') == u'\u20ac' ) 'utf-8') == u'\u20ac' )
......
...@@ -59,14 +59,21 @@ int _PyUnicode_IsLinebreak(register const Py_UNICODE ch) ...@@ -59,14 +59,21 @@ int _PyUnicode_IsLinebreak(register const Py_UNICODE ch)
/* Returns the titlecase Unicode characters corresponding to ch or just /* Returns the titlecase Unicode characters corresponding to ch or just
ch if no titlecase mapping is known. */ ch if no titlecase mapping is known. */
Py_UNICODE _PyUnicode_ToTitlecase(register const Py_UNICODE ch) Py_UNICODE _PyUnicode_ToTitlecase(register Py_UNICODE ch)
{ {
const _PyUnicode_TypeRecord *ctype = gettyperecord(ch); const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
if (ctype->title) if (ctype->title)
return ch + ctype->title; ch += ctype->title;
else
return ch + ctype->upper; ch += ctype->upper;
#ifdef USE_UCS4_STORAGE
/* The database assumes that the values wrap around at 0x10000. */
if (ch > 0x10000)
ch -= 0x10000;
#endif
return ch;
} }
/* Returns 1 for Unicode characters having the category 'Lt', 0 /* Returns 1 for Unicode characters having the category 'Lt', 0
...@@ -348,21 +355,33 @@ int _PyUnicode_IsUppercase(register const Py_UNICODE ch) ...@@ -348,21 +355,33 @@ int _PyUnicode_IsUppercase(register const Py_UNICODE ch)
/* Returns the uppercase Unicode characters corresponding to ch or just /* Returns the uppercase Unicode characters corresponding to ch or just
ch if no uppercase mapping is known. */ ch if no uppercase mapping is known. */
Py_UNICODE _PyUnicode_ToUppercase(register const Py_UNICODE ch) Py_UNICODE _PyUnicode_ToUppercase(register Py_UNICODE ch)
{ {
const _PyUnicode_TypeRecord *ctype = gettyperecord(ch); const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
return ch + ctype->upper; ch += ctype->upper;
#ifdef USE_UCS4_STORAGE
/* The database assumes that the values wrap around at 0x10000. */
if (ch > 0x10000)
ch -= 0x10000;
#endif
return ch;
} }
/* Returns the lowercase Unicode characters corresponding to ch or just /* Returns the lowercase Unicode characters corresponding to ch or just
ch if no lowercase mapping is known. */ ch if no lowercase mapping is known. */
Py_UNICODE _PyUnicode_ToLowercase(register const Py_UNICODE ch) Py_UNICODE _PyUnicode_ToLowercase(register Py_UNICODE ch)
{ {
const _PyUnicode_TypeRecord *ctype = gettyperecord(ch); const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
return ch + ctype->lower; ch += ctype->lower;
#ifdef USE_UCS4_STORAGE
/* The database assumes that the values wrap around at 0x10000. */
if (ch > 0x10000)
ch -= 0x10000;
#endif
return ch;
} }
/* Returns 1 for Unicode characters having the category 'Ll', 'Lu', 'Lt', /* Returns 1 for Unicode characters having the category 'Ll', 'Lu', 'Lt',
......
...@@ -103,6 +103,18 @@ static PyUnicodeObject *unicode_latin1[256]; ...@@ -103,6 +103,18 @@ static PyUnicodeObject *unicode_latin1[256];
*/ */
static char unicode_default_encoding[100]; static char unicode_default_encoding[100];
Py_UNICODE
PyUnicode_GetMax()
{
#ifdef USE_UCS4_STORAGE
return 0x10FFFF;
#else
/* This is actually an illegal character, so it should
not be passed to unichr. */
return 0xFFFF;
#endif
}
/* --- Unicode Object ----------------------------------------------------- */ /* --- Unicode Object ----------------------------------------------------- */
static static
...@@ -884,12 +896,6 @@ PyObject *PyUnicode_EncodeUTF8(const Py_UNICODE *s, ...@@ -884,12 +896,6 @@ PyObject *PyUnicode_EncodeUTF8(const Py_UNICODE *s,
cbWritten += 2; cbWritten += 2;
} }
else if (ch < 0x10000) { else if (ch < 0x10000) {
#if Py_UNICODE_SIZE == 4
*p++ = 0xe0 | (ch>>12);
*p++ = 0x80 | ((ch>>6) & 0x3f);
*p++ = 0x80 | (ch & 0x3f);
cbWritten += 3;
#else
/* Check for high surrogate */ /* Check for high surrogate */
if (0xD800 <= ch && ch <= 0xDBFF) { if (0xD800 <= ch && ch <= 0xDBFF) {
if (i != size) { if (i != size) {
...@@ -920,7 +926,6 @@ PyObject *PyUnicode_EncodeUTF8(const Py_UNICODE *s, ...@@ -920,7 +926,6 @@ PyObject *PyUnicode_EncodeUTF8(const Py_UNICODE *s,
} }
*p++ = (char)(0x80 | ((ch >> 6) & 0x3f)); *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
*p++ = (char)(0x80 | (ch & 0x3f)); *p++ = (char)(0x80 | (ch & 0x3f));
#endif
} else { } else {
*p++ = 0xf0 | (ch>>18); *p++ = 0xf0 | (ch>>18);
*p++ = 0x80 | ((ch>>12) & 0x3f); *p++ = 0x80 | ((ch>>12) & 0x3f);
......
...@@ -533,6 +533,7 @@ exc_traceback -- traceback of exception currently being handled\n\ ...@@ -533,6 +533,7 @@ exc_traceback -- traceback of exception currently being handled\n\
Static objects:\n\ Static objects:\n\
\n\ \n\
maxint -- the largest supported integer (the smallest is -maxint-1)\n\ maxint -- the largest supported integer (the smallest is -maxint-1)\n\
maxunicode -- the largest supported character\n\
builtin_module_names -- tuple of module names built into this intepreter\n\ builtin_module_names -- tuple of module names built into this intepreter\n\
version -- the version of this interpreter as a string\n\ version -- the version of this interpreter as a string\n\
version_info -- version information as a tuple\n\ version_info -- version information as a tuple\n\
...@@ -643,6 +644,9 @@ _PySys_Init(void) ...@@ -643,6 +644,9 @@ _PySys_Init(void)
PyDict_SetItemString(sysdict, "maxint", PyDict_SetItemString(sysdict, "maxint",
v = PyInt_FromLong(PyInt_GetMax())); v = PyInt_FromLong(PyInt_GetMax()));
Py_XDECREF(v); Py_XDECREF(v);
PyDict_SetItemString(sysdict, "maxunicode",
v = PyInt_FromLong(PyUnicode_GetMax()));
Py_XDECREF(v);
PyDict_SetItemString(sysdict, "builtin_module_names", PyDict_SetItemString(sysdict, "builtin_module_names",
v = list_builtin_module_names()); v = list_builtin_module_names());
Py_XDECREF(v); Py_XDECREF(v);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment