Commit 5be978d0 authored by Fredrik Lundh's avatar Fredrik Lundh

changed \x to consume exactly two hex digits, also for unicode

strings.  closes PEP-223.

also added \U escape (eight hex digits).
parent b8695c16
...@@ -1163,6 +1163,7 @@ PyObject *PyUnicode_DecodeUnicodeEscape(const char *s, ...@@ -1163,6 +1163,7 @@ PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
PyUnicodeObject *v; PyUnicodeObject *v;
Py_UNICODE *p = NULL, *buf = NULL; Py_UNICODE *p = NULL, *buf = NULL;
const char *end; const char *end;
Py_UCS4 chr;
/* Escaped strings will always be longer than the resulting /* Escaped strings will always be longer than the resulting
Unicode string, so we start with size here and then reduce the Unicode string, so we start with size here and then reduce the
...@@ -1214,28 +1215,27 @@ PyObject *PyUnicode_DecodeUnicodeEscape(const char *s, ...@@ -1214,28 +1215,27 @@ PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
*p++ = x; *p++ = x;
break; break;
/* \xXXXX escape with 1-n hex digits. for compatibility /* \xXX with two hex digits */
with 8-bit strings, this code ignores all but the last
two digits */
case 'x': case 'x':
x = 0; for (x = 0, i = 0; i < 2; i++) {
c = (unsigned char)*s; c = (unsigned char)s[i];
if (isxdigit(c)) { if (!isxdigit(c)) {
do { if (unicodeescape_decoding_error(&s, &x, errors,
x = (x<<4) & 0xF0; "truncated \\xXX"))
if ('0' <= c && c <= '9') goto onError;
x += c - '0'; i++;
else if ('a' <= c && c <= 'f') break;
x += 10 + c - 'a'; }
else x = (x<<4) & ~0xF;
x += 10 + c - 'A'; if (c >= '0' && c <= '9')
c = (unsigned char)*++s; x += c - '0';
} while (isxdigit(c)); else if (c >= 'a' && c <= 'f')
*p++ = (unsigned char) x; x += 10 + c - 'a';
} else { else
*p++ = '\\'; x += 10 + c - 'A';
*p++ = (unsigned char)s[-1];
} }
s += i;
*p++ = x;
break; break;
/* \uXXXX with 4 hex digits */ /* \uXXXX with 4 hex digits */
...@@ -1261,36 +1261,50 @@ PyObject *PyUnicode_DecodeUnicodeEscape(const char *s, ...@@ -1261,36 +1261,50 @@ PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
*p++ = x; *p++ = x;
break; break;
/* \UXXXXXXXX with 8 hex digits */
case 'U':
for (chr = 0, i = 0; i < 8; i++) {
c = (unsigned char)s[i];
if (!isxdigit(c)) {
if (unicodeescape_decoding_error(&s, &x, errors,
"truncated \\uXXXX"))
goto onError;
i++;
break;
}
chr = (chr<<4) & ~0xF;
if (c >= '0' && c <= '9')
chr += c - '0';
else if (c >= 'a' && c <= 'f')
chr += 10 + c - 'a';
else
chr += 10 + c - 'A';
}
s += i;
goto store;
case 'N': case 'N':
/* Ok, we need to deal with Unicode Character Names now, /* Ok, we need to deal with Unicode Character Names now,
* make sure we've imported the hash table data... * make sure we've imported the hash table data...
*/ */
if (pucnHash == NULL) if (pucnHash == NULL) {
{
PyObject *mod = 0, *v = 0; PyObject *mod = 0, *v = 0;
mod = PyImport_ImportModule("ucnhash"); mod = PyImport_ImportModule("ucnhash");
if (mod == NULL) if (mod == NULL)
goto onError; goto onError;
v = PyObject_GetAttrString(mod,"ucnhashAPI"); v = PyObject_GetAttrString(mod,"ucnhashAPI");
Py_DECREF(mod); Py_DECREF(mod);
if (v == NULL) if (v == NULL)
{
goto onError; goto onError;
}
pucnHash = PyCObject_AsVoidPtr(v); pucnHash = PyCObject_AsVoidPtr(v);
Py_DECREF(v); Py_DECREF(v);
if (pucnHash == NULL) if (pucnHash == NULL)
{
goto onError; goto onError;
}
} }
if (*s == '{') if (*s == '{') {
{
const char *start = s + 1; const char *start = s + 1;
const char *endBrace = start; const char *endBrace = start;
Py_UCS4 value;
unsigned long j; unsigned long j;
/* look for either the closing brace, or we /* look for either the closing brace, or we
...@@ -1303,8 +1317,7 @@ PyObject *PyUnicode_DecodeUnicodeEscape(const char *s, ...@@ -1303,8 +1317,7 @@ PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
{ {
endBrace++; endBrace++;
} }
if (endBrace != end && *endBrace == '}') if (endBrace != end && *endBrace == '}') {
{
j = pucnHash->hash(start, endBrace - start); j = pucnHash->hash(start, endBrace - start);
if (j > pucnHash->cKeys || if (j > pucnHash->cKeys ||
mystrnicmp( mystrnicmp(
...@@ -1321,30 +1334,11 @@ PyObject *PyUnicode_DecodeUnicodeEscape(const char *s, ...@@ -1321,30 +1334,11 @@ PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
} }
goto ucnFallthrough; goto ucnFallthrough;
} }
value = ((_Py_UnicodeCharacterName *) chr = ((_Py_UnicodeCharacterName *)
(pucnHash->getValue(j)))->value; (pucnHash->getValue(j)))->value;
if (value < 1<<16)
{
/* In UCS-2 range, easy solution.. */
*p++ = value;
}
else
{
/* Oops, its in UCS-4 space, */
/* compute and append the two surrogates: */
/* translate from 10000..10FFFF to 0..FFFFF */
value -= 0x10000;
/* high surrogate = top 10 bits added to D800 */
*p++ = 0xD800 + (value >> 10);
/* low surrogate = bottom 10 bits added to DC00 */
*p++ = 0xDC00 + (value & ~0xFC00);
}
s = endBrace + 1; s = endBrace + 1;
} goto store;
else } else {
{
if (unicodeescape_decoding_error( if (unicodeescape_decoding_error(
&s, &x, errors, &s, &x, errors,
"Unicode name missing closing brace")) "Unicode name missing closing brace"))
...@@ -1363,6 +1357,23 @@ ucnFallthrough: ...@@ -1363,6 +1357,23 @@ ucnFallthrough:
*p++ = '\\'; *p++ = '\\';
*p++ = (unsigned char)s[-1]; *p++ = (unsigned char)s[-1];
break; break;
store:
/* when we get here, chr is a 32-bit unicode character */
if (chr <= 0xffff)
/* UCS-2 character */
*p++ = (Py_UNICODE) chr;
else if (chr <= 0x10ffff) {
/* UCS-4 character. store as two surrogate characters */
chr -= 0x10000L;
*p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
*p++ = 0xDC00 + (Py_UNICODE) (chr & ~0xFC00);
} else {
if (unicodeescape_decoding_error(
&s, &x, errors,
"Illegal Unicode character")
)
goto onError;
}
} }
} }
if (_PyUnicode_Resize(v, (int)(p - buf))) if (_PyUnicode_Resize(v, (int)(p - buf)))
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment