Commit ccc7473f authored by Fredrik Lundh's avatar Fredrik Lundh

reorganized PyUnicode_DecodeUnicodeEscape a bit (in order to make it

less likely that bug #132817 ever appears again)
parent b95896b2
...@@ -1110,10 +1110,11 @@ PyObject *PyUnicode_DecodeUnicodeEscape(const char *s, ...@@ -1110,10 +1110,11 @@ PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
const char *errors) const char *errors)
{ {
PyUnicodeObject *v; PyUnicodeObject *v;
Py_UNICODE *p = NULL, *buf = NULL; Py_UNICODE *p, *buf;
const char *end; const char *end;
Py_UCS4 chr; char* message;
Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
/* Escaped strings will always be longer than the resulting /* Escaped strings will always be longer than the resulting
Unicode string, so we start with size here and then reduce the Unicode string, so we start with size here and then reduce the
length after conversion to the true value. */ length after conversion to the true value. */
...@@ -1122,16 +1123,18 @@ PyObject *PyUnicode_DecodeUnicodeEscape(const char *s, ...@@ -1122,16 +1123,18 @@ PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
goto onError; goto onError;
if (size == 0) if (size == 0)
return (PyObject *)v; return (PyObject *)v;
p = buf = PyUnicode_AS_UNICODE(v); p = buf = PyUnicode_AS_UNICODE(v);
end = s + size; end = s + size;
while (s < end) { while (s < end) {
unsigned char c; unsigned char c;
Py_UNICODE x; Py_UNICODE x;
int i; int i, digits;
/* Non-escape characters are interpreted as Unicode ordinals */ /* Non-escape characters are interpreted as Unicode ordinals */
if (*s != '\\') { if (*s != '\\') {
*p++ = (unsigned char)*s++; *p++ = (unsigned char) *s++;
continue; continue;
} }
...@@ -1164,60 +1167,31 @@ PyObject *PyUnicode_DecodeUnicodeEscape(const char *s, ...@@ -1164,60 +1167,31 @@ PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
*p++ = x; *p++ = x;
break; break;
/* \xXX with two hex digits */ /* hex escapes */
/* \xXX */
case 'x': case 'x':
for (x = 0, i = 0; i < 2; i++) { digits = 2;
c = (unsigned char)s[i]; message = "truncated \\xXX escape";
if (!isxdigit(c)) { goto hexescape;
if (unicodeescape_decoding_error(&s, &x, errors,
"truncated \\xXX"))
goto onError;
i++;
break;
}
x = (x<<4) & ~0xF;
if (c >= '0' && c <= '9')
x += c - '0';
else if (c >= 'a' && c <= 'f')
x += 10 + c - 'a';
else
x += 10 + c - 'A';
}
s += i;
*p++ = x;
break;
/* \uXXXX with 4 hex digits */ /* \uXXXX */
case 'u': case 'u':
for (x = 0, i = 0; i < 4; i++) { digits = 4;
c = (unsigned char)s[i]; message = "truncated \\uXXXX escape";
if (!isxdigit(c)) { goto hexescape;
if (unicodeescape_decoding_error(&s, &x, errors,
"truncated \\uXXXX"))
goto onError;
i++;
break;
}
x = (x<<4) & ~0xF;
if (c >= '0' && c <= '9')
x += c - '0';
else if (c >= 'a' && c <= 'f')
x += 10 + c - 'a';
else
x += 10 + c - 'A';
}
s += i;
*p++ = x;
break;
/* \UXXXXXXXX with 8 hex digits */ /* \UXXXXXXXX */
case 'U': case 'U':
for (chr = 0, i = 0; i < 8; i++) { digits = 8;
c = (unsigned char)s[i]; message = "truncated \\UXXXXXXXX escape";
hexescape:
chr = 0;
for (i = 0; i < digits; i++) {
c = (unsigned char) s[i];
if (!isxdigit(c)) { if (!isxdigit(c)) {
if (unicodeescape_decoding_error(&s, &x, errors, if (unicodeescape_decoding_error(&s, &x, errors, message))
"truncated \\uXXXX"))
goto onError; goto onError;
chr = x;
i++; i++;
break; break;
} }
...@@ -1230,19 +1204,37 @@ PyObject *PyUnicode_DecodeUnicodeEscape(const char *s, ...@@ -1230,19 +1204,37 @@ PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
chr += 10 + c - 'A'; chr += 10 + c - 'A';
} }
s += i; s += i;
goto store; store:
/* when we get here, chr is a 32-bit unicode character */
if (chr <= 0xffff)
/* UCS-2 character */
*p++ = (Py_UNICODE) chr;
else if (chr <= 0x10ffff) {
/* UCS-4 character. store as two surrogate characters */
chr -= 0x10000L;
*p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
*p++ = 0xDC00 + (Py_UNICODE) (chr & ~0xFC00);
} else {
if (unicodeescape_decoding_error(
&s, &x, errors,
"illegal Unicode character")
)
goto onError;
*p++ = x; /* store replacement character */
}
break;
/* \N{name} */
case 'N': case 'N':
/* Ok, we need to deal with Unicode Character Names now, message = "malformed \\N character escape";
* make sure we've imported the hash table data...
*/
if (ucnhash_CAPI == NULL) { if (ucnhash_CAPI == NULL) {
PyObject *mod = 0, *v = 0; /* load the unicode data module */
mod = PyImport_ImportModule("unicodedata"); PyObject *m, *v;
if (mod == NULL) m = PyImport_ImportModule("unicodedata");
if (m == NULL)
goto ucnhashError; goto ucnhashError;
v = PyObject_GetAttrString(mod,"ucnhash_CAPI"); v = PyObject_GetAttrString(m, "ucnhash_CAPI");
Py_DECREF(mod); Py_DECREF(m);
if (v == NULL) if (v == NULL)
goto ucnhashError; goto ucnhashError;
ucnhash_CAPI = PyCObject_AsVoidPtr(v); ucnhash_CAPI = PyCObject_AsVoidPtr(v);
...@@ -1250,75 +1242,42 @@ PyObject *PyUnicode_DecodeUnicodeEscape(const char *s, ...@@ -1250,75 +1242,42 @@ PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
if (ucnhash_CAPI == NULL) if (ucnhash_CAPI == NULL)
goto ucnhashError; goto ucnhashError;
} }
if (*s == '{') { if (*s == '{') {
const char *start = s + 1; const char *start = s+1;
const char *endBrace = start;
/* look for the closing brace */ /* look for the closing brace */
while (*endBrace != '}' && endBrace < end) while (*s != '}' && s < end)
endBrace++; s++;
if (endBrace != end && *endBrace == '}') { if (s > start && s < end && *s == '}') {
if (!ucnhash_CAPI->getcode(start, endBrace-start, &chr)) { /* found a name. look it up in the unicode database */
if (unicodeescape_decoding_error( message = "unknown Unicode character name";
&s, &x, errors, s++;
"Invalid Unicode Character Name") if (ucnhash_CAPI->getcode(start, s-start-1, &chr))
) goto store;
goto onError;
goto ucnFallthrough;
}
s = endBrace + 1;
goto store;
} else {
if (unicodeescape_decoding_error(
&s, &x, errors,
"Unicode name missing closing brace"))
goto onError;
goto ucnFallthrough;
} }
break;
} }
if (unicodeescape_decoding_error( if (unicodeescape_decoding_error(&s, &x, errors, message))
&s, &x, errors,
"Missing opening brace for Unicode Character Name escape"))
goto onError; goto onError;
ucnFallthrough: *p++ = x;
/* fall through on purpose */ break;
default:
default:
*p++ = '\\'; *p++ = '\\';
*p++ = (unsigned char)s[-1]; *p++ = (unsigned char)s[-1];
break; break;
store:
/* when we get here, chr is a 32-bit unicode character */
if (chr <= 0xffff)
/* UCS-2 character */
*p++ = (Py_UNICODE) chr;
else if (chr <= 0x10ffff) {
/* UCS-4 character. store as two surrogate characters */
chr -= 0x10000L;
*p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
*p++ = 0xDC00 + (Py_UNICODE) (chr & ~0xFC00);
} else {
if (unicodeescape_decoding_error(
&s, &x, errors,
"Illegal Unicode character")
)
goto onError;
}
} }
} }
if (_PyUnicode_Resize(v, (int)(p - buf))) if (_PyUnicode_Resize(v, (int)(p - buf)))
goto onError; goto onError;
return (PyObject *)v; return (PyObject *)v;
ucnhashError: ucnhashError:
PyErr_SetString( PyErr_SetString(
PyExc_UnicodeError, PyExc_UnicodeError,
"\\N escapes not supported (can't load unicodedata module)" "\\N escapes not supported (can't load unicodedata module)"
); );
return NULL; return NULL;
onError: onError:
Py_XDECREF(v); Py_XDECREF(v);
return NULL; return NULL;
} }
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment