Commit e12896ec authored by Marc-André Lemburg's avatar Marc-André Lemburg

New surrogate support in the UTF-8 codec. By Bill Tutt.

parent d6d06ade
...@@ -657,10 +657,10 @@ PyObject *PyUnicode_DecodeUTF8(const char *s, ...@@ -657,10 +657,10 @@ PyObject *PyUnicode_DecodeUTF8(const char *s,
e = s + size; e = s + size;
while (s < e) { while (s < e) {
register Py_UNICODE ch = (unsigned char)*s; Py_UCS4 ch = (unsigned char)*s;
if (ch < 0x80) { if (ch < 0x80) {
*p++ = ch; *p++ = (Py_UNICODE)ch;
s++; s++;
continue; continue;
} }
...@@ -687,7 +687,7 @@ PyObject *PyUnicode_DecodeUTF8(const char *s, ...@@ -687,7 +687,7 @@ PyObject *PyUnicode_DecodeUTF8(const char *s,
if (ch < 0x80) if (ch < 0x80)
UTF8_ERROR("illegal encoding"); UTF8_ERROR("illegal encoding");
else else
*p++ = ch; *p++ = (Py_UNICODE)ch;
break; break;
case 3: case 3:
...@@ -698,7 +698,30 @@ PyObject *PyUnicode_DecodeUTF8(const char *s, ...@@ -698,7 +698,30 @@ PyObject *PyUnicode_DecodeUTF8(const char *s,
if (ch < 0x800 || (ch >= 0xd800 && ch < 0xe000)) if (ch < 0x800 || (ch >= 0xd800 && ch < 0xe000))
UTF8_ERROR("illegal encoding"); UTF8_ERROR("illegal encoding");
else else
*p++ = ch; *p++ = (Py_UNICODE)ch;
break;
case 4:
if ((s[1] & 0xc0) != 0x80 ||
(s[2] & 0xc0) != 0x80 ||
(s[3] & 0xc0) != 0x80)
UTF8_ERROR("invalid data");
ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
/* validate and convert to UTF-16 */
if ((ch < 0x10000) || /* minimum value allowed for 4 byte encoding */
(ch > 0x10ffff)) /* maximum value allowed for UTF-16 */
UTF8_ERROR("illegal encoding");
/* compute and append the two surrogates: */
/* translate from 10000..10FFFF to 0..FFFF */
ch -= 0x10000;
/* high surrogate = top 10 bits added to D800 */
*p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
/* low surrogate = bottom 10 bits added to DC00 */
*p++ = (Py_UNICODE)(0xDC00 + (ch & ~0xFC00));
break; break;
default: default:
...@@ -758,32 +781,60 @@ PyObject *PyUnicode_EncodeUTF8(const Py_UNICODE *s, ...@@ -758,32 +781,60 @@ PyObject *PyUnicode_EncodeUTF8(const Py_UNICODE *s,
PyObject *v; PyObject *v;
char *p; char *p;
char *q; char *q;
Py_UCS4 ch2;
unsigned int cbAllocated = 3 * size;
unsigned int cbWritten = 0;
int i = 0;
v = PyString_FromStringAndSize(NULL, 3 * size); v = PyString_FromStringAndSize(NULL, cbAllocated);
if (v == NULL) if (v == NULL)
return NULL; return NULL;
if (size == 0) if (size == 0)
goto done; goto done;
p = q = PyString_AS_STRING(v); p = q = PyString_AS_STRING(v);
while (size-- > 0) { while (i < size) {
Py_UNICODE ch = *s++; Py_UCS4 ch = s[i++];
if (ch < 0x80) if (ch < 0x80) {
*p++ = (char) ch; *p++ = (char) ch;
cbWritten++;
}
else if (ch < 0x0800) { else if (ch < 0x0800) {
*p++ = 0xc0 | (ch >> 6); *p++ = 0xc0 | (ch >> 6);
*p++ = 0x80 | (ch & 0x3f); *p++ = 0x80 | (ch & 0x3f);
} else if (0xD800 <= ch && ch <= 0xDFFF) { cbWritten += 2;
/* These byte ranges are reserved for UTF-16 surrogate }
bytes which the Python implementation currently does else {
not support. */ /* Check for high surrogate */
if (utf8_encoding_error(&s, &p, errors, if (0xD800 <= ch && ch <= 0xDBFF) {
"unsupported code range")) if (i != size) {
ch2 = s[i];
if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
if (cbWritten >= (cbAllocated - 4)) {
/* Provide enough room for some more
surrogates */
cbAllocated += 4*10;
if (_PyString_Resize(&v, cbAllocated))
goto onError; goto onError;
} else { }
*p++ = 0xe0 | (ch >> 12);
*p++ = 0x80 | ((ch >> 6) & 0x3f); /* combine the two values */
*p++ = 0x80 | (ch & 0x3f); ch = ((ch - 0xD800)<<10 | (ch2-0xDC00))+0x10000;
*p++ = (char)((ch >> 18) | 0xf0);
*p++ = (char)(0x80 | (ch >> 12) & 0x3f);
i++;
cbWritten += 4;
}
}
}
else {
*p++ = (char)(0xe0 | (ch >> 12));
cbWritten += 3;
}
*p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
*p++ = (char)(0x80 | (ch & 0x3f));
} }
} }
*p = '\0'; *p = '\0';
...@@ -1217,7 +1268,7 @@ PyObject *PyUnicode_DecodeUnicodeEscape(const char *s, ...@@ -1217,7 +1268,7 @@ PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
{ {
const char *start = s + 1; const char *start = s + 1;
const char *endBrace = start; const char *endBrace = start;
unsigned int uiValue; Py_UCS4 value;
unsigned long j; unsigned long j;
/* look for either the closing brace, or we /* look for either the closing brace, or we
...@@ -1248,25 +1299,25 @@ PyObject *PyUnicode_DecodeUnicodeEscape(const char *s, ...@@ -1248,25 +1299,25 @@ PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
} }
goto ucnFallthrough; goto ucnFallthrough;
} }
uiValue = ((_Py_UnicodeCharacterName *) value = ((_Py_UnicodeCharacterName *)
(pucnHash->getValue(j)))->uiValue; (pucnHash->getValue(j)))->value;
if (uiValue < 1<<16) if (value < 1<<16)
{ {
/* In UCS-2 range, easy solution.. */ /* In UCS-2 range, easy solution.. */
*p++ = uiValue; *p++ = value;
} }
else else
{ {
/* Oops, its in UCS-4 space, */ /* Oops, its in UCS-4 space, */
/* compute and append the two surrogates: */ /* compute and append the two surrogates: */
/* translate from 10000..10FFFF to 0..FFFFF */ /* translate from 10000..10FFFF to 0..FFFFF */
uiValue -= 0x10000; value -= 0x10000;
/* high surrogate = top 10 bits added to D800 */ /* high surrogate = top 10 bits added to D800 */
*p++ = 0xD800 + (uiValue >> 10); *p++ = 0xD800 + (value >> 10);
/* low surrogate = bottom 10 bits added to DC00 */ /* low surrogate = bottom 10 bits added to DC00 */
*p++ = 0xDC00 + (uiValue & ~0xFC00); *p++ = 0xDC00 + (value & ~0xFC00);
} }
s = endBrace + 1; s = endBrace + 1;
} }
...@@ -3091,12 +3142,12 @@ unicode_center(PyUnicodeObject *self, PyObject *args) ...@@ -3091,12 +3142,12 @@ unicode_center(PyUnicodeObject *self, PyObject *args)
/* gleaned from: */ /* gleaned from: */
/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */ /* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
static unsigned long utf16Fixup[32] = static short utf16Fixup[32] =
{ {
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0x2000, 0xf800, 0xf800, 0xf800, 0xf800 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
}; };
static int static int
...@@ -3111,7 +3162,7 @@ unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2) ...@@ -3111,7 +3162,7 @@ unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
len2 = str2->length; len2 = str2->length;
while (len1 > 0 && len2 > 0) { while (len1 > 0 && len2 > 0) {
unsigned long c1, c2; Py_UNICODE c1, c2;
long diff; long diff;
c1 = *s1++; c1 = *s1++;
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment