Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Support
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
C
cpython
Project overview
Project overview
Details
Activity
Releases
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Issues
0
Issues
0
List
Boards
Labels
Milestones
Merge Requests
0
Merge Requests
0
Analytics
Analytics
Repository
Value Stream
Wiki
Wiki
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Create a new issue
Commits
Issue Boards
Open sidebar
Kirill Smelkov
cpython
Commits
e12896ec
Commit
e12896ec
authored
Jul 07, 2000
by
Marc-André Lemburg
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
New surrogate support in the UTF-8 codec. By Bill Tutt.
parent
d6d06ade
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
80 additions
and
29 deletions
+80
-29
Objects/unicodeobject.c
Objects/unicodeobject.c
+80
-29
No files found.
Objects/unicodeobject.c
View file @
e12896ec
...
...
@@ -657,10 +657,10 @@ PyObject *PyUnicode_DecodeUTF8(const char *s,
e
=
s
+
size
;
while
(
s
<
e
)
{
register
Py_UNICODE
ch
=
(
unsigned
char
)
*
s
;
Py_UCS4
ch
=
(
unsigned
char
)
*
s
;
if
(
ch
<
0x80
)
{
*
p
++
=
ch
;
*
p
++
=
(
Py_UNICODE
)
ch
;
s
++
;
continue
;
}
...
...
@@ -687,7 +687,7 @@ PyObject *PyUnicode_DecodeUTF8(const char *s,
if
(
ch
<
0x80
)
UTF8_ERROR
(
"illegal encoding"
);
else
*
p
++
=
ch
;
*
p
++
=
(
Py_UNICODE
)
ch
;
break
;
case
3
:
...
...
@@ -698,7 +698,30 @@ PyObject *PyUnicode_DecodeUTF8(const char *s,
if
(
ch
<
0x800
||
(
ch
>=
0xd800
&&
ch
<
0xe000
))
UTF8_ERROR
(
"illegal encoding"
);
else
*
p
++
=
ch
;
*
p
++
=
(
Py_UNICODE
)
ch
;
break
;
case
4
:
if
((
s
[
1
]
&
0xc0
)
!=
0x80
||
(
s
[
2
]
&
0xc0
)
!=
0x80
||
(
s
[
3
]
&
0xc0
)
!=
0x80
)
UTF8_ERROR
(
"invalid data"
);
ch
=
((
s
[
0
]
&
0x7
)
<<
18
)
+
((
s
[
1
]
&
0x3f
)
<<
12
)
+
((
s
[
2
]
&
0x3f
)
<<
6
)
+
(
s
[
3
]
&
0x3f
);
/* validate and convert to UTF-16 */
if
((
ch
<
0x10000
)
||
/* minimum value allowed for 4 byte encoding */
(
ch
>
0x10ffff
))
/* maximum value allowed for UTF-16 */
UTF8_ERROR
(
"illegal encoding"
);
/* compute and append the two surrogates: */
/* translate from 10000..10FFFF to 0..FFFF */
ch
-=
0x10000
;
/* high surrogate = top 10 bits added to D800 */
*
p
++
=
(
Py_UNICODE
)(
0xD800
+
(
ch
>>
10
));
/* low surrogate = bottom 10 bits added to DC00 */
*
p
++
=
(
Py_UNICODE
)(
0xDC00
+
(
ch
&
~
0xFC00
));
break
;
default:
...
...
@@ -758,32 +781,60 @@ PyObject *PyUnicode_EncodeUTF8(const Py_UNICODE *s,
PyObject
*
v
;
char
*
p
;
char
*
q
;
Py_UCS4
ch2
;
unsigned
int
cbAllocated
=
3
*
size
;
unsigned
int
cbWritten
=
0
;
int
i
=
0
;
v
=
PyString_FromStringAndSize
(
NULL
,
3
*
size
);
v
=
PyString_FromStringAndSize
(
NULL
,
cbAllocated
);
if
(
v
==
NULL
)
return
NULL
;
if
(
size
==
0
)
goto
done
;
p
=
q
=
PyString_AS_STRING
(
v
);
while
(
size
--
>
0
)
{
Py_U
NICODE
ch
=
*
s
++
;
if
(
ch
<
0x80
)
while
(
i
<
size
)
{
Py_U
CS4
ch
=
s
[
i
++
]
;
if
(
ch
<
0x80
)
{
*
p
++
=
(
char
)
ch
;
cbWritten
++
;
}
else
if
(
ch
<
0x0800
)
{
*
p
++
=
0xc0
|
(
ch
>>
6
);
*
p
++
=
0x80
|
(
ch
&
0x3f
);
}
else
if
(
0xD800
<=
ch
&&
ch
<=
0xDFFF
)
{
/* These byte ranges are reserved for UTF-16 surrogate
bytes which the Python implementation currently does
not support. */
if
(
utf8_encoding_error
(
&
s
,
&
p
,
errors
,
"unsupported code range"
))
cbWritten
+=
2
;
}
else
{
/* Check for high surrogate */
if
(
0xD800
<=
ch
&&
ch
<=
0xDBFF
)
{
if
(
i
!=
size
)
{
ch2
=
s
[
i
];
if
(
0xDC00
<=
ch2
&&
ch2
<=
0xDFFF
)
{
if
(
cbWritten
>=
(
cbAllocated
-
4
))
{
/* Provide enough room for some more
surrogates */
cbAllocated
+=
4
*
10
;
if
(
_PyString_Resize
(
&
v
,
cbAllocated
))
goto
onError
;
}
else
{
*
p
++
=
0xe0
|
(
ch
>>
12
);
*
p
++
=
0x80
|
((
ch
>>
6
)
&
0x3f
);
*
p
++
=
0x80
|
(
ch
&
0x3f
);
}
/* combine the two values */
ch
=
((
ch
-
0xD800
)
<<
10
|
(
ch2
-
0xDC00
))
+
0x10000
;
*
p
++
=
(
char
)((
ch
>>
18
)
|
0xf0
);
*
p
++
=
(
char
)(
0x80
|
(
ch
>>
12
)
&
0x3f
);
i
++
;
cbWritten
+=
4
;
}
}
}
else
{
*
p
++
=
(
char
)(
0xe0
|
(
ch
>>
12
));
cbWritten
+=
3
;
}
*
p
++
=
(
char
)(
0x80
|
((
ch
>>
6
)
&
0x3f
));
*
p
++
=
(
char
)(
0x80
|
(
ch
&
0x3f
));
}
}
*
p
=
'\0'
;
...
...
@@ -1217,7 +1268,7 @@ PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
{
const
char
*
start
=
s
+
1
;
const
char
*
endBrace
=
start
;
unsigned
int
uiV
alue
;
Py_UCS4
v
alue
;
unsigned
long
j
;
/* look for either the closing brace, or we
...
...
@@ -1248,25 +1299,25 @@ PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
}
goto
ucnFallthrough
;
}
uiV
alue
=
((
_Py_UnicodeCharacterName
*
)
(
pucnHash
->
getValue
(
j
)))
->
uiV
alue
;
if
(
uiV
alue
<
1
<<
16
)
v
alue
=
((
_Py_UnicodeCharacterName
*
)
(
pucnHash
->
getValue
(
j
)))
->
v
alue
;
if
(
v
alue
<
1
<<
16
)
{
/* In UCS-2 range, easy solution.. */
*
p
++
=
uiV
alue
;
*
p
++
=
v
alue
;
}
else
{
/* Oops, its in UCS-4 space, */
/* compute and append the two surrogates: */
/* translate from 10000..10FFFF to 0..FFFFF */
uiV
alue
-=
0x10000
;
v
alue
-=
0x10000
;
/* high surrogate = top 10 bits added to D800 */
*
p
++
=
0xD800
+
(
uiV
alue
>>
10
);
*
p
++
=
0xD800
+
(
v
alue
>>
10
);
/* low surrogate = bottom 10 bits added to DC00 */
*
p
++
=
0xDC00
+
(
uiV
alue
&
~
0xFC00
);
*
p
++
=
0xDC00
+
(
v
alue
&
~
0xFC00
);
}
s
=
endBrace
+
1
;
}
...
...
@@ -3091,12 +3142,12 @@ unicode_center(PyUnicodeObject *self, PyObject *args)
/* gleaned from: */
/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
static
unsigned
long
utf16Fixup
[
32
]
=
static
short
utf16Fixup
[
32
]
=
{
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0x2000
,
0xf800
,
0xf800
,
0xf800
,
0xf
800
0
,
0
,
0
,
0x2000
,
-
0x800
,
-
0x800
,
-
0x800
,
-
0x
800
};
static
int
...
...
@@ -3111,7 +3162,7 @@ unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
len2
=
str2
->
length
;
while
(
len1
>
0
&&
len2
>
0
)
{
unsigned
long
c1
,
c2
;
Py_UNICODE
c1
,
c2
;
long
diff
;
c1
=
*
s1
++
;
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment