Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Support
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
C
cpython
Project overview
Project overview
Details
Activity
Releases
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Issues
0
Issues
0
List
Boards
Labels
Milestones
Merge Requests
0
Merge Requests
0
Analytics
Analytics
Repository
Value Stream
Wiki
Wiki
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Create a new issue
Commits
Issue Boards
Open sidebar
Kirill Smelkov
cpython
Commits
31be90b0
Commit
31be90b0
authored
Apr 22, 2010
by
Victor Stinner
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
Issue #8092: Fix PyUnicode_EncodeUTF8() to support error handler producing
unicode string (eg. backslashreplace)
parent
29619b2a
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
93 additions
and
47 deletions
+93
-47
Lib/test/test_codecs.py
Lib/test/test_codecs.py
+10
-0
Misc/NEWS
Misc/NEWS
+3
-0
Objects/unicodeobject.c
Objects/unicodeobject.c
+80
-47
No files found.
Lib/test/test_codecs.py
View file @
31be90b0
...
...
@@ -571,6 +571,16 @@ class UTF8Test(ReadTest):
def
test_lone_surrogates
(
self
):
self
.
assertRaises
(
UnicodeEncodeError
,
"
\
ud800
"
.
encode
,
"utf-8"
)
self
.
assertRaises
(
UnicodeDecodeError
,
b"
\
xed
\
xa0
\
x80
"
.
decode
,
"utf-8"
)
self
.
assertEqual
(
"[
\
uDC80
]"
.
encode
(
"utf-8"
,
"backslashreplace"
),
b'[
\
\
udc80]'
)
self
.
assertEqual
(
"[
\
uDC80
]"
.
encode
(
"utf-8"
,
"xmlcharrefreplace"
),
b'[�]'
)
self
.
assertEqual
(
"[
\
uDC80
]"
.
encode
(
"utf-8"
,
"surrogateescape"
),
b'[
\
x80
]'
)
self
.
assertEqual
(
"[
\
uDC80
]"
.
encode
(
"utf-8"
,
"ignore"
),
b'[]'
)
self
.
assertEqual
(
"[
\
uDC80
]"
.
encode
(
"utf-8"
,
"replace"
),
b'[?]'
)
def
test_surrogatepass_handler
(
self
):
self
.
assertEquals
(
"abc
\
ud800
def"
.
encode
(
"utf-8"
,
"surrogatepass"
),
...
...
Misc/NEWS
View file @
31be90b0
...
...
@@ -12,6 +12,9 @@ What's New in Python 3.2 Alpha 1?
Core and Builtins
-----------------
- Issue #8092: Fix PyUnicode_EncodeUTF8() to support error handler producing
unicode string (eg. backslashreplace)
- Issue #8485: PyUnicode_FSConverter() doesn't accept bytearray object anymore,
you have to convert your bytearray filenames to bytes
...
...
Objects/unicodeobject.c
View file @
31be90b0
...
...
@@ -159,6 +159,12 @@ static PyObject *unicode_encode_call_errorhandler(const char *errors,
const
Py_UNICODE
*
unicode
,
Py_ssize_t
size
,
PyObject
**
exceptionObject
,
Py_ssize_t
startpos
,
Py_ssize_t
endpos
,
Py_ssize_t
*
newpos
);
static
void
raise_encode_exception
(
PyObject
**
exceptionObject
,
const
char
*
encoding
,
const
Py_UNICODE
*
unicode
,
Py_ssize_t
size
,
Py_ssize_t
startpos
,
Py_ssize_t
endpos
,
const
char
*
reason
);
/* Same for linebreaks */
static
unsigned
char
ascii_linebreak
[]
=
{
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
...
...
@@ -2542,61 +2548,88 @@ PyUnicode_EncodeUTF8(const Py_UNICODE *s,
/* Encode Latin-1 */
*
p
++
=
(
char
)(
0xc0
|
(
ch
>>
6
));
*
p
++
=
(
char
)(
0x80
|
(
ch
&
0x3f
));
}
else
{
/* Encode UCS2 Unicode ordinals */
if
(
ch
<
0x10000
)
{
}
else
if
(
0xD800
<=
ch
&&
ch
<=
0xDFFF
)
{
#ifndef Py_UNICODE_WIDE
/* Special case: check for high surrogate */
if
(
0xD800
<=
ch
&&
ch
<=
0xDBFF
&&
i
!=
size
)
{
Py_UCS4
ch2
=
s
[
i
];
/* Check for low surrogate and combine the two to
form a UCS4 value */
if
(
0xDC00
<=
ch2
&&
ch2
<=
0xDFFF
)
{
ch
=
((
ch
-
0xD800
)
<<
10
|
(
ch2
-
0xDC00
))
+
0x10000
;
i
++
;
goto
encodeUCS4
;
}
/* Fall through: handles isolated high surrogates */
}
/* Special case: check for high and low surrogate */
if
(
ch
<=
0xDBFF
&&
i
!=
size
&&
0xDC00
<=
s
[
i
]
&&
s
[
i
]
<=
0xDFFF
)
{
Py_UCS4
ch2
=
s
[
i
];
/* Combine the two surrogates to form a UCS4 value */
ch
=
((
ch
-
0xD800
)
<<
10
|
(
ch2
-
0xDC00
))
+
0x10000
;
i
++
;
/* Encode UCS4 Unicode ordinals */
*
p
++
=
(
char
)(
0xf0
|
(
ch
>>
18
));
*
p
++
=
(
char
)(
0x80
|
((
ch
>>
12
)
&
0x3f
));
*
p
++
=
(
char
)(
0x80
|
((
ch
>>
6
)
&
0x3f
));
*
p
++
=
(
char
)(
0x80
|
(
ch
&
0x3f
));
#endif
if
(
ch
>=
0xd800
&&
ch
<=
0xdfff
)
{
Py_ssize_t
newpos
;
PyObject
*
rep
;
char
*
prep
;
int
k
;
rep
=
unicode_encode_call_errorhandler
(
errors
,
&
errorHandler
,
"utf-8"
,
"surrogates not allowed"
,
s
,
size
,
&
exc
,
i
-
1
,
i
,
&
newpos
);
if
(
!
rep
)
goto
error
;
/* Implementation limitations: only support error handler that return
bytes, and only support up to four replacement bytes. */
if
(
!
PyBytes_Check
(
rep
))
{
PyErr_SetString
(
PyExc_TypeError
,
"error handler should have returned bytes"
);
Py_DECREF
(
rep
);
}
else
{
Py_ssize_t
newpos
;
PyObject
*
rep
;
Py_ssize_t
repsize
,
k
;
rep
=
unicode_encode_call_errorhandler
(
errors
,
&
errorHandler
,
"utf-8"
,
"surrogates not allowed"
,
s
,
size
,
&
exc
,
i
-
1
,
i
,
&
newpos
);
if
(
!
rep
)
goto
error
;
if
(
PyBytes_Check
(
rep
))
repsize
=
PyBytes_GET_SIZE
(
rep
);
else
repsize
=
PyUnicode_GET_SIZE
(
rep
);
if
(
repsize
>
4
)
{
Py_ssize_t
offset
;
if
(
result
==
NULL
)
offset
=
p
-
stackbuf
;
else
offset
=
p
-
PyBytes_AS_STRING
(
result
);
if
(
nallocated
>
PY_SSIZE_T_MAX
-
repsize
+
4
)
{
/* integer overflow */
PyErr_NoMemory
();
goto
error
;
}
if
(
PyBytes_Size
(
rep
)
>
4
)
{
PyErr_SetString
(
PyExc_TypeError
,
"error handler returned too many bytes"
);
Py_DECREF
(
rep
);
goto
error
;
nallocated
+=
repsize
-
4
;
if
(
result
!=
NULL
)
{
if
(
_PyBytes_Resize
(
&
result
,
nallocated
)
<
0
)
goto
error
;
}
else
{
result
=
PyBytes_FromStringAndSize
(
NULL
,
nallocated
);
if
(
result
==
NULL
)
goto
error
;
Py_MEMCPY
(
PyBytes_AS_STRING
(
result
),
stackbuf
,
offset
);
}
prep
=
PyBytes_AsString
(
rep
);
for
(
k
=
PyBytes_Size
(
rep
);
k
>
0
;
k
--
)
p
=
PyBytes_AS_STRING
(
result
)
+
offset
;
}
if
(
PyBytes_Check
(
rep
))
{
char
*
prep
=
PyBytes_AS_STRING
(
rep
);
for
(
k
=
repsize
;
k
>
0
;
k
--
)
*
p
++
=
*
prep
++
;
Py_DECREF
(
rep
);
continue
;
}
else
/* rep is unicode */
{
Py_UNICODE
*
prep
=
PyUnicode_AS_UNICODE
(
rep
);
Py_UNICODE
c
;
for
(
k
=
0
;
k
<
repsize
;
k
++
)
{
c
=
prep
[
k
];
if
(
0x80
<=
c
)
{
raise_encode_exception
(
&
exc
,
"utf-8"
,
s
,
size
,
i
-
1
,
i
,
"surrogates not allowed"
);
goto
error
;
}
*
p
++
=
(
char
)
prep
[
k
];
}
}
*
p
++
=
(
char
)(
0xe0
|
(
ch
>>
12
));
*
p
++
=
(
char
)(
0x80
|
((
ch
>>
6
)
&
0x3f
));
*
p
++
=
(
char
)(
0x80
|
(
ch
&
0x3f
));
continue
;
Py_DECREF
(
rep
);
}
#ifndef Py_UNICODE_WIDE
encodeUCS4:
#endif
}
else
if
(
ch
<
0x10000
)
{
*
p
++
=
(
char
)(
0xe0
|
(
ch
>>
12
));
*
p
++
=
(
char
)(
0x80
|
((
ch
>>
6
)
&
0x3f
));
*
p
++
=
(
char
)(
0x80
|
(
ch
&
0x3f
));
}
else
/* ch >= 0x10000 */
{
/* Encode UCS4 Unicode ordinals */
*
p
++
=
(
char
)(
0xf0
|
(
ch
>>
18
));
*
p
++
=
(
char
)(
0x80
|
((
ch
>>
12
)
&
0x3f
));
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment