Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Support
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
C
cpython
Project overview
Project overview
Details
Activity
Releases
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Issues
0
Issues
0
List
Boards
Labels
Milestones
Merge Requests
0
Merge Requests
0
Analytics
Analytics
Repository
Value Stream
Wiki
Wiki
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Create a new issue
Commits
Issue Boards
Open sidebar
Kirill Smelkov
cpython
Commits
e822b034
Commit
e822b034
authored
Aug 06, 2013
by
Serhiy Storchaka
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
Issue #15866: The xmlcharrefreplace error handler no more produces two XML
entities for a non-BMP character on narrow build.
parent
5ad35148
Changes
6
Hide whitespace changes
Inline
Side-by-side
Showing
6 changed files
with
135 additions
and
55 deletions
+135
-55
Lib/test/test_codeccallbacks.py
Lib/test/test_codeccallbacks.py
+22
-3
Lib/test/test_unicode.py
Lib/test/test_unicode.py
+12
-0
Misc/NEWS
Misc/NEWS
+3
-0
Modules/_testcapimodule.c
Modules/_testcapimodule.c
+1
-1
Objects/unicodeobject.c
Objects/unicodeobject.c
+61
-21
Python/codecs.c
Python/codecs.c
+36
-30
No files found.
Lib/test/test_codeccallbacks.py
View file @
e822b034
...
...
@@ -66,15 +66,34 @@ class CodecCallbackTest(unittest.TestCase):
# replace unencodable characters which numeric character entities.
# For ascii, latin-1 and charmaps this is completely implemented
# in C and should be reasonably fast.
s
=
u"
\
u30b9
\
u30d1
\
u30e2
\
xe4
nd egg
s
"
s
=
u"
\
u30b9
\
u30d1
\
u30e2
\
xe4
nd egg
\
u0161
"
self
.
assertEqual
(
s
.
encode
(
"ascii"
,
"xmlcharrefreplace"
),
"スパモ änd egg
s
"
"スパモ änd egg
š
"
)
self
.
assertEqual
(
s
.
encode
(
"latin-1"
,
"xmlcharrefreplace"
),
"スパモ
\
xe4
nd egg
s
"
"スパモ
\
xe4
nd egg
š
"
)
self
.
assertEqual
(
s
.
encode
(
"iso-8859-15"
,
"xmlcharrefreplace"
),
"スパモ
\
xe4
nd egg
\
xa8
"
)
def
test_xmlcharrefreplace_with_surrogates
(
self
):
tests
=
[(
u'
\
U0001f49d
'
,
'💝'
),
(
u'
\
ud83d
'
,
'�'
),
(
u'
\
udc9d
'
,
'�'
),
(
u'
\
ud83d
\
udc9d
'
,
'💝'
if
len
(
u'
\
U0001f49d
'
)
>
1
else
'��'
),
]
for
encoding
in
[
'ascii'
,
'latin1'
,
'iso-8859-15'
]:
for
s
,
exp
in
tests
:
self
.
assertEqual
(
s
.
encode
(
encoding
,
'xmlcharrefreplace'
),
exp
,
msg
=
'%r.encode(%r)'
%
(
s
,
encoding
))
self
.
assertEqual
((
s
+
'X'
).
encode
(
encoding
,
'xmlcharrefreplace'
),
exp
+
'X'
,
msg
=
'%r.encode(%r)'
%
(
s
+
'X'
,
encoding
))
def
test_xmlcharnamereplace
(
self
):
# This time use a named character entity for unencodable
...
...
Lib/test/test_unicode.py
View file @
e822b034
...
...
@@ -1658,6 +1658,18 @@ class UnicodeTest(
self
.
assertEqual
(
unicode_encodedecimal
(
u"123
\
u20ac
\
u0660
"
,
"replace"
),
b'123?0'
)
def
test_encode_decimal_with_surrogates
(
self
):
from
_testcapi
import
unicode_encodedecimal
tests
=
[(
u'
\
U0001f49d
'
,
'💝'
),
(
u'
\
ud83d
'
,
'�'
),
(
u'
\
udc9d
'
,
'�'
),
(
u'
\
ud83d
\
udc9d
'
,
'💝'
if
len
(
u'
\
U0001f49d
'
)
>
1
else
'��'
),
]
for
s
,
exp
in
tests
:
self
.
assertEqual
(
unicode_encodedecimal
(
u"123"
+
s
,
"xmlcharrefreplace"
),
'123'
+
exp
)
def
test_main
():
test_support
.
run_unittest
(
__name__
)
...
...
Misc/NEWS
View file @
e822b034
...
...
@@ -9,6 +9,9 @@ What's New in Python 2.7.6?
Core and Builtins
-----------------
- Issue #15866: The xmlcharrefreplace error handler no more produces two XML
entities for a non-BMP character on narrow build.
- Issue #18184: PyUnicode_FromFormat() and PyUnicode_FromFormatV() now raise
OverflowError when an argument of %c format is out of range.
...
...
Modules/_testcapimodule.c
View file @
e822b034
...
...
@@ -1118,7 +1118,7 @@ unicode_encodedecimal(PyObject *self, PyObject *args)
if
(
!
PyArg_ParseTuple
(
args
,
"u#|s"
,
&
unicode
,
&
length
,
&
errors
))
return
NULL
;
decimal_length
=
length
*
7
;
/* len('€
;') */
decimal_length
=
length
*
10
;
/* len('
;') */
decimal
=
PyBytes_FromStringAndSize
(
NULL
,
decimal_length
);
if
(
decimal
==
NULL
)
return
NULL
;
...
...
Objects/unicodeobject.c
View file @
e822b034
...
...
@@ -547,6 +547,37 @@ PyObject *PyUnicode_FromString(const char *u)
return
PyUnicode_FromStringAndSize
(
u
,
size
);
}
/* _Py_UNICODE_NEXT is a private macro used to retrieve the character pointed
* by 'ptr', possibly combining surrogate pairs on narrow builds.
* 'ptr' and 'end' must be Py_UNICODE*, with 'ptr' pointing at the character
* that should be returned and 'end' pointing to the end of the buffer.
* ('end' is used on narrow builds to detect a lone surrogate at the
* end of the buffer that should be returned unchanged.)
* The ptr and end arguments should be side-effect free and ptr must an lvalue.
* The type of the returned char is always Py_UCS4.
*
* Note: the macro advances ptr to next char, so it might have side-effects
* (especially if used with other macros).
*/
/* helper macros used by _Py_UNICODE_NEXT */
#define _Py_UNICODE_IS_HIGH_SURROGATE(ch) (0xD800 <= ch && ch <= 0xDBFF)
#define _Py_UNICODE_IS_LOW_SURROGATE(ch) (0xDC00 <= ch && ch <= 0xDFFF)
/* Join two surrogate characters and return a single Py_UCS4 value. */
#define _Py_UNICODE_JOIN_SURROGATES(high, low) \
(((((Py_UCS4)(high) & 0x03FF) << 10) | \
((Py_UCS4)(low) & 0x03FF)) + 0x10000)
#ifdef Py_UNICODE_WIDE
#define _Py_UNICODE_NEXT(ptr, end) *(ptr)++
#else
#define _Py_UNICODE_NEXT(ptr, end) \
(((_Py_UNICODE_IS_HIGH_SURROGATE(*(ptr)) && (ptr) < (end)) && \
_Py_UNICODE_IS_LOW_SURROGATE((ptr)[1])) ? \
((ptr) += 2,_Py_UNICODE_JOIN_SURROGATES((ptr)[-2], (ptr)[-1])) : \
(Py_UCS4)*(ptr)++)
#endif
#ifdef HAVE_WCHAR_H
#if (Py_UNICODE_SIZE == 2) && defined(SIZEOF_WCHAR_T) && (SIZEOF_WCHAR_T == 4)
...
...
@@ -3642,26 +3673,22 @@ static PyObject *unicode_encode_ucs1(const Py_UNICODE *p,
case
4
:
/* xmlcharrefreplace */
respos
=
str
-
PyString_AS_STRING
(
res
);
/* determine replacement size (temporarily (mis)uses p) */
for
(
p
=
collstart
,
repsize
=
0
;
p
<
collend
;
++
p
)
{
if
(
*
p
<
10
)
for
(
p
=
collstart
,
repsize
=
0
;
p
<
collend
;)
{
Py_UCS4
ch
=
_Py_UNICODE_NEXT
(
p
,
collend
);
if
(
ch
<
10
)
repsize
+=
2
+
1
+
1
;
else
if
(
*
p
<
100
)
else
if
(
ch
<
100
)
repsize
+=
2
+
2
+
1
;
else
if
(
*
p
<
1000
)
else
if
(
ch
<
1000
)
repsize
+=
2
+
3
+
1
;
else
if
(
*
p
<
10000
)
else
if
(
ch
<
10000
)
repsize
+=
2
+
4
+
1
;
#ifndef Py_UNICODE_WIDE
else
else
if
(
ch
<
100000
)
repsize
+=
2
+
5
+
1
;
#else
else
if
(
*
p
<
100000
)
repsize
+=
2
+
5
+
1
;
else
if
(
*
p
<
1000000
)
else
if
(
ch
<
1000000
)
repsize
+=
2
+
6
+
1
;
else
repsize
+=
2
+
7
+
1
;
#endif
}
requiredsize
=
respos
+
repsize
+
(
endp
-
collend
);
if
(
requiredsize
>
ressize
)
{
...
...
@@ -3673,8 +3700,9 @@ static PyObject *unicode_encode_ucs1(const Py_UNICODE *p,
ressize
=
requiredsize
;
}
/* generate replacement (temporarily (mis)uses p) */
for
(
p
=
collstart
;
p
<
collend
;
++
p
)
{
str
+=
sprintf
(
str
,
"&#%d;"
,
(
int
)
*
p
);
for
(
p
=
collstart
;
p
<
collend
;)
{
Py_UCS4
ch
=
_Py_UNICODE_NEXT
(
p
,
collend
);
str
+=
sprintf
(
str
,
"&#%d;"
,
(
int
)
ch
);
}
p
=
collend
;
break
;
...
...
@@ -4649,11 +4677,20 @@ int charmap_encoding_error(
*
inpos
=
collendpos
;
break
;
case
4
:
/* xmlcharrefreplace */
/* generate replacement
(temporarily (mis)uses p)
*/
for
(
collpos
=
collstartpos
;
collpos
<
collendpos
;
++
collpos
)
{
/* generate replacement */
for
(
collpos
=
collstartpos
;
collpos
<
collendpos
;)
{
char
buffer
[
2
+
29
+
1
+
1
];
char
*
cp
;
sprintf
(
buffer
,
"&#%d;"
,
(
int
)
p
[
collpos
]);
Py_UCS4
ch
=
p
[
collpos
++
];
#ifndef Py_UNICODE_WIDE
if
((
0xD800
<=
ch
&&
ch
<=
0xDBFF
)
&&
(
collpos
<
collendpos
)
&&
(
0xDC00
<=
p
[
collpos
]
&&
p
[
collpos
]
<=
0xDFFF
))
{
ch
=
((((
ch
&
0x03FF
)
<<
10
)
|
((
Py_UCS4
)
p
[
collpos
++
]
&
0x03FF
))
+
0x10000
);
}
#endif
sprintf
(
buffer
,
"&#%d;"
,
(
int
)
ch
);
for
(
cp
=
buffer
;
*
cp
;
++
cp
)
{
x
=
charmapencode_output
(
*
cp
,
mapping
,
res
,
respos
);
if
(
x
==
enc_EXCEPTION
)
...
...
@@ -5068,10 +5105,11 @@ PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *p,
break
;
case
4
:
/* xmlcharrefreplace */
/* generate replacement (temporarily (mis)uses p) */
for
(
p
=
collstart
;
p
<
collend
;
++
p
)
{
for
(
p
=
collstart
;
p
<
collend
;)
{
char
buffer
[
2
+
29
+
1
+
1
];
char
*
cp
;
sprintf
(
buffer
,
"&#%d;"
,
(
int
)
*
p
);
Py_UCS4
ch
=
_Py_UNICODE_NEXT
(
p
,
collend
);
sprintf
(
buffer
,
"&#%d;"
,
(
int
)
ch
);
if
(
charmaptranslate_makespace
(
&
res
,
&
str
,
(
str
-
PyUnicode_AS_UNICODE
(
res
))
+
strlen
(
buffer
)
+
(
endp
-
collend
)))
goto
onError
;
...
...
@@ -5222,8 +5260,10 @@ int PyUnicode_EncodeDecimal(Py_UNICODE *s,
break
;
case
4
:
/* xmlcharrefreplace */
/* generate replacement (temporarily (mis)uses p) */
for
(
p
=
collstart
;
p
<
collend
;
++
p
)
output
+=
sprintf
(
output
,
"&#%d;"
,
(
int
)
*
p
);
for
(
p
=
collstart
;
p
<
collend
;)
{
Py_UCS4
ch
=
_Py_UNICODE_NEXT
(
p
,
collend
);
output
+=
sprintf
(
output
,
"&#%d;"
,
ch
);
}
p
=
collend
;
break
;
default:
...
...
Python/codecs.c
View file @
e822b034
...
...
@@ -556,6 +556,7 @@ PyObject *PyCodec_XMLCharRefReplaceErrors(PyObject *exc)
PyObject
*
res
;
Py_UNICODE
*
p
;
Py_UNICODE
*
startp
;
Py_UNICODE
*
e
;
Py_UNICODE
*
outp
;
int
ressize
;
if
(
PyUnicodeEncodeError_GetStart
(
exc
,
&
start
))
...
...
@@ -565,26 +566,31 @@ PyObject *PyCodec_XMLCharRefReplaceErrors(PyObject *exc)
if
(
!
(
object
=
PyUnicodeEncodeError_GetObject
(
exc
)))
return
NULL
;
startp
=
PyUnicode_AS_UNICODE
(
object
);
for
(
p
=
startp
+
start
,
ressize
=
0
;
p
<
startp
+
end
;
++
p
)
{
if
(
*
p
<
10
)
e
=
startp
+
end
;
for
(
p
=
startp
+
start
,
ressize
=
0
;
p
<
e
;)
{
Py_UCS4
ch
=
*
p
++
;
#ifndef Py_UNICODE_WIDE
if
((
0xD800
<=
ch
&&
ch
<=
0xDBFF
)
&&
(
p
<
e
)
&&
(
0xDC00
<=
*
p
&&
*
p
<=
0xDFFF
))
{
ch
=
((((
ch
&
0x03FF
)
<<
10
)
|
((
Py_UCS4
)
*
p
++
&
0x03FF
))
+
0x10000
);
}
#endif
if
(
ch
<
10
)
ressize
+=
2
+
1
+
1
;
else
if
(
*
p
<
100
)
else
if
(
ch
<
100
)
ressize
+=
2
+
2
+
1
;
else
if
(
*
p
<
1000
)
else
if
(
ch
<
1000
)
ressize
+=
2
+
3
+
1
;
else
if
(
*
p
<
10000
)
else
if
(
ch
<
10000
)
ressize
+=
2
+
4
+
1
;
#ifndef Py_UNICODE_WIDE
else
ressize
+=
2
+
5
+
1
;
#else
else
if
(
*
p
<
100000
)
else
if
(
ch
<
100000
)
ressize
+=
2
+
5
+
1
;
else
if
(
*
p
<
1000000
)
else
if
(
ch
<
1000000
)
ressize
+=
2
+
6
+
1
;
else
ressize
+=
2
+
7
+
1
;
#endif
}
/* allocate replacement */
res
=
PyUnicode_FromUnicode
(
NULL
,
ressize
);
...
...
@@ -593,40 +599,41 @@ PyObject *PyCodec_XMLCharRefReplaceErrors(PyObject *exc)
return
NULL
;
}
/* generate replacement */
for
(
p
=
startp
+
start
,
outp
=
PyUnicode_AS_UNICODE
(
res
);
p
<
startp
+
end
;
++
p
)
{
Py_UNICODE
c
=
*
p
;
for
(
p
=
startp
+
start
,
outp
=
PyUnicode_AS_UNICODE
(
res
);
p
<
e
;)
{
int
digits
;
int
base
;
Py_UCS4
ch
=
*
p
++
;
#ifndef Py_UNICODE_WIDE
if
((
0xD800
<=
ch
&&
ch
<=
0xDBFF
)
&&
(
p
<
startp
+
end
)
&&
(
0xDC00
<=
*
p
&&
*
p
<=
0xDFFF
))
{
ch
=
((((
ch
&
0x03FF
)
<<
10
)
|
((
Py_UCS4
)
*
p
++
&
0x03FF
))
+
0x10000
);
}
#endif
*
outp
++
=
'&'
;
*
outp
++
=
'#'
;
if
(
*
p
<
10
)
{
if
(
ch
<
10
)
{
digits
=
1
;
base
=
1
;
}
else
if
(
*
p
<
100
)
{
else
if
(
ch
<
100
)
{
digits
=
2
;
base
=
10
;
}
else
if
(
*
p
<
1000
)
{
else
if
(
ch
<
1000
)
{
digits
=
3
;
base
=
100
;
}
else
if
(
*
p
<
10000
)
{
else
if
(
ch
<
10000
)
{
digits
=
4
;
base
=
1000
;
}
#ifndef Py_UNICODE_WIDE
else
{
digits
=
5
;
base
=
10000
;
}
#else
else
if
(
*
p
<
100000
)
{
else
if
(
ch
<
100000
)
{
digits
=
5
;
base
=
10000
;
}
else
if
(
*
p
<
1000000
)
{
else
if
(
ch
<
1000000
)
{
digits
=
6
;
base
=
100000
;
}
...
...
@@ -634,10 +641,9 @@ PyObject *PyCodec_XMLCharRefReplaceErrors(PyObject *exc)
digits
=
7
;
base
=
1000000
;
}
#endif
while
(
digits
-->
0
)
{
*
outp
++
=
'0'
+
c
/
base
;
c
%=
base
;
*
outp
++
=
'0'
+
c
h
/
base
;
c
h
%=
base
;
base
/=
10
;
}
*
outp
++
=
';'
;
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment