Skip to content
GitLab
Projects
Groups
Snippets
Help
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Open sidebar
Kirill Smelkov
cpython
Commits
e822b034
Commit
e822b034
authored
11 years ago
by
Serhiy Storchaka
Browse files
Options
Download
Email Patches
Plain Diff
Issue #15866: The xmlcharrefreplace error handler no more produces two XML
entities for a non-BMP character on narrow build.
parent
5ad35148
Changes
6
Hide whitespace changes
Inline
Side-by-side
Showing
6 changed files
with
135 additions
and
55 deletions
+135
-55
Lib/test/test_codeccallbacks.py
Lib/test/test_codeccallbacks.py
+22
-3
Lib/test/test_unicode.py
Lib/test/test_unicode.py
+12
-0
Misc/NEWS
Misc/NEWS
+3
-0
Modules/_testcapimodule.c
Modules/_testcapimodule.c
+1
-1
Objects/unicodeobject.c
Objects/unicodeobject.c
+61
-21
Python/codecs.c
Python/codecs.c
+36
-30
No files found.
Lib/test/test_codeccallbacks.py
View file @
e822b034
...
...
@@ -66,15 +66,34 @@ class CodecCallbackTest(unittest.TestCase):
# replace unencodable characters which numeric character entities.
# For ascii, latin-1 and charmaps this is completely implemented
# in C and should be reasonably fast.
s
=
u
"
\u30b9\u30d1\u30e2
\xe4
nd egg
s
"
s
=
u
"
\u30b9\u30d1\u30e2
\xe4
nd egg
\u0161
"
self
.
assertEqual
(
s
.
encode
(
"ascii"
,
"xmlcharrefreplace"
),
"スパモ änd egg
s
"
"スパモ änd egg
š
"
)
self
.
assertEqual
(
s
.
encode
(
"latin-1"
,
"xmlcharrefreplace"
),
"スパモ
\xe4
nd egg
s
"
"スパモ
\xe4
nd egg
š
"
)
self
.
assertEqual
(
s
.
encode
(
"iso-8859-15"
,
"xmlcharrefreplace"
),
"スパモ
\xe4
nd egg
\xa8
"
)
def
test_xmlcharrefreplace_with_surrogates
(
self
):
tests
=
[(
u
'
\U0001f49d
'
,
'💝'
),
(
u
'
\ud83d
'
,
'�'
),
(
u
'
\udc9d
'
,
'�'
),
(
u
'
\ud83d\udc9d
'
,
'💝'
if
len
(
u
'
\U0001f49d
'
)
>
1
else
'��'
),
]
for
encoding
in
[
'ascii'
,
'latin1'
,
'iso-8859-15'
]:
for
s
,
exp
in
tests
:
self
.
assertEqual
(
s
.
encode
(
encoding
,
'xmlcharrefreplace'
),
exp
,
msg
=
'%r.encode(%r)'
%
(
s
,
encoding
))
self
.
assertEqual
((
s
+
'X'
).
encode
(
encoding
,
'xmlcharrefreplace'
),
exp
+
'X'
,
msg
=
'%r.encode(%r)'
%
(
s
+
'X'
,
encoding
))
def
test_xmlcharnamereplace
(
self
):
# This time use a named character entity for unencodable
...
...
This diff is collapsed.
Click to expand it.
Lib/test/test_unicode.py
View file @
e822b034
...
...
@@ -1658,6 +1658,18 @@ class UnicodeTest(
self
.
assertEqual
(
unicode_encodedecimal
(
u
"123
\u20ac\u0660
"
,
"replace"
),
b
'123?0'
)
def
test_encode_decimal_with_surrogates
(
self
):
from
_testcapi
import
unicode_encodedecimal
tests
=
[(
u
'
\U0001f49d
'
,
'💝'
),
(
u
'
\ud83d
'
,
'�'
),
(
u
'
\udc9d
'
,
'�'
),
(
u
'
\ud83d\udc9d
'
,
'💝'
if
len
(
u
'
\U0001f49d
'
)
>
1
else
'��'
),
]
for
s
,
exp
in
tests
:
self
.
assertEqual
(
unicode_encodedecimal
(
u
"123"
+
s
,
"xmlcharrefreplace"
),
'123'
+
exp
)
def
test_main
():
test_support
.
run_unittest
(
__name__
)
...
...
This diff is collapsed.
Click to expand it.
Misc/NEWS
View file @
e822b034
...
...
@@ -9,6 +9,9 @@ What's New in Python 2.7.6?
Core and Builtins
-----------------
- Issue #15866: The xmlcharrefreplace error handler no more produces two XML
entities for a non-BMP character on narrow build.
- Issue #18184: PyUnicode_FromFormat() and PyUnicode_FromFormatV() now raise
OverflowError when an argument of %c format is out of range.
...
...
This diff is collapsed.
Click to expand it.
Modules/_testcapimodule.c
View file @
e822b034
...
...
@@ -1118,7 +1118,7 @@ unicode_encodedecimal(PyObject *self, PyObject *args)
if
(
!
PyArg_ParseTuple
(
args
,
"u#|s"
,
&
unicode
,
&
length
,
&
errors
))
return
NULL
;
decimal_length
=
length
*
7
;
/* len('&#
8364
;') */
decimal_length
=
length
*
10
;
/* len('&#
1114111
;') */
decimal
=
PyBytes_FromStringAndSize
(
NULL
,
decimal_length
);
if
(
decimal
==
NULL
)
return
NULL
;
...
...
This diff is collapsed.
Click to expand it.
Objects/unicodeobject.c
View file @
e822b034
...
...
@@ -547,6 +547,37 @@ PyObject *PyUnicode_FromString(const char *u)
return
PyUnicode_FromStringAndSize
(
u
,
size
);
}
/* _Py_UNICODE_NEXT is a private macro used to retrieve the character pointed
* by 'ptr', possibly combining surrogate pairs on narrow builds.
* 'ptr' and 'end' must be Py_UNICODE*, with 'ptr' pointing at the character
* that should be returned and 'end' pointing to the end of the buffer.
* ('end' is used on narrow builds to detect a lone surrogate at the
* end of the buffer that should be returned unchanged.)
* The ptr and end arguments should be side-effect free and ptr must an lvalue.
* The type of the returned char is always Py_UCS4.
*
* Note: the macro advances ptr to next char, so it might have side-effects
* (especially if used with other macros).
*/
/* helper macros used by _Py_UNICODE_NEXT */
#define _Py_UNICODE_IS_HIGH_SURROGATE(ch) (0xD800 <= ch && ch <= 0xDBFF)
#define _Py_UNICODE_IS_LOW_SURROGATE(ch) (0xDC00 <= ch && ch <= 0xDFFF)
/* Join two surrogate characters and return a single Py_UCS4 value. */
#define _Py_UNICODE_JOIN_SURROGATES(high, low) \
(((((Py_UCS4)(high) & 0x03FF) << 10) | \
((Py_UCS4)(low) & 0x03FF)) + 0x10000)
#ifdef Py_UNICODE_WIDE
#define _Py_UNICODE_NEXT(ptr, end) *(ptr)++
#else
#define _Py_UNICODE_NEXT(ptr, end) \
(((_Py_UNICODE_IS_HIGH_SURROGATE(*(ptr)) && (ptr) < (end)) && \
_Py_UNICODE_IS_LOW_SURROGATE((ptr)[1])) ? \
((ptr) += 2,_Py_UNICODE_JOIN_SURROGATES((ptr)[-2], (ptr)[-1])) : \
(Py_UCS4)*(ptr)++)
#endif
#ifdef HAVE_WCHAR_H
#if (Py_UNICODE_SIZE == 2) && defined(SIZEOF_WCHAR_T) && (SIZEOF_WCHAR_T == 4)
...
...
@@ -3642,26 +3673,22 @@ static PyObject *unicode_encode_ucs1(const Py_UNICODE *p,
case
4
:
/* xmlcharrefreplace */
respos
=
str
-
PyString_AS_STRING
(
res
);
/* determine replacement size (temporarily (mis)uses p) */
for
(
p
=
collstart
,
repsize
=
0
;
p
<
collend
;
++
p
)
{
if
(
*
p
<
10
)
for
(
p
=
collstart
,
repsize
=
0
;
p
<
collend
;)
{
Py_UCS4
ch
=
_Py_UNICODE_NEXT
(
p
,
collend
);
if
(
ch
<
10
)
repsize
+=
2
+
1
+
1
;
else
if
(
*
p
<
100
)
else
if
(
ch
<
100
)
repsize
+=
2
+
2
+
1
;
else
if
(
*
p
<
1000
)
else
if
(
ch
<
1000
)
repsize
+=
2
+
3
+
1
;
else
if
(
*
p
<
10000
)
else
if
(
ch
<
10000
)
repsize
+=
2
+
4
+
1
;
#ifndef Py_UNICODE_WIDE
else
else
if
(
ch
<
100000
)
repsize
+=
2
+
5
+
1
;
#else
else
if
(
*
p
<
100000
)
repsize
+=
2
+
5
+
1
;
else
if
(
*
p
<
1000000
)
else
if
(
ch
<
1000000
)
repsize
+=
2
+
6
+
1
;
else
repsize
+=
2
+
7
+
1
;
#endif
}
requiredsize
=
respos
+
repsize
+
(
endp
-
collend
);
if
(
requiredsize
>
ressize
)
{
...
...
@@ -3673,8 +3700,9 @@ static PyObject *unicode_encode_ucs1(const Py_UNICODE *p,
ressize
=
requiredsize
;
}
/* generate replacement (temporarily (mis)uses p) */
for
(
p
=
collstart
;
p
<
collend
;
++
p
)
{
str
+=
sprintf
(
str
,
"&#%d;"
,
(
int
)
*
p
);
for
(
p
=
collstart
;
p
<
collend
;)
{
Py_UCS4
ch
=
_Py_UNICODE_NEXT
(
p
,
collend
);
str
+=
sprintf
(
str
,
"&#%d;"
,
(
int
)
ch
);
}
p
=
collend
;
break
;
...
...
@@ -4649,11 +4677,20 @@ int charmap_encoding_error(
*
inpos
=
collendpos
;
break
;
case
4
:
/* xmlcharrefreplace */
/* generate replacement
(temporarily (mis)uses p)
*/
for
(
collpos
=
collstartpos
;
collpos
<
collendpos
;
++
collpos
)
{
/* generate replacement */
for
(
collpos
=
collstartpos
;
collpos
<
collendpos
;)
{
char
buffer
[
2
+
29
+
1
+
1
];
char
*
cp
;
sprintf
(
buffer
,
"&#%d;"
,
(
int
)
p
[
collpos
]);
Py_UCS4
ch
=
p
[
collpos
++
];
#ifndef Py_UNICODE_WIDE
if
((
0xD800
<=
ch
&&
ch
<=
0xDBFF
)
&&
(
collpos
<
collendpos
)
&&
(
0xDC00
<=
p
[
collpos
]
&&
p
[
collpos
]
<=
0xDFFF
))
{
ch
=
((((
ch
&
0x03FF
)
<<
10
)
|
((
Py_UCS4
)
p
[
collpos
++
]
&
0x03FF
))
+
0x10000
);
}
#endif
sprintf
(
buffer
,
"&#%d;"
,
(
int
)
ch
);
for
(
cp
=
buffer
;
*
cp
;
++
cp
)
{
x
=
charmapencode_output
(
*
cp
,
mapping
,
res
,
respos
);
if
(
x
==
enc_EXCEPTION
)
...
...
@@ -5068,10 +5105,11 @@ PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *p,
break
;
case
4
:
/* xmlcharrefreplace */
/* generate replacement (temporarily (mis)uses p) */
for
(
p
=
collstart
;
p
<
collend
;
++
p
)
{
for
(
p
=
collstart
;
p
<
collend
;)
{
char
buffer
[
2
+
29
+
1
+
1
];
char
*
cp
;
sprintf
(
buffer
,
"&#%d;"
,
(
int
)
*
p
);
Py_UCS4
ch
=
_Py_UNICODE_NEXT
(
p
,
collend
);
sprintf
(
buffer
,
"&#%d;"
,
(
int
)
ch
);
if
(
charmaptranslate_makespace
(
&
res
,
&
str
,
(
str
-
PyUnicode_AS_UNICODE
(
res
))
+
strlen
(
buffer
)
+
(
endp
-
collend
)))
goto
onError
;
...
...
@@ -5222,8 +5260,10 @@ int PyUnicode_EncodeDecimal(Py_UNICODE *s,
break
;
case
4
:
/* xmlcharrefreplace */
/* generate replacement (temporarily (mis)uses p) */
for
(
p
=
collstart
;
p
<
collend
;
++
p
)
output
+=
sprintf
(
output
,
"&#%d;"
,
(
int
)
*
p
);
for
(
p
=
collstart
;
p
<
collend
;)
{
Py_UCS4
ch
=
_Py_UNICODE_NEXT
(
p
,
collend
);
output
+=
sprintf
(
output
,
"&#%d;"
,
ch
);
}
p
=
collend
;
break
;
default:
...
...
This diff is collapsed.
Click to expand it.
Python/codecs.c
View file @
e822b034
...
...
@@ -556,6 +556,7 @@ PyObject *PyCodec_XMLCharRefReplaceErrors(PyObject *exc)
PyObject
*
res
;
Py_UNICODE
*
p
;
Py_UNICODE
*
startp
;
Py_UNICODE
*
e
;
Py_UNICODE
*
outp
;
int
ressize
;
if
(
PyUnicodeEncodeError_GetStart
(
exc
,
&
start
))
...
...
@@ -565,26 +566,31 @@ PyObject *PyCodec_XMLCharRefReplaceErrors(PyObject *exc)
if
(
!
(
object
=
PyUnicodeEncodeError_GetObject
(
exc
)))
return
NULL
;
startp
=
PyUnicode_AS_UNICODE
(
object
);
for
(
p
=
startp
+
start
,
ressize
=
0
;
p
<
startp
+
end
;
++
p
)
{
if
(
*
p
<
10
)
e
=
startp
+
end
;
for
(
p
=
startp
+
start
,
ressize
=
0
;
p
<
e
;)
{
Py_UCS4
ch
=
*
p
++
;
#ifndef Py_UNICODE_WIDE
if
((
0xD800
<=
ch
&&
ch
<=
0xDBFF
)
&&
(
p
<
e
)
&&
(
0xDC00
<=
*
p
&&
*
p
<=
0xDFFF
))
{
ch
=
((((
ch
&
0x03FF
)
<<
10
)
|
((
Py_UCS4
)
*
p
++
&
0x03FF
))
+
0x10000
);
}
#endif
if
(
ch
<
10
)
ressize
+=
2
+
1
+
1
;
else
if
(
*
p
<
100
)
else
if
(
ch
<
100
)
ressize
+=
2
+
2
+
1
;
else
if
(
*
p
<
1000
)
else
if
(
ch
<
1000
)
ressize
+=
2
+
3
+
1
;
else
if
(
*
p
<
10000
)
else
if
(
ch
<
10000
)
ressize
+=
2
+
4
+
1
;
#ifndef Py_UNICODE_WIDE
else
ressize
+=
2
+
5
+
1
;
#else
else
if
(
*
p
<
100000
)
else
if
(
ch
<
100000
)
ressize
+=
2
+
5
+
1
;
else
if
(
*
p
<
1000000
)
else
if
(
ch
<
1000000
)
ressize
+=
2
+
6
+
1
;
else
ressize
+=
2
+
7
+
1
;
#endif
}
/* allocate replacement */
res
=
PyUnicode_FromUnicode
(
NULL
,
ressize
);
...
...
@@ -593,40 +599,41 @@ PyObject *PyCodec_XMLCharRefReplaceErrors(PyObject *exc)
return
NULL
;
}
/* generate replacement */
for
(
p
=
startp
+
start
,
outp
=
PyUnicode_AS_UNICODE
(
res
);
p
<
startp
+
end
;
++
p
)
{
Py_UNICODE
c
=
*
p
;
for
(
p
=
startp
+
start
,
outp
=
PyUnicode_AS_UNICODE
(
res
);
p
<
e
;)
{
int
digits
;
int
base
;
Py_UCS4
ch
=
*
p
++
;
#ifndef Py_UNICODE_WIDE
if
((
0xD800
<=
ch
&&
ch
<=
0xDBFF
)
&&
(
p
<
startp
+
end
)
&&
(
0xDC00
<=
*
p
&&
*
p
<=
0xDFFF
))
{
ch
=
((((
ch
&
0x03FF
)
<<
10
)
|
((
Py_UCS4
)
*
p
++
&
0x03FF
))
+
0x10000
);
}
#endif
*
outp
++
=
'&'
;
*
outp
++
=
'#'
;
if
(
*
p
<
10
)
{
if
(
ch
<
10
)
{
digits
=
1
;
base
=
1
;
}
else
if
(
*
p
<
100
)
{
else
if
(
ch
<
100
)
{
digits
=
2
;
base
=
10
;
}
else
if
(
*
p
<
1000
)
{
else
if
(
ch
<
1000
)
{
digits
=
3
;
base
=
100
;
}
else
if
(
*
p
<
10000
)
{
else
if
(
ch
<
10000
)
{
digits
=
4
;
base
=
1000
;
}
#ifndef Py_UNICODE_WIDE
else
{
digits
=
5
;
base
=
10000
;
}
#else
else
if
(
*
p
<
100000
)
{
else
if
(
ch
<
100000
)
{
digits
=
5
;
base
=
10000
;
}
else
if
(
*
p
<
1000000
)
{
else
if
(
ch
<
1000000
)
{
digits
=
6
;
base
=
100000
;
}
...
...
@@ -634,10 +641,9 @@ PyObject *PyCodec_XMLCharRefReplaceErrors(PyObject *exc)
digits
=
7
;
base
=
1000000
;
}
#endif
while
(
digits
-->
0
)
{
*
outp
++
=
'0'
+
c
/
base
;
c
%=
base
;
*
outp
++
=
'0'
+
c
h
/
base
;
c
h
%=
base
;
base
/=
10
;
}
*
outp
++
=
';'
;
...
...
This diff is collapsed.
Click to expand it.
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment