Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Support
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
C
cpython
Project overview
Project overview
Details
Activity
Releases
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Issues
0
Issues
0
List
Boards
Labels
Milestones
Merge Requests
0
Merge Requests
0
Analytics
Analytics
Repository
Value Stream
Wiki
Wiki
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Create a new issue
Commits
Issue Boards
Open sidebar
Kirill Smelkov
cpython
Commits
4a219a99
Commit
4a219a99
authored
Jan 04, 2014
by
Serhiy Storchaka
Browse files
Options
Browse Files
Download
Plain Diff
Merge heads
parents
a04d6b89
645e59d1
Changes
4
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
62 additions
and
133 deletions
+62
-133
Doc/whatsnew/3.4.rst
Doc/whatsnew/3.4.rst
+1
-3
Misc/NEWS
Misc/NEWS
+0
-2
Objects/stringlib/codecs.h
Objects/stringlib/codecs.h
+0
-87
Objects/unicodeobject.c
Objects/unicodeobject.c
+61
-41
No files found.
Doc/whatsnew/3.4.rst
View file @
4a219a99
...
...
@@ -1213,9 +1213,7 @@ Other Improvements
Significant Optimizations
=========================
* The UTF-32 decoder is now 3x to 4x faster. The UTF-32 encoder is now 1.6x
to 3.5x faster. (Contributed by Serhiy Storchaka in :issue:`14625` and
:issue:`15027`.)
* The UTF-32 decoder is now 3x to 4x faster.
* The cost of hash collisions for sets is now reduced. Each hash table
probe now checks a series of consecutive, adjacent key/hash pairs before
...
...
Misc/NEWS
View file @
4a219a99
...
...
@@ -10,8 +10,6 @@ Release date: 2014-01-05
Core and Builtins
-----------------
- Issue #15027: Rewrite the UTF-32 encoder. It is now 1.6x to 3.5x faster.
- Issue #17432: Drop UCS2 from names of Unicode functions in python3.def.
- Issue #19526: Exclude all new API from the stable ABI. Exceptions can be
...
...
Objects/stringlib/codecs.h
View file @
4a219a99
...
...
@@ -718,93 +718,6 @@ STRINGLIB(utf16_encode)(const STRINGLIB_CHAR *in,
return
len
-
(
end
-
in
+
1
);
#endif
}
#if STRINGLIB_SIZEOF_CHAR == 1
# define SWAB4(CH, tmp) ((CH) << 24)
/* high bytes are zero */
#elif STRINGLIB_SIZEOF_CHAR == 2
# define SWAB4(CH, tmp) (tmp = (CH), \
((tmp & 0x00FFu) << 24) + ((tmp & 0xFF00u) << 8))
/* high bytes are zero */
#else
# define SWAB4(CH, tmp) (tmp = (CH), \
tmp = ((tmp & 0x00FF00FFu) << 8) + ((tmp >> 8) & 0x00FF00FFu), \
((tmp & 0x0000FFFFu) << 16) + ((tmp >> 16) & 0x0000FFFFu))
#endif
Py_LOCAL_INLINE
(
Py_ssize_t
)
STRINGLIB
(
utf32_encode
)(
const
STRINGLIB_CHAR
*
in
,
Py_ssize_t
len
,
PY_UINT32_T
**
outptr
,
int
native_ordering
)
{
PY_UINT32_T
*
out
=
*
outptr
;
const
STRINGLIB_CHAR
*
end
=
in
+
len
;
if
(
native_ordering
)
{
const
STRINGLIB_CHAR
*
unrolled_end
=
in
+
_Py_SIZE_ROUND_DOWN
(
len
,
4
);
while
(
in
<
unrolled_end
)
{
#if STRINGLIB_SIZEOF_CHAR > 1
/* check if any character is a surrogate character */
if
(((
in
[
0
]
^
0xd800
)
&
(
in
[
1
]
^
0xd800
)
&
(
in
[
2
]
^
0xd800
)
&
(
in
[
3
]
^
0xd800
)
&
0xf800
)
==
0
)
break
;
#endif
out
[
0
]
=
in
[
0
];
out
[
1
]
=
in
[
1
];
out
[
2
]
=
in
[
2
];
out
[
3
]
=
in
[
3
];
in
+=
4
;
out
+=
4
;
}
while
(
in
<
end
)
{
Py_UCS4
ch
;
ch
=
*
in
++
;
#if STRINGLIB_SIZEOF_CHAR > 1
if
(
Py_UNICODE_IS_SURROGATE
(
ch
))
{
/* reject surrogate characters (U+DC800-U+DFFF) */
goto
fail
;
}
#endif
*
out
++
=
ch
;
}
}
else
{
const
STRINGLIB_CHAR
*
unrolled_end
=
in
+
_Py_SIZE_ROUND_DOWN
(
len
,
4
);
while
(
in
<
unrolled_end
)
{
#if STRINGLIB_SIZEOF_CHAR > 1
Py_UCS4
ch1
,
ch2
,
ch3
,
ch4
;
/* check if any character is a surrogate character */
if
(((
in
[
0
]
^
0xd800
)
&
(
in
[
1
]
^
0xd800
)
&
(
in
[
2
]
^
0xd800
)
&
(
in
[
3
]
^
0xd800
)
&
0xf800
)
==
0
)
break
;
#endif
out
[
0
]
=
SWAB4
(
in
[
0
],
ch1
);
out
[
1
]
=
SWAB4
(
in
[
1
],
ch2
);
out
[
2
]
=
SWAB4
(
in
[
2
],
ch3
);
out
[
3
]
=
SWAB4
(
in
[
3
],
ch4
);
in
+=
4
;
out
+=
4
;
}
while
(
in
<
end
)
{
Py_UCS4
ch
=
*
in
++
;
#if STRINGLIB_SIZEOF_CHAR > 1
if
(
Py_UNICODE_IS_SURROGATE
(
ch
))
{
/* reject surrogate characters (U+DC800-U+DFFF) */
goto
fail
;
}
#endif
*
out
++
=
SWAB4
(
ch
,
ch
);
}
}
*
outptr
=
out
;
return
len
;
#if STRINGLIB_SIZEOF_CHAR > 1
fail:
*
outptr
=
out
;
return
len
-
(
end
-
in
+
1
);
#endif
}
#undef SWAB4
#endif
#endif
/* STRINGLIB_IS_UNICODE */
Objects/unicodeobject.c
View file @
4a219a99
...
...
@@ -5085,22 +5085,32 @@ _PyUnicode_EncodeUTF32(PyObject *str,
const
char
*
errors
,
int
byteorder
)
{
enum
PyUnicode_Kind
kind
;
const
void
*
data
;
int
kind
;
void
*
data
;
Py_ssize_t
len
;
PyObject
*
v
;
PY_UINT32_T
*
out
;
unsigned
char
*
p
;
Py_ssize_t
nsize
,
i
;
/* Offsets from p for storing byte pairs in the right order. */
#if PY_LITTLE_ENDIAN
int
native_ordering
=
byteorder
<=
0
;
int
iorder
[]
=
{
0
,
1
,
2
,
3
}
;
#else
int
native_ordering
=
byteorder
>=
0
;
int
iorder
[]
=
{
3
,
2
,
1
,
0
}
;
#endif
const
char
*
encoding
;
Py_ssize_t
nsize
,
pos
;
PyObject
*
errorHandler
=
NULL
;
PyObject
*
exc
=
NULL
;
PyObject
*
rep
=
NULL
;
#define STORECHAR(CH) \
do { \
p[iorder[3]] = ((CH) >> 24) & 0xff; \
p[iorder[2]] = ((CH) >> 16) & 0xff; \
p[iorder[1]] = ((CH) >> 8) & 0xff; \
p[iorder[0]] = (CH) & 0xff; \
p += 4; \
} while(0)
if
(
!
PyUnicode_Check
(
str
))
{
PyErr_BadArgument
();
return
NULL
;
...
...
@@ -5111,53 +5121,59 @@ _PyUnicode_EncodeUTF32(PyObject *str,
data
=
PyUnicode_DATA
(
str
);
len
=
PyUnicode_GET_LENGTH
(
str
);
if
(
len
>
PY_SSIZE_T_MAX
/
4
-
(
byteorder
==
0
))
return
PyErr_NoMemory
();
nsize
=
len
+
(
byteorder
==
0
);
if
(
nsize
>
PY_SSIZE_T_MAX
/
4
)
return
PyErr_NoMemory
();
v
=
PyBytes_FromStringAndSize
(
NULL
,
nsize
*
4
);
if
(
v
==
NULL
)
return
NULL
;
/* output buffer is 4-bytes aligned */
assert
(
_Py_IS_ALIGNED
(
PyBytes_AS_STRING
(
v
),
4
));
out
=
(
PY_UINT32_T
*
)
PyBytes_AS_STRING
(
v
);
p
=
(
unsigned
char
*
)
PyBytes_AS_STRING
(
v
);
if
(
byteorder
==
0
)
*
out
++
=
0xFEFF
;
STORECHAR
(
0xFEFF
)
;
if
(
len
==
0
)
goto
done
;
return
v
;
if
(
byteorder
==
-
1
)
if
(
byteorder
==
-
1
)
{
/* force LE */
iorder
[
0
]
=
0
;
iorder
[
1
]
=
1
;
iorder
[
2
]
=
2
;
iorder
[
3
]
=
3
;
encoding
=
"utf-32-le"
;
else
if
(
byteorder
==
1
)
}
else
if
(
byteorder
==
1
)
{
/* force BE */
iorder
[
0
]
=
3
;
iorder
[
1
]
=
2
;
iorder
[
2
]
=
1
;
iorder
[
3
]
=
0
;
encoding
=
"utf-32-be"
;
}
else
encoding
=
"utf-32"
;
if
(
kind
==
PyUnicode_1BYTE_KIND
)
{
ucs1lib_utf32_encode
((
const
Py_UCS1
*
)
data
,
len
,
&
out
,
native_ordering
);
goto
done
;
for
(
i
=
0
;
i
<
len
;
i
++
)
STORECHAR
(
PyUnicode_READ
(
kind
,
data
,
i
));
return
v
;
}
pos
=
0
;
while
(
pos
<
len
)
{
for
(
i
=
0
;
i
<
len
;)
{
Py_ssize_t
repsize
,
moreunits
;
if
(
kind
==
PyUnicode_2BYTE_KIND
)
{
pos
+=
ucs2lib_utf32_encode
((
const
Py_UCS2
*
)
data
+
pos
,
len
-
pos
,
&
out
,
native_ordering
);
}
else
{
assert
(
kind
==
PyUnicode_4BYTE_KIND
);
pos
+=
ucs4lib_utf32_encode
((
const
Py_UCS4
*
)
data
+
pos
,
len
-
pos
,
&
out
,
native_ordering
);
Py_UCS4
ch
=
PyUnicode_READ
(
kind
,
data
,
i
);
i
++
;
assert
(
ch
<=
MAX_UNICODE
);
if
(
!
Py_UNICODE_IS_SURROGATE
(
ch
))
{
STORECHAR
(
ch
);
continue
;
}
if
(
pos
==
len
)
break
;
rep
=
unicode_encode_call_errorhandler
(
errors
,
&
errorHandler
,
encoding
,
"surrogates not allowed"
,
str
,
&
exc
,
pos
,
pos
+
1
,
&
pos
);
str
,
&
exc
,
i
-
1
,
i
,
&
i
);
if
(
!
rep
)
goto
error
;
...
...
@@ -5165,7 +5181,7 @@ _PyUnicode_EncodeUTF32(PyObject *str,
repsize
=
PyBytes_GET_SIZE
(
rep
);
if
(
repsize
&
3
)
{
raise_encode_exception
(
&
exc
,
encoding
,
str
,
pos
-
1
,
pos
,
str
,
i
-
1
,
i
,
"surrogates not allowed"
);
goto
error
;
}
...
...
@@ -5178,7 +5194,7 @@ _PyUnicode_EncodeUTF32(PyObject *str,
moreunits
=
repsize
=
PyUnicode_GET_LENGTH
(
rep
);
if
(
!
PyUnicode_IS_ASCII
(
rep
))
{
raise_encode_exception
(
&
exc
,
encoding
,
str
,
pos
-
1
,
pos
,
str
,
i
-
1
,
i
,
"surrogates not allowed"
);
goto
error
;
}
...
...
@@ -5186,7 +5202,7 @@ _PyUnicode_EncodeUTF32(PyObject *str,
/* four bytes are reserved for each surrogate */
if
(
moreunits
>
1
)
{
Py_ssize_t
outpos
=
out
-
(
PY_UINT32_T
*
)
PyBytes_AS_STRING
(
v
);
Py_ssize_t
outpos
=
p
-
(
unsigned
char
*
)
PyBytes_AS_STRING
(
v
);
Py_ssize_t
morebytes
=
4
*
(
moreunits
-
1
);
if
(
PyBytes_GET_SIZE
(
v
)
>
PY_SSIZE_T_MAX
-
morebytes
)
{
/* integer overflow */
...
...
@@ -5195,16 +5211,20 @@ _PyUnicode_EncodeUTF32(PyObject *str,
}
if
(
_PyBytes_Resize
(
&
v
,
PyBytes_GET_SIZE
(
v
)
+
morebytes
)
<
0
)
goto
error
;
out
=
(
PY_UINT32_T
*
)
PyBytes_AS_STRING
(
v
)
+
outpos
;
p
=
(
unsigned
char
*
)
PyBytes_AS_STRING
(
v
)
+
outpos
;
}
if
(
PyBytes_Check
(
rep
))
{
Py_MEMCPY
(
out
,
PyBytes_AS_STRING
(
rep
),
repsize
);
out
+=
moreunits
;
Py_MEMCPY
(
p
,
PyBytes_AS_STRING
(
rep
),
repsize
);
p
+=
repsize
;
}
else
/* rep is unicode */
{
const
Py_UCS1
*
repdata
;
assert
(
PyUnicode_KIND
(
rep
)
==
PyUnicode_1BYTE_KIND
);
ucs1lib_utf32_encode
(
PyUnicode_1BYTE_DATA
(
rep
),
repsize
,
&
out
,
native_ordering
);
repdata
=
PyUnicode_1BYTE_DATA
(
rep
);
while
(
repsize
--
)
{
Py_UCS4
ch
=
*
repdata
++
;
STORECHAR
(
ch
);
}
}
Py_CLEAR
(
rep
);
...
...
@@ -5213,12 +5233,11 @@ _PyUnicode_EncodeUTF32(PyObject *str,
/* Cut back to size actually needed. This is necessary for, for example,
encoding of a string containing isolated surrogates and the 'ignore'
handler is used. */
nsize
=
(
unsigned
char
*
)
out
-
(
unsigned
char
*
)
PyBytes_AS_STRING
(
v
);
nsize
=
p
-
(
unsigned
char
*
)
PyBytes_AS_STRING
(
v
);
if
(
nsize
!=
PyBytes_GET_SIZE
(
v
))
_PyBytes_Resize
(
&
v
,
nsize
);
Py_XDECREF
(
errorHandler
);
Py_XDECREF
(
exc
);
done:
return
v
;
error:
Py_XDECREF
(
rep
);
...
...
@@ -5226,6 +5245,7 @@ _PyUnicode_EncodeUTF32(PyObject *str,
Py_XDECREF
(
exc
);
Py_XDECREF
(
v
);
return
NULL
;
#undef STORECHAR
}
PyObject
*
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment