Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Support
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
C
cpython
Project overview
Project overview
Details
Activity
Releases
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Issues
0
Issues
0
List
Boards
Labels
Milestones
Merge Requests
0
Merge Requests
0
Analytics
Analytics
Repository
Value Stream
Wiki
Wiki
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Create a new issue
Commits
Issue Boards
Open sidebar
Kirill Smelkov
cpython
Commits
b3316ece
Commit
b3316ece
authored
Oct 30, 2012
by
Victor Stinner
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
Close #14625: Rewrite the UTF-32 decoder. It is now 3x to 4x faster
Patch written by Serhiy Storchaka.
parent
17dfab1b
Changes
3
Show whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
73 additions
and
74 deletions
+73
-74
Doc/whatsnew/3.4.rst
Doc/whatsnew/3.4.rst
+1
-1
Misc/NEWS
Misc/NEWS
+3
-0
Objects/unicodeobject.c
Objects/unicodeobject.c
+69
-73
No files found.
Doc/whatsnew/3.4.rst
View file @
b3316ece
...
...
@@ -157,7 +157,7 @@ Optimizations
Major performance enhancements have been added:
*
None yet
.
*
The UTF-32 decoder is now 3x to 4x faster
.
Build and C API Changes
...
...
Misc/NEWS
View file @
b3316ece
...
...
@@ -10,6 +10,9 @@ What's New in Python 3.4.0 Alpha 1?
Core and Builtins
-----------------
- Issue #14625: Rewrite the UTF-32 decoder. It is now 3x to 4x faster. Patch
written by Serhiy Storchaka.
- Issue #16197: Update winreg docstrings and documentation to match code.
Patch by Zachary Ware.
...
...
Objects/unicodeobject.c
View file @
b3316ece
...
...
@@ -4804,14 +4804,8 @@ PyUnicode_DecodeUTF32Stateful(const char *s,
Py_ssize_t
outpos
;
PyObject
*
unicode
;
const
unsigned
char
*
q
,
*
e
;
int
bo
=
0
;
/* assume native ordering by default */
int
le
,
bo
=
0
;
/* assume native ordering by default */
const
char
*
errmsg
=
""
;
/* Offsets from q for retrieving bytes in the right order. */
#if PY_LITTLE_ENDIAN
int
iorder
[]
=
{
0
,
1
,
2
,
3
};
#else
int
iorder
[]
=
{
3
,
2
,
1
,
0
};
#endif
PyObject
*
errorHandler
=
NULL
;
PyObject
*
exc
=
NULL
;
...
...
@@ -4825,83 +4819,88 @@ PyUnicode_DecodeUTF32Stateful(const char *s,
byte order setting accordingly. In native mode, the leading BOM
mark is skipped, in all other modes, it is copied to the output
stream as-is (giving a ZWNBSP character). */
if
(
bo
==
0
)
{
if
(
size
>=
4
)
{
const
Py_UCS4
bom
=
(
q
[
iorder
[
3
]]
<<
24
)
|
(
q
[
iorder
[
2
]]
<<
16
)
|
(
q
[
iorder
[
1
]]
<<
8
)
|
q
[
iorder
[
0
]];
#if PY_LITTLE_ENDIAN
if
(
bo
==
0
&&
size
>=
4
)
{
Py_UCS4
bom
=
(
q
[
3
]
<<
24
)
|
(
q
[
2
]
<<
16
)
|
(
q
[
1
]
<<
8
)
|
q
[
0
];
if
(
bom
==
0x0000FEFF
)
{
q
+=
4
;
bo
=
-
1
;
}
else
if
(
bom
==
0xFFFE0000
)
{
q
+=
4
;
bo
=
1
;
}
#else
if
(
bom
==
0x0000FEFF
)
{
q
+=
4
;
bo
=
1
;
}
else
if
(
bom
==
0xFFFE0000
)
{
bo
=
1
;
q
+=
4
;
bo
=
-
1
;
}
#endif
}
if
(
byteorder
)
*
byteorder
=
bo
;
}
if
(
bo
==
-
1
)
{
/* force LE */
iorder
[
0
]
=
0
;
iorder
[
1
]
=
1
;
iorder
[
2
]
=
2
;
iorder
[
3
]
=
3
;
}
else
if
(
bo
==
1
)
{
/* force BE */
iorder
[
0
]
=
3
;
iorder
[
1
]
=
2
;
iorder
[
2
]
=
1
;
iorder
[
3
]
=
0
;
if
(
q
==
e
)
{
if
(
consumed
)
*
consumed
=
size
;
Py_INCREF
(
unicode_empty
);
return
unicode_empty
;
}
/* This might be one to much, because of a BOM */
unicode
=
PyUnicode_New
((
size
+
3
)
/
4
,
127
);
#ifdef WORDS_BIGENDIAN
le
=
bo
<
0
;
#else
le
=
bo
<=
0
;
#endif
unicode
=
PyUnicode_New
((
e
-
q
+
3
)
/
4
,
127
);
if
(
!
unicode
)
return
NULL
;
if
(
size
==
0
)
return
unicode
;
outpos
=
0
;
while
(
1
)
{
Py_UCS4
ch
=
0
;
Py_UCS4
maxch
=
PyUnicode_MAX_CHAR_VALUE
(
unicode
);
while
(
q
<
e
)
{
Py_UCS4
ch
;
/* remaining bytes at the end? (size should be divisible by 4) */
if
(
e
-
q
<
4
)
{
if
(
consumed
)
if
(
e
-
q
>=
4
)
{
enum
PyUnicode_Kind
kind
=
PyUnicode_KIND
(
unicode
);
void
*
data
=
PyUnicode_DATA
(
unicode
);
const
unsigned
char
*
last
=
e
-
4
;
if
(
le
)
{
do
{
ch
=
(
q
[
3
]
<<
24
)
|
(
q
[
2
]
<<
16
)
|
(
q
[
1
]
<<
8
)
|
q
[
0
];
if
(
ch
>
maxch
)
break
;
errmsg
=
"truncated data"
;
startinpos
=
((
const
char
*
)
q
)
-
starts
;
endinpos
=
((
const
char
*
)
e
)
-
starts
;
goto
utf32Error
;
/* The remaining input chars are ignored if the callback
chooses to skip the input */
PyUnicode_WRITE
(
kind
,
data
,
outpos
++
,
ch
);
q
+=
4
;
}
while
(
q
<=
last
);
}
else
{
do
{
ch
=
(
q
[
0
]
<<
24
)
|
(
q
[
1
]
<<
16
)
|
(
q
[
2
]
<<
8
)
|
q
[
3
];
if
(
ch
>
maxch
)
break
;
PyUnicode_WRITE
(
kind
,
data
,
outpos
++
,
ch
);
q
+=
4
;
}
while
(
q
<=
last
);
}
}
ch
=
(
q
[
iorder
[
3
]]
<<
24
)
|
(
q
[
iorder
[
2
]]
<<
16
)
|
(
q
[
iorder
[
1
]]
<<
8
)
|
q
[
iorder
[
0
]];
if
(
ch
>=
0x110000
)
{
errmsg
=
"codepoint not in range(0x110000)"
;
startinpos
=
((
const
char
*
)
q
)
-
starts
;
endinpos
=
startinpos
+
4
;
goto
utf32Error
;
if
(
ch
<=
maxch
)
{
if
(
q
==
e
||
consumed
)
break
;
/* remaining bytes at the end? (size should be divisible by 4) */
errmsg
=
"truncated data"
;
startinpos
=
((
const
char
*
)
q
)
-
starts
;
endinpos
=
((
const
char
*
)
e
)
-
starts
;
}
else
{
if
(
ch
<
0x110000
)
{
if
(
unicode_putchar
(
&
unicode
,
&
outpos
,
ch
)
<
0
)
goto
onError
;
q
+=
4
;
continue
;
utf32Error:
}
errmsg
=
"codepoint not in range(0x110000)"
;
startinpos
=
((
const
char
*
)
q
)
-
starts
;
endinpos
=
startinpos
+
4
;
}
/* The remaining input chars are ignored if the callback
chooses to skip the input */
if
(
unicode_decode_call_errorhandler
(
errors
,
&
errorHandler
,
"utf32"
,
errmsg
,
...
...
@@ -4910,9 +4909,6 @@ PyUnicode_DecodeUTF32Stateful(const char *s,
goto
onError
;
}
if
(
byteorder
)
*
byteorder
=
bo
;
if
(
consumed
)
*
consumed
=
(
const
char
*
)
q
-
starts
;
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment