Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Support
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
C
cpython
Project overview
Project overview
Details
Activity
Releases
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Issues
0
Issues
0
List
Boards
Labels
Milestones
Merge Requests
0
Merge Requests
0
Analytics
Analytics
Repository
Value Stream
Wiki
Wiki
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Create a new issue
Commits
Issue Boards
Open sidebar
Kirill Smelkov
cpython
Commits
27f6a3b0
Commit
27f6a3b0
authored
Jun 15, 2012
by
Antoine Pitrou
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
Issue #15026: utf-16 encoding is now significantly faster (up to 10x).
Patch by Serhiy Storchaka.
parent
3049f124
Changes
4
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
102 additions
and
49 deletions
+102
-49
Include/unicodeobject.h
Include/unicodeobject.h
+2
-2
Misc/NEWS
Misc/NEWS
+3
-0
Objects/stringlib/codecs.h
Objects/stringlib/codecs.h
+64
-0
Objects/unicodeobject.c
Objects/unicodeobject.c
+33
-47
No files found.
Include/unicodeobject.h
View file @
27f6a3b0
...
...
@@ -188,9 +188,9 @@ typedef unsigned char Py_UCS1;
(((((Py_UCS4)(high) & 0x03FF) << 10) | \
((Py_UCS4)(low) & 0x03FF)) + 0x10000)
/* high surrogate = top 10 bits added to D800 */
#define Py_UNICODE_HIGH_SURROGATE(ch) (0xD800
| (((ch) - 0x10000
) >> 10))
#define Py_UNICODE_HIGH_SURROGATE(ch) (0xD800
- (0x10000 >> 10) + ((ch
) >> 10))
/* low surrogate = bottom 10 bits added to DC00 */
#define Py_UNICODE_LOW_SURROGATE(ch) (0xDC00
| (((ch) - 0x10000
) & 0x3FF))
#define Py_UNICODE_LOW_SURROGATE(ch) (0xDC00
+ ((ch
) & 0x3FF))
/* Check if substring matches at given offset. The offset must be
valid, and the substring must not be empty. */
...
...
Misc/NEWS
View file @
27f6a3b0
...
...
@@ -10,6 +10,9 @@ What's New in Python 3.3.0 Beta 1?
Core and Builtins
-----------------
- Issue #15026: utf-16 encoding is now significantly faster (up to 10x).
Patch by Serhiy Storchaka.
- Issue #11022: open() and io.TextIOWrapper are now calling
locale.getpreferredencoding(False) instead of locale.getpreferredencoding()
in text mode if the encoding is not specified. Don'
t
change
temporary
the
...
...
Objects/stringlib/codecs.h
View file @
27f6a3b0
...
...
@@ -562,4 +562,68 @@ IllegalSurrogate:
#undef STRIPPED_MASK
#undef SWAB
#undef LONG_PTR_MASK
Py_LOCAL_INLINE
(
void
)
STRINGLIB
(
utf16_encode
)(
unsigned
short
*
out
,
const
STRINGLIB_CHAR
*
in
,
Py_ssize_t
len
,
int
native_ordering
)
{
const
STRINGLIB_CHAR
*
end
=
in
+
len
;
#if STRINGLIB_SIZEOF_CHAR == 1
# define SWAB2(CH) ((CH) << 8)
#else
# define SWAB2(CH) (((CH) << 8) | ((CH) >> 8))
#endif
#if STRINGLIB_MAX_CHAR < 0x10000
if
(
native_ordering
)
{
# if STRINGLIB_SIZEOF_CHAR == 2
Py_MEMCPY
(
out
,
in
,
2
*
len
);
# else
_PyUnicode_CONVERT_BYTES
(
STRINGLIB_CHAR
,
unsigned
short
,
in
,
end
,
out
);
# endif
}
else
{
const
STRINGLIB_CHAR
*
unrolled_end
=
in
+
(
len
&
~
(
Py_ssize_t
)
3
);
while
(
in
<
unrolled_end
)
{
out
[
0
]
=
SWAB2
(
in
[
0
]);
out
[
1
]
=
SWAB2
(
in
[
1
]);
out
[
2
]
=
SWAB2
(
in
[
2
]);
out
[
3
]
=
SWAB2
(
in
[
3
]);
in
+=
4
;
out
+=
4
;
}
while
(
in
<
end
)
{
*
out
++
=
SWAB2
(
*
in
);
++
in
;
}
}
#else
if
(
native_ordering
)
{
while
(
in
<
end
)
{
Py_UCS4
ch
=
*
in
++
;
if
(
ch
<
0x10000
)
*
out
++
=
ch
;
else
{
out
[
0
]
=
Py_UNICODE_HIGH_SURROGATE
(
ch
);
out
[
1
]
=
Py_UNICODE_LOW_SURROGATE
(
ch
);
out
+=
2
;
}
}
}
else
{
while
(
in
<
end
)
{
Py_UCS4
ch
=
*
in
++
;
if
(
ch
<
0x10000
)
*
out
++
=
SWAB2
((
Py_UCS2
)
ch
);
else
{
Py_UCS2
ch1
=
Py_UNICODE_HIGH_SURROGATE
(
ch
);
Py_UCS2
ch2
=
Py_UNICODE_LOW_SURROGATE
(
ch
);
out
[
0
]
=
SWAB2
(
ch1
);
out
[
1
]
=
SWAB2
(
ch2
);
out
+=
2
;
}
}
}
#endif
#undef SWAB2
}
#endif
/* STRINGLIB_IS_UNICODE */
Objects/unicodeobject.c
View file @
27f6a3b0
...
...
@@ -5359,27 +5359,19 @@ _PyUnicode_EncodeUTF16(PyObject *str,
const
char
*
errors
,
int
byteorder
)
{
int
kind
;
void
*
data
;
enum
PyUnicode_Kind
kind
;
const
void
*
data
;
Py_ssize_t
len
;
PyObject
*
v
;
unsigned
char
*
p
;
Py_ssize_t
nsize
,
bytesize
;
Py_ssize_t
i
,
pairs
;
/* Offsets from p for storing byte pairs in the right order. */
#ifdef BYTEORDER_IS_LITTLE_ENDIAN
int
ihi
=
1
,
ilo
=
0
;
unsigned
short
*
out
;
Py_ssize_t
bytesize
;
Py_ssize_t
pairs
;
#ifdef WORDS_BIGENDIAN
int
native_ordering
=
byteorder
>=
0
;
#else
int
ihi
=
0
,
ilo
=
1
;
int
native_ordering
=
byteorder
<=
0
;
#endif
#define STORECHAR(CH) \
do { \
p[ihi] = ((CH) >> 8) & 0xff; \
p[ilo] = (CH) & 0xff; \
p += 2; \
} while(0)
if
(
!
PyUnicode_Check
(
str
))
{
PyErr_BadArgument
();
return
NULL
;
...
...
@@ -5391,53 +5383,47 @@ _PyUnicode_EncodeUTF16(PyObject *str,
len
=
PyUnicode_GET_LENGTH
(
str
);
pairs
=
0
;
if
(
kind
==
PyUnicode_4BYTE_KIND
)
for
(
i
=
0
;
i
<
len
;
i
++
)
if
(
PyUnicode_READ
(
kind
,
data
,
i
)
>=
0x10000
)
if
(
kind
==
PyUnicode_4BYTE_KIND
)
{
const
Py_UCS4
*
in
=
(
const
Py_UCS4
*
)
data
;
const
Py_UCS4
*
end
=
in
+
len
;
while
(
in
<
end
)
if
(
*
in
++
>=
0x10000
)
pairs
++
;
/* 2 * (len + pairs + (byteorder == 0)) */
if
(
len
>
PY_SSIZE_T_MAX
-
pairs
-
(
byteorder
==
0
))
return
PyErr_NoMemory
();
nsize
=
len
+
pairs
+
(
byteorder
==
0
);
bytesize
=
nsize
*
2
;
if
(
bytesize
/
2
!=
nsize
)
}
if
(
len
>
PY_SSIZE_T_MAX
/
2
-
pairs
-
(
byteorder
==
0
))
return
PyErr_NoMemory
();
bytesize
=
(
len
+
pairs
+
(
byteorder
==
0
))
*
2
;
v
=
PyBytes_FromStringAndSize
(
NULL
,
bytesize
);
if
(
v
==
NULL
)
return
NULL
;
p
=
(
unsigned
char
*
)
PyBytes_AS_STRING
(
v
);
/* output buffer is 2-bytes aligned */
assert
(((
Py_uintptr_t
)
PyBytes_AS_STRING
(
v
)
&
1
)
==
0
);
out
=
(
unsigned
short
*
)
PyBytes_AS_STRING
(
v
);
if
(
byteorder
==
0
)
STORECHAR
(
0xFEFF
)
;
*
out
++
=
0xFEFF
;
if
(
len
==
0
)
goto
done
;
if
(
byteorder
==
-
1
)
{
/* force LE */
ihi
=
1
;
ilo
=
0
;
switch
(
kind
)
{
case
PyUnicode_1BYTE_KIND
:
{
ucs1lib_utf16_encode
(
out
,
(
const
Py_UCS1
*
)
data
,
len
,
native_ordering
)
;
break
;
}
else
if
(
byteorder
==
1
)
{
/* force BE */
ihi
=
0
;
ilo
=
1
;
case
PyUnicode_2BYTE_KIND
:
{
ucs2lib_utf16_encode
(
out
,
(
const
Py_UCS2
*
)
data
,
len
,
native_ordering
);
break
;
}
for
(
i
=
0
;
i
<
len
;
i
++
)
{
Py_UCS4
ch
=
PyUnicode_READ
(
kind
,
data
,
i
);
Py_UCS4
ch2
=
0
;
if
(
ch
>=
0x10000
)
{
ch2
=
Py_UNICODE_LOW_SURROGATE
(
ch
);
ch
=
Py_UNICODE_HIGH_SURROGATE
(
ch
);
}
STORECHAR
(
ch
);
if
(
ch2
)
STORECHAR
(
ch2
);
case
PyUnicode_4BYTE_KIND
:
{
ucs4lib_utf16_encode
(
out
,
(
const
Py_UCS4
*
)
data
,
len
,
native_ordering
);
break
;
}
default:
assert
(
0
);
}
done:
return
v
;
#undef STORECHAR
}
PyObject
*
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment