Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Support
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
C
cpython
Project overview
Project overview
Details
Activity
Releases
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Issues
0
Issues
0
List
Boards
Labels
Milestones
Merge Requests
0
Merge Requests
0
Analytics
Analytics
Repository
Value Stream
Wiki
Wiki
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Create a new issue
Commits
Issue Boards
Open sidebar
Kirill Smelkov
cpython
Commits
ca5f91b8
Commit
ca5f91b8
authored
May 10, 2012
by
Antoine Pitrou
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
Issue #14738: Speed-up UTF-8 decoding on non-ASCII data. Patch by Serhiy Storchaka.
parent
fda08b08
Changes
8
Show whitespace changes
Inline
Side-by-side
Showing
8 changed files
with
316 additions
and
552 deletions
+316
-552
Misc/NEWS
Misc/NEWS
+3
-0
Objects/stringlib/asciilib.h
Objects/stringlib/asciilib.h
+1
-0
Objects/stringlib/codecs.h
Objects/stringlib/codecs.h
+143
-78
Objects/stringlib/ucs1lib.h
Objects/stringlib/ucs1lib.h
+1
-0
Objects/stringlib/ucs2lib.h
Objects/stringlib/ucs2lib.h
+1
-0
Objects/stringlib/ucs4lib.h
Objects/stringlib/ucs4lib.h
+1
-0
Objects/stringlib/undef.h
Objects/stringlib/undef.h
+1
-0
Objects/unicodeobject.c
Objects/unicodeobject.c
+165
-474
No files found.
Misc/NEWS
View file @
ca5f91b8
...
@@ -10,6 +10,9 @@ What's New in Python 3.3.0 Alpha 4?
...
@@ -10,6 +10,9 @@ What's New in Python 3.3.0 Alpha 4?
Core and Builtins
Core and Builtins
-----------------
-----------------
- Issue #14738: Speed-up UTF-8 decoding on non-ASCII data. Patch by Serhiy
Storchaka.
- Issue #14700: Fix two broken and undefined-behaviour-inducing overflow checks
- Issue #14700: Fix two broken and undefined-behaviour-inducing overflow checks
in old-style string formatting.
in old-style string formatting.
...
...
Objects/stringlib/asciilib.h
View file @
ca5f91b8
...
@@ -7,6 +7,7 @@
...
@@ -7,6 +7,7 @@
#define STRINGLIB(F) asciilib_##F
#define STRINGLIB(F) asciilib_##F
#define STRINGLIB_OBJECT PyUnicodeObject
#define STRINGLIB_OBJECT PyUnicodeObject
#define STRINGLIB_SIZEOF_CHAR 1
#define STRINGLIB_SIZEOF_CHAR 1
#define STRINGLIB_MAX_CHAR 0x7Fu
#define STRINGLIB_CHAR Py_UCS1
#define STRINGLIB_CHAR Py_UCS1
#define STRINGLIB_TYPE_NAME "unicode"
#define STRINGLIB_TYPE_NAME "unicode"
#define STRINGLIB_PARSE_CODE "U"
#define STRINGLIB_PARSE_CODE "U"
...
...
Objects/stringlib/codecs.h
View file @
ca5f91b8
...
@@ -15,19 +15,18 @@
...
@@ -15,19 +15,18 @@
# error C 'long' size should be either 4 or 8!
# error C 'long' size should be either 4 or 8!
#endif
#endif
Py_LOCAL_INLINE
(
int
)
Py_LOCAL_INLINE
(
Py_UCS4
)
STRINGLIB
(
utf8_
try_decode
)(
const
char
*
start
,
const
char
*
end
,
STRINGLIB
(
utf8_
decode
)(
const
char
**
inptr
,
const
char
*
end
,
STRINGLIB_CHAR
*
dest
,
STRINGLIB_CHAR
*
dest
,
const
char
**
src_pos
,
Py_ssize_t
*
dest_index
)
Py_ssize_t
*
outpos
)
{
{
int
ret
;
Py_UCS4
ch
;
Py_ssize_t
n
;
const
char
*
s
=
*
inptr
;
const
char
*
s
=
start
;
const
char
*
aligned_end
=
(
const
char
*
)
((
size_t
)
end
&
~
LONG_PTR_MASK
);
const
char
*
aligned_end
=
(
const
char
*
)
((
size_t
)
end
&
~
LONG_PTR_MASK
);
STRINGLIB_CHAR
*
p
=
dest
;
STRINGLIB_CHAR
*
p
=
dest
+
*
outpos
;
while
(
s
<
end
)
{
while
(
s
<
end
)
{
Py_UCS4
ch
=
(
unsigned
char
)
*
s
;
ch
=
(
unsigned
char
)
*
s
;
if
(
ch
<
0x80
)
{
if
(
ch
<
0x80
)
{
/* Fast path for runs of ASCII characters. Given that common UTF-8
/* Fast path for runs of ASCII characters. Given that common UTF-8
...
@@ -48,15 +47,33 @@ STRINGLIB(utf8_try_decode)(const char *start, const char *end,
...
@@ -48,15 +47,33 @@ STRINGLIB(utf8_try_decode)(const char *start, const char *end,
unsigned
long
value
=
*
(
unsigned
long
*
)
_s
;
unsigned
long
value
=
*
(
unsigned
long
*
)
_s
;
if
(
value
&
ASCII_CHAR_MASK
)
if
(
value
&
ASCII_CHAR_MASK
)
break
;
break
;
_p
[
0
]
=
_s
[
0
];
#ifdef BYTEORDER_IS_LITTLE_ENDIAN
_p
[
1
]
=
_s
[
1
];
_p
[
0
]
=
(
STRINGLIB_CHAR
)(
value
&
0xFFu
);
_p
[
2
]
=
_s
[
2
];
_p
[
1
]
=
(
STRINGLIB_CHAR
)((
value
>>
8
)
&
0xFFu
);
_p
[
3
]
=
_s
[
3
];
_p
[
2
]
=
(
STRINGLIB_CHAR
)((
value
>>
16
)
&
0xFFu
);
#if (SIZEOF_LONG == 8)
_p
[
3
]
=
(
STRINGLIB_CHAR
)((
value
>>
24
)
&
0xFFu
);
_p
[
4
]
=
_s
[
4
];
# if SIZEOF_LONG == 8
_p
[
5
]
=
_s
[
5
];
_p
[
4
]
=
(
STRINGLIB_CHAR
)((
value
>>
32
)
&
0xFFu
);
_p
[
6
]
=
_s
[
6
];
_p
[
5
]
=
(
STRINGLIB_CHAR
)((
value
>>
40
)
&
0xFFu
);
_p
[
7
]
=
_s
[
7
];
_p
[
6
]
=
(
STRINGLIB_CHAR
)((
value
>>
48
)
&
0xFFu
);
_p
[
7
]
=
(
STRINGLIB_CHAR
)((
value
>>
56
)
&
0xFFu
);
# endif
#else
# if SIZEOF_LONG == 8
_p
[
0
]
=
(
STRINGLIB_CHAR
)((
value
>>
56
)
&
0xFFu
);
_p
[
1
]
=
(
STRINGLIB_CHAR
)((
value
>>
48
)
&
0xFFu
);
_p
[
2
]
=
(
STRINGLIB_CHAR
)((
value
>>
40
)
&
0xFFu
);
_p
[
3
]
=
(
STRINGLIB_CHAR
)((
value
>>
32
)
&
0xFFu
);
_p
[
4
]
=
(
STRINGLIB_CHAR
)((
value
>>
24
)
&
0xFFu
);
_p
[
5
]
=
(
STRINGLIB_CHAR
)((
value
>>
16
)
&
0xFFu
);
_p
[
6
]
=
(
STRINGLIB_CHAR
)((
value
>>
8
)
&
0xFFu
);
_p
[
7
]
=
(
STRINGLIB_CHAR
)(
value
&
0xFFu
);
# else
_p
[
0
]
=
(
STRINGLIB_CHAR
)((
value
>>
24
)
&
0xFFu
);
_p
[
1
]
=
(
STRINGLIB_CHAR
)((
value
>>
16
)
&
0xFFu
);
_p
[
2
]
=
(
STRINGLIB_CHAR
)((
value
>>
8
)
&
0xFFu
);
_p
[
3
]
=
(
STRINGLIB_CHAR
)(
value
&
0xFFu
);
# endif
#endif
#endif
_s
+=
SIZEOF_LONG
;
_s
+=
SIZEOF_LONG
;
_p
+=
SIZEOF_LONG
;
_p
+=
SIZEOF_LONG
;
...
@@ -67,87 +84,135 @@ STRINGLIB(utf8_try_decode)(const char *start, const char *end,
...
@@ -67,87 +84,135 @@ STRINGLIB(utf8_try_decode)(const char *start, const char *end,
break
;
break
;
ch
=
(
unsigned
char
)
*
s
;
ch
=
(
unsigned
char
)
*
s
;
}
}
}
if
(
ch
<
0x80
)
{
if
(
ch
<
0x80
)
{
s
++
;
s
++
;
*
p
++
=
ch
;
*
p
++
=
ch
;
continue
;
continue
;
}
}
}
n
=
utf8_code_length
[
ch
];
if
(
ch
<
0xC2
)
{
/* invalid sequence
\x80-\xBF -- continuation byte
\xC0-\xC1 -- fake 0000-007F */
goto
InvalidStart
;
}
if
(
s
+
n
>
end
)
{
if
(
ch
<
0xE0
)
{
/* \xC2\x80-\xDF\xBF -- 0080-07FF */
Py_UCS4
ch2
;
if
(
end
-
s
<
2
)
{
/* unexpected end of data: the caller will decide whether
/* unexpected end of data: the caller will decide whether
it's an error or not */
it's an error or not */
goto
_error
;
break
;
}
}
ch2
=
(
unsigned
char
)
s
[
1
];
switch
(
n
)
{
if
((
ch2
&
0xC0
)
!=
0x80
)
case
0
:
/* invalid start byte */
goto
_error
;
case
1
:
/* internal error */
goto
_error
;
case
2
:
if
((
s
[
1
]
&
0xc0
)
!=
0x80
)
/* invalid continuation byte */
/* invalid continuation byte */
goto
_error
;
goto
InvalidContinuation
;
ch
=
((
s
[
0
]
&
0x1f
)
<<
6
)
+
(
s
[
1
]
&
0x3f
);
ch
=
(
ch
<<
6
)
+
ch2
-
((
0xC0
<<
6
)
+
0x80
);
assert
((
ch
>
0x007F
)
&&
(
ch
<=
0x07FF
));
assert
((
ch
>
0x007F
)
&&
(
ch
<=
0x07FF
));
s
+=
2
;
s
+=
2
;
if
(
STRINGLIB_MAX_CHAR
<=
0x007F
||
(
STRINGLIB_MAX_CHAR
<
0x07FF
&&
ch
>
STRINGLIB_MAX_CHAR
))
goto
Overflow
;
*
p
++
=
ch
;
*
p
++
=
ch
;
break
;
continue
;
}
case
3
:
if
(
ch
<
0xF0
)
{
/* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
/* \xE0\xA0\x80-\xEF\xBF\xBF -- 0800-FFFF */
will result in surrogates in range d800-dfff. Surrogates are
Py_UCS4
ch2
,
ch3
;
if
(
end
-
s
<
3
)
{
/* unexpected end of data: the caller will decide whether
it's an error or not */
break
;
}
ch2
=
(
unsigned
char
)
s
[
1
];
ch3
=
(
unsigned
char
)
s
[
2
];
if
((
ch2
&
0xC0
)
!=
0x80
||
(
ch3
&
0xC0
)
!=
0x80
)
{
/* invalid continuation byte */
goto
InvalidContinuation
;
}
if
(
ch
==
0xE0
)
{
if
(
ch2
<
0xA0
)
/* invalid sequence
\xE0\x80\x80-\xE0\x9F\xBF -- fake 0000-0800 */
goto
InvalidContinuation
;
}
else
if
(
ch
==
0xED
&&
ch2
>
0x9F
)
{
/* Decoding UTF-8 sequences in range \xED\xA0\x80-\xED\xBF\xBF
will result in surrogates in range D800-DFFF. Surrogates are
not valid UTF-8 so they are rejected.
not valid UTF-8 so they are rejected.
See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
(table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
(table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
if
((
s
[
1
]
&
0xc0
)
!=
0x80
||
goto
InvalidContinuation
;
(
s
[
2
]
&
0xc0
)
!=
0x80
||
((
unsigned
char
)
s
[
0
]
==
0xE0
&&
(
unsigned
char
)
s
[
1
]
<
0xA0
)
||
((
unsigned
char
)
s
[
0
]
==
0xED
&&
(
unsigned
char
)
s
[
1
]
>
0x9F
))
{
/* invalid continuation byte */
goto
_error
;
}
}
ch
=
((
s
[
0
]
&
0x0f
)
<<
12
)
+
((
s
[
1
]
&
0x3f
)
<<
6
)
+
(
s
[
2
]
&
0x3f
);
ch
=
(
ch
<<
12
)
+
(
ch2
<<
6
)
+
ch3
-
((
0xE0
<<
12
)
+
(
0x80
<<
6
)
+
0x80
);
assert
((
ch
>
0x07FF
)
&&
(
ch
<=
0xFFFF
));
assert
((
ch
>
0x07FF
)
&&
(
ch
<=
0xFFFF
));
s
+=
3
;
s
+=
3
;
if
(
STRINGLIB_MAX_CHAR
<=
0x07FF
||
(
STRINGLIB_MAX_CHAR
<
0xFFFF
&&
ch
>
STRINGLIB_MAX_CHAR
))
goto
Overflow
;
*
p
++
=
ch
;
*
p
++
=
ch
;
break
;
continue
;
}
case
4
:
if
(
ch
<
0xF5
)
{
if
((
s
[
1
]
&
0xc0
)
!=
0x80
||
/* \xF0\x90\x80\x80-\xF4\x8F\xBF\xBF -- 10000-10FFFF */
(
s
[
2
]
&
0xc0
)
!=
0x80
||
Py_UCS4
ch2
,
ch3
,
ch4
;
(
s
[
3
]
&
0xc0
)
!=
0x80
||
if
(
end
-
s
<
4
)
{
((
unsigned
char
)
s
[
0
]
==
0xF0
&&
/* unexpected end of data: the caller will decide whether
(
unsigned
char
)
s
[
1
]
<
0x90
)
||
it's an error or not */
((
unsigned
char
)
s
[
0
]
==
0xF4
&&
break
;
(
unsigned
char
)
s
[
1
]
>
0x8F
))
{
}
ch2
=
(
unsigned
char
)
s
[
1
];
ch3
=
(
unsigned
char
)
s
[
2
];
ch4
=
(
unsigned
char
)
s
[
3
];
if
((
ch2
&
0xC0
)
!=
0x80
||
(
ch3
&
0xC0
)
!=
0x80
||
(
ch4
&
0xC0
)
!=
0x80
)
{
/* invalid continuation byte */
/* invalid continuation byte */
goto
_error
;
goto
InvalidContinuation
;
}
if
(
ch
==
0xF0
)
{
if
(
ch2
<
0x90
)
/* invalid sequence
\xF0\x80\x80\x80-\xF0\x80\xBF\xBF -- fake 0000-FFFF */
goto
InvalidContinuation
;
}
}
ch
=
((
s
[
0
]
&
0x7
)
<<
18
)
+
((
s
[
1
]
&
0x3f
)
<<
12
)
+
else
if
(
ch
==
0xF4
&&
ch2
>
0x8F
)
{
((
s
[
2
]
&
0x3f
)
<<
6
)
+
(
s
[
3
]
&
0x3f
);
/* invalid sequence
assert
((
ch
>
0xFFFF
)
&&
(
ch
<=
0x10ffff
));
\xF4\x90\x80\80- -- 110000- overflow */
goto
InvalidContinuation
;
}
ch
=
(
ch
<<
18
)
+
(
ch2
<<
12
)
+
(
ch3
<<
6
)
+
ch4
-
((
0xF0
<<
18
)
+
(
0x80
<<
12
)
+
(
0x80
<<
6
)
+
0x80
);
assert
((
ch
>
0xFFFF
)
&&
(
ch
<=
0x10FFFF
));
s
+=
4
;
s
+=
4
;
if
(
STRINGLIB_MAX_CHAR
<=
0xFFFF
||
(
STRINGLIB_MAX_CHAR
<
0x10FFFF
&&
ch
>
STRINGLIB_MAX_CHAR
))
goto
Overflow
;
*
p
++
=
ch
;
*
p
++
=
ch
;
break
;
continue
;
}
}
goto
InvalidStart
;
}
}
ret
=
0
;
ch
=
0
;
goto
_ok
;
Overflow:
_error:
Return:
ret
=
-
1
;
*
inptr
=
s
;
_ok:
*
outpos
=
p
-
dest
;
*
src_pos
=
s
;
return
ch
;
*
dest_index
=
p
-
dest
;
InvalidStart:
return
ret
;
ch
=
1
;
goto
Return
;
InvalidContinuation:
ch
=
2
;
goto
Return
;
}
}
#undef LONG_PTR_MASK
#undef LONG_PTR_MASK
...
...
Objects/stringlib/ucs1lib.h
View file @
ca5f91b8
...
@@ -7,6 +7,7 @@
...
@@ -7,6 +7,7 @@
#define STRINGLIB(F) ucs1lib_##F
#define STRINGLIB(F) ucs1lib_##F
#define STRINGLIB_OBJECT PyUnicodeObject
#define STRINGLIB_OBJECT PyUnicodeObject
#define STRINGLIB_SIZEOF_CHAR 1
#define STRINGLIB_SIZEOF_CHAR 1
#define STRINGLIB_MAX_CHAR 0xFFu
#define STRINGLIB_CHAR Py_UCS1
#define STRINGLIB_CHAR Py_UCS1
#define STRINGLIB_TYPE_NAME "unicode"
#define STRINGLIB_TYPE_NAME "unicode"
#define STRINGLIB_PARSE_CODE "U"
#define STRINGLIB_PARSE_CODE "U"
...
...
Objects/stringlib/ucs2lib.h
View file @
ca5f91b8
...
@@ -7,6 +7,7 @@
...
@@ -7,6 +7,7 @@
#define STRINGLIB(F) ucs2lib_##F
#define STRINGLIB(F) ucs2lib_##F
#define STRINGLIB_OBJECT PyUnicodeObject
#define STRINGLIB_OBJECT PyUnicodeObject
#define STRINGLIB_SIZEOF_CHAR 2
#define STRINGLIB_SIZEOF_CHAR 2
#define STRINGLIB_MAX_CHAR 0xFFFFu
#define STRINGLIB_CHAR Py_UCS2
#define STRINGLIB_CHAR Py_UCS2
#define STRINGLIB_TYPE_NAME "unicode"
#define STRINGLIB_TYPE_NAME "unicode"
#define STRINGLIB_PARSE_CODE "U"
#define STRINGLIB_PARSE_CODE "U"
...
...
Objects/stringlib/ucs4lib.h
View file @
ca5f91b8
...
@@ -7,6 +7,7 @@
...
@@ -7,6 +7,7 @@
#define STRINGLIB(F) ucs4lib_##F
#define STRINGLIB(F) ucs4lib_##F
#define STRINGLIB_OBJECT PyUnicodeObject
#define STRINGLIB_OBJECT PyUnicodeObject
#define STRINGLIB_SIZEOF_CHAR 4
#define STRINGLIB_SIZEOF_CHAR 4
#define STRINGLIB_MAX_CHAR 0x10FFFFu
#define STRINGLIB_CHAR Py_UCS4
#define STRINGLIB_CHAR Py_UCS4
#define STRINGLIB_TYPE_NAME "unicode"
#define STRINGLIB_TYPE_NAME "unicode"
#define STRINGLIB_PARSE_CODE "U"
#define STRINGLIB_PARSE_CODE "U"
...
...
Objects/stringlib/undef.h
View file @
ca5f91b8
#undef FASTSEARCH
#undef FASTSEARCH
#undef STRINGLIB
#undef STRINGLIB
#undef STRINGLIB_SIZEOF_CHAR
#undef STRINGLIB_SIZEOF_CHAR
#undef STRINGLIB_MAX_CHAR
#undef STRINGLIB_CHAR
#undef STRINGLIB_CHAR
#undef STRINGLIB_STR
#undef STRINGLIB_STR
#undef STRINGLIB_LEN
#undef STRINGLIB_LEN
...
...
Objects/unicodeobject.c
View file @
ca5f91b8
...
@@ -4615,28 +4615,6 @@ PyUnicode_EncodeUTF7(const Py_UNICODE *s,
...
@@ -4615,28 +4615,6 @@ PyUnicode_EncodeUTF7(const Py_UNICODE *s,
/* --- UTF-8 Codec -------------------------------------------------------- */
/* --- UTF-8 Codec -------------------------------------------------------- */
static
char
utf8_code_length
[
256
]
=
{
/* Map UTF-8 encoded prefix byte to sequence length. Zero means
illegal prefix. See RFC 3629 for details */
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
/* 00-0F */
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
/* 70-7F */
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
/* 80-8F */
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
/* B0-BF */
0
,
0
,
2
,
2
,
2
,
2
,
2
,
2
,
2
,
2
,
2
,
2
,
2
,
2
,
2
,
2
,
/* C0-C1 + C2-CF */
2
,
2
,
2
,
2
,
2
,
2
,
2
,
2
,
2
,
2
,
2
,
2
,
2
,
2
,
2
,
2
,
/* D0-DF */
3
,
3
,
3
,
3
,
3
,
3
,
3
,
3
,
3
,
3
,
3
,
3
,
3
,
3
,
3
,
3
,
/* E0-EF */
4
,
4
,
4
,
4
,
4
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
/* F0-F4 + F5-FF */
};
PyObject
*
PyObject
*
PyUnicode_DecodeUTF8
(
const
char
*
s
,
PyUnicode_DecodeUTF8
(
const
char
*
s
,
Py_ssize_t
size
,
Py_ssize_t
size
,
...
@@ -4645,6 +4623,10 @@ PyUnicode_DecodeUTF8(const char *s,
...
@@ -4645,6 +4623,10 @@ PyUnicode_DecodeUTF8(const char *s,
return
PyUnicode_DecodeUTF8Stateful
(
s
,
size
,
errors
,
NULL
);
return
PyUnicode_DecodeUTF8Stateful
(
s
,
size
,
errors
,
NULL
);
}
}
#include "stringlib/asciilib.h"
#include "stringlib/codecs.h"
#include "stringlib/undef.h"
#include "stringlib/ucs1lib.h"
#include "stringlib/ucs1lib.h"
#include "stringlib/codecs.h"
#include "stringlib/codecs.h"
#include "stringlib/undef.h"
#include "stringlib/undef.h"
...
@@ -4670,378 +4652,171 @@ PyUnicode_DecodeUTF8(const char *s,
...
@@ -4670,378 +4652,171 @@ PyUnicode_DecodeUTF8(const char *s,
# error C 'long' size should be either 4 or 8!
# error C 'long' size should be either 4 or 8!
#endif
#endif
/* Scans a UTF-8 string and returns the maximum character to be expected
static
Py_ssize_t
and the size of the decoded unicode string.
ascii_decode
(
const
char
*
start
,
const
char
*
end
,
Py_UCS1
*
dest
)
{
This function doesn't check for errors, these checks are performed in
const
char
*
p
=
start
;
PyUnicode_DecodeUTF8Stateful.
const
char
*
aligned_end
=
(
const
char
*
)
((
size_t
)
end
&
~
LONG_PTR_MASK
);
*/
static
Py_UCS4
utf8_scanner
(
const
unsigned
char
*
p
,
Py_ssize_t
string_size
,
Py_ssize_t
*
unicode_size
)
{
Py_ssize_t
char_count
=
0
;
const
unsigned
char
*
end
=
p
+
string_size
;
const
unsigned
char
*
aligned_end
=
(
const
unsigned
char
*
)
((
size_t
)
end
&
~
LONG_PTR_MASK
);
assert
(
unicode_size
!=
NULL
);
/* By having a cascade of independent loops which fallback onto each
other, we minimize the amount of work done in the average loop
iteration, and we also maximize the CPU's ability to predict
branches correctly (because a given condition will have always the
same boolean outcome except perhaps in the last iteration of the
corresponding loop).
In the general case this brings us rather close to decoding
performance pre-PEP 393, despite the two-pass decoding.
Note that the pure ASCII loop is not duplicated once a non-ASCII
character has been encountered. It is actually a pessimization (by
a significant factor) to use this loop on text with many non-ASCII
characters, and it is important to avoid bad performance on valid
utf-8 data (invalid utf-8 being a different can of worms).
*/
/* ASCII */
#if SIZEOF_LONG <= SIZEOF_VOID_P
for
(;
p
<
end
;
++
p
)
{
assert
(
!
((
size_t
)
dest
&
LONG_PTR_MASK
));
/* Only check value if it's not a ASCII char... */
if
(
*
p
<
0x80
)
{
/* Fast path, see below in PyUnicode_DecodeUTF8Stateful for
an explanation. */
if
(
!
((
size_t
)
p
&
LONG_PTR_MASK
))
{
if
(
!
((
size_t
)
p
&
LONG_PTR_MASK
))
{
/* Fast path, see in STRINGLIB(utf8_decode) for
an explanation. */
/* Help register allocation */
/* Help register allocation */
register
const
unsigned
char
*
_p
=
p
;
register
const
char
*
_p
=
p
;
register
Py_UCS1
*
q
=
dest
;
while
(
_p
<
aligned_end
)
{
while
(
_p
<
aligned_end
)
{
unsigned
long
value
=
*
(
unsigned
long
*
)
_p
;
unsigned
long
value
=
*
(
const
unsigned
long
*
)
_p
;
if
(
value
&
ASCII_CHAR_MASK
)
if
(
value
&
ASCII_CHAR_MASK
)
break
;
break
;
*
((
unsigned
long
*
)
q
)
=
value
;
_p
+=
SIZEOF_LONG
;
_p
+=
SIZEOF_LONG
;
char_count
+=
SIZEOF_LONG
;
q
+=
SIZEOF_LONG
;
}
}
p
=
_p
;
p
=
_p
;
if
(
p
==
end
)
while
(
p
<
end
)
{
if
((
unsigned
char
)
*
p
&
0x80
)
break
;
break
;
*
q
++
=
*
p
++
;
}
}
return
p
-
start
;
}
}
if
(
*
p
<
0x80
)
#endif
++
char_count
;
while
(
p
<
end
)
{
else
/* Fast path, see in STRINGLIB(utf8_decode) in stringlib/codecs.h
goto
_ucs1loop
;
for an explanation. */
}
if
(
!
((
size_t
)
p
&
LONG_PTR_MASK
))
{
*
unicode_size
=
char_count
;
/* Help register allocation */
return
127
;
register
const
char
*
_p
=
p
;
while
(
_p
<
aligned_end
)
{
_ucs1loop:
unsigned
long
value
=
*
(
unsigned
long
*
)
_p
;
for
(;
p
<
end
;
++
p
)
{
if
(
value
&
ASCII_CHAR_MASK
)
if
(
*
p
<
0xc4
)
break
;
char_count
+=
((
*
p
&
0xc0
)
!=
0x80
);
_p
+=
SIZEOF_LONG
;
else
goto
_ucs2loop
;
}
}
*
unicode_size
=
char_count
;
p
=
_p
;
return
255
;
if
(
_p
==
end
)
break
;
_ucs2loop:
for
(;
p
<
end
;
++
p
)
{
if
(
*
p
<
0xf0
)
char_count
+=
((
*
p
&
0xc0
)
!=
0x80
);
else
goto
_ucs4loop
;
}
}
*
unicode_size
=
char_count
;
if
((
unsigned
char
)
*
p
&
0x80
)
return
65535
;
break
;
++
p
;
_ucs4loop:
for
(;
p
<
end
;
++
p
)
{
char_count
+=
((
*
p
&
0xc0
)
!=
0x80
);
}
}
*
unicode_size
=
char_count
;
memcpy
(
dest
,
start
,
p
-
start
)
;
return
65537
;
return
p
-
start
;
}
}
/* Similar to PyUnicode_WRITE but may attempt to widen and resize the string
PyObject
*
in case of errors. Implicit parameters: unicode, kind, data, onError.
PyUnicode_DecodeUTF8Stateful
(
const
char
*
s
,
Potential resizing overallocates, so the result needs to shrink at the end.
*/
#define WRITE_MAYBE_FAIL(index, value) \
do { \
Py_ssize_t pos = index; \
if (pos > PyUnicode_GET_LENGTH(unicode) && \
unicode_resize(&unicode, pos + pos/8) < 0) \
goto onError; \
if (unicode_putchar(&unicode, &pos, value) < 0) \
goto onError; \
} while (0)
static
PyObject
*
decode_utf8_errors
(
const
char
*
starts
,
Py_ssize_t
size
,
Py_ssize_t
size
,
const
char
*
errors
,
const
char
*
errors
,
Py_ssize_t
*
consumed
,
Py_ssize_t
*
consumed
)
const
char
*
s
,
PyObject
*
unicode
,
Py_ssize_t
i
)
{
{
int
n
;
PyObject
*
unicode
;
int
k
;
const
char
*
starts
=
s
;
const
char
*
end
=
s
+
size
;
Py_ssize_t
outpos
;
Py_ssize_t
startinpos
;
Py_ssize_t
startinpos
;
Py_ssize_t
endinpos
;
Py_ssize_t
endinpos
;
const
char
*
e
=
starts
+
size
;
const
char
*
aligned_end
;
const
char
*
errmsg
=
""
;
const
char
*
errmsg
=
""
;
PyObject
*
errorHandler
=
NULL
;
PyObject
*
errorHandler
=
NULL
;
PyObject
*
exc
=
NULL
;
PyObject
*
exc
=
NULL
;
aligned_end
=
(
const
char
*
)
((
size_t
)
e
&
~
LONG_PTR_MASK
);
if
(
size
==
0
)
{
if
(
consumed
)
while
(
s
<
e
)
{
*
consumed
=
0
;
Py_UCS4
ch
=
(
unsigned
char
)
*
s
;
Py_INCREF
(
unicode_empty
);
return
unicode_empty
;
if
(
ch
<
0x80
)
{
/* Fast path for runs of ASCII characters. Given that common UTF-8
input will consist of an overwhelming majority of ASCII
characters, we try to optimize for this case by checking
as many characters as a C 'long' can contain.
First, check if we can do an aligned read, as most CPUs have
a penalty for unaligned reads.
*/
if
(
!
((
size_t
)
s
&
LONG_PTR_MASK
))
{
/* Help register allocation */
register
const
char
*
_s
=
s
;
register
Py_ssize_t
_i
=
i
;
while
(
_s
<
aligned_end
)
{
/* Read a whole long at a time (either 4 or 8 bytes),
and do a fast unrolled copy if it only contains ASCII
characters. */
unsigned
long
value
=
*
(
unsigned
long
*
)
_s
;
if
(
value
&
ASCII_CHAR_MASK
)
break
;
WRITE_MAYBE_FAIL
(
_i
+
0
,
_s
[
0
]);
WRITE_MAYBE_FAIL
(
_i
+
1
,
_s
[
1
]);
WRITE_MAYBE_FAIL
(
_i
+
2
,
_s
[
2
]);
WRITE_MAYBE_FAIL
(
_i
+
3
,
_s
[
3
]);
#if (SIZEOF_LONG == 8)
WRITE_MAYBE_FAIL
(
_i
+
4
,
_s
[
4
]);
WRITE_MAYBE_FAIL
(
_i
+
5
,
_s
[
5
]);
WRITE_MAYBE_FAIL
(
_i
+
6
,
_s
[
6
]);
WRITE_MAYBE_FAIL
(
_i
+
7
,
_s
[
7
]);
#endif
_s
+=
SIZEOF_LONG
;
_i
+=
SIZEOF_LONG
;
}
s
=
_s
;
i
=
_i
;
if
(
s
==
e
)
break
;
ch
=
(
unsigned
char
)
*
s
;
}
}
}
if
(
ch
<
0x80
)
{
/* ASCII is equivalent to the first 128 ordinals in Unicode. */
WRITE_MAYBE_FAIL
(
i
++
,
ch
);
if
(
size
==
1
&&
(
unsigned
char
)
s
[
0
]
<
128
)
{
s
++
;
if
(
consumed
)
continue
;
*
consumed
=
1
;
return
get_latin1_char
((
unsigned
char
)
s
[
0
]);
}
}
n
=
utf8_code_length
[
ch
];
unicode
=
PyUnicode_New
(
size
,
127
);
if
(
!
unicode
)
return
NULL
;
if
(
s
+
n
>
e
)
{
outpos
=
ascii_decode
(
s
,
end
,
PyUnicode_1BYTE_DATA
(
unicode
));
if
(
consumed
)
s
+=
outpos
;
break
;
while
(
s
<
end
)
{
else
{
Py_UCS4
ch
;
errmsg
=
"unexpected end of data"
;
int
kind
=
PyUnicode_KIND
(
unicode
);
startinpos
=
s
-
starts
;
if
(
kind
==
PyUnicode_1BYTE_KIND
)
{
endinpos
=
startinpos
+
1
;
if
(
PyUnicode_IS_ASCII
(
unicode
))
for
(
k
=
1
;
(
k
<
size
-
startinpos
)
&&
((
s
[
k
]
&
0xC0
)
==
0x80
);
k
++
)
ch
=
asciilib_utf8_decode
(
&
s
,
end
,
endinpos
++
;
PyUnicode_1BYTE_DATA
(
unicode
),
&
outpos
);
goto
utf8Error
;
else
}
ch
=
ucs1lib_utf8_decode
(
&
s
,
end
,
PyUnicode_1BYTE_DATA
(
unicode
),
&
outpos
);
}
else
if
(
kind
==
PyUnicode_2BYTE_KIND
)
{
ch
=
ucs2lib_utf8_decode
(
&
s
,
end
,
PyUnicode_2BYTE_DATA
(
unicode
),
&
outpos
);
}
else
{
assert
(
kind
==
PyUnicode_4BYTE_KIND
);
ch
=
ucs4lib_utf8_decode
(
&
s
,
end
,
PyUnicode_4BYTE_DATA
(
unicode
),
&
outpos
);
}
}
switch
(
n
)
{
switch
(
ch
)
{
case
0
:
case
0
:
errmsg
=
"invalid start byte"
;
if
(
s
==
end
||
consumed
)
startinpos
=
s
-
starts
;
goto
End
;
endinpos
=
startinpos
+
1
;
errmsg
=
"unexpected end of data"
;
goto
utf8Error
;
startinpos
=
s
-
starts
;
case
1
:
errmsg
=
"internal error"
;
startinpos
=
s
-
starts
;
endinpos
=
startinpos
+
1
;
goto
utf8Error
;
case
2
:
if
((
s
[
1
]
&
0xc0
)
!=
0x80
)
{
errmsg
=
"invalid continuation byte"
;
startinpos
=
s
-
starts
;
endinpos
=
startinpos
+
1
;
goto
utf8Error
;
}
ch
=
((
s
[
0
]
&
0x1f
)
<<
6
)
+
(
s
[
1
]
&
0x3f
);
assert
((
ch
>
0x007F
)
&&
(
ch
<=
0x07FF
));
WRITE_MAYBE_FAIL
(
i
++
,
ch
);
break
;
case
3
:
/* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
will result in surrogates in range d800-dfff. Surrogates are
not valid UTF-8 so they are rejected.
See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
(table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
if
((
s
[
1
]
&
0xc0
)
!=
0x80
||
(
s
[
2
]
&
0xc0
)
!=
0x80
||
((
unsigned
char
)
s
[
0
]
==
0xE0
&&
(
unsigned
char
)
s
[
1
]
<
0xA0
)
||
((
unsigned
char
)
s
[
0
]
==
0xED
&&
(
unsigned
char
)
s
[
1
]
>
0x9F
))
{
errmsg
=
"invalid continuation byte"
;
startinpos
=
s
-
starts
;
endinpos
=
startinpos
+
1
;
endinpos
=
startinpos
+
1
;
while
(
endinpos
<
size
&&
(
starts
[
endinpos
]
&
0xC0
)
==
0x80
)
/* if s[1] first two bits are 1 and 0, then the invalid
continuation byte is s[2], so increment endinpos by 1,
if not, s[1] is invalid and endinpos doesn't need to
be incremented. */
if
((
s
[
1
]
&
0xC0
)
==
0x80
)
endinpos
++
;
endinpos
++
;
goto
utf8Error
;
}
ch
=
((
s
[
0
]
&
0x0f
)
<<
12
)
+
((
s
[
1
]
&
0x3f
)
<<
6
)
+
(
s
[
2
]
&
0x3f
);
assert
((
ch
>
0x07FF
)
&&
(
ch
<=
0xFFFF
));
WRITE_MAYBE_FAIL
(
i
++
,
ch
);
break
;
break
;
case
1
:
case
4
:
errmsg
=
"invalid start byte"
;
if
((
s
[
1
]
&
0xc0
)
!=
0x80
||
startinpos
=
s
-
starts
;
(
s
[
2
]
&
0xc0
)
!=
0x80
||
endinpos
=
startinpos
+
1
;
(
s
[
3
]
&
0xc0
)
!=
0x80
||
break
;
((
unsigned
char
)
s
[
0
]
==
0xF0
&&
case
2
:
(
unsigned
char
)
s
[
1
]
<
0x90
)
||
((
unsigned
char
)
s
[
0
]
==
0xF4
&&
(
unsigned
char
)
s
[
1
]
>
0x8F
))
{
errmsg
=
"invalid continuation byte"
;
errmsg
=
"invalid continuation byte"
;
startinpos
=
s
-
starts
;
startinpos
=
s
-
starts
;
endinpos
=
startinpos
+
1
;
endinpos
=
startinpos
+
1
;
if
((
s
[
1
]
&
0xC0
)
==
0x80
)
{
while
(
endinpos
<
size
&&
(
starts
[
endinpos
]
&
0xC0
)
==
0x80
)
endinpos
++
;
if
((
s
[
2
]
&
0xC0
)
==
0x80
)
endinpos
++
;
endinpos
++
;
}
goto
utf8Error
;
}
ch
=
((
s
[
0
]
&
0x7
)
<<
18
)
+
((
s
[
1
]
&
0x3f
)
<<
12
)
+
((
s
[
2
]
&
0x3f
)
<<
6
)
+
(
s
[
3
]
&
0x3f
);
assert
((
ch
>
0xFFFF
)
&&
(
ch
<=
MAX_UNICODE
));
WRITE_MAYBE_FAIL
(
i
++
,
ch
);
break
;
break
;
}
default:
s
+=
n
;
if
(
unicode_putchar
(
&
unicode
,
&
outpos
,
ch
)
<
0
)
goto
onError
;
continue
;
continue
;
}
utf8Error:
if
(
unicode_decode_call_errorhandler
(
if
(
unicode_decode_call_errorhandler
(
errors
,
&
errorHandler
,
errors
,
&
errorHandler
,
"utf-8"
,
errmsg
,
"utf-8"
,
errmsg
,
&
starts
,
&
e
,
&
startinpos
,
&
endinpos
,
&
exc
,
&
s
,
&
starts
,
&
e
nd
,
&
startinpos
,
&
endinpos
,
&
exc
,
&
s
,
&
unicode
,
&
i
))
&
unicode
,
&
outpos
))
goto
onError
;
goto
onError
;
/* Update data because unicode_decode_call_errorhandler might have
re-created or resized the unicode object. */
aligned_end
=
(
const
char
*
)
((
size_t
)
e
&
~
LONG_PTR_MASK
);
}
}
if
(
consumed
)
*
consumed
=
s
-
starts
;
/* Adjust length and ready string when it contained errors and
End:
is of the old resizable kind. */
if
(
unicode_resize
(
&
unicode
,
outpos
)
<
0
)
if
(
unicode_resize
(
&
unicode
,
i
)
<
0
)
goto
onError
;
unicode_adjust_maxchar
(
&
unicode
);
if
(
unicode
==
NULL
)
goto
onError
;
goto
onError
;
if
(
consumed
)
*
consumed
=
s
-
starts
;
Py_XDECREF
(
errorHandler
);
Py_XDECREF
(
errorHandler
);
Py_XDECREF
(
exc
);
Py_XDECREF
(
exc
);
assert
(
_PyUnicode_CheckConsistency
(
unicode
,
1
));
assert
(
_PyUnicode_CheckConsistency
(
unicode
,
1
));
return
unicode
;
return
unicode
;
onError:
onError:
Py_XDECREF
(
errorHandler
);
Py_XDECREF
(
errorHandler
);
Py_XDECREF
(
exc
);
Py_XDECREF
(
exc
);
Py_XDECREF
(
unicode
);
Py_XDECREF
(
unicode
);
return
NULL
;
return
NULL
;
}
}
#undef WRITE_MAYBE_FAIL
PyObject
*
PyUnicode_DecodeUTF8Stateful
(
const
char
*
s
,
Py_ssize_t
size
,
const
char
*
errors
,
Py_ssize_t
*
consumed
)
{
Py_UCS4
maxchar
=
0
;
Py_ssize_t
unicode_size
;
int
has_errors
=
0
;
PyObject
*
unicode
;
int
kind
;
void
*
data
;
const
char
*
starts
=
s
;
const
char
*
e
;
Py_ssize_t
i
;
if
(
size
==
0
)
{
if
(
consumed
)
*
consumed
=
0
;
Py_INCREF
(
unicode_empty
);
return
unicode_empty
;
}
maxchar
=
utf8_scanner
((
const
unsigned
char
*
)
s
,
size
,
&
unicode_size
);
/* When the string is ASCII only, just use memcpy and return.
unicode_size may be != size if there is an incomplete UTF-8
sequence at the end of the ASCII block. */
if
(
maxchar
<
128
&&
size
==
unicode_size
)
{
if
(
consumed
)
*
consumed
=
size
;
return
unicode_fromascii
((
const
unsigned
char
*
)
s
,
size
);
}
unicode
=
PyUnicode_New
(
unicode_size
,
maxchar
);
if
(
!
unicode
)
return
NULL
;
kind
=
PyUnicode_KIND
(
unicode
);
data
=
PyUnicode_DATA
(
unicode
);
/* Unpack UTF-8 encoded data */
i
=
0
;
e
=
starts
+
size
;
switch
(
kind
)
{
case
PyUnicode_1BYTE_KIND
:
has_errors
=
ucs1lib_utf8_try_decode
(
s
,
e
,
(
Py_UCS1
*
)
data
,
&
s
,
&
i
);
break
;
case
PyUnicode_2BYTE_KIND
:
has_errors
=
ucs2lib_utf8_try_decode
(
s
,
e
,
(
Py_UCS2
*
)
data
,
&
s
,
&
i
);
break
;
case
PyUnicode_4BYTE_KIND
:
has_errors
=
ucs4lib_utf8_try_decode
(
s
,
e
,
(
Py_UCS4
*
)
data
,
&
s
,
&
i
);
break
;
}
if
(
!
has_errors
)
{
/* Ensure the unicode size calculation was correct */
assert
(
i
==
unicode_size
);
assert
(
s
==
e
);
if
(
consumed
)
*
consumed
=
size
;
return
unicode
;
}
/* In case of errors, maxchar and size computation might be incorrect;
code below refits and resizes as necessary. */
return
decode_utf8_errors
(
starts
,
size
,
errors
,
consumed
,
s
,
unicode
,
i
);
}
#ifdef __APPLE__
#ifdef __APPLE__
...
@@ -5051,9 +4826,9 @@ PyUnicode_DecodeUTF8Stateful(const char *s,
...
@@ -5051,9 +4826,9 @@ PyUnicode_DecodeUTF8Stateful(const char *s,
wchar_t
*
wchar_t
*
_Py_DecodeUTF8_surrogateescape
(
const
char
*
s
,
Py_ssize_t
size
)
_Py_DecodeUTF8_surrogateescape
(
const
char
*
s
,
Py_ssize_t
size
)
{
{
int
n
;
const
char
*
e
;
const
char
*
e
;
wchar_t
*
unicode
,
*
p
;
wchar_t
*
unicode
;
Py_ssize_t
outpos
;
/* Note: size will always be longer than the resulting Unicode
/* Note: size will always be longer than the resulting Unicode
character count */
character count */
...
@@ -5066,86 +4841,33 @@ _Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size)
...
@@ -5066,86 +4841,33 @@ _Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size)
return
NULL
;
return
NULL
;
/* Unpack UTF-8 encoded data */
/* Unpack UTF-8 encoded data */
p
=
unicode
;
e
=
s
+
size
;
e
=
s
+
size
;
outpos
=
0
;
while
(
s
<
e
)
{
while
(
s
<
e
)
{
Py_UCS4
ch
=
(
unsigned
char
)
*
s
;
Py_UCS4
ch
;
#if SIZEOF_WCHAR_T == 4
if
(
ch
<
0x80
)
{
ch
=
ucs4lib_utf8_decode
(
&
s
,
e
,
(
Py_UCS4
*
)
unicode
,
&
outpos
);
*
p
++
=
(
wchar_t
)
ch
;
#else
s
++
;
ch
=
ucs2lib_utf8_decode
(
&
s
,
e
,
(
Py_UCS2
*
)
unicode
,
&
outpos
);
continue
;
#endif
}
if
(
ch
>
0xFF
)
{
n
=
utf8_code_length
[
ch
];
if
(
s
+
n
>
e
)
{
goto
surrogateescape
;
}
switch
(
n
)
{
case
0
:
case
1
:
goto
surrogateescape
;
case
2
:
if
((
s
[
1
]
&
0xc0
)
!=
0x80
)
goto
surrogateescape
;
ch
=
((
s
[
0
]
&
0x1f
)
<<
6
)
+
(
s
[
1
]
&
0x3f
);
assert
((
ch
>
0x007F
)
&&
(
ch
<=
0x07FF
));
*
p
++
=
(
wchar_t
)
ch
;
break
;
case
3
:
/* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
will result in surrogates in range d800-dfff. Surrogates are
not valid UTF-8 so they are rejected.
See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
(table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
if
((
s
[
1
]
&
0xc0
)
!=
0x80
||
(
s
[
2
]
&
0xc0
)
!=
0x80
||
((
unsigned
char
)
s
[
0
]
==
0xE0
&&
(
unsigned
char
)
s
[
1
]
<
0xA0
)
||
((
unsigned
char
)
s
[
0
]
==
0xED
&&
(
unsigned
char
)
s
[
1
]
>
0x9F
))
{
goto
surrogateescape
;
}
ch
=
((
s
[
0
]
&
0x0f
)
<<
12
)
+
((
s
[
1
]
&
0x3f
)
<<
6
)
+
(
s
[
2
]
&
0x3f
);
assert
((
ch
>
0x07FF
)
&&
(
ch
<=
0xFFFF
));
*
p
++
=
(
wchar_t
)
ch
;
break
;
case
4
:
if
((
s
[
1
]
&
0xc0
)
!=
0x80
||
(
s
[
2
]
&
0xc0
)
!=
0x80
||
(
s
[
3
]
&
0xc0
)
!=
0x80
||
((
unsigned
char
)
s
[
0
]
==
0xF0
&&
(
unsigned
char
)
s
[
1
]
<
0x90
)
||
((
unsigned
char
)
s
[
0
]
==
0xF4
&&
(
unsigned
char
)
s
[
1
]
>
0x8F
))
{
goto
surrogateescape
;
}
ch
=
((
s
[
0
]
&
0x7
)
<<
18
)
+
((
s
[
1
]
&
0x3f
)
<<
12
)
+
((
s
[
2
]
&
0x3f
)
<<
6
)
+
(
s
[
3
]
&
0x3f
);
assert
((
ch
>
0xFFFF
)
&&
(
ch
<=
MAX_UNICODE
));
#if SIZEOF_WCHAR_T == 4
#if SIZEOF_WCHAR_T == 4
*
p
++
=
(
wchar_t
)
ch
;
assert
(
0
)
;
#else
#else
assert
(
Py_UNICODE_IS_SURROGATE
(
ch
));
/* compute and append the two surrogates: */
/* compute and append the two surrogates: */
*
p
++
=
(
wchar_t
)
Py_UNICODE_HIGH_SURROGATE
(
ch
);
unicode
[
outpos
++
]
=
(
wchar_t
)
Py_UNICODE_HIGH_SURROGATE
(
ch
);
*
p
++
=
(
wchar_t
)
Py_UNICODE_LOW_SURROGATE
(
ch
);
unicode
[
outpos
++
]
=
(
wchar_t
)
Py_UNICODE_LOW_SURROGATE
(
ch
);
#endif
#endif
}
else
{
if
(
!
ch
&&
s
==
e
)
break
;
break
;
/* surrogateescape */
unicode
[
outpos
++
]
=
0xDC00
+
(
unsigned
char
)
*
s
++
;
}
}
s
+=
n
;
continue
;
surrogateescape:
*
p
++
=
0xDC00
+
ch
;
s
++
;
}
}
*
p
=
L'\0'
;
unicode
[
outpos
]
=
L'\0'
;
return
unicode
;
return
unicode
;
}
}
...
@@ -6970,17 +6692,13 @@ PyUnicode_DecodeASCII(const char *s,
...
@@ -6970,17 +6692,13 @@ PyUnicode_DecodeASCII(const char *s,
const
char
*
errors
)
const
char
*
errors
)
{
{
const
char
*
starts
=
s
;
const
char
*
starts
=
s
;
PyObject
*
v
;
PyObject
*
unicode
;
int
kind
;
int
kind
;
void
*
data
;
void
*
data
;
Py_ssize_t
startinpos
;
Py_ssize_t
startinpos
;
Py_ssize_t
endinpos
;
Py_ssize_t
endinpos
;
Py_ssize_t
outpos
;
Py_ssize_t
outpos
;
const
char
*
e
;
const
char
*
e
;
int
has_error
;
const
unsigned
char
*
p
=
(
const
unsigned
char
*
)
s
;
const
unsigned
char
*
end
=
p
+
size
;
const
unsigned
char
*
aligned_end
=
(
const
unsigned
char
*
)
((
size_t
)
end
&
~
LONG_PTR_MASK
);
PyObject
*
errorHandler
=
NULL
;
PyObject
*
errorHandler
=
NULL
;
PyObject
*
exc
=
NULL
;
PyObject
*
exc
=
NULL
;
...
@@ -6993,45 +6711,18 @@ PyUnicode_DecodeASCII(const char *s,
...
@@ -6993,45 +6711,18 @@ PyUnicode_DecodeASCII(const char *s,
if
(
size
==
1
&&
(
unsigned
char
)
s
[
0
]
<
128
)
if
(
size
==
1
&&
(
unsigned
char
)
s
[
0
]
<
128
)
return
get_latin1_char
((
unsigned
char
)
s
[
0
]);
return
get_latin1_char
((
unsigned
char
)
s
[
0
]);
has_error
=
0
;
unicode
=
PyUnicode_New
(
size
,
127
);
while
(
p
<
end
&&
!
has_error
)
{
if
(
unicode
==
NULL
)
/* Fast path, see below in PyUnicode_DecodeUTF8Stateful for
an explanation. */
if
(
!
((
size_t
)
p
&
LONG_PTR_MASK
))
{
/* Help register allocation */
register
const
unsigned
char
*
_p
=
p
;
while
(
_p
<
aligned_end
)
{
unsigned
long
value
=
*
(
unsigned
long
*
)
_p
;
if
(
value
&
ASCII_CHAR_MASK
)
{
has_error
=
1
;
break
;
}
_p
+=
SIZEOF_LONG
;
}
if
(
_p
==
end
)
break
;
if
(
has_error
)
break
;
p
=
_p
;
}
if
(
*
p
&
0x80
)
{
has_error
=
1
;
break
;
}
else
{
++
p
;
}
}
if
(
!
has_error
)
return
unicode_fromascii
((
const
unsigned
char
*
)
s
,
size
);
v
=
PyUnicode_New
(
size
,
127
);
if
(
v
==
NULL
)
goto
onError
;
goto
onError
;
kind
=
PyUnicode_KIND
(
v
);
data
=
PyUnicode_DATA
(
v
);
outpos
=
0
;
e
=
s
+
size
;
e
=
s
+
size
;
data
=
PyUnicode_1BYTE_DATA
(
unicode
);
outpos
=
ascii_decode
(
s
,
e
,
(
Py_UCS1
*
)
data
);
if
(
outpos
==
size
)
return
unicode
;
s
+=
outpos
;
kind
=
PyUnicode_1BYTE_KIND
;
while
(
s
<
e
)
{
while
(
s
<
e
)
{
register
unsigned
char
c
=
(
unsigned
char
)
*
s
;
register
unsigned
char
c
=
(
unsigned
char
)
*
s
;
if
(
c
<
128
)
{
if
(
c
<
128
)
{
...
@@ -7045,21 +6736,21 @@ PyUnicode_DecodeASCII(const char *s,
...
@@ -7045,21 +6736,21 @@ PyUnicode_DecodeASCII(const char *s,
errors
,
&
errorHandler
,
errors
,
&
errorHandler
,
"ascii"
,
"ordinal not in range(128)"
,
"ascii"
,
"ordinal not in range(128)"
,
&
starts
,
&
e
,
&
startinpos
,
&
endinpos
,
&
exc
,
&
s
,
&
starts
,
&
e
,
&
startinpos
,
&
endinpos
,
&
exc
,
&
s
,
&
v
,
&
outpos
))
&
unicode
,
&
outpos
))
goto
onError
;
goto
onError
;
kind
=
PyUnicode_KIND
(
v
);
kind
=
PyUnicode_KIND
(
unicode
);
data
=
PyUnicode_DATA
(
v
);
data
=
PyUnicode_DATA
(
unicode
);
}
}
}
}
if
(
unicode_resize
(
&
v
,
outpos
)
<
0
)
if
(
unicode_resize
(
&
unicode
,
outpos
)
<
0
)
goto
onError
;
goto
onError
;
Py_XDECREF
(
errorHandler
);
Py_XDECREF
(
errorHandler
);
Py_XDECREF
(
exc
);
Py_XDECREF
(
exc
);
assert
(
_PyUnicode_CheckConsistency
(
v
,
1
));
assert
(
_PyUnicode_CheckConsistency
(
unicode
,
1
));
return
v
;
return
unicode
;
onError:
onError:
Py_XDECREF
(
v
);
Py_XDECREF
(
unicode
);
Py_XDECREF
(
errorHandler
);
Py_XDECREF
(
errorHandler
);
Py_XDECREF
(
exc
);
Py_XDECREF
(
exc
);
return
NULL
;
return
NULL
;
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment