Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Support
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
C
cpython
Project overview
Project overview
Details
Activity
Releases
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Issues
0
Issues
0
List
Boards
Labels
Milestones
Merge Requests
0
Merge Requests
0
Analytics
Analytics
Repository
Value Stream
Wiki
Wiki
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Create a new issue
Commits
Issue Boards
Open sidebar
Kirill Smelkov
cpython
Commits
e9b11c1c
Commit
e9b11c1c
authored
Nov 08, 2011
by
Martin v. Löwis
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
Change decoders to use Unicode API instead of Py_UNICODE.
parent
d03a4915
Changes
1
Show whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
292 additions
and
420 deletions
+292
-420
Objects/unicodeobject.c
Objects/unicodeobject.c
+292
-420
No files found.
Objects/unicodeobject.c
View file @
e9b11c1c
...
@@ -1512,6 +1512,13 @@ unicode_resize(PyObject **p_unicode, Py_ssize_t length)
...
@@ -1512,6 +1512,13 @@ unicode_resize(PyObject **p_unicode, Py_ssize_t length)
if
(
old_length
==
length
)
if
(
old_length
==
length
)
return
0
;
return
0
;
if
(
length
==
0
)
{
Py_DECREF
(
*
p_unicode
);
*
p_unicode
=
unicode_empty
;
Py_INCREF
(
*
p_unicode
);
return
0
;
}
if
(
!
unicode_resizable
(
unicode
))
{
if
(
!
unicode_resizable
(
unicode
))
{
PyObject
*
copy
=
resize_copy
(
unicode
,
length
);
PyObject
*
copy
=
resize_copy
(
unicode
,
length
);
if
(
copy
==
NULL
)
if
(
copy
==
NULL
)
...
@@ -1540,8 +1547,7 @@ PyUnicode_Resize(PyObject **p_unicode, Py_ssize_t length)
...
@@ -1540,8 +1547,7 @@ PyUnicode_Resize(PyObject **p_unicode, Py_ssize_t length)
return
-
1
;
return
-
1
;
}
}
unicode
=
*
p_unicode
;
unicode
=
*
p_unicode
;
if
(
unicode
==
NULL
||
!
PyUnicode_Check
(
unicode
)
||
length
<
0
if
(
unicode
==
NULL
||
!
PyUnicode_Check
(
unicode
)
||
length
<
0
)
||
_PyUnicode_KIND
(
unicode
)
!=
PyUnicode_WCHAR_KIND
)
{
{
PyErr_BadInternalCall
();
PyErr_BadInternalCall
();
return
-
1
;
return
-
1
;
...
@@ -1549,6 +1555,36 @@ PyUnicode_Resize(PyObject **p_unicode, Py_ssize_t length)
...
@@ -1549,6 +1555,36 @@ PyUnicode_Resize(PyObject **p_unicode, Py_ssize_t length)
return
unicode_resize
(
p_unicode
,
length
);
return
unicode_resize
(
p_unicode
,
length
);
}
}
static
int
unicode_widen
(
PyObject
**
p_unicode
,
int
maxchar
)
{
PyObject
*
result
;
assert
(
PyUnicode_IS_READY
(
*
p_unicode
));
if
(
maxchar
<=
PyUnicode_MAX_CHAR_VALUE
(
*
p_unicode
))
return
0
;
result
=
PyUnicode_New
(
PyUnicode_GET_LENGTH
(
*
p_unicode
),
maxchar
);
if
(
result
==
NULL
)
return
-
1
;
PyUnicode_CopyCharacters
(
result
,
0
,
*
p_unicode
,
0
,
PyUnicode_GET_LENGTH
(
*
p_unicode
));
Py_DECREF
(
*
p_unicode
);
*
p_unicode
=
result
;
return
0
;
}
static
int
unicode_putchar
(
PyObject
**
p_unicode
,
Py_ssize_t
*
pos
,
Py_UCS4
ch
)
{
if
(
unicode_widen
(
p_unicode
,
ch
)
<
0
)
return
-
1
;
PyUnicode_WRITE
(
PyUnicode_KIND
(
*
p_unicode
),
PyUnicode_DATA
(
*
p_unicode
),
(
*
pos
)
++
,
ch
);
return
0
;
}
static
PyObject
*
static
PyObject
*
get_latin1_char
(
unsigned
char
ch
)
get_latin1_char
(
unsigned
char
ch
)
{
{
...
@@ -3581,19 +3617,18 @@ unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
...
@@ -3581,19 +3617,18 @@ unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
const
char
*
encoding
,
const
char
*
reason
,
const
char
*
encoding
,
const
char
*
reason
,
const
char
**
input
,
const
char
**
inend
,
Py_ssize_t
*
startinpos
,
const
char
**
input
,
const
char
**
inend
,
Py_ssize_t
*
startinpos
,
Py_ssize_t
*
endinpos
,
PyObject
**
exceptionObject
,
const
char
**
inptr
,
Py_ssize_t
*
endinpos
,
PyObject
**
exceptionObject
,
const
char
**
inptr
,
PyObject
**
output
,
Py_ssize_t
*
outpos
,
Py_UNICODE
**
outptr
)
PyObject
**
output
,
Py_ssize_t
*
outpos
)
{
{
static
char
*
argparse
=
"O!n;decoding error handler must return (str, int) tuple"
;
static
char
*
argparse
=
"O!n;decoding error handler must return (str, int) tuple"
;
PyObject
*
restuple
=
NULL
;
PyObject
*
restuple
=
NULL
;
PyObject
*
repunicode
=
NULL
;
PyObject
*
repunicode
=
NULL
;
Py_ssize_t
outsize
=
PyUnicode_GET_
SIZE
(
*
output
);
Py_ssize_t
outsize
=
PyUnicode_GET_
LENGTH
(
*
output
);
Py_ssize_t
insize
;
Py_ssize_t
insize
;
Py_ssize_t
requiredsize
;
Py_ssize_t
requiredsize
;
Py_ssize_t
newpos
;
Py_ssize_t
newpos
;
const
Py_UNICODE
*
repptr
;
PyObject
*
inputobj
=
NULL
;
PyObject
*
inputobj
=
NULL
;
Py_ssize_t
rep
size
;
Py_ssize_t
rep
len
;
int
res
=
-
1
;
int
res
=
-
1
;
if
(
*
errorHandler
==
NULL
)
{
if
(
*
errorHandler
==
NULL
)
{
...
@@ -3619,6 +3654,8 @@ unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
...
@@ -3619,6 +3654,8 @@ unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
}
}
if
(
!
PyArg_ParseTuple
(
restuple
,
argparse
,
&
PyUnicode_Type
,
&
repunicode
,
&
newpos
))
if
(
!
PyArg_ParseTuple
(
restuple
,
argparse
,
&
PyUnicode_Type
,
&
repunicode
,
&
newpos
))
goto
onError
;
goto
onError
;
if
(
PyUnicode_READY
(
repunicode
)
<
0
)
goto
onError
;
/* Copy back the bytes variables, which might have been modified by the
/* Copy back the bytes variables, which might have been modified by the
callback */
callback */
...
@@ -3646,21 +3683,20 @@ unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
...
@@ -3646,21 +3683,20 @@ unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
have+the replacement+the rest of the string (starting
have+the replacement+the rest of the string (starting
at the new input position), so we won't have to check space
at the new input position), so we won't have to check space
when there are no errors in the rest of the string) */
when there are no errors in the rest of the string) */
repptr
=
PyUnicode_AS_UNICODE
(
repunicode
);
replen
=
PyUnicode_GET_LENGTH
(
repunicode
);
repsize
=
PyUnicode_GET_SIZE
(
repunicode
);
requiredsize
=
*
outpos
+
replen
+
insize
-
newpos
;
requiredsize
=
*
outpos
+
repsize
+
insize
-
newpos
;
if
(
requiredsize
>
outsize
)
{
if
(
requiredsize
>
outsize
)
{
if
(
requiredsize
<
2
*
outsize
)
if
(
requiredsize
<
2
*
outsize
)
requiredsize
=
2
*
outsize
;
requiredsize
=
2
*
outsize
;
if
(
PyUnicode_R
esize
(
output
,
requiredsize
)
<
0
)
if
(
unicode_r
esize
(
output
,
requiredsize
)
<
0
)
goto
onError
;
goto
onError
;
*
outptr
=
PyUnicode_AS_UNICODE
(
*
output
)
+
*
outpos
;
}
}
if
(
unicode_widen
(
output
,
PyUnicode_MAX_CHAR_VALUE
(
repunicode
))
<
0
)
goto
onError
;
*
endinpos
=
newpos
;
*
endinpos
=
newpos
;
*
inptr
=
*
input
+
newpos
;
*
inptr
=
*
input
+
newpos
;
Py_UNICODE_COPY
(
*
outptr
,
repptr
,
repsize
);
PyUnicode_CopyCharacters
(
*
output
,
*
outpos
,
repunicode
,
0
,
replen
);
*
outptr
+=
repsize
;
*
outpos
+=
replen
;
*
outpos
+=
repsize
;
/* we made it! */
/* we made it! */
res
=
0
;
res
=
0
;
...
@@ -3778,17 +3814,17 @@ PyUnicode_DecodeUTF7Stateful(const char *s,
...
@@ -3778,17 +3814,17 @@ PyUnicode_DecodeUTF7Stateful(const char *s,
Py_ssize_t
outpos
;
Py_ssize_t
outpos
;
const
char
*
e
;
const
char
*
e
;
PyObject
*
unicode
;
PyObject
*
unicode
;
Py_UNICODE
*
p
;
const
char
*
errmsg
=
""
;
const
char
*
errmsg
=
""
;
int
inShift
=
0
;
int
inShift
=
0
;
Py_
UNICODE
*
shiftOutStart
;
Py_
ssize_t
shiftOutStart
;
unsigned
int
base64bits
=
0
;
unsigned
int
base64bits
=
0
;
unsigned
long
base64buffer
=
0
;
unsigned
long
base64buffer
=
0
;
Py_UNICODE
surrogate
=
0
;
Py_UNICODE
surrogate
=
0
;
PyObject
*
errorHandler
=
NULL
;
PyObject
*
errorHandler
=
NULL
;
PyObject
*
exc
=
NULL
;
PyObject
*
exc
=
NULL
;
unicode
=
(
PyObject
*
)
_PyUnicode_New
(
size
);
/* Start off assuming it's all ASCII. Widen later as necessary. */
unicode
=
PyUnicode_New
(
size
,
127
);
if
(
!
unicode
)
if
(
!
unicode
)
return
NULL
;
return
NULL
;
if
(
size
==
0
)
{
if
(
size
==
0
)
{
...
@@ -3797,12 +3833,11 @@ PyUnicode_DecodeUTF7Stateful(const char *s,
...
@@ -3797,12 +3833,11 @@ PyUnicode_DecodeUTF7Stateful(const char *s,
return
unicode
;
return
unicode
;
}
}
p
=
PyUnicode_AS_UNICODE
(
unicode
);
shiftOutStart
=
outpos
=
0
;
shiftOutStart
=
p
;
e
=
s
+
size
;
e
=
s
+
size
;
while
(
s
<
e
)
{
while
(
s
<
e
)
{
Py_U
NICODE
ch
;
Py_U
CS4
ch
;
restart:
restart:
ch
=
(
unsigned
char
)
*
s
;
ch
=
(
unsigned
char
)
*
s
;
...
@@ -3820,13 +3855,10 @@ PyUnicode_DecodeUTF7Stateful(const char *s,
...
@@ -3820,13 +3855,10 @@ PyUnicode_DecodeUTF7Stateful(const char *s,
if
(
surrogate
)
{
if
(
surrogate
)
{
/* expecting a second surrogate */
/* expecting a second surrogate */
if
(
outCh
>=
0xDC00
&&
outCh
<=
0xDFFF
)
{
if
(
outCh
>=
0xDC00
&&
outCh
<=
0xDFFF
)
{
#ifdef Py_UNICODE_WIDE
Py_UCS4
ch2
=
(((
surrogate
&
0x3FF
)
<<
10
)
*
p
++
=
(((
surrogate
&
0x3FF
)
<<
10
)
|
(
outCh
&
0x3FF
))
+
0x10000
;
|
(
outCh
&
0x3FF
))
+
0x10000
;
#else
if
(
unicode_putchar
(
&
unicode
,
&
outpos
,
ch2
)
<
0
)
*
p
++
=
surrogate
;
goto
onError
;
*
p
++
=
outCh
;
#endif
surrogate
=
0
;
surrogate
=
0
;
}
}
else
{
else
{
...
@@ -3844,7 +3876,8 @@ PyUnicode_DecodeUTF7Stateful(const char *s,
...
@@ -3844,7 +3876,8 @@ PyUnicode_DecodeUTF7Stateful(const char *s,
goto
utf7Error
;
goto
utf7Error
;
}
}
else
{
else
{
*
p
++
=
outCh
;
if
(
unicode_putchar
(
&
unicode
,
&
outpos
,
outCh
)
<
0
)
goto
onError
;
}
}
}
}
}
}
...
@@ -3872,7 +3905,8 @@ PyUnicode_DecodeUTF7Stateful(const char *s,
...
@@ -3872,7 +3905,8 @@ PyUnicode_DecodeUTF7Stateful(const char *s,
if
(
ch
!=
'-'
)
{
if
(
ch
!=
'-'
)
{
/* '-' is absorbed; other terminating
/* '-' is absorbed; other terminating
characters are preserved */
characters are preserved */
*
p
++
=
ch
;
if
(
unicode_putchar
(
&
unicode
,
&
outpos
,
ch
)
<
0
)
goto
onError
;
}
}
}
}
}
}
...
@@ -3881,16 +3915,18 @@ PyUnicode_DecodeUTF7Stateful(const char *s,
...
@@ -3881,16 +3915,18 @@ PyUnicode_DecodeUTF7Stateful(const char *s,
s
++
;
/* consume '+' */
s
++
;
/* consume '+' */
if
(
s
<
e
&&
*
s
==
'-'
)
{
/* '+-' encodes '+' */
if
(
s
<
e
&&
*
s
==
'-'
)
{
/* '+-' encodes '+' */
s
++
;
s
++
;
*
p
++
=
'+'
;
if
(
unicode_putchar
(
&
unicode
,
&
outpos
,
'+'
)
<
0
)
goto
onError
;
}
}
else
{
/* begin base64-encoded section */
else
{
/* begin base64-encoded section */
inShift
=
1
;
inShift
=
1
;
shiftOutStart
=
p
;
shiftOutStart
=
outpos
;
base64bits
=
0
;
base64bits
=
0
;
}
}
}
}
else
if
(
DECODE_DIRECT
(
ch
))
{
/* character decodes as itself */
else
if
(
DECODE_DIRECT
(
ch
))
{
/* character decodes as itself */
*
p
++
=
ch
;
if
(
unicode_putchar
(
&
unicode
,
&
outpos
,
ch
)
<
0
)
goto
onError
;
s
++
;
s
++
;
}
}
else
{
else
{
...
@@ -3901,13 +3937,12 @@ PyUnicode_DecodeUTF7Stateful(const char *s,
...
@@ -3901,13 +3937,12 @@ PyUnicode_DecodeUTF7Stateful(const char *s,
}
}
continue
;
continue
;
utf7Error:
utf7Error:
outpos
=
p
-
PyUnicode_AS_UNICODE
(
unicode
);
endinpos
=
s
-
starts
;
endinpos
=
s
-
starts
;
if
(
unicode_decode_call_errorhandler
(
if
(
unicode_decode_call_errorhandler
(
errors
,
&
errorHandler
,
errors
,
&
errorHandler
,
"utf7"
,
errmsg
,
"utf7"
,
errmsg
,
&
starts
,
&
e
,
&
startinpos
,
&
endinpos
,
&
exc
,
&
s
,
&
starts
,
&
e
,
&
startinpos
,
&
endinpos
,
&
exc
,
&
s
,
&
unicode
,
&
outpos
,
&
p
))
&
unicode
,
&
outpos
))
goto
onError
;
goto
onError
;
}
}
...
@@ -3918,13 +3953,12 @@ utf7Error:
...
@@ -3918,13 +3953,12 @@ utf7Error:
if
(
surrogate
||
if
(
surrogate
||
(
base64bits
>=
6
)
||
(
base64bits
>=
6
)
||
(
base64bits
>
0
&&
base64buffer
!=
0
))
{
(
base64bits
>
0
&&
base64buffer
!=
0
))
{
outpos
=
p
-
PyUnicode_AS_UNICODE
(
unicode
);
endinpos
=
size
;
endinpos
=
size
;
if
(
unicode_decode_call_errorhandler
(
if
(
unicode_decode_call_errorhandler
(
errors
,
&
errorHandler
,
errors
,
&
errorHandler
,
"utf7"
,
"unterminated shift sequence"
,
"utf7"
,
"unterminated shift sequence"
,
&
starts
,
&
e
,
&
startinpos
,
&
endinpos
,
&
exc
,
&
s
,
&
starts
,
&
e
,
&
startinpos
,
&
endinpos
,
&
exc
,
&
s
,
&
unicode
,
&
outpos
,
&
p
))
&
unicode
,
&
outpos
))
goto
onError
;
goto
onError
;
if
(
s
<
e
)
if
(
s
<
e
)
goto
restart
;
goto
restart
;
...
@@ -3934,7 +3968,7 @@ utf7Error:
...
@@ -3934,7 +3968,7 @@ utf7Error:
/* return state */
/* return state */
if
(
consumed
)
{
if
(
consumed
)
{
if
(
inShift
)
{
if
(
inShift
)
{
p
=
shiftOutStart
;
/* back off output */
outpos
=
shiftOutStart
;
/* back off output */
*
consumed
=
startinpos
;
*
consumed
=
startinpos
;
}
}
else
{
else
{
...
@@ -3942,7 +3976,7 @@ utf7Error:
...
@@ -3942,7 +3976,7 @@ utf7Error:
}
}
}
}
if
(
PyUnicode_Resize
(
&
unicode
,
p
-
PyUnicode_AS_UNICODE
(
unicode
)
)
<
0
)
if
(
unicode_resize
(
&
unicode
,
outpos
)
<
0
)
goto
onError
;
goto
onError
;
Py_XDECREF
(
errorHandler
);
Py_XDECREF
(
errorHandler
);
...
@@ -4208,7 +4242,7 @@ utf8_max_char_size_and_has_errors(const char *s, Py_ssize_t string_size,
...
@@ -4208,7 +4242,7 @@ utf8_max_char_size_and_has_errors(const char *s, Py_ssize_t string_size,
err
=
1
;
err
=
1
;
break
;
break
;
}
}
for
(
cont
=
p
+
1
;
cont
<
(
p
+
n
);
++
cont
)
{
for
(
cont
=
p
+
1
;
cont
<
=
(
p
+
n
);
++
cont
)
{
if
((
*
cont
&
0xc0
)
!=
0x80
)
{
if
((
*
cont
&
0xc0
)
!=
0x80
)
{
err
=
1
;
err
=
1
;
break
;
break
;
...
@@ -4229,19 +4263,23 @@ utf8_max_char_size_and_has_errors(const char *s, Py_ssize_t string_size,
...
@@ -4229,19 +4263,23 @@ utf8_max_char_size_and_has_errors(const char *s, Py_ssize_t string_size,
return
max_char
;
return
max_char
;
}
}
/* Similar to PyUnicode_WRITE but can also write into wstr field
/* Similar to PyUnicode_WRITE but may attempt to widen and resize the string
of the legacy unicode representation */
in case of errors. Implicit parameters: unicode, kind, data, has_errors,
#define WRITE_FLEXIBLE_OR_WSTR(kind, buf, index, value) \
onError. Potential resizing overallocates, so the result needs to shrink
at the end.
*/
#define WRITE_MAYBE_FAIL(index, value) \
do { \
do { \
const int k_ = (kind); \
if (has_errors) { \
if (k_ == PyUnicode_WCHAR_KIND) \
Py_ssize_t pos = index; \
((Py_UNICODE *)(buf))[(index)] = (Py_UNICODE)(value); \
if (pos > PyUnicode_GET_LENGTH(unicode) && \
else if (k_ == PyUnicode_1BYTE_KIND) \
unicode_resize(&unicode, pos + pos/8) < 0) \
((unsigned char *)(buf))[(index)] = (unsigned char)(value); \
goto onError; \
else if (k_ == PyUnicode_2BYTE_KIND) \
if (unicode_putchar(&unicode, &pos, value) < 0) \
((Py_UCS2 *)(buf))[(index)] = (Py_UCS2)(value); \
goto onError; \
} \
else \
else \
((Py_UCS4 *)(buf))[(index)] = (Py_UCS4)(value);
\
PyUnicode_WRITE(kind, data, index, value);
\
} while (0)
} while (0)
PyObject
*
PyObject
*
...
@@ -4266,10 +4304,6 @@ PyUnicode_DecodeUTF8Stateful(const char *s,
...
@@ -4266,10 +4304,6 @@ PyUnicode_DecodeUTF8Stateful(const char *s,
int
kind
;
int
kind
;
void
*
data
;
void
*
data
;
int
has_errors
;
int
has_errors
;
Py_UNICODE
*
error_outptr
;
#if SIZEOF_WCHAR_T == 2
Py_ssize_t
wchar_offset
=
0
;
#endif
if
(
size
==
0
)
{
if
(
size
==
0
)
{
if
(
consumed
)
if
(
consumed
)
...
@@ -4278,28 +4312,23 @@ PyUnicode_DecodeUTF8Stateful(const char *s,
...
@@ -4278,28 +4312,23 @@ PyUnicode_DecodeUTF8Stateful(const char *s,
}
}
maxchar
=
utf8_max_char_size_and_has_errors
(
s
,
size
,
&
unicode_size
,
maxchar
=
utf8_max_char_size_and_has_errors
(
s
,
size
,
&
unicode_size
,
consumed
,
&
has_errors
);
consumed
,
&
has_errors
);
if
(
has_errors
)
{
if
(
has_errors
)
unicode
=
(
PyObject
*
)
_PyUnicode_New
(
size
);
/* maxchar and size computation might be incorrect;
if
(
!
unicode
)
code below widens and resizes as necessary. */
return
NULL
;
unicode
=
PyUnicode_New
(
size
,
127
);
kind
=
PyUnicode_WCHAR_KIND
;
else
data
=
PyUnicode_AS_UNICODE
(
unicode
);
assert
(
data
!=
NULL
);
}
else
{
unicode
=
PyUnicode_New
(
unicode_size
,
maxchar
);
unicode
=
PyUnicode_New
(
unicode_size
,
maxchar
);
if
(
!
unicode
)
if
(
!
unicode
)
return
NULL
;
return
NULL
;
/* When the string is ASCII only, just use memcpy and return.
/* When the string is ASCII only, just use memcpy and return.
unicode_size may be != size if there is an incomplete UTF-8
unicode_size may be != size if there is an incomplete UTF-8
sequence at the end of the ASCII block. */
sequence at the end of the ASCII block. */
if
(
maxchar
<
128
&&
size
==
unicode_size
)
{
if
(
!
has_errors
&&
maxchar
<
128
&&
size
==
unicode_size
)
{
Py_MEMCPY
(
PyUnicode_1BYTE_DATA
(
unicode
),
s
,
unicode_size
);
Py_MEMCPY
(
PyUnicode_1BYTE_DATA
(
unicode
),
s
,
unicode_size
);
return
unicode
;
return
unicode
;
}
}
kind
=
PyUnicode_KIND
(
unicode
);
kind
=
PyUnicode_KIND
(
unicode
);
data
=
PyUnicode_DATA
(
unicode
);
data
=
PyUnicode_DATA
(
unicode
);
}
/* Unpack UTF-8 encoded data */
/* Unpack UTF-8 encoded data */
i
=
0
;
i
=
0
;
e
=
s
+
size
;
e
=
s
+
size
;
...
@@ -4327,15 +4356,15 @@ PyUnicode_DecodeUTF8Stateful(const char *s,
...
@@ -4327,15 +4356,15 @@ PyUnicode_DecodeUTF8Stateful(const char *s,
unsigned
long
value
=
*
(
unsigned
long
*
)
_s
;
unsigned
long
value
=
*
(
unsigned
long
*
)
_s
;
if
(
value
&
ASCII_CHAR_MASK
)
if
(
value
&
ASCII_CHAR_MASK
)
break
;
break
;
WRITE_
FLEXIBLE_OR_WSTR
(
kind
,
data
,
_i
+
0
,
_s
[
0
]);
WRITE_
MAYBE_FAIL
(
_i
+
0
,
_s
[
0
]);
WRITE_
FLEXIBLE_OR_WSTR
(
kind
,
data
,
_i
+
1
,
_s
[
1
]);
WRITE_
MAYBE_FAIL
(
_i
+
1
,
_s
[
1
]);
WRITE_
FLEXIBLE_OR_WSTR
(
kind
,
data
,
_i
+
2
,
_s
[
2
]);
WRITE_
MAYBE_FAIL
(
_i
+
2
,
_s
[
2
]);
WRITE_
FLEXIBLE_OR_WSTR
(
kind
,
data
,
_i
+
3
,
_s
[
3
]);
WRITE_
MAYBE_FAIL
(
_i
+
3
,
_s
[
3
]);
#if (SIZEOF_LONG == 8)
#if (SIZEOF_LONG == 8)
WRITE_
FLEXIBLE_OR_WSTR
(
kind
,
data
,
_i
+
4
,
_s
[
4
]);
WRITE_
MAYBE_FAIL
(
_i
+
4
,
_s
[
4
]);
WRITE_
FLEXIBLE_OR_WSTR
(
kind
,
data
,
_i
+
5
,
_s
[
5
]);
WRITE_
MAYBE_FAIL
(
_i
+
5
,
_s
[
5
]);
WRITE_
FLEXIBLE_OR_WSTR
(
kind
,
data
,
_i
+
6
,
_s
[
6
]);
WRITE_
MAYBE_FAIL
(
_i
+
6
,
_s
[
6
]);
WRITE_
FLEXIBLE_OR_WSTR
(
kind
,
data
,
_i
+
7
,
_s
[
7
]);
WRITE_
MAYBE_FAIL
(
_i
+
7
,
_s
[
7
]);
#endif
#endif
_s
+=
SIZEOF_LONG
;
_s
+=
SIZEOF_LONG
;
_i
+=
SIZEOF_LONG
;
_i
+=
SIZEOF_LONG
;
...
@@ -4349,7 +4378,7 @@ PyUnicode_DecodeUTF8Stateful(const char *s,
...
@@ -4349,7 +4378,7 @@ PyUnicode_DecodeUTF8Stateful(const char *s,
}
}
if
(
ch
<
0x80
)
{
if
(
ch
<
0x80
)
{
WRITE_
FLEXIBLE_OR_WSTR
(
kind
,
data
,
i
++
,
ch
);
WRITE_
MAYBE_FAIL
(
i
++
,
ch
);
s
++
;
s
++
;
continue
;
continue
;
}
}
...
@@ -4392,7 +4421,7 @@ PyUnicode_DecodeUTF8Stateful(const char *s,
...
@@ -4392,7 +4421,7 @@ PyUnicode_DecodeUTF8Stateful(const char *s,
}
}
ch
=
((
s
[
0
]
&
0x1f
)
<<
6
)
+
(
s
[
1
]
&
0x3f
);
ch
=
((
s
[
0
]
&
0x1f
)
<<
6
)
+
(
s
[
1
]
&
0x3f
);
assert
((
ch
>
0x007F
)
&&
(
ch
<=
0x07FF
));
assert
((
ch
>
0x007F
)
&&
(
ch
<=
0x07FF
));
WRITE_
FLEXIBLE_OR_WSTR
(
kind
,
data
,
i
++
,
ch
);
WRITE_
MAYBE_FAIL
(
i
++
,
ch
);
break
;
break
;
case
3
:
case
3
:
...
@@ -4421,7 +4450,7 @@ PyUnicode_DecodeUTF8Stateful(const char *s,
...
@@ -4421,7 +4450,7 @@ PyUnicode_DecodeUTF8Stateful(const char *s,
}
}
ch
=
((
s
[
0
]
&
0x0f
)
<<
12
)
+
((
s
[
1
]
&
0x3f
)
<<
6
)
+
(
s
[
2
]
&
0x3f
);
ch
=
((
s
[
0
]
&
0x0f
)
<<
12
)
+
((
s
[
1
]
&
0x3f
)
<<
6
)
+
(
s
[
2
]
&
0x3f
);
assert
((
ch
>
0x07FF
)
&&
(
ch
<=
0xFFFF
));
assert
((
ch
>
0x07FF
)
&&
(
ch
<=
0xFFFF
));
WRITE_
FLEXIBLE_OR_WSTR
(
kind
,
data
,
i
++
,
ch
);
WRITE_
MAYBE_FAIL
(
i
++
,
ch
);
break
;
break
;
case
4
:
case
4
:
...
@@ -4446,86 +4475,56 @@ PyUnicode_DecodeUTF8Stateful(const char *s,
...
@@ -4446,86 +4475,56 @@ PyUnicode_DecodeUTF8Stateful(const char *s,
((
s
[
2
]
&
0x3f
)
<<
6
)
+
(
s
[
3
]
&
0x3f
);
((
s
[
2
]
&
0x3f
)
<<
6
)
+
(
s
[
3
]
&
0x3f
);
assert
((
ch
>
0xFFFF
)
&&
(
ch
<=
0x10ffff
));
assert
((
ch
>
0xFFFF
)
&&
(
ch
<=
0x10ffff
));
/* If the string is flexible or we have native UCS-4, write
WRITE_MAYBE_FAIL
(
i
++
,
ch
);
directly.. */
if
(
sizeof
(
Py_UNICODE
)
>
2
||
kind
!=
PyUnicode_WCHAR_KIND
)
WRITE_FLEXIBLE_OR_WSTR
(
kind
,
data
,
i
++
,
ch
);
else
{
/* compute and append the two surrogates: */
/* translate from 10000..10FFFF to 0..FFFF */
ch
-=
0x10000
;
/* high surrogate = top 10 bits added to D800 */
WRITE_FLEXIBLE_OR_WSTR
(
kind
,
data
,
i
++
,
(
Py_UNICODE
)(
0xD800
+
(
ch
>>
10
)));
/* low surrogate = bottom 10 bits added to DC00 */
WRITE_FLEXIBLE_OR_WSTR
(
kind
,
data
,
i
++
,
(
Py_UNICODE
)(
0xDC00
+
(
ch
&
0x03FF
)));
}
#if SIZEOF_WCHAR_T == 2
wchar_offset
++
;
#endif
break
;
break
;
}
}
s
+=
n
;
s
+=
n
;
continue
;
continue
;
utf8Error:
utf8Error:
/* If this is not yet a resizable string, make it one.. */
if
(
!
has_errors
)
{
if
(
kind
!=
PyUnicode_WCHAR_KIND
)
{
PyObject
*
tmp
;
const
Py_UNICODE
*
u
;
Py_ssize_t
k
;
PyObject
*
new_unicode
=
(
PyObject
*
)
_PyUnicode_New
(
size
);
/* We encountered some error that wasn't detected in the original scan,
if
(
!
new_unicode
)
e.g. an encoded surrogate character. The original maxchar computation may
goto
onError
;
have been incorrect, so redo it now. */
u
=
PyUnicode_AsUnicode
(
unicode
);
for
(
k
=
0
,
maxchar
=
0
;
k
<
i
;
k
++
)
if
(
!
u
)
maxchar
=
Py_MAX
(
maxchar
,
PyUnicode_READ
(
kind
,
data
,
k
));
goto
onError
;
tmp
=
PyUnicode_New
(
PyUnicode_GET_LENGTH
(
unicode
),
maxchar
);
#if SIZEOF_WCHAR_T == 2
if
(
tmp
==
NULL
)
i
+=
wchar_offset
;
goto
onError
;
#endif
PyUnicode_CopyCharacters
(
tmp
,
0
,
unicode
,
0
,
i
);
Py_UNICODE_COPY
(
PyUnicode_AS_UNICODE
(
new_unicode
),
u
,
i
);
Py_DECREF
(
unicode
);
Py_DECREF
(
unicode
);
unicode
=
new_unicode
;
unicode
=
tmp
;
kind
=
0
;
has_errors
=
1
;
data
=
PyUnicode_AS_UNICODE
(
new_unicode
);
assert
(
data
!=
NULL
);
}
}
error_outptr
=
PyUnicode_AS_UNICODE
(
unicode
)
+
i
;
if
(
unicode_decode_call_errorhandler
(
if
(
unicode_decode_call_errorhandler
(
errors
,
&
errorHandler
,
errors
,
&
errorHandler
,
"utf8"
,
errmsg
,
"utf8"
,
errmsg
,
&
starts
,
&
e
,
&
startinpos
,
&
endinpos
,
&
exc
,
&
s
,
&
starts
,
&
e
,
&
startinpos
,
&
endinpos
,
&
exc
,
&
s
,
&
unicode
,
&
i
,
&
error_outptr
))
&
unicode
,
&
i
))
goto
onError
;
goto
onError
;
/* Update data because unicode_decode_call_errorhandler might have
/* Update data because unicode_decode_call_errorhandler might have
re-created or resized the unicode object. */
re-created or resized the unicode object. */
data
=
PyUnicode_AS_UNICODE
(
unicode
);
data
=
PyUnicode_DATA
(
unicode
);
kind
=
PyUnicode_KIND
(
unicode
);
aligned_end
=
(
const
char
*
)
((
size_t
)
e
&
~
LONG_PTR_MASK
);
aligned_end
=
(
const
char
*
)
((
size_t
)
e
&
~
LONG_PTR_MASK
);
}
}
/* Ensure the unicode_size calculation above was correct: */
/* Ensure the unicode_size calculation above was correct: */
assert
(
kind
==
PyUnicode_WCHAR_KIND
||
i
==
unicode_size
);
assert
(
has_errors
||
i
==
unicode_size
);
if
(
consumed
)
if
(
consumed
)
*
consumed
=
s
-
starts
;
*
consumed
=
s
-
starts
;
/* Adjust length and ready string when it contained errors and
/* Adjust length and ready string when it contained errors and
is of the old resizable kind. */
is of the old resizable kind. */
if
(
kind
==
PyUnicode_WCHAR_KIND
)
{
if
(
has_errors
)
{
if
(
PyUnicode_Resize
(
&
unicode
,
i
)
<
0
)
if
(
PyUnicode_Resize
(
&
unicode
,
i
)
<
0
)
goto
onError
;
goto
onError
;
}
}
Py_XDECREF
(
errorHandler
);
Py_XDECREF
(
errorHandler
);
Py_XDECREF
(
exc
);
Py_XDECREF
(
exc
);
#ifndef DONT_MAKE_RESULT_READY
if
(
_PyUnicode_READY_REPLACE
(
&
unicode
))
{
Py_DECREF
(
unicode
);
return
NULL
;
}
#endif
assert
(
_PyUnicode_CheckConsistency
(
unicode
,
1
));
assert
(
_PyUnicode_CheckConsistency
(
unicode
,
1
));
return
unicode
;
return
unicode
;
...
@@ -4536,7 +4535,7 @@ PyUnicode_DecodeUTF8Stateful(const char *s,
...
@@ -4536,7 +4535,7 @@ PyUnicode_DecodeUTF8Stateful(const char *s,
return
NULL
;
return
NULL
;
}
}
#undef WRITE_
FLEXIBLE_OR_WSTR
#undef WRITE_
MAYBE_FAIL
#ifdef __APPLE__
#ifdef __APPLE__
...
@@ -4871,13 +4870,6 @@ PyUnicode_DecodeUTF32Stateful(const char *s,
...
@@ -4871,13 +4870,6 @@ PyUnicode_DecodeUTF32Stateful(const char *s,
Py_ssize_t
endinpos
;
Py_ssize_t
endinpos
;
Py_ssize_t
outpos
;
Py_ssize_t
outpos
;
PyObject
*
unicode
;
PyObject
*
unicode
;
Py_UNICODE
*
p
;
#ifndef Py_UNICODE_WIDE
int
pairs
=
0
;
const
unsigned
char
*
qq
;
#else
const
int
pairs
=
0
;
#endif
const
unsigned
char
*
q
,
*
e
;
const
unsigned
char
*
q
,
*
e
;
int
bo
=
0
;
/* assume native ordering by default */
int
bo
=
0
;
/* assume native ordering by default */
const
char
*
errmsg
=
""
;
const
char
*
errmsg
=
""
;
...
@@ -4941,23 +4933,13 @@ PyUnicode_DecodeUTF32Stateful(const char *s,
...
@@ -4941,23 +4933,13 @@ PyUnicode_DecodeUTF32Stateful(const char *s,
iorder
[
3
]
=
0
;
iorder
[
3
]
=
0
;
}
}
/* On narrow builds we split characters outside the BMP into two
codepoints => count how much extra space we need. */
#ifndef Py_UNICODE_WIDE
for
(
qq
=
q
;
qq
<
e
;
qq
+=
4
)
if
(
qq
[
iorder
[
2
]]
!=
0
||
qq
[
iorder
[
3
]]
!=
0
)
pairs
++
;
#endif
/* This might be one to much, because of a BOM */
/* This might be one to much, because of a BOM */
unicode
=
(
PyObject
*
)
_PyUnicode_New
((
size
+
3
)
/
4
+
pairs
);
unicode
=
PyUnicode_New
((
size
+
3
)
/
4
,
127
);
if
(
!
unicode
)
if
(
!
unicode
)
return
NULL
;
return
NULL
;
if
(
size
==
0
)
if
(
size
==
0
)
return
unicode
;
return
unicode
;
outpos
=
0
;
/* Unpack UTF-32 encoded data */
p
=
PyUnicode_AS_UNICODE
(
unicode
);
while
(
q
<
e
)
{
while
(
q
<
e
)
{
Py_UCS4
ch
;
Py_UCS4
ch
;
...
@@ -4982,24 +4964,16 @@ PyUnicode_DecodeUTF32Stateful(const char *s,
...
@@ -4982,24 +4964,16 @@ PyUnicode_DecodeUTF32Stateful(const char *s,
endinpos
=
startinpos
+
4
;
endinpos
=
startinpos
+
4
;
goto
utf32Error
;
goto
utf32Error
;
}
}
#ifndef Py_UNICODE_WIDE
if
(
unicode_putchar
(
&
unicode
,
&
outpos
,
ch
)
<
0
)
if
(
ch
>=
0x10000
)
goto
onError
;
{
*
p
++
=
0xD800
|
((
ch
-
0x10000
)
>>
10
);
*
p
++
=
0xDC00
|
((
ch
-
0x10000
)
&
0x3FF
);
}
else
#endif
*
p
++
=
ch
;
q
+=
4
;
q
+=
4
;
continue
;
continue
;
utf32Error:
utf32Error:
outpos
=
p
-
PyUnicode_AS_UNICODE
(
unicode
);
if
(
unicode_decode_call_errorhandler
(
if
(
unicode_decode_call_errorhandler
(
errors
,
&
errorHandler
,
errors
,
&
errorHandler
,
"utf32"
,
errmsg
,
"utf32"
,
errmsg
,
&
starts
,
(
const
char
**
)
&
e
,
&
startinpos
,
&
endinpos
,
&
exc
,
(
const
char
**
)
&
q
,
&
starts
,
(
const
char
**
)
&
e
,
&
startinpos
,
&
endinpos
,
&
exc
,
(
const
char
**
)
&
q
,
&
unicode
,
&
outpos
,
&
p
))
&
unicode
,
&
outpos
))
goto
onError
;
goto
onError
;
}
}
...
@@ -5010,7 +4984,7 @@ PyUnicode_DecodeUTF32Stateful(const char *s,
...
@@ -5010,7 +4984,7 @@ PyUnicode_DecodeUTF32Stateful(const char *s,
*
consumed
=
(
const
char
*
)
q
-
starts
;
*
consumed
=
(
const
char
*
)
q
-
starts
;
/* Adjust length */
/* Adjust length */
if
(
PyUnicode_Resize
(
&
unicode
,
p
-
PyUnicode_AS_UNICODE
(
unicode
)
)
<
0
)
if
(
PyUnicode_Resize
(
&
unicode
,
outpos
)
<
0
)
goto
onError
;
goto
onError
;
Py_XDECREF
(
errorHandler
);
Py_XDECREF
(
errorHandler
);
...
@@ -5171,7 +5145,6 @@ PyUnicode_DecodeUTF16Stateful(const char *s,
...
@@ -5171,7 +5145,6 @@ PyUnicode_DecodeUTF16Stateful(const char *s,
Py_ssize_t
endinpos
;
Py_ssize_t
endinpos
;
Py_ssize_t
outpos
;
Py_ssize_t
outpos
;
PyObject
*
unicode
;
PyObject
*
unicode
;
Py_UNICODE
*
p
;
const
unsigned
char
*
q
,
*
e
,
*
aligned_end
;
const
unsigned
char
*
q
,
*
e
,
*
aligned_end
;
int
bo
=
0
;
/* assume native ordering by default */
int
bo
=
0
;
/* assume native ordering by default */
int
native_ordering
=
0
;
int
native_ordering
=
0
;
...
@@ -5187,14 +5160,13 @@ PyUnicode_DecodeUTF16Stateful(const char *s,
...
@@ -5187,14 +5160,13 @@ PyUnicode_DecodeUTF16Stateful(const char *s,
/* Note: size will always be longer than the resulting Unicode
/* Note: size will always be longer than the resulting Unicode
character count */
character count */
unicode
=
(
PyObject
*
)
_PyUnicode_New
(
size
);
unicode
=
PyUnicode_New
(
size
,
127
);
if
(
!
unicode
)
if
(
!
unicode
)
return
NULL
;
return
NULL
;
if
(
size
==
0
)
if
(
size
==
0
)
return
unicode
;
return
unicode
;
outpos
=
0
;
/* Unpack UTF-16 encoded data */
p
=
PyUnicode_AS_UNICODE
(
unicode
);
q
=
(
unsigned
char
*
)
s
;
q
=
(
unsigned
char
*
)
s
;
e
=
q
+
size
-
1
;
e
=
q
+
size
-
1
;
...
@@ -5254,68 +5226,51 @@ PyUnicode_DecodeUTF16Stateful(const char *s,
...
@@ -5254,68 +5226,51 @@ PyUnicode_DecodeUTF16Stateful(const char *s,
if
(
!
((
size_t
)
q
&
LONG_PTR_MASK
))
{
if
(
!
((
size_t
)
q
&
LONG_PTR_MASK
))
{
/* Fast path for runs of non-surrogate chars. */
/* Fast path for runs of non-surrogate chars. */
register
const
unsigned
char
*
_q
=
q
;
register
const
unsigned
char
*
_q
=
q
;
Py_UNICODE
*
_p
=
p
;
int
kind
=
PyUnicode_KIND
(
unicode
);
if
(
native_ordering
)
{
void
*
data
=
PyUnicode_DATA
(
unicode
);
/* Native ordering is simple: as long as the input cannot
possibly contain a surrogate char, do an unrolled copy
of several 16-bit code points to the target object.
The non-surrogate check is done on several input bytes
at a time (as many as a C 'long' can contain). */
while
(
_q
<
aligned_end
)
{
while
(
_q
<
aligned_end
)
{
unsigned
long
data
=
*
(
unsigned
long
*
)
_q
;
unsigned
long
block
=
*
(
unsigned
long
*
)
_q
;
if
(
data
&
FAST_CHAR_MASK
)
unsigned
short
*
pblock
=
(
unsigned
short
*
)
&
block
;
Py_UCS4
maxch
;
if
(
native_ordering
)
{
/* Can use buffer directly */
if
(
block
&
FAST_CHAR_MASK
)
break
;
break
;
_p
[
0
]
=
((
unsigned
short
*
)
_q
)[
0
];
_p
[
1
]
=
((
unsigned
short
*
)
_q
)[
1
];
#if (SIZEOF_LONG == 8)
_p
[
2
]
=
((
unsigned
short
*
)
_q
)[
2
];
_p
[
3
]
=
((
unsigned
short
*
)
_q
)[
3
];
#endif
_q
+=
SIZEOF_LONG
;
_p
+=
SIZEOF_LONG
/
2
;
}
}
}
else
{
else
{
/* Byteswapped ordering is similar, but we must decompose
/* Need to byte-swap */
the copy bytewise, and take care of zero'ing out the
unsigned
char
*
_p
=
(
unsigned
char
*
)
pblock
;
upper bytes if the target object is in 32-bit units
if
(
block
&
SWAPPED_FAST_CHAR_MASK
)
(that is, in UCS-4 builds). */
break
;
while
(
_q
<
aligned_end
)
{
_p
[
0
]
=
_q
[
1
];
unsigned
long
data
=
*
(
unsigned
long
*
)
_q
;
_p
[
1
]
=
_q
[
0
];
if
(
data
&
SWAPPED_FAST_CHAR_MASK
)
_p
[
2
]
=
_q
[
3
];
break
;
_p
[
3
]
=
_q
[
2
];
/* Zero upper bytes in UCS-4 builds */
#if (Py_UNICODE_SIZE > 2)
_p
[
0
]
=
0
;
_p
[
1
]
=
0
;
#if (SIZEOF_LONG == 8)
#if (SIZEOF_LONG == 8)
_p
[
2
]
=
0
;
_p
[
4
]
=
_q
[
5
];
_p
[
3
]
=
0
;
_p
[
5
]
=
_q
[
4
];
_p
[
6
]
=
_q
[
7
];
_p
[
7
]
=
_q
[
6
];
#endif
#endif
}
maxch
=
Py_MAX
(
pblock
[
0
],
pblock
[
1
]);
#if SIZEOF_LONG == 8
maxch
=
Py_MAX
(
maxch
,
Py_MAX
(
pblock
[
2
],
pblock
[
3
]));
#endif
#endif
/* Issue #4916; UCS-4 builds on big endian machines must
if
(
maxch
>
PyUnicode_MAX_CHAR_VALUE
(
unicode
))
{
fill the two last bytes of each 4-byte unit. */
if
(
unicode_widen
(
&
unicode
,
maxch
)
<
0
)
#if (!defined(BYTEORDER_IS_LITTLE_ENDIAN) && Py_UNICODE_SIZE > 2)
goto
onError
;
# define OFF 2
kind
=
PyUnicode_KIND
(
unicode
);
#else
data
=
PyUnicode_DATA
(
unicode
);
# define OFF 0
}
#endif
PyUnicode_WRITE
(
kind
,
data
,
outpos
++
,
pblock
[
0
]);
((
unsigned
char
*
)
_p
)[
OFF
+
1
]
=
_q
[
0
];
PyUnicode_WRITE
(
kind
,
data
,
outpos
++
,
pblock
[
1
]);
((
unsigned
char
*
)
_p
)[
OFF
+
0
]
=
_q
[
1
];
#if SIZEOF_LONG == 8
((
unsigned
char
*
)
_p
)[
OFF
+
1
+
Py_UNICODE_SIZE
]
=
_q
[
2
];
PyUnicode_WRITE
(
kind
,
data
,
outpos
++
,
pblock
[
2
]);
((
unsigned
char
*
)
_p
)[
OFF
+
0
+
Py_UNICODE_SIZE
]
=
_q
[
3
];
PyUnicode_WRITE
(
kind
,
data
,
outpos
++
,
pblock
[
3
]);
#if (SIZEOF_LONG == 8)
((
unsigned
char
*
)
_p
)[
OFF
+
1
+
2
*
Py_UNICODE_SIZE
]
=
_q
[
4
];
((
unsigned
char
*
)
_p
)[
OFF
+
0
+
2
*
Py_UNICODE_SIZE
]
=
_q
[
5
];
((
unsigned
char
*
)
_p
)[
OFF
+
1
+
3
*
Py_UNICODE_SIZE
]
=
_q
[
6
];
((
unsigned
char
*
)
_p
)[
OFF
+
0
+
3
*
Py_UNICODE_SIZE
]
=
_q
[
7
];
#endif
#endif
#undef OFF
_q
+=
SIZEOF_LONG
;
_q
+=
SIZEOF_LONG
;
_p
+=
SIZEOF_LONG
/
2
;
}
}
}
p
=
_p
;
q
=
_q
;
q
=
_q
;
if
(
q
>=
e
)
if
(
q
>=
e
)
break
;
break
;
...
@@ -5325,7 +5280,8 @@ PyUnicode_DecodeUTF16Stateful(const char *s,
...
@@ -5325,7 +5280,8 @@ PyUnicode_DecodeUTF16Stateful(const char *s,
q
+=
2
;
q
+=
2
;
if
(
ch
<
0xD800
||
ch
>
0xDFFF
)
{
if
(
ch
<
0xD800
||
ch
>
0xDFFF
)
{
*
p
++
=
ch
;
if
(
unicode_putchar
(
&
unicode
,
&
outpos
,
ch
)
<
0
)
goto
onError
;
continue
;
continue
;
}
}
...
@@ -5340,12 +5296,10 @@ PyUnicode_DecodeUTF16Stateful(const char *s,
...
@@ -5340,12 +5296,10 @@ PyUnicode_DecodeUTF16Stateful(const char *s,
Py_UNICODE
ch2
=
(
q
[
ihi
]
<<
8
)
|
q
[
ilo
];
Py_UNICODE
ch2
=
(
q
[
ihi
]
<<
8
)
|
q
[
ilo
];
q
+=
2
;
q
+=
2
;
if
(
0xDC00
<=
ch2
&&
ch2
<=
0xDFFF
)
{
if
(
0xDC00
<=
ch2
&&
ch2
<=
0xDFFF
)
{
#ifndef Py_UNICODE_WIDE
if
(
unicode_putchar
(
&
unicode
,
&
outpos
,
*
p
++
=
ch
;
(((
ch
&
0x3FF
)
<<
10
)
|
*
p
++
=
ch2
;
(
ch2
&
0x3FF
))
+
0x10000
)
<
0
)
#else
goto
onError
;
*
p
++
=
(((
ch
&
0x3FF
)
<<
10
)
|
(
ch2
&
0x3FF
))
+
0x10000
;
#endif
continue
;
continue
;
}
}
else
{
else
{
...
@@ -5362,7 +5316,6 @@ PyUnicode_DecodeUTF16Stateful(const char *s,
...
@@ -5362,7 +5316,6 @@ PyUnicode_DecodeUTF16Stateful(const char *s,
/* Fall through to report the error */
/* Fall through to report the error */
utf16Error:
utf16Error:
outpos
=
p
-
PyUnicode_AS_UNICODE
(
unicode
);
if
(
unicode_decode_call_errorhandler
(
if
(
unicode_decode_call_errorhandler
(
errors
,
errors
,
&
errorHandler
,
&
errorHandler
,
...
@@ -5374,8 +5327,7 @@ PyUnicode_DecodeUTF16Stateful(const char *s,
...
@@ -5374,8 +5327,7 @@ PyUnicode_DecodeUTF16Stateful(const char *s,
&
exc
,
&
exc
,
(
const
char
**
)
&
q
,
(
const
char
**
)
&
q
,
&
unicode
,
&
unicode
,
&
outpos
,
&
outpos
))
&
p
))
goto
onError
;
goto
onError
;
}
}
/* remaining byte at the end? (size should be even) */
/* remaining byte at the end? (size should be even) */
...
@@ -5384,7 +5336,6 @@ PyUnicode_DecodeUTF16Stateful(const char *s,
...
@@ -5384,7 +5336,6 @@ PyUnicode_DecodeUTF16Stateful(const char *s,
errmsg
=
"truncated data"
;
errmsg
=
"truncated data"
;
startinpos
=
((
const
char
*
)
q
)
-
starts
;
startinpos
=
((
const
char
*
)
q
)
-
starts
;
endinpos
=
((
const
char
*
)
e
)
+
1
-
starts
;
endinpos
=
((
const
char
*
)
e
)
+
1
-
starts
;
outpos
=
p
-
PyUnicode_AS_UNICODE
(
unicode
);
if
(
unicode_decode_call_errorhandler
(
if
(
unicode_decode_call_errorhandler
(
errors
,
errors
,
&
errorHandler
,
&
errorHandler
,
...
@@ -5396,8 +5347,7 @@ PyUnicode_DecodeUTF16Stateful(const char *s,
...
@@ -5396,8 +5347,7 @@ PyUnicode_DecodeUTF16Stateful(const char *s,
&
exc
,
&
exc
,
(
const
char
**
)
&
q
,
(
const
char
**
)
&
q
,
&
unicode
,
&
unicode
,
&
outpos
,
&
outpos
))
&
p
))
goto
onError
;
goto
onError
;
/* The remaining input chars are ignored if the callback
/* The remaining input chars are ignored if the callback
chooses to skip the input */
chooses to skip the input */
...
@@ -5411,17 +5361,11 @@ PyUnicode_DecodeUTF16Stateful(const char *s,
...
@@ -5411,17 +5361,11 @@ PyUnicode_DecodeUTF16Stateful(const char *s,
*
consumed
=
(
const
char
*
)
q
-
starts
;
*
consumed
=
(
const
char
*
)
q
-
starts
;
/* Adjust length */
/* Adjust length */
if
(
PyUnicode_Resize
(
&
unicode
,
p
-
PyUnicode_AS_UNICODE
(
unicode
)
)
<
0
)
if
(
PyUnicode_Resize
(
&
unicode
,
outpos
)
<
0
)
goto
onError
;
goto
onError
;
Py_XDECREF
(
errorHandler
);
Py_XDECREF
(
errorHandler
);
Py_XDECREF
(
exc
);
Py_XDECREF
(
exc
);
#ifndef DONT_MAKE_RESULT_READY
if
(
_PyUnicode_READY_REPLACE
(
&
unicode
))
{
Py_DECREF
(
unicode
);
return
NULL
;
}
#endif
assert
(
_PyUnicode_CheckConsistency
(
unicode
,
1
));
assert
(
_PyUnicode_CheckConsistency
(
unicode
,
1
));
return
unicode
;
return
unicode
;
...
@@ -5613,31 +5557,26 @@ PyUnicode_DecodeUnicodeEscape(const char *s,
...
@@ -5613,31 +5557,26 @@ PyUnicode_DecodeUnicodeEscape(const char *s,
Py_ssize_t
endinpos
;
Py_ssize_t
endinpos
;
int
j
;
int
j
;
PyObject
*
v
;
PyObject
*
v
;
Py_UNICODE
*
p
;
const
char
*
end
;
const
char
*
end
;
char
*
message
;
char
*
message
;
Py_UCS4
chr
=
0xffffffff
;
/* in case 'getcode' messes up */
Py_UCS4
chr
=
0xffffffff
;
/* in case 'getcode' messes up */
PyObject
*
errorHandler
=
NULL
;
PyObject
*
errorHandler
=
NULL
;
PyObject
*
exc
=
NULL
;
PyObject
*
exc
=
NULL
;
Py_ssize_t
ascii_length
;
Py_ssize_t
len
;
Py_ssize_t
i
;
Py_ssize_t
i
;
int
kind
;
void
*
data
;
ascii_length
=
length_of_escaped_ascii_string
(
s
,
size
);
len
=
length_of_escaped_ascii_string
(
s
,
size
);
/* After length_of_escaped_ascii_string() there are two alternatives,
/* After length_of_escaped_ascii_string() there are two alternatives,
either the string is pure ASCII with named escapes like \n, etc.
either the string is pure ASCII with named escapes like \n, etc.
and we determined it's exact size (common case)
and we determined it's exact size (common case)
or it contains \x, \u, ... escape sequences. then we create a
or it contains \x, \u, ... escape sequences. then we create a
legacy wchar string and resize it at the end of this function. */
legacy wchar string and resize it at the end of this function. */
if
(
ascii_length
>=
0
)
{
if
(
len
>=
0
)
{
v
=
PyUnicode_New
(
ascii_length
,
127
);
v
=
PyUnicode_New
(
len
,
127
);
if
(
!
v
)
if
(
!
v
)
goto
onError
;
goto
onError
;
assert
(
PyUnicode_KIND
(
v
)
==
PyUnicode_1BYTE_KIND
);
assert
(
PyUnicode_KIND
(
v
)
==
PyUnicode_1BYTE_KIND
);
kind
=
PyUnicode_1BYTE_KIND
;
data
=
PyUnicode_DATA
(
v
);
}
}
else
{
else
{
/* Escaped strings will always be longer than the resulting
/* Escaped strings will always be longer than the resulting
...
@@ -5645,11 +5584,10 @@ PyUnicode_DecodeUnicodeEscape(const char *s,
...
@@ -5645,11 +5584,10 @@ PyUnicode_DecodeUnicodeEscape(const char *s,
length after conversion to the true value.
length after conversion to the true value.
(but if the error callback returns a long replacement string
(but if the error callback returns a long replacement string
we'll have to allocate more space) */
we'll have to allocate more space) */
v
=
(
PyObject
*
)
_PyUnicode_New
(
size
);
v
=
PyUnicode_New
(
size
,
127
);
if
(
!
v
)
if
(
!
v
)
goto
onError
;
goto
onError
;
kind
=
PyUnicode_WCHAR_KIND
;
len
=
size
;
data
=
PyUnicode_AS_UNICODE
(
v
);
}
}
if
(
size
==
0
)
if
(
size
==
0
)
...
@@ -5662,18 +5600,14 @@ PyUnicode_DecodeUnicodeEscape(const char *s,
...
@@ -5662,18 +5600,14 @@ PyUnicode_DecodeUnicodeEscape(const char *s,
Py_UNICODE
x
;
Py_UNICODE
x
;
int
digits
;
int
digits
;
if
(
kind
==
PyUnicode_WCHAR_KIND
)
{
assert
(
i
<
_PyUnicode_WSTR_LENGTH
(
v
));
}
else
{
/* The only case in which i == ascii_length is a backslash
/* The only case in which i == ascii_length is a backslash
followed by a newline. */
followed by a newline. */
assert
(
i
<=
ascii_length
);
assert
(
i
<=
len
);
}
/* Non-escape characters are interpreted as Unicode ordinals */
/* Non-escape characters are interpreted as Unicode ordinals */
if
(
*
s
!=
'\\'
)
{
if
(
*
s
!=
'\\'
)
{
WRITE_ASCII_OR_WSTR
(
kind
,
data
,
i
++
,
(
unsigned
char
)
*
s
++
);
if
(
unicode_putchar
(
&
v
,
&
i
,
(
unsigned
char
)
*
s
++
)
<
0
)
goto
onError
;
continue
;
continue
;
}
}
...
@@ -5684,32 +5618,33 @@ PyUnicode_DecodeUnicodeEscape(const char *s,
...
@@ -5684,32 +5618,33 @@ PyUnicode_DecodeUnicodeEscape(const char *s,
if
(
s
>
end
)
if
(
s
>
end
)
c
=
'\0'
;
/* Invalid after \ */
c
=
'\0'
;
/* Invalid after \ */
if
(
kind
==
PyUnicode_WCHAR_KIND
)
{
assert
(
i
<
_PyUnicode_WSTR_LENGTH
(
v
));
}
else
{
/* The only case in which i == ascii_length is a backslash
/* The only case in which i == ascii_length is a backslash
followed by a newline. */
followed by a newline. */
assert
(
i
<
ascii_length
||
(
i
==
ascii_length
&&
c
==
'\n'
));
assert
(
i
<
len
||
(
i
==
len
&&
c
==
'\n'
));
}
switch
(
c
)
{
switch
(
c
)
{
/* \x escapes */
/* \x escapes */
#define WRITECHAR(ch) \
do { \
if (unicode_putchar(&v, &i, ch) < 0) \
goto onError; \
}while(0)
case
'\n'
:
break
;
case
'\n'
:
break
;
case
'\\'
:
WRITE
_ASCII_OR_WSTR
(
kind
,
data
,
i
++
,
'\\'
);
break
;
case
'\\'
:
WRITE
CHAR
(
'\\'
);
break
;
case
'\''
:
WRITE
_ASCII_OR_WSTR
(
kind
,
data
,
i
++
,
'\''
);
break
;
case
'\''
:
WRITE
CHAR
(
'\''
);
break
;
case
'\"'
:
WRITE
_ASCII_OR_WSTR
(
kind
,
data
,
i
++
,
'\"'
);
break
;
case
'\"'
:
WRITE
CHAR
(
'\"'
);
break
;
case
'b'
:
WRITE
_ASCII_OR_WSTR
(
kind
,
data
,
i
++
,
'\b'
);
break
;
case
'b'
:
WRITE
CHAR
(
'\b'
);
break
;
/* FF */
/* FF */
case
'f'
:
WRITE
_ASCII_OR_WSTR
(
kind
,
data
,
i
++
,
'\014'
);
break
;
case
'f'
:
WRITE
CHAR
(
'\014'
);
break
;
case
't'
:
WRITE
_ASCII_OR_WSTR
(
kind
,
data
,
i
++
,
'\t'
);
break
;
case
't'
:
WRITE
CHAR
(
'\t'
);
break
;
case
'n'
:
WRITE
_ASCII_OR_WSTR
(
kind
,
data
,
i
++
,
'\n'
);
break
;
case
'n'
:
WRITE
CHAR
(
'\n'
);
break
;
case
'r'
:
WRITE
_ASCII_OR_WSTR
(
kind
,
data
,
i
++
,
'\r'
);
break
;
case
'r'
:
WRITE
CHAR
(
'\r'
);
break
;
/* VT */
/* VT */
case
'v'
:
WRITE
_ASCII_OR_WSTR
(
kind
,
data
,
i
++
,
'\013'
);
break
;
case
'v'
:
WRITE
CHAR
(
'\013'
);
break
;
/* BEL, not classic C */
/* BEL, not classic C */
case
'a'
:
WRITE
_ASCII_OR_WSTR
(
kind
,
data
,
i
++
,
'\007'
);
break
;
case
'a'
:
WRITE
CHAR
(
'\007'
);
break
;
/* \OOO (octal) escapes */
/* \OOO (octal) escapes */
case
'0'
:
case
'1'
:
case
'2'
:
case
'3'
:
case
'0'
:
case
'1'
:
case
'2'
:
case
'3'
:
...
@@ -5720,7 +5655,7 @@ PyUnicode_DecodeUnicodeEscape(const char *s,
...
@@ -5720,7 +5655,7 @@ PyUnicode_DecodeUnicodeEscape(const char *s,
if
(
s
<
end
&&
'0'
<=
*
s
&&
*
s
<=
'7'
)
if
(
s
<
end
&&
'0'
<=
*
s
&&
*
s
<=
'7'
)
x
=
(
x
<<
3
)
+
*
s
++
-
'0'
;
x
=
(
x
<<
3
)
+
*
s
++
-
'0'
;
}
}
WRITE
_WSTR
(
data
,
i
++
,
x
);
WRITE
CHAR
(
x
);
break
;
break
;
/* hex escapes */
/* hex escapes */
...
@@ -5742,30 +5677,27 @@ PyUnicode_DecodeUnicodeEscape(const char *s,
...
@@ -5742,30 +5677,27 @@ PyUnicode_DecodeUnicodeEscape(const char *s,
message
=
"truncated
\\
UXXXXXXXX escape"
;
message
=
"truncated
\\
UXXXXXXXX escape"
;
hexescape:
hexescape:
chr
=
0
;
chr
=
0
;
p
=
PyUnicode_AS_UNICODE
(
v
)
+
i
;
if
(
s
+
digits
>
end
)
{
if
(
s
+
digits
>
end
)
{
endinpos
=
size
;
endinpos
=
size
;
if
(
unicode_decode_call_errorhandler
(
if
(
unicode_decode_call_errorhandler
(
errors
,
&
errorHandler
,
errors
,
&
errorHandler
,
"unicodeescape"
,
"end of string in escape sequence"
,
"unicodeescape"
,
"end of string in escape sequence"
,
&
starts
,
&
end
,
&
startinpos
,
&
endinpos
,
&
exc
,
&
s
,
&
starts
,
&
end
,
&
startinpos
,
&
endinpos
,
&
exc
,
&
s
,
&
v
,
&
i
,
&
p
))
&
v
,
&
i
))
goto
onError
;
goto
onError
;
data
=
PyUnicode_AS_UNICODE
(
v
);
goto
nextByte
;
goto
nextByte
;
}
}
for
(
j
=
0
;
j
<
digits
;
++
j
)
{
for
(
j
=
0
;
j
<
digits
;
++
j
)
{
c
=
(
unsigned
char
)
s
[
j
];
c
=
(
unsigned
char
)
s
[
j
];
if
(
!
Py_ISXDIGIT
(
c
))
{
if
(
!
Py_ISXDIGIT
(
c
))
{
endinpos
=
(
s
+
j
+
1
)
-
starts
;
endinpos
=
(
s
+
j
+
1
)
-
starts
;
p
=
PyUnicode_AS_UNICODE
(
v
)
+
i
;
if
(
unicode_decode_call_errorhandler
(
if
(
unicode_decode_call_errorhandler
(
errors
,
&
errorHandler
,
errors
,
&
errorHandler
,
"unicodeescape"
,
message
,
"unicodeescape"
,
message
,
&
starts
,
&
end
,
&
startinpos
,
&
endinpos
,
&
exc
,
&
s
,
&
starts
,
&
end
,
&
startinpos
,
&
endinpos
,
&
exc
,
&
s
,
&
v
,
&
i
,
&
p
))
&
v
,
&
i
))
goto
onError
;
goto
onError
;
data
=
PyUnicode_AS_UNICODE
(
v
);
len
=
PyUnicode_GET_LENGTH
(
v
);
goto
nextByte
;
goto
nextByte
;
}
}
chr
=
(
chr
<<
4
)
&
~
0xF
;
chr
=
(
chr
<<
4
)
&
~
0xF
;
...
@@ -5783,29 +5715,16 @@ PyUnicode_DecodeUnicodeEscape(const char *s,
...
@@ -5783,29 +5715,16 @@ PyUnicode_DecodeUnicodeEscape(const char *s,
break
;
break
;
store:
store:
/* when we get here, chr is a 32-bit unicode character */
/* when we get here, chr is a 32-bit unicode character */
if
(
chr
<=
0xffff
)
if
(
chr
<=
0x10ffff
)
{
/* UCS-2 character */
WRITECHAR
(
chr
);
WRITE_WSTR
(
data
,
i
++
,
chr
);
else
if
(
chr
<=
0x10ffff
)
{
/* UCS-4 character. Either store directly, or as
surrogate pair. */
#ifdef Py_UNICODE_WIDE
WRITE_WSTR
(
data
,
i
++
,
chr
);
#else
chr
-=
0x10000L
;
WRITE_WSTR
(
data
,
i
++
,
0xD800
+
(
Py_UNICODE
)
(
chr
>>
10
));
WRITE_WSTR
(
data
,
i
++
,
0xDC00
+
(
Py_UNICODE
)
(
chr
&
0x03FF
));
#endif
}
else
{
}
else
{
endinpos
=
s
-
starts
;
endinpos
=
s
-
starts
;
p
=
PyUnicode_AS_UNICODE
(
v
)
+
i
;
if
(
unicode_decode_call_errorhandler
(
if
(
unicode_decode_call_errorhandler
(
errors
,
&
errorHandler
,
errors
,
&
errorHandler
,
"unicodeescape"
,
"illegal Unicode character"
,
"unicodeescape"
,
"illegal Unicode character"
,
&
starts
,
&
end
,
&
startinpos
,
&
endinpos
,
&
exc
,
&
s
,
&
starts
,
&
end
,
&
startinpos
,
&
endinpos
,
&
exc
,
&
s
,
&
v
,
&
i
,
&
p
))
&
v
,
&
i
))
goto
onError
;
goto
onError
;
data
=
PyUnicode_AS_UNICODE
(
v
);
}
}
break
;
break
;
...
@@ -5834,48 +5753,39 @@ PyUnicode_DecodeUnicodeEscape(const char *s,
...
@@ -5834,48 +5753,39 @@ PyUnicode_DecodeUnicodeEscape(const char *s,
}
}
}
}
endinpos
=
s
-
starts
;
endinpos
=
s
-
starts
;
p
=
PyUnicode_AS_UNICODE
(
v
)
+
i
;
if
(
unicode_decode_call_errorhandler
(
if
(
unicode_decode_call_errorhandler
(
errors
,
&
errorHandler
,
errors
,
&
errorHandler
,
"unicodeescape"
,
message
,
"unicodeescape"
,
message
,
&
starts
,
&
end
,
&
startinpos
,
&
endinpos
,
&
exc
,
&
s
,
&
starts
,
&
end
,
&
startinpos
,
&
endinpos
,
&
exc
,
&
s
,
&
v
,
&
i
,
&
p
))
&
v
,
&
i
))
goto
onError
;
goto
onError
;
data
=
PyUnicode_AS_UNICODE
(
v
);
break
;
break
;
default:
default:
if
(
s
>
end
)
{
if
(
s
>
end
)
{
assert
(
kind
==
PyUnicode_WCHAR_KIND
);
message
=
"
\\
at end of string"
;
message
=
"
\\
at end of string"
;
s
--
;
s
--
;
endinpos
=
s
-
starts
;
endinpos
=
s
-
starts
;
p
=
PyUnicode_AS_UNICODE
(
v
)
+
i
;
if
(
unicode_decode_call_errorhandler
(
if
(
unicode_decode_call_errorhandler
(
errors
,
&
errorHandler
,
errors
,
&
errorHandler
,
"unicodeescape"
,
message
,
"unicodeescape"
,
message
,
&
starts
,
&
end
,
&
startinpos
,
&
endinpos
,
&
exc
,
&
s
,
&
starts
,
&
end
,
&
startinpos
,
&
endinpos
,
&
exc
,
&
s
,
&
v
,
&
i
,
&
p
))
&
v
,
&
i
))
goto
onError
;
goto
onError
;
data
=
PyUnicode_AS_UNICODE
(
v
);
}
}
else
{
else
{
WRITE
_ASCII_OR_WSTR
(
kind
,
data
,
i
++
,
'\\'
);
WRITE
CHAR
(
'\\'
);
WRITE
_ASCII_OR_WSTR
(
kind
,
data
,
i
++
,
(
unsigned
char
)
s
[
-
1
]);
WRITE
CHAR
(
s
[
-
1
]);
}
}
break
;
break
;
}
}
nextByte:
nextByte:
;
;
}
}
/* Ensure the length prediction worked in case of ASCII strings */
#undef WRITECHAR
assert
(
kind
==
PyUnicode_WCHAR_KIND
||
i
==
ascii_length
);
if
(
kind
==
PyUnicode_WCHAR_KIND
)
{
if
(
PyUnicode_Resize
(
&
v
,
i
)
<
0
)
if
(
PyUnicode_Resize
(
&
v
,
i
)
<
0
)
goto
onError
;
goto
onError
;
}
Py_XDECREF
(
errorHandler
);
Py_XDECREF
(
errorHandler
);
Py_XDECREF
(
exc
);
Py_XDECREF
(
exc
);
#ifndef DONT_MAKE_RESULT_READY
#ifndef DONT_MAKE_RESULT_READY
...
@@ -6081,7 +5991,6 @@ PyUnicode_DecodeRawUnicodeEscape(const char *s,
...
@@ -6081,7 +5991,6 @@ PyUnicode_DecodeRawUnicodeEscape(const char *s,
Py_ssize_t
endinpos
;
Py_ssize_t
endinpos
;
Py_ssize_t
outpos
;
Py_ssize_t
outpos
;
PyObject
*
v
;
PyObject
*
v
;
Py_UNICODE
*
p
;
const
char
*
end
;
const
char
*
end
;
const
char
*
bs
;
const
char
*
bs
;
PyObject
*
errorHandler
=
NULL
;
PyObject
*
errorHandler
=
NULL
;
...
@@ -6091,12 +6000,12 @@ PyUnicode_DecodeRawUnicodeEscape(const char *s,
...
@@ -6091,12 +6000,12 @@ PyUnicode_DecodeRawUnicodeEscape(const char *s,
Unicode string, so we start with size here and then reduce the
Unicode string, so we start with size here and then reduce the
length after conversion to the true value. (But decoding error
length after conversion to the true value. (But decoding error
handler might have to resize the string) */
handler might have to resize the string) */
v
=
(
PyObject
*
)
_PyUnicode_New
(
size
);
v
=
PyUnicode_New
(
size
,
127
);
if
(
v
==
NULL
)
if
(
v
==
NULL
)
goto
onError
;
goto
onError
;
if
(
size
==
0
)
if
(
size
==
0
)
return
v
;
return
v
;
p
=
PyUnicode_AS_UNICODE
(
v
)
;
outpos
=
0
;
end
=
s
+
size
;
end
=
s
+
size
;
while
(
s
<
end
)
{
while
(
s
<
end
)
{
unsigned
char
c
;
unsigned
char
c
;
...
@@ -6106,7 +6015,8 @@ PyUnicode_DecodeRawUnicodeEscape(const char *s,
...
@@ -6106,7 +6015,8 @@ PyUnicode_DecodeRawUnicodeEscape(const char *s,
/* Non-escape characters are interpreted as Unicode ordinals */
/* Non-escape characters are interpreted as Unicode ordinals */
if
(
*
s
!=
'\\'
)
{
if
(
*
s
!=
'\\'
)
{
*
p
++
=
(
unsigned
char
)
*
s
++
;
if
(
unicode_putchar
(
&
v
,
&
outpos
,
(
unsigned
char
)
*
s
++
)
<
0
)
goto
onError
;
continue
;
continue
;
}
}
startinpos
=
s
-
starts
;
startinpos
=
s
-
starts
;
...
@@ -6117,19 +6027,19 @@ PyUnicode_DecodeRawUnicodeEscape(const char *s,
...
@@ -6117,19 +6027,19 @@ PyUnicode_DecodeRawUnicodeEscape(const char *s,
for
(;
s
<
end
;)
{
for
(;
s
<
end
;)
{
if
(
*
s
!=
'\\'
)
if
(
*
s
!=
'\\'
)
break
;
break
;
*
p
++
=
(
unsigned
char
)
*
s
++
;
if
(
unicode_putchar
(
&
v
,
&
outpos
,
(
unsigned
char
)
*
s
++
)
<
0
)
goto
onError
;
}
}
if
(((
s
-
bs
)
&
1
)
==
0
||
if
(((
s
-
bs
)
&
1
)
==
0
||
s
>=
end
||
s
>=
end
||
(
*
s
!=
'u'
&&
*
s
!=
'U'
))
{
(
*
s
!=
'u'
&&
*
s
!=
'U'
))
{
continue
;
continue
;
}
}
p
--
;
outpos
--
;
count
=
*
s
==
'u'
?
4
:
8
;
count
=
*
s
==
'u'
?
4
:
8
;
s
++
;
s
++
;
/* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
/* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
outpos
=
p
-
PyUnicode_AS_UNICODE
(
v
);
for
(
x
=
0
,
i
=
0
;
i
<
count
;
++
i
,
++
s
)
{
for
(
x
=
0
,
i
=
0
;
i
<
count
;
++
i
,
++
s
)
{
c
=
(
unsigned
char
)
*
s
;
c
=
(
unsigned
char
)
*
s
;
if
(
!
Py_ISXDIGIT
(
c
))
{
if
(
!
Py_ISXDIGIT
(
c
))
{
...
@@ -6138,7 +6048,7 @@ PyUnicode_DecodeRawUnicodeEscape(const char *s,
...
@@ -6138,7 +6048,7 @@ PyUnicode_DecodeRawUnicodeEscape(const char *s,
errors
,
&
errorHandler
,
errors
,
&
errorHandler
,
"rawunicodeescape"
,
"truncated
\\
uXXXX"
,
"rawunicodeescape"
,
"truncated
\\
uXXXX"
,
&
starts
,
&
end
,
&
startinpos
,
&
endinpos
,
&
exc
,
&
s
,
&
starts
,
&
end
,
&
startinpos
,
&
endinpos
,
&
exc
,
&
s
,
&
v
,
&
outpos
,
&
p
))
&
v
,
&
outpos
))
goto
onError
;
goto
onError
;
goto
nextByte
;
goto
nextByte
;
}
}
...
@@ -6150,42 +6060,25 @@ PyUnicode_DecodeRawUnicodeEscape(const char *s,
...
@@ -6150,42 +6060,25 @@ PyUnicode_DecodeRawUnicodeEscape(const char *s,
else
else
x
+=
10
+
c
-
'A'
;
x
+=
10
+
c
-
'A'
;
}
}
if
(
x
<=
0xffff
)
if
(
x
<=
0x10ffff
)
{
/* UCS-2 character */
if
(
unicode_putchar
(
&
v
,
&
outpos
,
x
)
<
0
)
*
p
++
=
(
Py_UNICODE
)
x
;
goto
onError
;
else
if
(
x
<=
0x10ffff
)
{
/* UCS-4 character. Either store directly, or as
surrogate pair. */
#ifdef Py_UNICODE_WIDE
*
p
++
=
(
Py_UNICODE
)
x
;
#else
x
-=
0x10000L
;
*
p
++
=
0xD800
+
(
Py_UNICODE
)
(
x
>>
10
);
*
p
++
=
0xDC00
+
(
Py_UNICODE
)
(
x
&
0x03FF
);
#endif
}
else
{
}
else
{
endinpos
=
s
-
starts
;
endinpos
=
s
-
starts
;
outpos
=
p
-
PyUnicode_AS_UNICODE
(
v
);
if
(
unicode_decode_call_errorhandler
(
if
(
unicode_decode_call_errorhandler
(
errors
,
&
errorHandler
,
errors
,
&
errorHandler
,
"rawunicodeescape"
,
"
\\
Uxxxxxxxx out of range"
,
"rawunicodeescape"
,
"
\\
Uxxxxxxxx out of range"
,
&
starts
,
&
end
,
&
startinpos
,
&
endinpos
,
&
exc
,
&
s
,
&
starts
,
&
end
,
&
startinpos
,
&
endinpos
,
&
exc
,
&
s
,
&
v
,
&
outpos
,
&
p
))
&
v
,
&
outpos
))
goto
onError
;
goto
onError
;
}
}
nextByte:
nextByte:
;
;
}
}
if
(
PyUnicode_Resize
(
&
v
,
p
-
PyUnicode_AS_UNICODE
(
v
)
)
<
0
)
if
(
PyUnicode_Resize
(
&
v
,
outpos
)
<
0
)
goto
onError
;
goto
onError
;
Py_XDECREF
(
errorHandler
);
Py_XDECREF
(
errorHandler
);
Py_XDECREF
(
exc
);
Py_XDECREF
(
exc
);
#ifndef DONT_MAKE_RESULT_READY
if
(
_PyUnicode_READY_REPLACE
(
&
v
))
{
Py_DECREF
(
v
);
return
NULL
;
}
#endif
assert
(
_PyUnicode_CheckConsistency
(
v
,
1
));
assert
(
_PyUnicode_CheckConsistency
(
v
,
1
));
return
v
;
return
v
;
...
@@ -6311,34 +6204,27 @@ _PyUnicode_DecodeUnicodeInternal(const char *s,
...
@@ -6311,34 +6204,27 @@ _PyUnicode_DecodeUnicodeInternal(const char *s,
Py_ssize_t
endinpos
;
Py_ssize_t
endinpos
;
Py_ssize_t
outpos
;
Py_ssize_t
outpos
;
PyObject
*
v
;
PyObject
*
v
;
Py_UNICODE
*
p
;
const
char
*
end
;
const
char
*
end
;
const
char
*
reason
;
const
char
*
reason
;
PyObject
*
errorHandler
=
NULL
;
PyObject
*
errorHandler
=
NULL
;
PyObject
*
exc
=
NULL
;
PyObject
*
exc
=
NULL
;
#ifdef Py_UNICODE_WIDE
Py_UNICODE
unimax
=
PyUnicode_GetMax
();
#endif
/* XXX overflow detection missing */
/* XXX overflow detection missing */
v
=
(
PyObject
*
)
_PyUnicode_New
((
size
+
Py_UNICODE_SIZE
-
1
)
/
Py_UNICODE_SIZE
);
v
=
PyUnicode_New
((
size
+
Py_UNICODE_SIZE
-
1
)
/
Py_UNICODE_SIZE
,
127
);
if
(
v
==
NULL
)
if
(
v
==
NULL
)
goto
onError
;
goto
onError
;
/* Intentionally PyUnicode_GET_SIZE instead of PyUnicode_GET_LENGTH
if
(
PyUnicode_GET_LENGTH
(
v
)
==
0
)
as string was created with the old API. */
if
(
PyUnicode_GET_SIZE
(
v
)
==
0
)
return
v
;
return
v
;
p
=
PyUnicode_AS_UNICODE
(
v
)
;
outpos
=
0
;
end
=
s
+
size
;
end
=
s
+
size
;
while
(
s
<
end
)
{
while
(
s
<
end
)
{
memcpy
(
p
,
s
,
sizeof
(
Py_UNICODE
))
;
Py_UCS4
ch
=
*
(
Py_UNICODE
*
)
s
;
/* We have to sanity check the raw data, otherwise doom looms for
/* We have to sanity check the raw data, otherwise doom looms for
some malformed UCS-4 data. */
some malformed UCS-4 data. */
if
(
if
(
#ifdef Py_UNICODE_WIDE
#ifdef Py_UNICODE_WIDE
*
p
>
unimax
||
*
p
<
0
||
ch
>
0x10ffff
||
#endif
#endif
end
-
s
<
Py_UNICODE_SIZE
end
-
s
<
Py_UNICODE_SIZE
)
)
...
@@ -6352,31 +6238,25 @@ _PyUnicode_DecodeUnicodeInternal(const char *s,
...
@@ -6352,31 +6238,25 @@ _PyUnicode_DecodeUnicodeInternal(const char *s,
endinpos
=
s
-
starts
+
Py_UNICODE_SIZE
;
endinpos
=
s
-
starts
+
Py_UNICODE_SIZE
;
reason
=
"illegal code point (> 0x10FFFF)"
;
reason
=
"illegal code point (> 0x10FFFF)"
;
}
}
outpos
=
p
-
PyUnicode_AS_UNICODE
(
v
);
if
(
unicode_decode_call_errorhandler
(
if
(
unicode_decode_call_errorhandler
(
errors
,
&
errorHandler
,
errors
,
&
errorHandler
,
"unicode_internal"
,
reason
,
"unicode_internal"
,
reason
,
&
starts
,
&
end
,
&
startinpos
,
&
endinpos
,
&
exc
,
&
s
,
&
starts
,
&
end
,
&
startinpos
,
&
endinpos
,
&
exc
,
&
s
,
&
v
,
&
outpos
,
&
p
))
{
&
v
,
&
outpos
))
{
goto
onError
;
goto
onError
;
}
}
}
}
else
{
else
{
p
++
;
if
(
unicode_putchar
(
&
v
,
&
outpos
,
ch
)
<
0
)
goto
onError
;
s
+=
Py_UNICODE_SIZE
;
s
+=
Py_UNICODE_SIZE
;
}
}
}
}
if
(
PyUnicode_Resize
(
&
v
,
p
-
PyUnicode_AS_UNICODE
(
v
)
)
<
0
)
if
(
PyUnicode_Resize
(
&
v
,
outpos
)
<
0
)
goto
onError
;
goto
onError
;
Py_XDECREF
(
errorHandler
);
Py_XDECREF
(
errorHandler
);
Py_XDECREF
(
exc
);
Py_XDECREF
(
exc
);
#ifndef DONT_MAKE_RESULT_READY
if
(
_PyUnicode_READY_REPLACE
(
&
v
))
{
Py_DECREF
(
v
);
return
NULL
;
}
#endif
assert
(
_PyUnicode_CheckConsistency
(
v
,
1
));
assert
(
_PyUnicode_CheckConsistency
(
v
,
1
));
return
v
;
return
v
;
...
@@ -6749,7 +6629,8 @@ PyUnicode_DecodeASCII(const char *s,
...
@@ -6749,7 +6629,8 @@ PyUnicode_DecodeASCII(const char *s,
{
{
const
char
*
starts
=
s
;
const
char
*
starts
=
s
;
PyObject
*
v
;
PyObject
*
v
;
Py_UNICODE
*
u
;
int
kind
;
void
*
data
;
Py_ssize_t
startinpos
;
Py_ssize_t
startinpos
;
Py_ssize_t
endinpos
;
Py_ssize_t
endinpos
;
Py_ssize_t
outpos
;
Py_ssize_t
outpos
;
...
@@ -6797,42 +6678,38 @@ PyUnicode_DecodeASCII(const char *s,
...
@@ -6797,42 +6678,38 @@ PyUnicode_DecodeASCII(const char *s,
if
(
!
has_error
)
if
(
!
has_error
)
return
unicode_fromascii
((
const
unsigned
char
*
)
s
,
size
);
return
unicode_fromascii
((
const
unsigned
char
*
)
s
,
size
);
v
=
(
PyObject
*
)
_PyUnicode_New
(
size
);
v
=
PyUnicode_New
(
size
,
127
);
if
(
v
==
NULL
)
if
(
v
==
NULL
)
goto
onError
;
goto
onError
;
if
(
size
==
0
)
if
(
size
==
0
)
return
v
;
return
v
;
u
=
PyUnicode_AS_UNICODE
(
v
);
kind
=
PyUnicode_KIND
(
v
);
data
=
PyUnicode_DATA
(
v
);
outpos
=
0
;
e
=
s
+
size
;
e
=
s
+
size
;
while
(
s
<
e
)
{
while
(
s
<
e
)
{
register
unsigned
char
c
=
(
unsigned
char
)
*
s
;
register
unsigned
char
c
=
(
unsigned
char
)
*
s
;
if
(
c
<
128
)
{
if
(
c
<
128
)
{
*
u
++
=
c
;
PyUnicode_WRITE
(
kind
,
data
,
outpos
++
,
c
)
;
++
s
;
++
s
;
}
}
else
{
else
{
startinpos
=
s
-
starts
;
startinpos
=
s
-
starts
;
endinpos
=
startinpos
+
1
;
endinpos
=
startinpos
+
1
;
outpos
=
u
-
(
Py_UNICODE
*
)
PyUnicode_AS_UNICODE
(
v
);
if
(
unicode_decode_call_errorhandler
(
if
(
unicode_decode_call_errorhandler
(
errors
,
&
errorHandler
,
errors
,
&
errorHandler
,
"ascii"
,
"ordinal not in range(128)"
,
"ascii"
,
"ordinal not in range(128)"
,
&
starts
,
&
e
,
&
startinpos
,
&
endinpos
,
&
exc
,
&
s
,
&
starts
,
&
e
,
&
startinpos
,
&
endinpos
,
&
exc
,
&
s
,
&
v
,
&
outpos
,
&
u
))
&
v
,
&
outpos
))
goto
onError
;
goto
onError
;
kind
=
PyUnicode_KIND
(
v
);
data
=
PyUnicode_DATA
(
v
);
}
}
}
}
if
(
u
-
PyUnicode_AS_UNICODE
(
v
)
<
PyUnicode_GET_SIZE
(
v
))
if
(
PyUnicode_Resize
(
&
v
,
outpos
)
<
0
)
if
(
PyUnicode_Resize
(
&
v
,
u
-
PyUnicode_AS_UNICODE
(
v
))
<
0
)
goto
onError
;
goto
onError
;
Py_XDECREF
(
errorHandler
);
Py_XDECREF
(
errorHandler
);
Py_XDECREF
(
exc
);
Py_XDECREF
(
exc
);
#ifndef DONT_MAKE_RESULT_READY
if
(
_PyUnicode_READY_REPLACE
(
&
v
))
{
Py_DECREF
(
v
);
return
NULL
;
}
#endif
assert
(
_PyUnicode_CheckConsistency
(
v
,
1
));
assert
(
_PyUnicode_CheckConsistency
(
v
,
1
));
return
v
;
return
v
;
...
@@ -7648,7 +7525,6 @@ PyUnicode_DecodeCharmap(const char *s,
...
@@ -7648,7 +7525,6 @@ PyUnicode_DecodeCharmap(const char *s,
Py_ssize_t
outpos
;
Py_ssize_t
outpos
;
const
char
*
e
;
const
char
*
e
;
PyObject
*
v
;
PyObject
*
v
;
Py_UNICODE
*
p
;
Py_ssize_t
extrachars
=
0
;
Py_ssize_t
extrachars
=
0
;
PyObject
*
errorHandler
=
NULL
;
PyObject
*
errorHandler
=
NULL
;
PyObject
*
exc
=
NULL
;
PyObject
*
exc
=
NULL
;
...
@@ -7659,12 +7535,12 @@ PyUnicode_DecodeCharmap(const char *s,
...
@@ -7659,12 +7535,12 @@ PyUnicode_DecodeCharmap(const char *s,
if
(
mapping
==
NULL
)
if
(
mapping
==
NULL
)
return
PyUnicode_DecodeLatin1
(
s
,
size
,
errors
);
return
PyUnicode_DecodeLatin1
(
s
,
size
,
errors
);
v
=
(
PyObject
*
)
_PyUnicode_New
(
size
);
v
=
PyUnicode_New
(
size
,
127
);
if
(
v
==
NULL
)
if
(
v
==
NULL
)
goto
onError
;
goto
onError
;
if
(
size
==
0
)
if
(
size
==
0
)
return
v
;
return
v
;
p
=
PyUnicode_AS_UNICODE
(
v
)
;
outpos
=
0
;
e
=
s
+
size
;
e
=
s
+
size
;
if
(
PyUnicode_CheckExact
(
mapping
))
{
if
(
PyUnicode_CheckExact
(
mapping
))
{
mapstring
=
PyUnicode_AS_UNICODE
(
mapping
);
mapstring
=
PyUnicode_AS_UNICODE
(
mapping
);
...
@@ -7678,19 +7554,19 @@ PyUnicode_DecodeCharmap(const char *s,
...
@@ -7678,19 +7554,19 @@ PyUnicode_DecodeCharmap(const char *s,
if
(
x
==
0xfffe
)
{
if
(
x
==
0xfffe
)
{
/* undefined mapping */
/* undefined mapping */
outpos
=
p
-
PyUnicode_AS_UNICODE
(
v
);
startinpos
=
s
-
starts
;
startinpos
=
s
-
starts
;
endinpos
=
startinpos
+
1
;
endinpos
=
startinpos
+
1
;
if
(
unicode_decode_call_errorhandler
(
if
(
unicode_decode_call_errorhandler
(
errors
,
&
errorHandler
,
errors
,
&
errorHandler
,
"charmap"
,
"character maps to <undefined>"
,
"charmap"
,
"character maps to <undefined>"
,
&
starts
,
&
e
,
&
startinpos
,
&
endinpos
,
&
exc
,
&
s
,
&
starts
,
&
e
,
&
startinpos
,
&
endinpos
,
&
exc
,
&
s
,
&
v
,
&
outpos
,
&
p
))
{
&
v
,
&
outpos
))
{
goto
onError
;
goto
onError
;
}
}
continue
;
continue
;
}
}
*
p
++
=
x
;
if
(
unicode_putchar
(
&
v
,
&
outpos
,
x
)
<
0
)
goto
onError
;
++
s
;
++
s
;
}
}
}
}
...
@@ -7724,18 +7600,18 @@ PyUnicode_DecodeCharmap(const char *s,
...
@@ -7724,18 +7600,18 @@ PyUnicode_DecodeCharmap(const char *s,
Py_DECREF
(
x
);
Py_DECREF
(
x
);
goto
onError
;
goto
onError
;
}
}
*
p
++
=
(
Py_UNICODE
)
value
;
if
(
unicode_putchar
(
&
v
,
&
outpos
,
value
)
<
0
)
goto
onError
;
}
}
else
if
(
x
==
Py_None
)
{
else
if
(
x
==
Py_None
)
{
/* undefined mapping */
/* undefined mapping */
outpos
=
p
-
PyUnicode_AS_UNICODE
(
v
);
startinpos
=
s
-
starts
;
startinpos
=
s
-
starts
;
endinpos
=
startinpos
+
1
;
endinpos
=
startinpos
+
1
;
if
(
unicode_decode_call_errorhandler
(
if
(
unicode_decode_call_errorhandler
(
errors
,
&
errorHandler
,
errors
,
&
errorHandler
,
"charmap"
,
"character maps to <undefined>"
,
"charmap"
,
"character maps to <undefined>"
,
&
starts
,
&
e
,
&
startinpos
,
&
endinpos
,
&
exc
,
&
s
,
&
starts
,
&
e
,
&
startinpos
,
&
endinpos
,
&
exc
,
&
s
,
&
v
,
&
outpos
,
&
p
))
{
&
v
,
&
outpos
))
{
Py_DECREF
(
x
);
Py_DECREF
(
x
);
goto
onError
;
goto
onError
;
}
}
...
@@ -7743,32 +7619,36 @@ PyUnicode_DecodeCharmap(const char *s,
...
@@ -7743,32 +7619,36 @@ PyUnicode_DecodeCharmap(const char *s,
continue
;
continue
;
}
}
else
if
(
PyUnicode_Check
(
x
))
{
else
if
(
PyUnicode_Check
(
x
))
{
Py_ssize_t
targetsize
=
PyUnicode_GET_SIZE
(
x
)
;
Py_ssize_t
targetsize
;
if
(
targetsize
==
1
)
if
(
PyUnicode_READY
(
x
)
<
0
)
/* 1-1 mapping */
goto
onError
;
*
p
++
=
*
PyUnicode_AS_UNICODE
(
x
);
targetsize
=
PyUnicode_GET_LENGTH
(
x
);
if
(
targetsize
==
1
)
{
/* 1-1 mapping */
if
(
unicode_putchar
(
&
v
,
&
outpos
,
PyUnicode_READ_CHAR
(
x
,
0
))
<
0
)
goto
onError
;
}
else
if
(
targetsize
>
1
)
{
else
if
(
targetsize
>
1
)
{
/* 1-n mapping */
/* 1-n mapping */
if
(
targetsize
>
extrachars
)
{
if
(
targetsize
>
extrachars
)
{
/* resize first */
/* resize first */
Py_ssize_t
oldpos
=
p
-
PyUnicode_AS_UNICODE
(
v
);
Py_ssize_t
needed
=
(
targetsize
-
extrachars
)
+
\
Py_ssize_t
needed
=
(
targetsize
-
extrachars
)
+
\
(
targetsize
<<
2
);
(
targetsize
<<
2
);
extrachars
+=
needed
;
extrachars
+=
needed
;
/* XXX overflow detection missing */
/* XXX overflow detection missing */
if
(
PyUnicode_Resize
(
&
v
,
if
(
PyUnicode_Resize
(
&
v
,
PyUnicode_GET_
SIZE
(
v
)
+
needed
)
<
0
)
{
PyUnicode_GET_
LENGTH
(
v
)
+
needed
)
<
0
)
{
Py_DECREF
(
x
);
Py_DECREF
(
x
);
goto
onError
;
goto
onError
;
}
}
p
=
PyUnicode_AS_UNICODE
(
v
)
+
oldpos
;
}
}
Py_UNICODE_COPY
(
p
,
if
(
unicode_widen
(
&
v
,
PyUnicode_MAX_CHAR_VALUE
(
x
))
<
0
)
PyUnicode_AS_UNICODE
(
x
),
goto
onError
;
targetsize
);
PyUnicode_CopyCharacters
(
v
,
outpos
,
x
,
0
,
targetsize
);
p
+=
targetsize
;
outpos
+=
targetsize
;
extrachars
-=
targetsize
;
extrachars
-=
targetsize
;
}
}
/* 1-0 mapping: skip the character */
/* 1-0 mapping: skip the character */
...
@@ -7784,17 +7664,9 @@ PyUnicode_DecodeCharmap(const char *s,
...
@@ -7784,17 +7664,9 @@ PyUnicode_DecodeCharmap(const char *s,
++
s
;
++
s
;
}
}
}
}
if
(
p
-
PyUnicode_AS_UNICODE
(
v
)
<
PyUnicode_GET_SIZE
(
v
))
if
(
PyUnicode_Resize
(
&
v
,
outpos
)
<
0
)
if
(
PyUnicode_Resize
(
&
v
,
p
-
PyUnicode_AS_UNICODE
(
v
))
<
0
)
goto
onError
;
Py_XDECREF
(
errorHandler
);
Py_XDECREF
(
errorHandler
);
Py_XDECREF
(
exc
);
Py_XDECREF
(
exc
);
#ifndef DONT_MAKE_RESULT_READY
if
(
_PyUnicode_READY_REPLACE
(
&
v
))
{
Py_DECREF
(
v
);
return
NULL
;
}
#endif
assert
(
_PyUnicode_CheckConsistency
(
v
,
1
));
assert
(
_PyUnicode_CheckConsistency
(
v
,
1
));
return
v
;
return
v
;
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment