Commit 18970d35 authored by scoder's avatar scoder Committed by GitHub

Optimise float parsing from Unicode strings with non-ASCII spaces (GH-4084)

* Reject invalid underscore placements in float parser.
* Add a proper nan/inf float parser to prevent underscore-mixes like "in_f" from passing through.
parent 747cd2fb
...@@ -664,9 +664,130 @@ static CYTHON_INLINE double __Pyx_PyUnicode_AsDouble(PyObject *obj);/*proto*/ ...@@ -664,9 +664,130 @@ static CYTHON_INLINE double __Pyx_PyUnicode_AsDouble(PyObject *obj);/*proto*/
/////////////// pyunicode_as_double.proto /////////////// /////////////// pyunicode_as_double.proto ///////////////
//@requires: pybytes_as_double //@requires: pybytes_as_double
#if PY_MAJOR_VERSION >= 3 && !CYTHON_COMPILING_IN_PYPY
static const char* __Pyx__PyUnicode_AsDouble_Copy(const void* data, const int kind, char* buffer, Py_ssize_t start, Py_ssize_t end) {
int last_was_punctuation;
Py_ssize_t i;
// number must not start with punctuation
last_was_punctuation = 1;
for (i=start; i <= end; i++) {
Py_UCS4 chr = PyUnicode_READ(kind, data, i);
int is_punctuation = (chr == '_') | (chr == '.');
*buffer = (char)chr;
// reject sequences of '_' and '.'
buffer += (chr != '_');
if (unlikely(chr > 127)) goto parse_failure;
if (unlikely(last_was_punctuation & is_punctuation)) goto parse_failure;
last_was_punctuation = is_punctuation;
}
if (unlikely(last_was_punctuation)) goto parse_failure;
*buffer = '\0';
return buffer;
parse_failure:
return NULL;
}
static double __Pyx__PyUnicode_AsDouble_inf_nan(const void* data, int kind, Py_ssize_t start, Py_ssize_t length) {
int matches = 1;
Py_UCS4 chr;
Py_UCS4 sign = PyUnicode_READ(kind, data, start);
int is_signed = (sign == '-') | (sign == '+');
start += is_signed;
length -= is_signed;
switch (PyUnicode_READ(kind, data, start)) {
#ifdef Py_NAN
case 'n':
case 'N':
if (unlikely(length != 3)) goto parse_failure;
chr = PyUnicode_READ(kind, data, start+1);
matches &= (chr == 'a') | (chr == 'A');
chr = PyUnicode_READ(kind, data, start+2);
matches &= (chr == 'n') | (chr == 'N');
if (unlikely(!matches)) goto parse_failure;
return (sign == '-') ? -Py_NAN : Py_NAN;
#endif
case 'i':
case 'I':
if (unlikely(length < 3)) goto parse_failure;
chr = PyUnicode_READ(kind, data, start+1);
matches &= (chr == 'n') | (chr == 'N');
chr = PyUnicode_READ(kind, data, start+2);
matches &= (chr == 'f') | (chr == 'F');
if (likely(length == 3 && matches))
return (sign == '-') ? -Py_HUGE_VAL : Py_HUGE_VAL;
if (unlikely(length != 8)) goto parse_failure;
chr = PyUnicode_READ(kind, data, start+3);
matches &= (chr == 'i') | (chr == 'I');
chr = PyUnicode_READ(kind, data, start+4);
matches &= (chr == 'n') | (chr == 'N');
chr = PyUnicode_READ(kind, data, start+5);
matches &= (chr == 'i') | (chr == 'I');
chr = PyUnicode_READ(kind, data, start+6);
matches &= (chr == 't') | (chr == 'T');
chr = PyUnicode_READ(kind, data, start+7);
matches &= (chr == 'y') | (chr == 'Y');
if (unlikely(!matches)) goto parse_failure;
return (sign == '-') ? -Py_HUGE_VAL : Py_HUGE_VAL;
case '.': case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9':
break;
default:
goto parse_failure;
}
return 0.0;
parse_failure:
return -1.0;
}
static double __Pyx_PyUnicode_AsDouble_WithSpaces(PyObject *obj) {
double value;
const char *last;
char *end;
Py_ssize_t start, length = PyUnicode_GET_LENGTH(obj);
const int kind = PyUnicode_KIND(obj);
const void* data = PyUnicode_DATA(obj);
// strip spaces at start and end
start = 0;
while (Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, start)))
start++;
while (start < length - 1 && Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, length - 1)))
length--;
length -= start;
if (unlikely(length <= 0)) goto fallback;
// parse NaN / inf
value = __Pyx__PyUnicode_AsDouble_inf_nan(data, kind, start, length);
if (unlikely(value == -1.0)) goto fallback;
if (value != 0.0) return value;
if (length < 40) {
char number[40];
last = __Pyx__PyUnicode_AsDouble_Copy(data, kind, number, start, start + length);
if (unlikely(!last)) goto fallback;
value = PyOS_string_to_double(number, &end, NULL);
} else {
char *number = (char*) PyMem_Malloc((length + 1) * sizeof(char));
if (unlikely(!number)) goto fallback;
last = __Pyx__PyUnicode_AsDouble_Copy(data, kind, number, start, start + length);
if (unlikely(!last)) {
PyMem_Free(number);
goto fallback;
}
value = PyOS_string_to_double(number, &end, NULL);
PyMem_Free(number);
}
if (likely(end == last) || (value == (double)-1 && PyErr_Occurred())) {
return value;
}
fallback:
return __Pyx_SlowPyString_AsDouble(obj);
}
#endif
static CYTHON_INLINE double __Pyx_PyUnicode_AsDouble(PyObject *obj) { static CYTHON_INLINE double __Pyx_PyUnicode_AsDouble(PyObject *obj) {
// Currently not optimised for 1) Py2.7 and 2) Py3 unicode strings with non-ASCII whitespace. // Currently not optimised for Py2.7.
// See __Pyx__PyBytes_AsDouble() below, the same byte buffer copying could be done here.
#if PY_MAJOR_VERSION >= 3 && !CYTHON_COMPILING_IN_PYPY #if PY_MAJOR_VERSION >= 3 && !CYTHON_COMPILING_IN_PYPY
if (unlikely(PyUnicode_READY(obj) == -1)) if (unlikely(PyUnicode_READY(obj) == -1))
return (double)-1; return (double)-1;
...@@ -676,8 +797,10 @@ static CYTHON_INLINE double __Pyx_PyUnicode_AsDouble(PyObject *obj) { ...@@ -676,8 +797,10 @@ static CYTHON_INLINE double __Pyx_PyUnicode_AsDouble(PyObject *obj) {
s = PyUnicode_AsUTF8AndSize(obj, &length); s = PyUnicode_AsUTF8AndSize(obj, &length);
return __Pyx__PyBytes_AsDouble(obj, s, length); return __Pyx__PyBytes_AsDouble(obj, s, length);
} }
#endif return __Pyx_PyUnicode_AsDouble_WithSpaces(obj);
#else
return __Pyx_SlowPyString_AsDouble(obj); return __Pyx_SlowPyString_AsDouble(obj);
#endif
} }
...@@ -687,18 +810,10 @@ static double __Pyx_SlowPyString_AsDouble(PyObject *obj);/*proto*/ ...@@ -687,18 +810,10 @@ static double __Pyx_SlowPyString_AsDouble(PyObject *obj);/*proto*/
static double __Pyx__PyBytes_AsDouble(PyObject *obj, const char* start, Py_ssize_t length);/*proto*/ static double __Pyx__PyBytes_AsDouble(PyObject *obj, const char* start, Py_ssize_t length);/*proto*/
static CYTHON_INLINE double __Pyx_PyBytes_AsDouble(PyObject *obj) { static CYTHON_INLINE double __Pyx_PyBytes_AsDouble(PyObject *obj) {
#if CYTHON_COMPILING_IN_PYPY
return __Pyx_SlowPyString_AsDouble(obj);
#else
return __Pyx__PyBytes_AsDouble(obj, PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj)); return __Pyx__PyBytes_AsDouble(obj, PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj));
#endif
} }
static CYTHON_INLINE double __Pyx_PyByteArray_AsDouble(PyObject *obj) { static CYTHON_INLINE double __Pyx_PyByteArray_AsDouble(PyObject *obj) {
#if CYTHON_COMPILING_IN_PYPY
return __Pyx_SlowPyString_AsDouble(obj);
#else
return __Pyx__PyBytes_AsDouble(obj, PyByteArray_AS_STRING(obj), PyByteArray_GET_SIZE(obj)); return __Pyx__PyBytes_AsDouble(obj, PyByteArray_AS_STRING(obj), PyByteArray_GET_SIZE(obj));
#endif
} }
...@@ -720,12 +835,66 @@ static double __Pyx_SlowPyString_AsDouble(PyObject *obj) { ...@@ -720,12 +835,66 @@ static double __Pyx_SlowPyString_AsDouble(PyObject *obj) {
} }
static const char* __Pyx__PyBytes_AsDouble_Copy(const char* start, char* buffer, Py_ssize_t length) { static const char* __Pyx__PyBytes_AsDouble_Copy(const char* start, char* buffer, Py_ssize_t length) {
// number must not start with punctuation
int last_was_punctuation = 1;
Py_ssize_t i; Py_ssize_t i;
for (i=0; i < length; i++) { for (i=0; i < length; i++) {
if (start[i] != '_') *buffer++ = start[i]; char chr = start[i];
} int is_punctuation = (chr == '_') | (chr == '.') | (chr == 'e') | (chr == 'E');
*buffer = chr;
buffer += (chr != '_');
// reject sequences of '_' and '.'
if (unlikely(last_was_punctuation & is_punctuation)) goto parse_failure;
last_was_punctuation = is_punctuation;
}
if (unlikely(last_was_punctuation)) goto parse_failure;
*buffer = '\0'; *buffer = '\0';
return buffer; return buffer;
parse_failure:
return NULL;
}
static double __Pyx__PyBytes_AsDouble_inf_nan(const char* start, Py_ssize_t length) {
int matches = 1;
char sign = start[0];
int is_signed = (sign == '+') | (sign == '-');
start += is_signed;
length -= is_signed;
switch (start[0]) {
#ifdef Py_NAN
case 'n':
case 'N':
if (unlikely(length != 3)) goto parse_failure;
matches &= (start[1] == 'a' || start[1] == 'A');
matches &= (start[2] == 'n' || start[2] == 'N');
if (unlikely(!matches)) goto parse_failure;
return (sign == '-') ? -Py_NAN : Py_NAN;
#endif
case 'i':
case 'I':
if (unlikely(length < 3)) goto parse_failure;
matches &= (start[1] == 'n' || start[1] == 'N');
matches &= (start[2] == 'f' || start[2] == 'F');
if (likely(length == 3 && matches))
return (sign == '-') ? -Py_HUGE_VAL : Py_HUGE_VAL;
if (unlikely(length != 8)) goto parse_failure;
matches &= (start[3] == 'i' || start[3] == 'I');
matches &= (start[4] == 'n' || start[4] == 'N');
matches &= (start[5] == 'i' || start[5] == 'I');
matches &= (start[6] == 't' || start[6] == 'T');
matches &= (start[7] == 'y' || start[7] == 'Y');
if (unlikely(!matches)) goto parse_failure;
return (sign == '-') ? -Py_HUGE_VAL : Py_HUGE_VAL;
case '.': case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9':
break;
default:
goto parse_failure;
}
return 0.0;
parse_failure:
return -1.0;
} }
static CYTHON_UNUSED double __Pyx__PyBytes_AsDouble(PyObject *obj, const char* start, Py_ssize_t length) { static CYTHON_UNUSED double __Pyx__PyBytes_AsDouble(PyObject *obj, const char* start, Py_ssize_t length) {
...@@ -733,12 +902,20 @@ static CYTHON_UNUSED double __Pyx__PyBytes_AsDouble(PyObject *obj, const char* s ...@@ -733,12 +902,20 @@ static CYTHON_UNUSED double __Pyx__PyBytes_AsDouble(PyObject *obj, const char* s
Py_ssize_t i, digits; Py_ssize_t i, digits;
const char *last = start + length; const char *last = start + length;
char *end; char *end;
// strip spaces at start and end // strip spaces at start and end
while (Py_ISSPACE(*start)) while (Py_ISSPACE(*start))
start++; start++;
while (start < last - 1 && Py_ISSPACE(last[-1])) while (start < last - 1 && Py_ISSPACE(last[-1]))
last--; last--;
length = last - start; length = last - start;
if (unlikely(length <= 0)) goto fallback;
// parse NaN / inf
value = __Pyx__PyBytes_AsDouble_inf_nan(start, length);
if (unlikely(value == -1.0)) goto fallback;
if (value != 0.0) return value;
// look for underscores // look for underscores
digits = 0; digits = 0;
for (i=0; i < length; digits += start[i++] != '_'); for (i=0; i < length; digits += start[i++] != '_');
...@@ -748,11 +925,16 @@ static CYTHON_UNUSED double __Pyx__PyBytes_AsDouble(PyObject *obj, const char* s ...@@ -748,11 +925,16 @@ static CYTHON_UNUSED double __Pyx__PyBytes_AsDouble(PyObject *obj, const char* s
} else if (digits < 40) { } else if (digits < 40) {
char number[40]; char number[40];
last = __Pyx__PyBytes_AsDouble_Copy(start, number, length); last = __Pyx__PyBytes_AsDouble_Copy(start, number, length);
if (unlikely(!last)) goto fallback;
value = PyOS_string_to_double(number, &end, NULL); value = PyOS_string_to_double(number, &end, NULL);
} else { } else {
char *number = (char*) PyMem_Malloc((digits + 1) * sizeof(char)); char *number = (char*) PyMem_Malloc((digits + 1) * sizeof(char));
if (unlikely(!number)) goto fallback; if (unlikely(!number)) goto fallback;
last = __Pyx__PyBytes_AsDouble_Copy(start, number, length); last = __Pyx__PyBytes_AsDouble_Copy(start, number, length);
if (unlikely(!last)) {
PyMem_Free(number);
goto fallback;
}
value = PyOS_string_to_double(number, &end, NULL); value = PyOS_string_to_double(number, &end, NULL);
PyMem_Free(number); PyMem_Free(number);
} }
......
...@@ -5,7 +5,7 @@ import cython ...@@ -5,7 +5,7 @@ import cython
import sys import sys
def fix_underscores(s): def fix_underscores(s):
if sys.version_info < (3, 6): if sys.version_info < (3, 6) or getattr(sys, 'pypy_version_info', (9, 9)) < (3, 7, 4):
# Py2 float() does not support PEP-515 underscore literals # Py2 float() does not support PEP-515 underscore literals
if isinstance(s, bytes): if isinstance(s, bytes):
if not cython.compiled and b'_' in s: if not cython.compiled and b'_' in s:
...@@ -60,6 +60,18 @@ def from_bytes(s: bytes): ...@@ -60,6 +60,18 @@ def from_bytes(s: bytes):
1.2413112312318938e+47 1.2413112312318938e+47
>>> from_bytes(b"123E100") >>> from_bytes(b"123E100")
1.23e+102 1.23e+102
>>> from_bytes(b"12__._3") # doctest: +ELLIPSIS
Traceback (most recent call last):
ValueError: ...12__._3...
>>> from_bytes(b"_12.3") # doctest: +ELLIPSIS
Traceback (most recent call last):
ValueError: ..._12.3...
>>> from_bytes(b"12.3_") # doctest: +ELLIPSIS
Traceback (most recent call last):
ValueError: ...12.3_...
>>> from_bytes(b"na_n") # doctest: +ELLIPSIS
Traceback (most recent call last):
ValueError: ...na_n...
>>> from_bytes(None) # doctest: +ELLIPSIS >>> from_bytes(None) # doctest: +ELLIPSIS
Traceback (most recent call last): Traceback (most recent call last):
TypeError... TypeError...
...@@ -95,6 +107,18 @@ def from_bytearray(s: bytearray): ...@@ -95,6 +107,18 @@ def from_bytearray(s: bytearray):
1.2413112312318938e+47 1.2413112312318938e+47
>>> from_bytearray(bytearray(b"123E100")) >>> from_bytearray(bytearray(b"123E100"))
1.23e+102 1.23e+102
>>> from_bytearray(bytearray(b"12__._3")) # doctest: +ELLIPSIS
Traceback (most recent call last):
ValueError: ...12__._3...
>>> from_bytearray(bytearray(b"_12.3")) # doctest: +ELLIPSIS
Traceback (most recent call last):
ValueError: ..._12.3...
>>> from_bytearray(bytearray(b"12.3_")) # doctest: +ELLIPSIS
Traceback (most recent call last):
ValueError: ...12.3_...
>>> from_bytearray(bytearray(b"in_f")) # doctest: +ELLIPSIS
Traceback (most recent call last):
ValueError: ...in_f...
>>> from_bytearray(None) # doctest: +ELLIPSIS >>> from_bytearray(None) # doctest: +ELLIPSIS
Traceback (most recent call last): Traceback (most recent call last):
TypeError... TypeError...
...@@ -118,6 +142,18 @@ def from_str(s: 'str'): ...@@ -118,6 +142,18 @@ def from_str(s: 'str'):
1.2413112312318938e+47 1.2413112312318938e+47
>>> from_str("123E100") >>> from_str("123E100")
1.23e+102 1.23e+102
>>> from_str("12__._3") # doctest: +ELLIPSIS
Traceback (most recent call last):
ValueError: ...12__._3...
>>> from_str("_12.3") # doctest: +ELLIPSIS
Traceback (most recent call last):
ValueError: ..._12.3...
>>> from_str("12.3_") # doctest: +ELLIPSIS
Traceback (most recent call last):
ValueError: ...12.3_...
>>> from_str("n_an") # doctest: +ELLIPSIS
Traceback (most recent call last):
ValueError: ...n_an...
>>> from_str(None) # doctest: +ELLIPSIS >>> from_str(None) # doctest: +ELLIPSIS
Traceback (most recent call last): Traceback (most recent call last):
TypeError... TypeError...
...@@ -155,6 +191,32 @@ def from_unicode(s: 'unicode'): ...@@ -155,6 +191,32 @@ def from_unicode(s: 'unicode'):
1.23e+102 1.23e+102
>>> from_unicode(u"123.23\\N{PUNCTUATION SPACE}") >>> from_unicode(u"123.23\\N{PUNCTUATION SPACE}")
123.23 123.23
>>> from_unicode(u"\\N{PUNCTUATION SPACE} 123.23 \\N{PUNCTUATION SPACE}")
123.23
>>> from_unicode(fix_underscores(u"\\N{PUNCTUATION SPACE} 12_3.2_3 \\N{PUNCTUATION SPACE}"))
123.23
>>> from_unicode(u"\\N{PUNCTUATION SPACE} " * 25 + u"123.54 " + u"\\N{PUNCTUATION SPACE} " * 22) # >= 40 chars
123.54
>>> from_unicode(fix_underscores(u"\\N{PUNCTUATION SPACE} " * 25 + u"1_23.5_4 " + u"\\N{PUNCTUATION SPACE} " * 22))
123.54
>>> from_unicode(u"\\N{PUNCTUATION SPACE} " + u"123.54 " * 2 + u"\\N{PUNCTUATION SPACE}") # doctest: +ELLIPSIS
Traceback (most recent call last):
ValueError: ...123.54 123.54...
>>> from_unicode(u"\\N{PUNCTUATION SPACE} " * 25 + u"123.54 " * 2 + u"\\N{PUNCTUATION SPACE} " * 22) # doctest: +ELLIPSIS
Traceback (most recent call last):
ValueError: ...123.54 123.54...
>>> from_unicode(u"_12__._3") # doctest: +ELLIPSIS
Traceback (most recent call last):
ValueError: ..._12__._3...
>>> from_unicode(u"_12.3") # doctest: +ELLIPSIS
Traceback (most recent call last):
ValueError: ..._12.3...
>>> from_unicode(u"12.3_") # doctest: +ELLIPSIS
Traceback (most recent call last):
ValueError: ...12.3_...
>>> from_unicode(u"i_nf") # doctest: +ELLIPSIS
Traceback (most recent call last):
ValueError: ...i_nf...
>>> from_unicode(None) # doctest: +ELLIPSIS >>> from_unicode(None) # doctest: +ELLIPSIS
Traceback (most recent call last): Traceback (most recent call last):
TypeError... TypeError...
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment