Optimise float parsing from Unicode strings with non-ASCII spaces (GH-4084)

* Reject invalid underscore placements in float parser. * Add a proper nan/inf float parser to prevent underscore-mixes like "in_f" from passing through.

Optimise float parsing from Unicode strings with non-ASCII spaces (GH-4084)
* Reject invalid underscore placements in float parser. * Add a proper nan/inf float parser to prevent underscore-mixes like "in_f" from passing through.
18970d35 · scoder · GitHub · 747cd2fb · 18970d35 · 18970d35
Commit 18970d35 authored Apr 05, 2021 by scoder Committed by GitHub Apr 05, 2021
Show whitespace changes
Inline Side-by-side

Showing with 257 additions and 13 deletions

Cython/Utility/Optimize.c Cython/Utility/Optimize.c +194 -12

tests/run/builtin_float.py tests/run/builtin_float.py +63 -1

No files found.
--- a/Cython/Utility/Optimize.c
+++ b/Cython/Utility/Optimize.c
@@ -664,9 +664,130 @@ static CYTHON_INLINE double __Pyx_PyUnicode_AsDouble(PyObject *obj);/*proto*/
 /////////////// pyunicode_as_double.proto ///////////////
 //@requires: pybytes_as_double
+#if PY_MAJOR_VERSION >= 3 && !CYTHON_COMPILING_IN_PYPY
+static const char* __Pyx__PyUnicode_AsDouble_Copy(const void* data, const int kind, char* buffer, Py_ssize_t start, Py_ssize_t end) {
+    int last_was_punctuation;
+    Py_ssize_t i;
+    // number must not start with punctuation
+    last_was_punctuation = 1;
+    for (i=start; i <= end; i++) {
+        Py_UCS4 chr = PyUnicode_READ(kind, data, i);
+        int is_punctuation = (chr == '_') | (chr == '.');
+        *buffer = (char)chr;
+        // reject sequences of '_' and '.'
+        buffer += (chr != '_');
+        if (unlikely(chr > 127)) goto parse_failure;
+        if (unlikely(last_was_punctuation & is_punctuation)) goto parse_failure;
+        last_was_punctuation = is_punctuation;
+    }
+    if (unlikely(last_was_punctuation)) goto parse_failure;
+    *buffer = '\0';
+    return buffer;
+parse_failure:
+    return NULL;
+}
+static double __Pyx__PyUnicode_AsDouble_inf_nan(const void* data, int kind, Py_ssize_t start, Py_ssize_t length) {
+    int matches = 1;
+    Py_UCS4 chr;
+    Py_UCS4 sign = PyUnicode_READ(kind, data, start);
+    int is_signed = (sign == '-') | (sign == '+');
+    start += is_signed;
+    length -= is_signed;
+    switch (PyUnicode_READ(kind, data, start)) {
+        #ifdef Py_NAN
+        case 'n':
+        case 'N':
+            if (unlikely(length != 3)) goto parse_failure;
+            chr = PyUnicode_READ(kind, data, start+1);
+            matches &= (chr == 'a') | (chr == 'A');
+            chr = PyUnicode_READ(kind, data, start+2);
+            matches &= (chr == 'n') | (chr == 'N');
+            if (unlikely(!matches)) goto parse_failure;
+            return (sign == '-') ? -Py_NAN : Py_NAN;
+        #endif
+        case 'i':
+        case 'I':
+            if (unlikely(length < 3)) goto parse_failure;
+            chr = PyUnicode_READ(kind, data, start+1);
+            matches &= (chr == 'n') | (chr == 'N');
+            chr = PyUnicode_READ(kind, data, start+2);
+            matches &= (chr == 'f') | (chr == 'F');
+            if (likely(length == 3 && matches))
+                return (sign == '-') ? -Py_HUGE_VAL : Py_HUGE_VAL;
+            if (unlikely(length != 8)) goto parse_failure;
+            chr = PyUnicode_READ(kind, data, start+3);
+            matches &= (chr == 'i') | (chr == 'I');
+            chr = PyUnicode_READ(kind, data, start+4);
+            matches &= (chr == 'n') | (chr == 'N');
+            chr = PyUnicode_READ(kind, data, start+5);
+            matches &= (chr == 'i') | (chr == 'I');
+            chr = PyUnicode_READ(kind, data, start+6);
+            matches &= (chr == 't') | (chr == 'T');
+            chr = PyUnicode_READ(kind, data, start+7);
+            matches &= (chr == 'y') | (chr == 'Y');
+            if (unlikely(!matches)) goto parse_failure;
+            return (sign == '-') ? -Py_HUGE_VAL : Py_HUGE_VAL;
+        case '.': case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9':
+            break;
+        default:
+            goto parse_failure;
+    }
+    return 0.0;
+parse_failure:
+    return -1.0;
+}
+static double __Pyx_PyUnicode_AsDouble_WithSpaces(PyObject *obj) {
+    double value;
+    const char *last;
+    char *end;
+    Py_ssize_t start, length = PyUnicode_GET_LENGTH(obj);
+    const int kind = PyUnicode_KIND(obj);
+    const void* data = PyUnicode_DATA(obj);
+    // strip spaces at start and end
+    start = 0;
+    while (Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, start)))
+        start++;
+    while (start < length - 1 && Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, length - 1)))
+        length--;
+    length -= start;
+    if (unlikely(length <= 0)) goto fallback;
+    // parse NaN / inf
+    value = __Pyx__PyUnicode_AsDouble_inf_nan(data, kind, start, length);
+    if (unlikely(value == -1.0)) goto fallback;
+    if (value != 0.0) return value;
+    if (length < 40) {
+        char number[40];
+        last = __Pyx__PyUnicode_AsDouble_Copy(data, kind, number, start, start + length);
+        if (unlikely(!last)) goto fallback;
+        value = PyOS_string_to_double(number, &end, NULL);
+    } else {
+        char *number = (char*) PyMem_Malloc((length + 1) * sizeof(char));
+        if (unlikely(!number)) goto fallback;
+        last = __Pyx__PyUnicode_AsDouble_Copy(data, kind, number, start, start + length);
+        if (unlikely(!last)) {
+            PyMem_Free(number);
+            goto fallback;
+        }
+        value = PyOS_string_to_double(number, &end, NULL);
+        PyMem_Free(number);
+    }
+    if (likely(end == last) || (value == (double)-1 && PyErr_Occurred())) {
+        return value;
+    }
+fallback:
+    return __Pyx_SlowPyString_AsDouble(obj);
+}
+#endif
 static CYTHON_INLINE double __Pyx_PyUnicode_AsDouble(PyObject *obj) {
-    // Currently not optimised for 1) Py2.7 and 2) Py3 unicode strings with non-ASCII whitespace.
+    // Currently not optimised for Py2.7.
-    // See __Pyx__PyBytes_AsDouble() below, the same byte buffer copying could be done here.
 #if PY_MAJOR_VERSION >= 3 && !CYTHON_COMPILING_IN_PYPY
    if (unlikely(PyUnicode_READY(obj) == -1))
        return (double)-1;
@@ -676,8 +797,10 @@ static CYTHON_INLINE double __Pyx_PyUnicode_AsDouble(PyObject *obj) {
        s = PyUnicode_AsUTF8AndSize(obj, &length);
        return __Pyx__PyBytes_AsDouble(obj, s, length);
    }
-#endif
+    return __Pyx_PyUnicode_AsDouble_WithSpaces(obj);
+#else
    return __Pyx_SlowPyString_AsDouble(obj);
+#endif
 }
@@ -687,18 +810,10 @@ static double __Pyx_SlowPyString_AsDouble(PyObject *obj);/*proto*/
 static double __Pyx__PyBytes_AsDouble(PyObject *obj, const char* start, Py_ssize_t length);/*proto*/
 static CYTHON_INLINE double __Pyx_PyBytes_AsDouble(PyObject *obj) {
-#if CYTHON_COMPILING_IN_PYPY
-    return __Pyx_SlowPyString_AsDouble(obj);
-#else
    return __Pyx__PyBytes_AsDouble(obj, PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj));
-#endif
 }
 static CYTHON_INLINE double __Pyx_PyByteArray_AsDouble(PyObject *obj) {
-#if CYTHON_COMPILING_IN_PYPY
-    return __Pyx_SlowPyString_AsDouble(obj);
-#else
    return __Pyx__PyBytes_AsDouble(obj, PyByteArray_AS_STRING(obj), PyByteArray_GET_SIZE(obj));
-#endif
 }
@@ -720,12 +835,66 @@ static double __Pyx_SlowPyString_AsDouble(PyObject *obj) {
 }
 static const char* __Pyx__PyBytes_AsDouble_Copy(const char* start, char* buffer, Py_ssize_t length) {
+    // number must not start with punctuation
+    int last_was_punctuation = 1;
    Py_ssize_t i;
    for (i=0; i < length; i++) {
-        if (start[i] != '_') *buffer++ = start[i];
+        char chr = start[i];
-    }
+        int is_punctuation = (chr == '_') | (chr == '.') | (chr == 'e') | (chr == 'E');
+        *buffer = chr;
+        buffer += (chr != '_');
+        // reject sequences of '_' and '.'
+        if (unlikely(last_was_punctuation & is_punctuation)) goto parse_failure;
+        last_was_punctuation = is_punctuation;
+    }
+    if (unlikely(last_was_punctuation)) goto parse_failure;
    *buffer = '\0';
    return buffer;
+parse_failure:
+    return NULL;
+}
+static double __Pyx__PyBytes_AsDouble_inf_nan(const char* start, Py_ssize_t length) {
+    int matches = 1;
+    char sign = start[0];
+    int is_signed = (sign == '+') | (sign == '-');
+    start += is_signed;
+    length -= is_signed;
+    switch (start[0]) {
+        #ifdef Py_NAN
+        case 'n':
+        case 'N':
+            if (unlikely(length != 3)) goto parse_failure;
+            matches &= (start[1] == 'a' || start[1] == 'A');
+            matches &= (start[2] == 'n' || start[2] == 'N');
+            if (unlikely(!matches)) goto parse_failure;
+            return (sign == '-') ? -Py_NAN : Py_NAN;
+        #endif
+        case 'i':
+        case 'I':
+            if (unlikely(length < 3)) goto parse_failure;
+            matches &= (start[1] == 'n' || start[1] == 'N');
+            matches &= (start[2] == 'f' || start[2] == 'F');
+            if (likely(length == 3 && matches))
+                return (sign == '-') ? -Py_HUGE_VAL : Py_HUGE_VAL;
+            if (unlikely(length != 8)) goto parse_failure;
+            matches &= (start[3] == 'i' || start[3] == 'I');
+            matches &= (start[4] == 'n' || start[4] == 'N');
+            matches &= (start[5] == 'i' || start[5] == 'I');
+            matches &= (start[6] == 't' || start[6] == 'T');
+            matches &= (start[7] == 'y' || start[7] == 'Y');
+            if (unlikely(!matches)) goto parse_failure;
+            return (sign == '-') ? -Py_HUGE_VAL : Py_HUGE_VAL;
+        case '.': case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9':
+            break;
+        default:
+            goto parse_failure;
+    }
+    return 0.0;
+parse_failure:
+    return -1.0;
 }
 static CYTHON_UNUSED double __Pyx__PyBytes_AsDouble(PyObject *obj, const char* start, Py_ssize_t length) {
@@ -733,12 +902,20 @@ static CYTHON_UNUSED double __Pyx__PyBytes_AsDouble(PyObject *obj, const char* s
    Py_ssize_t i, digits;
    const char *last = start + length;
    char *end;
    // strip spaces at start and end
    while (Py_ISSPACE(*start))
        start++;
    while (start < last - 1 && Py_ISSPACE(last[-1]))
        last--;
    length = last - start;
+    if (unlikely(length <= 0)) goto fallback;
+    // parse NaN / inf
+    value = __Pyx__PyBytes_AsDouble_inf_nan(start, length);
+    if (unlikely(value == -1.0)) goto fallback;
+    if (value != 0.0) return value;
    // look for underscores
    digits = 0;
    for (i=0; i < length; digits += start[i++] != '_');
@@ -748,11 +925,16 @@ static CYTHON_UNUSED double __Pyx__PyBytes_AsDouble(PyObject *obj, const char* s
    } else if (digits < 40) {
        char number[40];
        last = __Pyx__PyBytes_AsDouble_Copy(start, number, length);
+        if (unlikely(!last)) goto fallback;
        value = PyOS_string_to_double(number, &end, NULL);
    } else {
        char *number = (char*) PyMem_Malloc((digits + 1) * sizeof(char));
        if (unlikely(!number)) goto fallback;
        last = __Pyx__PyBytes_AsDouble_Copy(start, number, length);
+        if (unlikely(!last)) {
+            PyMem_Free(number);
+            goto fallback;
+        }
        value = PyOS_string_to_double(number, &end, NULL);
        PyMem_Free(number);
    }

--- a/tests/run/builtin_float.py
+++ b/tests/run/builtin_float.py
@@ -5,7 +5,7 @@ import cython
 import sys
 def fix_underscores(s):
-    if sys.version_info < (3, 6):
+    if sys.version_info < (3, 6) or getattr(sys, 'pypy_version_info', (9, 9)) < (3, 7, 4):
        # Py2 float() does not support PEP-515 underscore literals
        if isinstance(s, bytes):
            if not cython.compiled and b'_' in s:
@@ -60,6 +60,18 @@ def from_bytes(s: bytes):
    1.2413112312318938e+47
    >>> from_bytes(b"123E100")
    1.23e+102
+    >>> from_bytes(b"12__._3")  # doctest: +ELLIPSIS
+    Traceback (most recent call last):
+    ValueError: ...12__._3...
+    >>> from_bytes(b"_12.3")  # doctest: +ELLIPSIS
+    Traceback (most recent call last):
+    ValueError: ..._12.3...
+    >>> from_bytes(b"12.3_")  # doctest: +ELLIPSIS
+    Traceback (most recent call last):
+    ValueError: ...12.3_...
+    >>> from_bytes(b"na_n")  # doctest: +ELLIPSIS
+    Traceback (most recent call last):
+    ValueError: ...na_n...
    >>> from_bytes(None)  # doctest: +ELLIPSIS
    Traceback (most recent call last):
    TypeError...
@@ -95,6 +107,18 @@ def from_bytearray(s: bytearray):
    1.2413112312318938e+47
    >>> from_bytearray(bytearray(b"123E100"))
    1.23e+102
+    >>> from_bytearray(bytearray(b"12__._3"))  # doctest: +ELLIPSIS
+    Traceback (most recent call last):
+    ValueError: ...12__._3...
+    >>> from_bytearray(bytearray(b"_12.3"))  # doctest: +ELLIPSIS
+    Traceback (most recent call last):
+    ValueError: ..._12.3...
+    >>> from_bytearray(bytearray(b"12.3_"))  # doctest: +ELLIPSIS
+    Traceback (most recent call last):
+    ValueError: ...12.3_...
+    >>> from_bytearray(bytearray(b"in_f"))  # doctest: +ELLIPSIS
+    Traceback (most recent call last):
+    ValueError: ...in_f...
    >>> from_bytearray(None)  # doctest: +ELLIPSIS
    Traceback (most recent call last):
    TypeError...
@@ -118,6 +142,18 @@ def from_str(s: 'str'):
    1.2413112312318938e+47
    >>> from_str("123E100")
    1.23e+102
+    >>> from_str("12__._3")  # doctest: +ELLIPSIS
+    Traceback (most recent call last):
+    ValueError: ...12__._3...
+    >>> from_str("_12.3")  # doctest: +ELLIPSIS
+    Traceback (most recent call last):
+    ValueError: ..._12.3...
+    >>> from_str("12.3_")  # doctest: +ELLIPSIS
+    Traceback (most recent call last):
+    ValueError: ...12.3_...
+    >>> from_str("n_an")  # doctest: +ELLIPSIS
+    Traceback (most recent call last):
+    ValueError: ...n_an...
    >>> from_str(None)  # doctest: +ELLIPSIS
    Traceback (most recent call last):
    TypeError...
@@ -155,6 +191,32 @@ def from_unicode(s: 'unicode'):
    1.23e+102
    >>> from_unicode(u"123.23\\N{PUNCTUATION SPACE}")
    123.23
+    >>> from_unicode(u"\\N{PUNCTUATION SPACE} 123.23 \\N{PUNCTUATION SPACE}")
+    123.23
+    >>> from_unicode(fix_underscores(u"\\N{PUNCTUATION SPACE} 12_3.2_3 \\N{PUNCTUATION SPACE}"))
+    123.23
+    >>> from_unicode(u"\\N{PUNCTUATION SPACE} " * 25 + u"123.54 " + u"\\N{PUNCTUATION SPACE} " * 22)  # >= 40 chars
+    123.54
+    >>> from_unicode(fix_underscores(u"\\N{PUNCTUATION SPACE} " * 25 + u"1_23.5_4 " + u"\\N{PUNCTUATION SPACE} " * 22))
+    123.54
+    >>> from_unicode(u"\\N{PUNCTUATION SPACE} " + u"123.54 " * 2 + u"\\N{PUNCTUATION SPACE}")  # doctest: +ELLIPSIS
+    Traceback (most recent call last):
+    ValueError: ...123.54 123.54...
+    >>> from_unicode(u"\\N{PUNCTUATION SPACE} " * 25 + u"123.54 " * 2 + u"\\N{PUNCTUATION SPACE} " * 22)  # doctest: +ELLIPSIS
+    Traceback (most recent call last):
+    ValueError: ...123.54 123.54...
+    >>> from_unicode(u"_12__._3")  # doctest: +ELLIPSIS
+    Traceback (most recent call last):
+    ValueError: ..._12__._3...
+    >>> from_unicode(u"_12.3")  # doctest: +ELLIPSIS
+    Traceback (most recent call last):
+    ValueError: ..._12.3...
+    >>> from_unicode(u"12.3_")  # doctest: +ELLIPSIS
+    Traceback (most recent call last):
+    ValueError: ...12.3_...
+    >>> from_unicode(u"i_nf")  # doctest: +ELLIPSIS
+    Traceback (most recent call last):
+    ValueError: ...i_nf...
    >>> from_unicode(None)  # doctest: +ELLIPSIS
    Traceback (most recent call last):
    TypeError...