Issue 26002 and 25974

patches by Upendra Kumar and Stefan Krah speed up median by using bisect, and general speedup for Decimals using as_integer_ratio

Issue 26002 and 25974
patches by Upendra Kumar and Stefan Krah speed up median by using bisect, and general speedup for Decimals using as_integer_ratio
3b06e243 · Steven D'Aprano · ad039f75 · 3b06e243 · 3b06e243
Commit 3b06e243 authored May 05, 2016 by Steven D'Aprano
Hide whitespace changes
Inline Side-by-side

Showing with 44 additions and 55 deletions

Lib/statistics.py Lib/statistics.py +30 -38

Lib/test/test_statistics.py Lib/test/test_statistics.py +14 -17

No files found.
--- a/Lib/statistics.py
+++ b/Lib/statistics.py
@@ -105,6 +105,7 @@ import math
 from fractions import Fraction
 from decimal import Decimal
 from itertools import groupby
+from bisect import bisect_left, bisect_right



@@ -223,56 +224,26 @@ def _exact_ratio(x):
        # Optimise the common case of floats. We expect that the most often
        # used numeric type will be builtin floats, so try to make this as
        # fast as possible.
-        if type(x) is float:
+        if type(x) is float or type(x) is Decimal:
            return x.as_integer_ratio()
        try:
            # x may be an int, Fraction, or Integral ABC.
            return (x.numerator, x.denominator)
        except AttributeError:
            try:
-                # x may be a float subclass.
+                # x may be a float or Decimal subclass.
                return x.as_integer_ratio()
            except AttributeError:
-                try:
-                    # x may be a Decimal.
-                    return _decimal_to_ratio(x)
-                except AttributeError:
-                    # Just give up?
-                    pass
+                # Just give up?
+                pass
    except (OverflowError, ValueError):
        # float NAN or INF.
-        assert not math.isfinite(x)
+        assert not _isfinite(x)
        return (x, None)
    msg = "can't convert type '{}' to numerator/denominator"
    raise TypeError(msg.format(type(x).__name__))


-# FIXME This is faster than Fraction.from_decimal, but still too slow.
-def _decimal_to_ratio(d):
-    """Convert Decimal d to exact integer ratio (numerator, denominator).
-
-    >>> from decimal import Decimal
-    >>> _decimal_to_ratio(Decimal("2.6"))
-    (26, 10)
-
-    """
-    sign, digits, exp = d.as_tuple()
-    if exp in ('F', 'n', 'N'):  # INF, NAN, sNAN
-        assert not d.is_finite()
-        return (d, None)
-    num = 0
-    for digit in digits:
-        num = num*10 + digit
-    if exp < 0:
-        den = 10**-exp
-    else:
-        num *= 10**exp
-        den = 1
-    if sign:
-        num = -num
-    return (num, den)
-
-
 def _convert(value, T):
    """Convert value to given numeric type T."""
    if type(value) is T:
@@ -305,6 +276,21 @@ def _counts(data):
    return table


+def _find_lteq(a, x):
+    'Locate the leftmost value exactly equal to x'
+    i = bisect_left(a, x)
+    if i != len(a) and a[i] == x:
+        return i
+    raise ValueError
+
+
+def _find_rteq(a, l, x):
+    'Locate the rightmost value exactly equal to x'
+    i = bisect_right(a, x, lo=l)
+    if i != (len(a)+1) and a[i-1] == x:
+        return i-1
+    raise ValueError
+
 # === Measures of central tendency (averages) ===

 def mean(data):
@@ -442,9 +428,15 @@ def median_grouped(data, interval=1):
    except TypeError:
        # Mixed type. For now we just coerce to float.
        L = float(x) - float(interval)/2
-    cf = data.index(x)  # Number of values below the median interval.
-    # FIXME The following line could be more efficient for big lists.
-    f = data.count(x)  # Number of data points in the median interval.
+
+    # Uses bisection search to search for x in data with log(n) time complexity
+    # Find the position of leftmost occurence of x in data
+    l1 = _find_lteq(data, x)
+    # Find the position of rightmost occurence of x in data[l1...len(data)]
+    # Assuming always l1 <= l2
+    l2 = _find_rteq(data, l1, x)
+    cf = l1
+    f = l2 - l1 + 1
    return L + interval*(n/2 - cf)/f



--- a/Lib/test/test_statistics.py
+++ b/Lib/test/test_statistics.py
@@ -699,13 +699,12 @@ class ExactRatioTest(unittest.TestCase):
            num, den = statistics._exact_ratio(x)
            self.assertEqual(x, num/den)

-    @unittest.skipIf(True, "temporarily disabled: see #25928")
    def test_decimal(self):
        D = Decimal
        _exact_ratio = statistics._exact_ratio
-        self.assertEqual(_exact_ratio(D("0.125")), (125, 1000))
-        self.assertEqual(_exact_ratio(D("12.345")), (12345, 1000))
-        self.assertEqual(_exact_ratio(D("-1.98")), (-198, 100))
+        self.assertEqual(_exact_ratio(D("0.125")), (1, 8))
+        self.assertEqual(_exact_ratio(D("12.345")), (2469, 200))
+        self.assertEqual(_exact_ratio(D("-1.98")), (-99, 50))

    def test_inf(self):
        INF = float("INF")
@@ -731,7 +730,6 @@ class ExactRatioTest(unittest.TestCase):
            self.assertIs(ratio[1], None)
            self.assertEqual(type(ratio[0]), type(nan))

-    @unittest.skipIf(True, "temporarily disabled: see #25928")
    def test_decimal_nan(self):
        NAN = Decimal("NAN")
        sNAN = Decimal("sNAN")
@@ -745,18 +743,18 @@ class ExactRatioTest(unittest.TestCase):


 class DecimalToRatioTest(unittest.TestCase):
-    # Test _decimal_to_ratio private function.
+    # Test _exact_ratio private function.

    def test_infinity(self):
        # Test that INFs are handled correctly.
        inf = Decimal('INF')
-        self.assertEqual(statistics._decimal_to_ratio(inf), (inf, None))
-        self.assertEqual(statistics._decimal_to_ratio(-inf), (-inf, None))
+        self.assertEqual(statistics._exact_ratio(inf), (inf, None))
+        self.assertEqual(statistics._exact_ratio(-inf), (-inf, None))

    def test_nan(self):
        # Test that NANs are handled correctly.
        for nan in (Decimal('NAN'), Decimal('sNAN')):
-            num, den = statistics._decimal_to_ratio(nan)
+            num, den = statistics._exact_ratio(nan)
            # Because NANs always compare non-equal, we cannot use assertEqual.
            # Nor can we use an identity test, as we don't guarantee anything
            # about the object identity.
@@ -769,30 +767,30 @@ class DecimalToRatioTest(unittest.TestCase):
        for d in numbers:
            # First test positive decimals.
            assert d > 0
-            num, den = statistics._decimal_to_ratio(d)
+            num, den = statistics._exact_ratio(d)
            self.assertGreaterEqual(num, 0)
            self.assertGreater(den, 0)
            # Then test negative decimals.
-            num, den = statistics._decimal_to_ratio(-d)
+            num, den = statistics._exact_ratio(-d)
            self.assertLessEqual(num, 0)
            self.assertGreater(den, 0)

    def test_negative_exponent(self):
        # Test result when the exponent is negative.
-        t = statistics._decimal_to_ratio(Decimal("0.1234"))
-        self.assertEqual(t, (1234, 10000))
+        t = statistics._exact_ratio(Decimal("0.1234"))
+        self.assertEqual(t, (617, 5000))

    def test_positive_exponent(self):
        # Test results when the exponent is positive.
-        t = statistics._decimal_to_ratio(Decimal("1.234e7"))
+        t = statistics._exact_ratio(Decimal("1.234e7"))
        self.assertEqual(t, (12340000, 1))

    def test_regression_20536(self):
        # Regression test for issue 20536.
        # See http://bugs.python.org/issue20536
-        t = statistics._decimal_to_ratio(Decimal("1e2"))
+        t = statistics._exact_ratio(Decimal("1e2"))
        self.assertEqual(t, (100, 1))
-        t = statistics._decimal_to_ratio(Decimal("1.47e5"))
+        t = statistics._exact_ratio(Decimal("1.47e5"))
        self.assertEqual(t, (147000, 1))


@@ -1260,7 +1258,6 @@ class SumSpecialValues(NumericTestCase):
        with decimal.localcontext(decimal.BasicContext):
            self.assertRaises(decimal.InvalidOperation, statistics._sum, data)

-    @unittest.skipIf(True, "temporarily disabled: see #25928")
    def test_decimal_snan_raises(self):
        # Adding sNAN should raise InvalidOperation.
        sNAN = Decimal('sNAN')