Commit 40a841bc authored by Steven D'Aprano's avatar Steven D'Aprano

Fixed issue #25177, problems with the mean of very small and very large numbers.

parent ee1a0e4b
...@@ -104,6 +104,8 @@ import math ...@@ -104,6 +104,8 @@ import math
from fractions import Fraction from fractions import Fraction
from decimal import Decimal from decimal import Decimal
from itertools import groupby
# === Exceptions === # === Exceptions ===
...@@ -115,86 +117,102 @@ class StatisticsError(ValueError): ...@@ -115,86 +117,102 @@ class StatisticsError(ValueError):
# === Private utilities === # === Private utilities ===
def _sum(data, start=0): def _sum(data, start=0):
"""_sum(data [, start]) -> value """_sum(data [, start]) -> (type, sum, count)
Return a high-precision sum of the given numeric data as a fraction,
together with the type to be converted to and the count of items.
Return a high-precision sum of the given numeric data. If optional If optional argument ``start`` is given, it is added to the total.
argument ``start`` is given, it is added to the total. If ``data`` is If ``data`` is empty, ``start`` (defaulting to 0) is returned.
empty, ``start`` (defaulting to 0) is returned.
Examples Examples
-------- --------
>>> _sum([3, 2.25, 4.5, -0.5, 1.0], 0.75) >>> _sum([3, 2.25, 4.5, -0.5, 1.0], 0.75)
11.0 (<class 'float'>, Fraction(11, 1), 5)
Some sources of round-off error will be avoided: Some sources of round-off error will be avoided:
>>> _sum([1e50, 1, -1e50] * 1000) # Built-in sum returns zero. >>> _sum([1e50, 1, -1e50] * 1000) # Built-in sum returns zero.
1000.0 (<class 'float'>, Fraction(1000, 1), 3000)
Fractions and Decimals are also supported: Fractions and Decimals are also supported:
>>> from fractions import Fraction as F >>> from fractions import Fraction as F
>>> _sum([F(2, 3), F(7, 5), F(1, 4), F(5, 6)]) >>> _sum([F(2, 3), F(7, 5), F(1, 4), F(5, 6)])
Fraction(63, 20) (<class 'fractions.Fraction'>, Fraction(63, 20), 4)
>>> from decimal import Decimal as D >>> from decimal import Decimal as D
>>> data = [D("0.1375"), D("0.2108"), D("0.3061"), D("0.0419")] >>> data = [D("0.1375"), D("0.2108"), D("0.3061"), D("0.0419")]
>>> _sum(data) >>> _sum(data)
Decimal('0.6963') (<class 'decimal.Decimal'>, Fraction(6963, 10000), 4)
Mixed types are currently treated as an error, except that int is Mixed types are currently treated as an error, except that int is
allowed. allowed.
""" """
# We fail as soon as we reach a value that is not an int or the type of count = 0
# the first value which is not an int. E.g. _sum([int, int, float, int])
# is okay, but sum([int, int, float, Fraction]) is not.
allowed_types = {int, type(start)}
n, d = _exact_ratio(start) n, d = _exact_ratio(start)
partials = {d: n} # map {denominator: sum of numerators} partials = {d: n}
# Micro-optimizations.
exact_ratio = _exact_ratio
partials_get = partials.get partials_get = partials.get
# Add numerators for each denominator. T = _coerce(int, type(start))
for x in data: for typ, values in groupby(data, type):
_check_type(type(x), allowed_types) T = _coerce(T, typ) # or raise TypeError
n, d = exact_ratio(x) for n,d in map(_exact_ratio, values):
partials[d] = partials_get(d, 0) + n count += 1
# Find the expected result type. If allowed_types has only one item, it partials[d] = partials_get(d, 0) + n
# will be int; if it has two, use the one which isn't int.
assert len(allowed_types) in (1, 2)
if len(allowed_types) == 1:
assert allowed_types.pop() is int
T = int
else:
T = (allowed_types - {int}).pop()
if None in partials: if None in partials:
assert issubclass(T, (float, Decimal)) # The sum will be a NAN or INF. We can ignore all the finite
assert not math.isfinite(partials[None]) # partials, and just look at this special one.
return T(partials[None]) total = partials[None]
total = Fraction() assert not _isfinite(total)
for d, n in sorted(partials.items()): else:
total += Fraction(n, d) # Sum all the partial sums using builtin sum.
if issubclass(T, int): # FIXME is this faster if we sum them in order of the denominator?
assert total.denominator == 1 total = sum(Fraction(n, d) for d, n in sorted(partials.items()))
return T(total.numerator) return (T, total, count)
if issubclass(T, Decimal):
return T(total.numerator)/total.denominator
return T(total) def _isfinite(x):
try:
return x.is_finite() # Likely a Decimal.
def _check_type(T, allowed): except AttributeError:
if T not in allowed: return math.isfinite(x) # Coerces to float first.
if len(allowed) == 1:
allowed.add(T)
else: def _coerce(T, S):
types = ', '.join([t.__name__ for t in allowed] + [T.__name__]) """Coerce types T and S to a common type, or raise TypeError.
raise TypeError("unsupported mixed types: %s" % types)
Coercion rules are currently an implementation detail. See the CoerceTest
test class in test_statistics for details.
"""
# See http://bugs.python.org/issue24068.
assert T is not bool, "initial type T is bool"
# If the types are the same, no need to coerce anything. Put this
# first, so that the usual case (no coercion needed) happens as soon
# as possible.
if T is S: return T
# Mixed int & other coerce to the other type.
if S is int or S is bool: return T
if T is int: return S
# If one is a (strict) subclass of the other, coerce to the subclass.
if issubclass(S, T): return S
if issubclass(T, S): return T
# Ints coerce to the other type.
if issubclass(T, int): return S
if issubclass(S, int): return T
# Mixed fraction & float coerces to float (or float subclass).
if issubclass(T, Fraction) and issubclass(S, float):
return S
if issubclass(T, float) and issubclass(S, Fraction):
return T
# Any other combination is disallowed.
msg = "don't know how to coerce %s and %s"
raise TypeError(msg % (T.__name__, S.__name__))
def _exact_ratio(x): def _exact_ratio(x):
"""Convert Real number x exactly to (numerator, denominator) pair. """Return Real number x to exact (numerator, denominator) pair.
>>> _exact_ratio(0.25) >>> _exact_ratio(0.25)
(1, 4) (1, 4)
...@@ -202,29 +220,31 @@ def _exact_ratio(x): ...@@ -202,29 +220,31 @@ def _exact_ratio(x):
x is expected to be an int, Fraction, Decimal or float. x is expected to be an int, Fraction, Decimal or float.
""" """
try: try:
# Optimise the common case of floats. We expect that the most often
# used numeric type will be builtin floats, so try to make this as
# fast as possible.
if type(x) is float:
return x.as_integer_ratio()
try: try:
# int, Fraction # x may be an int, Fraction, or Integral ABC.
return (x.numerator, x.denominator) return (x.numerator, x.denominator)
except AttributeError: except AttributeError:
# float
try: try:
# x may be a float subclass.
return x.as_integer_ratio() return x.as_integer_ratio()
except AttributeError: except AttributeError:
# Decimal
try: try:
# x may be a Decimal.
return _decimal_to_ratio(x) return _decimal_to_ratio(x)
except AttributeError: except AttributeError:
msg = "can't convert type '{}' to numerator/denominator" # Just give up?
raise TypeError(msg.format(type(x).__name__)) from None pass
except (OverflowError, ValueError): except (OverflowError, ValueError):
# INF or NAN # float NAN or INF.
if __debug__: assert not math.isfinite(x)
# Decimal signalling NANs cannot be converted to float :-(
if isinstance(x, Decimal):
assert not x.is_finite()
else:
assert not math.isfinite(x)
return (x, None) return (x, None)
msg = "can't convert type '{}' to numerator/denominator"
raise TypeError(msg.format(type(x).__name__))
# FIXME This is faster than Fraction.from_decimal, but still too slow. # FIXME This is faster than Fraction.from_decimal, but still too slow.
...@@ -239,7 +259,7 @@ def _decimal_to_ratio(d): ...@@ -239,7 +259,7 @@ def _decimal_to_ratio(d):
sign, digits, exp = d.as_tuple() sign, digits, exp = d.as_tuple()
if exp in ('F', 'n', 'N'): # INF, NAN, sNAN if exp in ('F', 'n', 'N'): # INF, NAN, sNAN
assert not d.is_finite() assert not d.is_finite()
raise ValueError return (d, None)
num = 0 num = 0
for digit in digits: for digit in digits:
num = num*10 + digit num = num*10 + digit
...@@ -253,6 +273,24 @@ def _decimal_to_ratio(d): ...@@ -253,6 +273,24 @@ def _decimal_to_ratio(d):
return (num, den) return (num, den)
def _convert(value, T):
"""Convert value to given numeric type T."""
if type(value) is T:
# This covers the cases where T is Fraction, or where value is
# a NAN or INF (Decimal or float).
return value
if issubclass(T, int) and value.denominator != 1:
T = float
try:
# FIXME: what do we do if this overflows?
return T(value)
except TypeError:
if issubclass(T, Decimal):
return T(value.numerator)/T(value.denominator)
else:
raise
def _counts(data): def _counts(data):
# Generate a table of sorted (value, frequency) pairs. # Generate a table of sorted (value, frequency) pairs.
table = collections.Counter(iter(data)).most_common() table = collections.Counter(iter(data)).most_common()
...@@ -290,7 +328,9 @@ def mean(data): ...@@ -290,7 +328,9 @@ def mean(data):
n = len(data) n = len(data)
if n < 1: if n < 1:
raise StatisticsError('mean requires at least one data point') raise StatisticsError('mean requires at least one data point')
return _sum(data)/n T, total, count = _sum(data)
assert count == n
return _convert(total/n, T)
# FIXME: investigate ways to calculate medians without sorting? Quickselect? # FIXME: investigate ways to calculate medians without sorting? Quickselect?
...@@ -460,12 +500,14 @@ def _ss(data, c=None): ...@@ -460,12 +500,14 @@ def _ss(data, c=None):
""" """
if c is None: if c is None:
c = mean(data) c = mean(data)
ss = _sum((x-c)**2 for x in data) T, total, count = _sum((x-c)**2 for x in data)
# The following sum should mathematically equal zero, but due to rounding # The following sum should mathematically equal zero, but due to rounding
# error may not. # error may not.
ss -= _sum((x-c) for x in data)**2/len(data) U, total2, count2 = _sum((x-c) for x in data)
assert not ss < 0, 'negative sum of square deviations: %f' % ss assert T == U and count == count2
return ss total -= total2**2/len(data)
assert not total < 0, 'negative sum of square deviations: %f' % total
return (T, total)
def variance(data, xbar=None): def variance(data, xbar=None):
...@@ -511,8 +553,8 @@ def variance(data, xbar=None): ...@@ -511,8 +553,8 @@ def variance(data, xbar=None):
n = len(data) n = len(data)
if n < 2: if n < 2:
raise StatisticsError('variance requires at least two data points') raise StatisticsError('variance requires at least two data points')
ss = _ss(data, xbar) T, ss = _ss(data, xbar)
return ss/(n-1) return _convert(ss/(n-1), T)
def pvariance(data, mu=None): def pvariance(data, mu=None):
...@@ -560,7 +602,8 @@ def pvariance(data, mu=None): ...@@ -560,7 +602,8 @@ def pvariance(data, mu=None):
if n < 1: if n < 1:
raise StatisticsError('pvariance requires at least one data point') raise StatisticsError('pvariance requires at least one data point')
ss = _ss(data, mu) ss = _ss(data, mu)
return ss/n T, ss = _ss(data, mu)
return _convert(ss/n, T)
def stdev(data, xbar=None): def stdev(data, xbar=None):
......
This diff is collapsed.
...@@ -20,6 +20,10 @@ Core and Builtins ...@@ -20,6 +20,10 @@ Core and Builtins
Library Library
------- -------
- Issue #25177: Fixed problem with the mean of very small and very large
numbers. As a side effect, statistics.mean and statistics.variance should
be significantly faster.
- Issue #25718: Fixed copying object with state with boolean value is false. - Issue #25718: Fixed copying object with state with boolean value is false.
- Issue #10131: Fixed deep copying of minidom documents. Based on patch - Issue #10131: Fixed deep copying of minidom documents. Based on patch
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment