Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Support
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
C
cpython
Project overview
Project overview
Details
Activity
Releases
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Issues
0
Issues
0
List
Boards
Labels
Milestones
Merge Requests
0
Merge Requests
0
Analytics
Analytics
Repository
Value Stream
Wiki
Wiki
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Create a new issue
Commits
Issue Boards
Open sidebar
Kirill Smelkov
cpython
Commits
40a841bc
Commit
40a841bc
authored
Dec 01, 2015
by
Steven D'Aprano
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
Fixed issue #25177, problems with the mean of very small and very large numbers.
parent
ee1a0e4b
Changes
3
Expand all
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
433 additions
and
119 deletions
+433
-119
Lib/statistics.py
Lib/statistics.py
+114
-71
Lib/test/test_statistics.py
Lib/test/test_statistics.py
+315
-48
Misc/NEWS
Misc/NEWS
+4
-0
No files found.
Lib/statistics.py
View file @
40a841bc
...
@@ -104,6 +104,8 @@ import math
...
@@ -104,6 +104,8 @@ import math
from
fractions
import
Fraction
from
fractions
import
Fraction
from
decimal
import
Decimal
from
decimal
import
Decimal
from
itertools
import
groupby
# === Exceptions ===
# === Exceptions ===
...
@@ -115,86 +117,102 @@ class StatisticsError(ValueError):
...
@@ -115,86 +117,102 @@ class StatisticsError(ValueError):
# === Private utilities ===
# === Private utilities ===
def
_sum
(
data
,
start
=
0
):
def
_sum
(
data
,
start
=
0
):
"""_sum(data [, start]) -> value
"""_sum(data [, start]) -> (type, sum, count)
Return a high-precision sum of the given numeric data as a fraction,
together with the type to be converted to and the count of items.
Return a high-precision sum of the given numeric data. If optional
If optional argument ``start`` is given, it is added to the total.
argument ``start`` is given, it is added to the total. If ``data`` is
If ``data`` is empty, ``start`` (defaulting to 0) is returned.
empty, ``start`` (defaulting to 0) is returned.
Examples
Examples
--------
--------
>>> _sum([3, 2.25, 4.5, -0.5, 1.0], 0.75)
>>> _sum([3, 2.25, 4.5, -0.5, 1.0], 0.75)
11.0
(<class 'float'>, Fraction(11, 1), 5)
Some sources of round-off error will be avoided:
Some sources of round-off error will be avoided:
>>> _sum([1e50, 1, -1e50] * 1000) # Built-in sum returns zero.
>>> _sum([1e50, 1, -1e50] * 1000) # Built-in sum returns zero.
1000.0
(<class 'float'>, Fraction(1000, 1), 3000)
Fractions and Decimals are also supported:
Fractions and Decimals are also supported:
>>> from fractions import Fraction as F
>>> from fractions import Fraction as F
>>> _sum([F(2, 3), F(7, 5), F(1, 4), F(5, 6)])
>>> _sum([F(2, 3), F(7, 5), F(1, 4), F(5, 6)])
Fraction(63, 20
)
(<class 'fractions.Fraction'>, Fraction(63, 20), 4
)
>>> from decimal import Decimal as D
>>> from decimal import Decimal as D
>>> data = [D("0.1375"), D("0.2108"), D("0.3061"), D("0.0419")]
>>> data = [D("0.1375"), D("0.2108"), D("0.3061"), D("0.0419")]
>>> _sum(data)
>>> _sum(data)
Decimal('0.6963'
)
(<class 'decimal.Decimal'>, Fraction(6963, 10000), 4
)
Mixed types are currently treated as an error, except that int is
Mixed types are currently treated as an error, except that int is
allowed.
allowed.
"""
"""
# We fail as soon as we reach a value that is not an int or the type of
count
=
0
# the first value which is not an int. E.g. _sum([int, int, float, int])
# is okay, but sum([int, int, float, Fraction]) is not.
allowed_types
=
{
int
,
type
(
start
)}
n
,
d
=
_exact_ratio
(
start
)
n
,
d
=
_exact_ratio
(
start
)
partials
=
{
d
:
n
}
# map {denominator: sum of numerators}
partials
=
{
d
:
n
}
# Micro-optimizations.
exact_ratio
=
_exact_ratio
partials_get
=
partials
.
get
partials_get
=
partials
.
get
# Add numerators for each denominator.
T
=
_coerce
(
int
,
type
(
start
))
for
x
in
data
:
for
typ
,
values
in
groupby
(
data
,
type
):
_check_type
(
type
(
x
),
allowed_types
)
T
=
_coerce
(
T
,
typ
)
# or raise TypeError
n
,
d
=
exact_ratio
(
x
)
for
n
,
d
in
map
(
_exact_ratio
,
values
):
partials
[
d
]
=
partials_get
(
d
,
0
)
+
n
count
+=
1
# Find the expected result type. If allowed_types has only one item, it
partials
[
d
]
=
partials_get
(
d
,
0
)
+
n
# will be int; if it has two, use the one which isn't int.
assert
len
(
allowed_types
)
in
(
1
,
2
)
if
len
(
allowed_types
)
==
1
:
assert
allowed_types
.
pop
()
is
int
T
=
int
else
:
T
=
(
allowed_types
-
{
int
}).
pop
()
if
None
in
partials
:
if
None
in
partials
:
assert
issubclass
(
T
,
(
float
,
Decimal
))
# The sum will be a NAN or INF. We can ignore all the finite
assert
not
math
.
isfinite
(
partials
[
None
])
# partials, and just look at this special one.
return
T
(
partials
[
None
])
total
=
partials
[
None
]
total
=
Fraction
()
assert
not
_isfinite
(
total
)
for
d
,
n
in
sorted
(
partials
.
items
()):
else
:
total
+=
Fraction
(
n
,
d
)
# Sum all the partial sums using builtin sum.
if
issubclass
(
T
,
int
):
# FIXME is this faster if we sum them in order of the denominator?
assert
total
.
denominator
==
1
total
=
sum
(
Fraction
(
n
,
d
)
for
d
,
n
in
sorted
(
partials
.
items
()))
return
T
(
total
.
numerator
)
return
(
T
,
total
,
count
)
if
issubclass
(
T
,
Decimal
):
return
T
(
total
.
numerator
)
/
total
.
denominator
return
T
(
total
)
def
_isfinite
(
x
):
try
:
return
x
.
is_finite
()
# Likely a Decimal.
def
_check_type
(
T
,
allowed
):
except
AttributeError
:
if
T
not
in
allowed
:
return
math
.
isfinite
(
x
)
# Coerces to float first.
if
len
(
allowed
)
==
1
:
allowed
.
add
(
T
)
else
:
def
_coerce
(
T
,
S
):
types
=
', '
.
join
([
t
.
__name__
for
t
in
allowed
]
+
[
T
.
__name__
])
"""Coerce types T and S to a common type, or raise TypeError.
raise
TypeError
(
"unsupported mixed types: %s"
%
types
)
Coercion rules are currently an implementation detail. See the CoerceTest
test class in test_statistics for details.
"""
# See http://bugs.python.org/issue24068.
assert
T
is
not
bool
,
"initial type T is bool"
# If the types are the same, no need to coerce anything. Put this
# first, so that the usual case (no coercion needed) happens as soon
# as possible.
if
T
is
S
:
return
T
# Mixed int & other coerce to the other type.
if
S
is
int
or
S
is
bool
:
return
T
if
T
is
int
:
return
S
# If one is a (strict) subclass of the other, coerce to the subclass.
if
issubclass
(
S
,
T
):
return
S
if
issubclass
(
T
,
S
):
return
T
# Ints coerce to the other type.
if
issubclass
(
T
,
int
):
return
S
if
issubclass
(
S
,
int
):
return
T
# Mixed fraction & float coerces to float (or float subclass).
if
issubclass
(
T
,
Fraction
)
and
issubclass
(
S
,
float
):
return
S
if
issubclass
(
T
,
float
)
and
issubclass
(
S
,
Fraction
):
return
T
# Any other combination is disallowed.
msg
=
"don't know how to coerce %s and %s"
raise
TypeError
(
msg
%
(
T
.
__name__
,
S
.
__name__
))
def
_exact_ratio
(
x
):
def
_exact_ratio
(
x
):
"""
Convert Real number x exactly to
(numerator, denominator) pair.
"""
Return Real number x to exact
(numerator, denominator) pair.
>>> _exact_ratio(0.25)
>>> _exact_ratio(0.25)
(1, 4)
(1, 4)
...
@@ -202,29 +220,31 @@ def _exact_ratio(x):
...
@@ -202,29 +220,31 @@ def _exact_ratio(x):
x is expected to be an int, Fraction, Decimal or float.
x is expected to be an int, Fraction, Decimal or float.
"""
"""
try
:
try
:
# Optimise the common case of floats. We expect that the most often
# used numeric type will be builtin floats, so try to make this as
# fast as possible.
if
type
(
x
)
is
float
:
return
x
.
as_integer_ratio
()
try
:
try
:
#
int, Fraction
#
x may be an int, Fraction, or Integral ABC.
return
(
x
.
numerator
,
x
.
denominator
)
return
(
x
.
numerator
,
x
.
denominator
)
except
AttributeError
:
except
AttributeError
:
# float
try
:
try
:
# x may be a float subclass.
return
x
.
as_integer_ratio
()
return
x
.
as_integer_ratio
()
except
AttributeError
:
except
AttributeError
:
# Decimal
try
:
try
:
# x may be a Decimal.
return
_decimal_to_ratio
(
x
)
return
_decimal_to_ratio
(
x
)
except
AttributeError
:
except
AttributeError
:
msg
=
"can't convert type '{}' to numerator/denominator"
# Just give up?
raise
TypeError
(
msg
.
format
(
type
(
x
).
__name__
))
from
None
pass
except
(
OverflowError
,
ValueError
):
except
(
OverflowError
,
ValueError
):
# INF or NAN
# float NAN or INF.
if
__debug__
:
assert
not
math
.
isfinite
(
x
)
# Decimal signalling NANs cannot be converted to float :-(
if
isinstance
(
x
,
Decimal
):
assert
not
x
.
is_finite
()
else
:
assert
not
math
.
isfinite
(
x
)
return
(
x
,
None
)
return
(
x
,
None
)
msg
=
"can't convert type '{}' to numerator/denominator"
raise
TypeError
(
msg
.
format
(
type
(
x
).
__name__
))
# FIXME This is faster than Fraction.from_decimal, but still too slow.
# FIXME This is faster than Fraction.from_decimal, but still too slow.
...
@@ -239,7 +259,7 @@ def _decimal_to_ratio(d):
...
@@ -239,7 +259,7 @@ def _decimal_to_ratio(d):
sign
,
digits
,
exp
=
d
.
as_tuple
()
sign
,
digits
,
exp
=
d
.
as_tuple
()
if
exp
in
(
'F'
,
'n'
,
'N'
):
# INF, NAN, sNAN
if
exp
in
(
'F'
,
'n'
,
'N'
):
# INF, NAN, sNAN
assert
not
d
.
is_finite
()
assert
not
d
.
is_finite
()
r
aise
ValueError
r
eturn
(
d
,
None
)
num
=
0
num
=
0
for
digit
in
digits
:
for
digit
in
digits
:
num
=
num
*
10
+
digit
num
=
num
*
10
+
digit
...
@@ -253,6 +273,24 @@ def _decimal_to_ratio(d):
...
@@ -253,6 +273,24 @@ def _decimal_to_ratio(d):
return
(
num
,
den
)
return
(
num
,
den
)
def
_convert
(
value
,
T
):
"""Convert value to given numeric type T."""
if
type
(
value
)
is
T
:
# This covers the cases where T is Fraction, or where value is
# a NAN or INF (Decimal or float).
return
value
if
issubclass
(
T
,
int
)
and
value
.
denominator
!=
1
:
T
=
float
try
:
# FIXME: what do we do if this overflows?
return
T
(
value
)
except
TypeError
:
if
issubclass
(
T
,
Decimal
):
return
T
(
value
.
numerator
)
/
T
(
value
.
denominator
)
else
:
raise
def
_counts
(
data
):
def
_counts
(
data
):
# Generate a table of sorted (value, frequency) pairs.
# Generate a table of sorted (value, frequency) pairs.
table
=
collections
.
Counter
(
iter
(
data
)).
most_common
()
table
=
collections
.
Counter
(
iter
(
data
)).
most_common
()
...
@@ -290,7 +328,9 @@ def mean(data):
...
@@ -290,7 +328,9 @@ def mean(data):
n
=
len
(
data
)
n
=
len
(
data
)
if
n
<
1
:
if
n
<
1
:
raise
StatisticsError
(
'mean requires at least one data point'
)
raise
StatisticsError
(
'mean requires at least one data point'
)
return
_sum
(
data
)
/
n
T
,
total
,
count
=
_sum
(
data
)
assert
count
==
n
return
_convert
(
total
/
n
,
T
)
# FIXME: investigate ways to calculate medians without sorting? Quickselect?
# FIXME: investigate ways to calculate medians without sorting? Quickselect?
...
@@ -460,12 +500,14 @@ def _ss(data, c=None):
...
@@ -460,12 +500,14 @@ def _ss(data, c=None):
"""
"""
if
c
is
None
:
if
c
is
None
:
c
=
mean
(
data
)
c
=
mean
(
data
)
ss
=
_sum
((
x
-
c
)
**
2
for
x
in
data
)
T
,
total
,
count
=
_sum
((
x
-
c
)
**
2
for
x
in
data
)
# The following sum should mathematically equal zero, but due to rounding
# The following sum should mathematically equal zero, but due to rounding
# error may not.
# error may not.
ss
-=
_sum
((
x
-
c
)
for
x
in
data
)
**
2
/
len
(
data
)
U
,
total2
,
count2
=
_sum
((
x
-
c
)
for
x
in
data
)
assert
not
ss
<
0
,
'negative sum of square deviations: %f'
%
ss
assert
T
==
U
and
count
==
count2
return
ss
total
-=
total2
**
2
/
len
(
data
)
assert
not
total
<
0
,
'negative sum of square deviations: %f'
%
total
return
(
T
,
total
)
def
variance
(
data
,
xbar
=
None
):
def
variance
(
data
,
xbar
=
None
):
...
@@ -511,8 +553,8 @@ def variance(data, xbar=None):
...
@@ -511,8 +553,8 @@ def variance(data, xbar=None):
n
=
len
(
data
)
n
=
len
(
data
)
if
n
<
2
:
if
n
<
2
:
raise
StatisticsError
(
'variance requires at least two data points'
)
raise
StatisticsError
(
'variance requires at least two data points'
)
ss
=
_ss
(
data
,
xbar
)
T
,
ss
=
_ss
(
data
,
xbar
)
return
ss
/
(
n
-
1
)
return
_convert
(
ss
/
(
n
-
1
),
T
)
def
pvariance
(
data
,
mu
=
None
):
def
pvariance
(
data
,
mu
=
None
):
...
@@ -560,7 +602,8 @@ def pvariance(data, mu=None):
...
@@ -560,7 +602,8 @@ def pvariance(data, mu=None):
if
n
<
1
:
if
n
<
1
:
raise
StatisticsError
(
'pvariance requires at least one data point'
)
raise
StatisticsError
(
'pvariance requires at least one data point'
)
ss
=
_ss
(
data
,
mu
)
ss
=
_ss
(
data
,
mu
)
return
ss
/
n
T
,
ss
=
_ss
(
data
,
mu
)
return
_convert
(
ss
/
n
,
T
)
def
stdev
(
data
,
xbar
=
None
):
def
stdev
(
data
,
xbar
=
None
):
...
...
Lib/test/test_statistics.py
View file @
40a841bc
This diff is collapsed.
Click to expand it.
Misc/NEWS
View file @
40a841bc
...
@@ -20,6 +20,10 @@ Core and Builtins
...
@@ -20,6 +20,10 @@ Core and Builtins
Library
Library
-------
-------
- Issue #25177: Fixed problem with the mean of very small and very large
numbers. As a side effect, statistics.mean and statistics.variance should
be significantly faster.
- Issue #25718: Fixed copying object with state with boolean value is false.
- Issue #25718: Fixed copying object with state with boolean value is false.
- Issue #10131: Fixed deep copying of minidom documents. Based on patch
- Issue #10131: Fixed deep copying of minidom documents. Based on patch
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment