bpo-35892: Fix mode() and add multimode() (#12089)

fc06a192 · Raymond Hettinger · GitHub · 3e936431 · fc06a192 · fc06a192
Commit fc06a192 authored Mar 12, 2019 by Raymond Hettinger Committed by GitHub Mar 12, 2019
4 changed files
--- a/Doc/library/statistics.rst
+++ b/Doc/library/statistics.rst
@@ -37,7 +37,7 @@ Averages and measures of central location
 These functions calculate an average or typical value from a population
 or sample.
-=======================  =============================================
+=======================  ===============================================================
 :func:`mean`             Arithmetic mean ("average") of data.
 :func:`fmean`            Fast, floating point arithmetic mean.
 :func:`harmonic_mean`    Harmonic mean of data.
@@ -45,8 +45,9 @@ or sample.
 :func:`median_low`       Low median of data.
 :func:`median_high`      High median of data.
 :func:`median_grouped`   Median, or 50th percentile, of grouped data.
-:func:`mode`             Mode (most common value) of discrete data.
+:func:`mode`             Single mode (most common value) of discrete or nominal data.
-=======================  =============================================
+:func:`multimode`        List of modes (most common values) of discrete or nomimal data.
+=======================  ===============================================================
 Measures of spread
 ------------------
@@ -287,12 +288,12 @@ However, for reading convenience, most of the examples show sorted sequences.
 .. function:: mode(data)
-   Return the most common data point from discrete or nominal *data*.  The mode
+   Return the single most common data point from discrete or nominal *data*.
-   (when it exists) is the most typical value, and is a robust measure of
+   The mode (when it exists) is the most typical value and serves as a
-   central location.
+   measure of central location.
-   If *data* is empty, or if there is not exactly one most common value,
+   If there are multiple modes, returns the first one encountered in the *data*.
-   :exc:`StatisticsError` is raised.
+   If *data* is empty, :exc:`StatisticsError` is raised.
   ``mode`` assumes discrete data, and returns a single value. This is the
   standard treatment of the mode as commonly taught in schools:
@@ -310,6 +311,27 @@ However, for reading convenience, most of the examples show sorted sequences.
      >>> mode(["red", "blue", "blue", "red", "green", "red", "red"])
      'red'
+   .. versionchanged:: 3.8
+      Now handles multimodal datasets by returning the first mode encountered.
+      Formerly, it raised :exc:`StatisticsError` when more than one mode was
+      found.
+.. function:: multimode(data)
+   Return a list of the most frequently occurring values in the order they
+   were first encountered in the *data*.  Will return more than one result if
+   there are multiple modes or an empty list if the *data* is empty:
+   .. doctest::
+        >>> multimode('aabbbbccddddeeffffgg')
+        ['b', 'd', 'f']
+        >>> multimode('')
+        []
+   .. versionadded:: 3.8
 .. function:: pstdev(data, mu=None)

--- a/Doc/whatsnew/3.8.rst
+++ b/Doc/whatsnew/3.8.rst
@@ -282,6 +282,9 @@ Added :func:`statistics.fmean` as a faster, floating point variant of
 :func:`statistics.mean()`.  (Contributed by Raymond Hettinger and
 Steven D'Aprano in :issue:`35904`.)
+Added :func:`statistics.multimode` that returns a list of the most
+common values. (Contributed by Raymond Hettinger in :issue:`35892`.)
 Added :class:`statistics.NormalDist`, a tool for creating
 and manipulating normal distributions of a random variable.
 (Contributed by Raymond Hettinger in :issue:`36018`.)
@@ -591,6 +594,11 @@ Changes in the Python API
 * The function :func:`platform.popen` has been removed, it was deprecated since
  Python 3.3: use :func:`os.popen` instead.
+* The :func:`statistics.mode` function no longer raises an exception
+  when given multimodal data.  Instead, it returns the first mode
+  encountered in the input data.  (Contributed by Raymond Hettinger
+  in :issue:`35892`.)
 * The :meth:`~tkinter.ttk.Treeview.selection` method of the
  :class:`tkinter.ttk.Treeview` class no longer takes arguments.  Using it with
  arguments for changing the selection was deprecated in Python 3.6.  Use

--- a/Lib/statistics.py
+++ b/Lib/statistics.py
@@ -17,6 +17,7 @@ median_low          Low median of data.
 median_high         High median of data.
 median_grouped      Median, or 50th percentile, of grouped data.
 mode                Mode (most common value) of data.
+multimode           List of modes (most common values of data)
 ==================  =============================================
 Calculate the arithmetic mean ("the average") of data:
@@ -79,10 +80,9 @@ A single exception is defined: StatisticsError is a subclass of ValueError.
 __all__ = [ 'StatisticsError', 'NormalDist',
            'pstdev', 'pvariance', 'stdev', 'variance',
            'median',  'median_low', 'median_high', 'median_grouped',
-            'mean', 'mode', 'harmonic_mean', 'fmean',
+            'mean', 'mode', 'multimode', 'harmonic_mean', 'fmean',
          ]
-import collections
 import math
 import numbers
 import random
@@ -92,8 +92,8 @@ from decimal import Decimal
 from itertools import groupby
 from bisect import bisect_left, bisect_right
 from math import hypot, sqrt, fabs, exp, erf, tau, log, fsum
+from operator import itemgetter
+from collections import Counter
 # === Exceptions ===
@@ -249,20 +249,6 @@ def _convert(value, T):
            raise
-def _counts(data):
-    # Generate a table of sorted (value, frequency) pairs.
-    table = collections.Counter(iter(data)).most_common()
-    if not table:
-        return table
-    # Extract the values with the highest frequency.
-    maxfreq = table[0][1]
-    for i in range(1, len(table)):
-        if table[i][1] != maxfreq:
-            table = table[:i]
-            break
-    return table
 def _find_lteq(a, x):
    'Locate the leftmost value exactly equal to x'
    i = bisect_left(a, x)
@@ -334,9 +320,9 @@ def fmean(data):
            nonlocal n
            n += 1
            return x
-        total = math.fsum(map(count, data))
+        total = fsum(map(count, data))
    else:
-        total = math.fsum(data)
+        total = fsum(data)
    try:
        return total / n
    except ZeroDivisionError:
@@ -523,19 +509,38 @@ def mode(data):
    >>> mode(["red", "blue", "blue", "red", "green", "red", "red"])
    'red'
-    If there is not exactly one most common value, ``mode`` will raise
+    If there are multiple modes, return the first one encountered.
-    StatisticsError.
+        >>> mode(['red', 'red', 'green', 'blue', 'blue'])
+        'red'
+    If *data* is empty, ``mode``, raises StatisticsError.
    """
-    # Generate a table of sorted (value, frequency) pairs.
+    data = iter(data)
-    table = _counts(data)
+    try:
-    if len(table) == 1:
+        return Counter(data).most_common(1)[0][0]
-        return table[0][0]
+    except IndexError:
-    elif table:
+        raise StatisticsError('no mode for empty data') from None
-        raise StatisticsError(
-                'no unique mode; found %d equally common values' % len(table)
-                )
+def multimode(data):
-    else:
+    """ Return a list of the most frequently occurring values.
-        raise StatisticsError('no mode for empty data')
+        Will return more than one result if there are multiple modes
+        or an empty list if *data* is empty.
+        >>> multimode('aabbbbbbbbcc')
+        ['b']
+        >>> multimode('aabbbbccddddeeffffgg')
+        ['b', 'd', 'f']
+        >>> multimode('')
+        []
+    """
+    counts = Counter(iter(data)).most_common()
+    maxcount, mode_items = next(groupby(counts, key=itemgetter(1)), (0, []))
+    return list(map(itemgetter(0), mode_items))
 # === Measures of spread ===
@@ -836,6 +841,7 @@ if __name__ == '__main__':
    from math import isclose
    from operator import add, sub, mul, truediv
    from itertools import repeat
+    import doctest
    g1 = NormalDist(10, 20)
    g2 = NormalDist(-5, 25)
@@ -893,3 +899,5 @@ if __name__ == '__main__':
    S = NormalDist.from_samples([x - y for x, y in zip(X.samples(n),
                                                       Y.samples(n))])
    assert_close(X - Y, S)
+    print(doctest.testmod())
--- a/Lib/test/test_statistics.py
+++ b/Lib/test/test_statistics.py
@@ -1769,7 +1769,7 @@ class TestMode(NumericTestCase, AverageMixin, UnivariateTypeMixin):
    def test_range_data(self):
        # Override test from UnivariateCommonMixin.
        data = range(20, 50, 3)
-        self.assertRaises(statistics.StatisticsError, self.func, data)
+        self.assertEqual(self.func(data), 20)
    def test_nominal_data(self):
        # Test mode with nominal data.
@@ -1790,13 +1790,14 @@ class TestMode(NumericTestCase, AverageMixin, UnivariateTypeMixin):
        # Test mode with bimodal data.
        data = [1, 1, 2, 2, 2, 2, 3, 4, 5, 6, 6, 6, 6, 7, 8, 9, 9]
        assert data.count(2) == data.count(6) == 4
-        # Check for an exception.
+        # mode() should return 2, the first encounted mode
-        self.assertRaises(statistics.StatisticsError, self.func, data)
+        self.assertEqual(self.func(data), 2)
-    def test_unique_data_failure(self):
+    def test_unique_data(self):
-        # Test mode exception when data points are all unique.
+        # Test mode when data points are all unique.
        data = list(range(10))
-        self.assertRaises(statistics.StatisticsError, self.func, data)
+        # mode() should return 0, the first encounted mode
+        self.assertEqual(self.func(data), 0)
    def test_none_data(self):
        # Test that mode raises TypeError if given None as data.
@@ -1809,8 +1810,18 @@ class TestMode(NumericTestCase, AverageMixin, UnivariateTypeMixin):
        # Test that a Counter is treated like any other iterable.
        data = collections.Counter([1, 1, 1, 2])
        # Since the keys of the counter are treated as data points, not the
-        # counts, this should raise.
+        # counts, this should return the first mode encountered, 1
-        self.assertRaises(statistics.StatisticsError, self.func, data)
+        self.assertEqual(self.func(data), 1)
+class TestMultiMode(unittest.TestCase):
+    def test_basics(self):
+        multimode = statistics.multimode
+        self.assertEqual(multimode('aabbbbbbbbcc'), ['b'])
+        self.assertEqual(multimode('aabbbbccddddeeffffgg'), ['b', 'd', 'f'])
+        self.assertEqual(multimode(''), [])
 class TestFMean(unittest.TestCase):