Commit 00166c55 authored by Raymond Hettinger's avatar Raymond Hettinger

Add merge() function to heapq.

parent d6fc72a5
...@@ -88,7 +88,18 @@ True ...@@ -88,7 +88,18 @@ True
>>> >>>
\end{verbatim} \end{verbatim}
The module also offers two general purpose functions based on heaps. The module also offers three general purpose functions based on heaps.
\begin{funcdesc}{merge}{*iterables}
Merge multiple sorted inputs into a single sorted output (for example, merge
timestamped entries from multiple log files). Returns an iterator over
over the sorted values.
Similar to \code{sorted(itertools.chain(*iterables))} but returns an iterable,
does not pull the data into memory all at once, and reduces the number of
comparisons by assuming that each of the input streams is already sorted.
\versionadded{2.6}
\end{funcdesc}
\begin{funcdesc}{nlargest}{n, iterable\optional{, key}} \begin{funcdesc}{nlargest}{n, iterable\optional{, key}}
Return a list with the \var{n} largest elements from the dataset defined Return a list with the \var{n} largest elements from the dataset defined
...@@ -110,7 +121,7 @@ Equivalent to: \samp{sorted(iterable, key=key)[:n]} ...@@ -110,7 +121,7 @@ Equivalent to: \samp{sorted(iterable, key=key)[:n]}
\versionchanged[Added the optional \var{key} argument]{2.5} \versionchanged[Added the optional \var{key} argument]{2.5}
\end{funcdesc} \end{funcdesc}
Both functions perform best for smaller values of \var{n}. For larger The latter two functions perform best for smaller values of \var{n}. For larger
values, it is more efficient to use the \function{sorted()} function. Also, values, it is more efficient to use the \function{sorted()} function. Also,
when \code{n==1}, it is more efficient to use the builtin \function{min()} when \code{n==1}, it is more efficient to use the builtin \function{min()}
and \function{max()} functions. and \function{max()} functions.
......
...@@ -126,8 +126,8 @@ Believe me, real good tape sorts were quite spectacular to watch! ...@@ -126,8 +126,8 @@ Believe me, real good tape sorts were quite spectacular to watch!
From all times, sorting has always been a Great Art! :-) From all times, sorting has always been a Great Art! :-)
""" """
__all__ = ['heappush', 'heappop', 'heapify', 'heapreplace', 'nlargest', __all__ = ['heappush', 'heappop', 'heapify', 'heapreplace', 'merge',
'nsmallest'] 'nlargest', 'nsmallest']
from itertools import islice, repeat, count, imap, izip, tee from itertools import islice, repeat, count, imap, izip, tee
from operator import itemgetter, neg from operator import itemgetter, neg
...@@ -308,6 +308,41 @@ try: ...@@ -308,6 +308,41 @@ try:
except ImportError: except ImportError:
pass pass
def merge(*iterables):
'''Merge multiple sorted inputs into a single sorted output.
Similar to sorted(itertools.chain(*iterables)) but returns an iterable,
does not pull the data into memory all at once, and reduces the number
of comparisons by assuming that each of the input streams is already sorted.
>>> list(merge([1,3,5,7], [0,2,4,8], [5,10,15,20], [], [25]))
[0, 1, 2, 3, 4, 5, 5, 7, 8, 10, 15, 20, 25]
'''
_heappop, siftup, _StopIteration = heappop, _siftup, StopIteration
h = []
h_append = h.append
for it in map(iter, iterables):
try:
next = it.next
h_append([next(), next])
except _StopIteration:
pass
heapify(h)
while 1:
try:
while 1:
v, next = s = h[0] # raises IndexError when h is empty
yield v
s[0] = next() # raises StopIteration when exhausted
siftup(h, 0) # restore heap condition
except _StopIteration:
_heappop(h) # remove empty iterator
except IndexError:
return
# Extend the implementations of nsmallest and nlargest to use a key= argument # Extend the implementations of nsmallest and nlargest to use a key= argument
_nsmallest = nsmallest _nsmallest = nsmallest
def nsmallest(n, iterable, key=None): def nsmallest(n, iterable, key=None):
...@@ -341,3 +376,6 @@ if __name__ == "__main__": ...@@ -341,3 +376,6 @@ if __name__ == "__main__":
while heap: while heap:
sort.append(heappop(heap)) sort.append(heappop(heap))
print sort print sort
import doctest
doctest.testmod()
"""Unittests for heapq.""" """Unittests for heapq."""
from heapq import heappush, heappop, heapify, heapreplace, nlargest, nsmallest from heapq import heappush, heappop, heapify, heapreplace, merge, nlargest, nsmallest
import random import random
import unittest import unittest
from test import test_support from test import test_support
...@@ -103,6 +103,14 @@ class TestHeap(unittest.TestCase): ...@@ -103,6 +103,14 @@ class TestHeap(unittest.TestCase):
heap_sorted = [heappop(heap) for i in range(size)] heap_sorted = [heappop(heap) for i in range(size)]
self.assertEqual(heap_sorted, sorted(data)) self.assertEqual(heap_sorted, sorted(data))
def test_merge(self):
inputs = []
for i in xrange(random.randrange(5)):
row = sorted(random.randrange(1000) for j in range(random.randrange(10)))
inputs.append(row)
self.assertEqual(sorted(chain(*inputs)), list(merge(*inputs)))
self.assertEqual(list(merge()), [])
def test_nsmallest(self): def test_nsmallest(self):
data = [(random.randrange(2000), i) for i in range(1000)] data = [(random.randrange(2000), i) for i in range(1000)]
for f in (None, lambda x: x[0] * 547 % 2000): for f in (None, lambda x: x[0] * 547 % 2000):
......
...@@ -128,6 +128,8 @@ Core and builtins ...@@ -128,6 +128,8 @@ Core and builtins
Library Library
------- -------
- Added heapq.merge() for merging sorted input streams.
- Have the encoding package's search function dynamically import using absolute - Have the encoding package's search function dynamically import using absolute
import semantics. import semantics.
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment