Commit 9e28a41b authored by Guido van Rossum's avatar Guido van Rossum

Tim Peters writes:

Attached is a cleaned-up version of ndiff (added useful module
docstring, now echo'ed in case of cmd line mistake); added -q option
to suppress initial file identification lines; + other minor cleanups,
& a slightly faster match engine.
parent 0d36e46e
#! /usr/bin/env python #! /usr/bin/env python
# Released to the public domain $JustDate: 3/16/98 $, # Module ndiff version 1.3.0
# by Tim Peters (email tim_one@email.msn.com). # Released to the public domain 26-Mar-1999,
# by Tim Peters (tim_one@email.msn.com).
# ndiff file1 file2 -- a human-friendly file differencer. # Provided as-is; use at your own risk; no warranty; no promises; enjoy!
# $Revision$ """ndiff [-q] file1 file2
Print a human-friendly file difference report to stdout. Both inter-
and intra-line differences are noted.
If -q ("quiet") is not specified, the first two lines of output are
-: file1
+: file2
Each remaining line begins with a two-letter code:
"- " line unique to file1
"+ " line unique to file2
" " line common to both files
"? " line not present in either input file
Lines beginning with "? " attempt to guide the eye to intraline
differences, and were not present in either input file.
The first file can be recovered by retaining only lines that begin with
" " or "- ", and deleting those 2-character prefixes.
The second file can be recovered similarly, but by retaining only " "
and "+ " lines. On Unix, the second file can be recovered by piping the
output through
sed -n '/^[+ ] /s/^..//p'
Modifications to recover the first file are left as an exercise for
the reader.
See module comments for details and programmatic interface.
"""
__version__ = 1, 3, 0
# SequenceMatcher tries to compute a "human-friendly diff" between # SequenceMatcher tries to compute a "human-friendly diff" between
# two sequences (chiefly picturing a file as a sequence of lines, # two sequences (chiefly picturing a file as a sequence of lines,
# and a line as a sequence of characters, here). Unlike UNIX(tm) diff, # and a line as a sequence of characters, here). Unlike e.g. UNIX(tm)
# e.g., the fundamental notion is the longest *contiguous* & junk-free # diff, the fundamental notion is the longest *contiguous* & junk-free
# matching subsequence. That's what catches peoples' eyes. The # matching subsequence. That's what catches peoples' eyes. The
# Windows(tm) windiff has another interesting notion, pairing up elements # Windows(tm) windiff has another interesting notion, pairing up elements
# that appear uniquely in each sequence. That, and the method here, # that appear uniquely in each sequence. That, and the method here,
...@@ -26,11 +60,11 @@ ...@@ -26,11 +60,11 @@
# apart. Restricting synch points to contiguous matches preserves some # apart. Restricting synch points to contiguous matches preserves some
# notion of locality, at the occasional cost of producing a longer diff. # notion of locality, at the occasional cost of producing a longer diff.
# #
# With respect to junk, an earlier verion of ndiff simply refused to # With respect to junk, an earlier version of ndiff simply refused to
# *start* a match with a junk element. The result was cases like this: # *start* a match with a junk element. The result was cases like this:
# before: private Thread currentThread; # before: private Thread currentThread;
# after: private volatile Thread currentThread; # after: private volatile Thread currentThread;
# If you consider whitespace to be junk, the longest continguous match # If you consider whitespace to be junk, the longest contiguous match
# not starting with junk is "e Thread currentThread". So ndiff reported # not starting with junk is "e Thread currentThread". So ndiff reported
# that "e volatil" was inserted between the 't' and the 'e' in "private". # that "e volatil" was inserted between the 't' and the 'e' in "private".
# While an accurate view, to people that's absurd. The current version # While an accurate view, to people that's absurd. The current version
...@@ -40,23 +74,9 @@ ...@@ -40,23 +74,9 @@
# preceding blank; then "private" is matched, and extended to suck up the # preceding blank; then "private" is matched, and extended to suck up the
# following blank; then "Thread" is matched; and finally ndiff reports # following blank; then "Thread" is matched; and finally ndiff reports
# that "volatile " was inserted before "Thread". The only quibble # that "volatile " was inserted before "Thread". The only quibble
# remaining is that perhaps it was really the case that " volative" # remaining is that perhaps it was really the case that " volatile"
# was inserted after "private". I can live with that <wink>. # was inserted after "private". I can live with that <wink>.
# #
# NOTE on the output: From an ndiff report,
# 1) The first file can be recovered by retaining only lines that begin
# with " " or "- ", and deleting those 2-character prefixes.
# 2) The second file can be recovered similarly, but by retaining only
# " " and "+ " lines.
# 3) Lines beginning with "? " attempt to guide the eye to intraline
# differences, and were not present in either input file.
#
# COROLLARY:
# On Unix, the second file can be recovered by piping the output through
# sed -n '/^[+ ] /s/^..//p'
# Modifications to recover the first file are left as an exercise for
# the reader.
#
# NOTE on junk: the module-level names # NOTE on junk: the module-level names
# IS_LINE_JUNK # IS_LINE_JUNK
# IS_CHARACTER_JUNK # IS_CHARACTER_JUNK
...@@ -70,8 +90,8 @@ ...@@ -70,8 +90,8 @@
# #
# After setting those, you can call fcompare(f1name, f2name) with the # After setting those, you can call fcompare(f1name, f2name) with the
# names of the files you want to compare. The difference report # names of the files you want to compare. The difference report
# is sent to stdout. Or you can call main(), which expects to find # is sent to stdout. Or you can call main(args), passing what would
# (exactly) the two file names in sys.argv. # have been in sys.argv[1:] had the cmd-line form been used.
import string import string
TRACE = 0 TRACE = 0
...@@ -148,7 +168,7 @@ class SequenceMatcher: ...@@ -148,7 +168,7 @@ class SequenceMatcher:
self.fullbcount = None self.fullbcount = None
self.__chain_b() self.__chain_b()
# for each element x in b, set b2j[x] to a list of the indices in # For each element x in b, set b2j[x] to a list of the indices in
# b where x appears; the indices are in increasing order; note that # b where x appears; the indices are in increasing order; note that
# the number of times x appears in b is len(b2j[x]) ... # the number of times x appears in b is len(b2j[x]) ...
# when self.isjunk is defined, junk elements don't show up in this # when self.isjunk is defined, junk elements don't show up in this
...@@ -173,7 +193,7 @@ class SequenceMatcher: ...@@ -173,7 +193,7 @@ class SequenceMatcher:
b = self.b b = self.b
self.b2j = b2j = {} self.b2j = b2j = {}
self.b2jhas = b2jhas = b2j.has_key self.b2jhas = b2jhas = b2j.has_key
for i in xrange(0, len(b)): for i in xrange(len(b)):
elt = b[i] elt = b[i]
if b2jhas(elt): if b2jhas(elt):
b2j[elt].append(i) b2j[elt].append(i)
...@@ -210,9 +230,9 @@ class SequenceMatcher: ...@@ -210,9 +230,9 @@ class SequenceMatcher:
k >= k' k >= k'
i <= i' i <= i'
and if i == i', j <= j' and if i == i', j <= j'
In other words, of all maximal matching blocks, returns one In other words, of all maximal matching blocks, return one
that starts earliest in a, and of all those maximal matching that starts earliest in a, and of all those maximal matching
blocks that start earliest in a, returns the one that starts blocks that start earliest in a, return the one that starts
earliest in b. earliest in b.
If isjunk is defined, first the longest matching block is If isjunk is defined, first the longest matching block is
...@@ -223,7 +243,7 @@ class SequenceMatcher: ...@@ -223,7 +243,7 @@ class SequenceMatcher:
as identical junk happens to be adjacent to an "interesting" as identical junk happens to be adjacent to an "interesting"
match. match.
If no blocks match, returns (alo, blo, 0). If no blocks match, return (alo, blo, 0).
""" """
# CAUTION: stripping common prefix or suffix would be incorrect. # CAUTION: stripping common prefix or suffix would be incorrect.
...@@ -238,40 +258,28 @@ class SequenceMatcher: ...@@ -238,40 +258,28 @@ class SequenceMatcher:
# Windiff ends up at the same place as diff, but by pairing up # Windiff ends up at the same place as diff, but by pairing up
# the unique 'b's and then matching the first two 'a's. # the unique 'b's and then matching the first two 'a's.
# find longest junk-free match
a, b, b2j, isbjunk = self.a, self.b, self.b2j, self.isbjunk a, b, b2j, isbjunk = self.a, self.b, self.b2j, self.isbjunk
besti, bestj, bestsize = alo, blo, 0 besti, bestj, bestsize = alo, blo, 0
# find longest junk-free match
# during an iteration of the loop, j2len[j] = length of longest
# junk-free match ending with a[i-1] and b[j]
j2len = {}
nothing = []
for i in xrange(alo, ahi): for i in xrange(alo, ahi):
# check for longest match starting at a[i]
if i + bestsize >= ahi:
# we're too far right to get a new best
break
# look at all instances of a[i] in b; note that because # look at all instances of a[i] in b; note that because
# b2j has no junk keys, the loop is skipped if a[i] is junk # b2j has no junk keys, the loop is skipped if a[i] is junk
for j in b2j.get(a[i], []): j2lenget = j2len.get
newj2len = {}
for j in b2j.get(a[i], nothing):
# a[i] matches b[j] # a[i] matches b[j]
if j < blo: if j < blo:
continue continue
if j + bestsize >= bhi: if j >= bhi:
# we're too far right to get a new best, here or
# anywhere to the right
break break
if a[i + bestsize] != b[j + bestsize]: k = newj2len[j] = j2lenget(j-1, 0) + 1
# can't be longer match; this test is not necessary
# for correctness, but is a huge win for efficiency
continue
# set k to length of match
k = 1 # a[i] == b[j] already known
while i + k < ahi and j + k < bhi and \
a[i+k] == b[j+k] and not isbjunk(b[j+k]):
k = k + 1
if k > bestsize: if k > bestsize:
besti, bestj, bestsize = i, j, k besti, bestj, bestsize = i-k+1, j-k+1, k
if i + bestsize >= ahi: j2len = newj2len
# only time in my life I really wanted a
# labelled break <wink> -- we're done with
# both loops now
break
# Now that we have a wholly interesting match (albeit possibly # Now that we have a wholly interesting match (albeit possibly
# empty!), we may as well suck up the matching junk on each # empty!), we may as well suck up the matching junk on each
...@@ -294,101 +302,6 @@ class SequenceMatcher: ...@@ -294,101 +302,6 @@ class SequenceMatcher:
print " returns", besti, bestj, bestsize print " returns", besti, bestj, bestsize
return besti, bestj, bestsize return besti, bestj, bestsize
# A different implementation, using a binary doubling technique that
# does far fewer element compares (trades 'em for integer compares),
# and has n*lg n worst-case behavior. Alas, the code is much harder
# to follow (the details are tricky!), and in most cases I've seen,
# it takes at least 50% longer than the "clever dumb" method above;
# probably due to creating layers of small dicts.
# NOTE: this no longer matches the version above wrt junk; remains
# too unpromising to update it; someday, though ...
# def find_longest_match(self, alo, ahi, blo, bhi):
# """Find longest matching block in a[alo:ahi] and b[blo:bhi].
#
# Return (i,j,k) such that a[i:i+k] is equal to b[j:j+k], where
# alo <= i <= i+k <= ahi
# blo <= j <= j+k <= bhi
# and for all (i',j',k') meeting those conditions,
# k >= k'
# i <= i'
# and if i == i', j <= j'
# In other words, of all maximal matching blocks, returns one
# that starts earliest in a, and of all those maximal matching
# blocks that start earliest in a, returns the one that starts
# earliest in b.
#
# If no blocks match, returns (alo, blo, 0).
# """
#
# a, b2j = self.a, self.b2j
# # alljs[size][i] is a set of all j's s.t. a[i:i+len] matches
# # b[j:j+len]
# alljs = {}
# alljs[1] = js = {}
# ahits = {}
# for i in xrange(alo, ahi):
# elt = a[i]
# if ahits.has_key(elt):
# js[i] = ahits[elt]
# continue
# if b2j.has_key(elt):
# in_range = {}
# for j in b2j[elt]:
# if j >= blo:
# if j >= bhi:
# break
# in_range[j] = 1
# if in_range:
# ahits[elt] = js[i] = in_range
# del ahits
# size = 1
# while js:
# oldsize = size
# size = size + size
# oldjs = js
# alljs[size] = js = {}
# for i in oldjs.keys():
# # i has matches of size oldsize
# if not oldjs.has_key(i + oldsize):
# # can't double it
# continue
# second_js = oldjs[i + oldsize]
# answer = {}
# for j in oldjs[i].keys():
# if second_js.has_key(j + oldsize):
# answer[j] = 1
# if answer:
# js[i] = answer
# del alljs[size]
# size = size >> 1 # max power of 2 with a match
# if not size:
# return alo, blo, 0
# besti, bestj, bestsize = alo, blo, 0
# fatis = alljs[size].keys()
# fatis.sort()
# for i in fatis:
# # figure out longest match starting at a[i]
# totalsize = halfsize = size
# # i has matches of len totalsize at the indices in js
# js = alljs[size][i].keys()
# while halfsize > 1:
# halfsize = halfsize >> 1
# # is there a match of len halfsize starting at
# # i + totalsize?
# newjs = []
# if alljs[halfsize].has_key(i + totalsize):
# second_js = alljs[halfsize][i + totalsize]
# for j in js:
# if second_js.has_key(j + totalsize):
# newjs.append(j)
# if newjs:
# totalsize = totalsize + halfsize
# js = newjs
# if totalsize > bestsize:
# besti, bestj, bestsize = i, min(js), totalsize
# return besti, bestj, bestsize
def get_matching_blocks(self): def get_matching_blocks(self):
if self.matching_blocks is not None: if self.matching_blocks is not None:
return self.matching_blocks return self.matching_blocks
...@@ -621,7 +534,7 @@ def fopen(fname): ...@@ -621,7 +534,7 @@ def fopen(fname):
try: try:
return open(fname, 'r') return open(fname, 'r')
except IOError, detail: except IOError, detail:
print "couldn't open " + fname + ": " + `detail` print "couldn't open " + fname + ": " + str(detail)
return 0 return 0
# open two files & spray the diff to stdout; return false iff a problem # open two files & spray the diff to stdout; return false iff a problem
...@@ -649,24 +562,39 @@ def fcompare(f1name, f2name): ...@@ -649,24 +562,39 @@ def fcompare(f1name, f2name):
return 1 return 1
# get file names from argv & compare; return false iff a problem # crack args (sys.argv[1:] is normal) & compare;
def main(): # return false iff a problem
from sys import argv
if len(argv) != 3: def main(args):
import getopt
try:
opts, args = getopt.getopt(args, "q")
except getopt.error, detail:
print str(detail)
print __doc__
return 0
noisy = 1
for opt, val in opts:
if opt == "-q":
noisy = 0
if len(args) != 2:
print 'need 2 args' print 'need 2 args'
print __doc__
return 0 return 0
[f1name, f2name] = argv[1:3] f1name, f2name = args
print '-:', f1name if noisy:
print '+:', f2name print '-:', f1name
print '+:', f2name
return fcompare(f1name, f2name) return fcompare(f1name, f2name)
if __name__ == '__main__': if __name__ == '__main__':
import sys
args = sys.argv[1:]
if 1: if 1:
main() main(args)
else: else:
import profile, pstats import profile, pstats
statf = "ndiff.pro" statf = "ndiff.pro"
profile.run("main()", statf) profile.run("main(args)", statf)
stats = pstats.Stats(statf) stats = pstats.Stats(statf)
stats.strip_dirs().sort_stats('time').print_stats() stats.strip_dirs().sort_stats('time').print_stats()
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment