Merge pull request #1 from IanBruene/differ

ianbruene · web-flow · commit ecda353ee369 · 2018-12-10T21:29:16.000-06:00
Added port of Differ class, and tests.
diff --git a/difflib/difflib.go b/difflib/difflib.go
@@ -18,9 +18,11 @@ package difflib
 import (
 	"bufio"
 	"bytes"
+	"errors"
 	"fmt"
 	"io"
 	"strings"
+	"unicode"
 )
 
 func min(a, b int) int {
@@ -44,6 +46,14 @@ func calculateRatio(matches, length int) float64 {
 	return 1.0
 }
 
+func listifyString(str string) (lst []string) {
+	lst = make([]string, len(str))
+	for i, c := range str {
+		lst[i] = string(c)
+	}
+	return lst
+}
+
 type Match struct {
 	A    int
 	B    int
@@ -511,6 +521,227 @@ func (m *SequenceMatcher) RealQuickRatio() float64 {
 	return calculateRatio(min(la, lb), la+lb)
 }
 
+func count_leading(line string, ch byte) (count int) {
+	// Return number of `ch` characters at the start of `line`.
+	count = 0
+	n := len(line)
+	for (count < n) && (line[count] == ch) {
+		count++
+	}
+	return count
+}
+
+type Differ struct {
+	Linejunk func(string) bool
+	Charjunk func(string) bool
+}
+
+func NewDiffer() *Differ {
+	return &Differ{}
+}
+
+func (d *Differ) Compare(a []string, b []string) (diffs []string, err error) {
+	// Compare two sequences of lines; generate the resulting delta.
+
+	// Each sequence must contain individual single-line strings ending with
+	// newlines. Such sequences can be obtained from the `readlines()` method
+	// of file-like objects.  The delta generated also consists of newline-
+	// terminated strings, ready to be printed as-is via the writeline()
+	// method of a file-like object.
+	diffs = []string{}
+	cruncher := NewMatcherWithJunk(a, b, true, d.Linejunk)
+	opcodes := cruncher.GetOpCodes()
+	for _, current := range opcodes {
+		alo := current.I1
+		ahi := current.I2
+		blo := current.J1
+		bhi := current.J2
+		var g []string
+		if current.Tag == 'r' {
+			g, _ = d.FancyReplace(a, alo, ahi, b, blo, bhi)
+		} else if current.Tag == 'd' {
+			g = d.Dump("-", a, alo, ahi)
+		} else if current.Tag == 'i' {
+			g = d.Dump("+", b, blo, bhi)
+		} else if current.Tag == 'e' {
+			g = d.Dump(" ", a, alo, ahi)
+		} else {
+			return nil, errors.New(fmt.Sprintf("unknown tag %q", current.Tag))
+		}
+		diffs = append(diffs, g...)
+	}
+	return diffs, nil
+}
+
+func (d *Differ) Dump(tag string, x []string, lo int, hi int) (out []string) {
+	// Generate comparison results for a same-tagged range.
+	out = []string{}
+	for i := lo; i < hi; i++ {
+		out = append(out, fmt.Sprintf("%s %s", tag, x[i]))
+	}
+	return out
+}
+
+func (d *Differ) PlainReplace(a []string, alo int, ahi int, b []string, blo int, bhi int) (out []string, err error) {
+	if !(alo < ahi) || !(blo < bhi) { // assertion
+		return nil, errors.New("low greater than or equal to high")
+	}
+	// dump the shorter block first -- reduces the burden on short-term
+	// memory if the blocks are of very different sizes
+	if bhi-blo < ahi-alo {
+		out = d.Dump("+", b, blo, bhi)
+		out = append(out, d.Dump("-", a, alo, ahi)...)
+	} else {
+		out = d.Dump("-", a, alo, ahi)
+		out = append(out, d.Dump("+", b, blo, bhi)...)
+	}
+	return out, nil
+}
+
+func (d *Differ) FancyReplace(a []string, alo int, ahi int, b []string, blo int, bhi int) (out []string, err error) {
+	// When replacing one block of lines with another, search the blocks
+	// for *similar* lines; the best-matching pair (if any) is used as a
+	// synch point, and intraline difference marking is done on the
+	// similar pair. Lots of work, but often worth it.
+
+	// don't synch up unless the lines have a similarity score of at
+	// least cutoff; best_ratio tracks the best score seen so far
+	best_ratio := 0.74
+	cutoff := 0.75
+	cruncher := NewMatcherWithJunk(a, b, true, d.Charjunk)
+	eqi := -1 // 1st indices of equal lines (if any)
+	eqj := -1
+	out = []string{}
+
+	// search for the pair that matches best without being identical
+	// (identical lines must be junk lines, & we don't want to synch up
+	// on junk -- unless we have to)
+	var best_i, best_j int
+	for j := blo; j < bhi; j++ {
+		bj := b[j]
+		cruncher.SetSeq2(listifyString(bj))
+		for i := alo; i < ahi; i++ {
+			ai := a[i]
+			if ai == bj {
+				if eqi == -1 {
+					eqi = i
+					eqj = j
+				}
+				continue
+			}
+			cruncher.SetSeq1(listifyString(ai))
+			// computing similarity is expensive, so use the quick
+			// upper bounds first -- have seen this speed up messy
+			// compares by a factor of 3.
+			// note that ratio() is only expensive to compute the first
+			// time it's called on a sequence pair; the expensive part
+			// of the computation is cached by cruncher
+			if cruncher.RealQuickRatio() > best_ratio &&
+				cruncher.QuickRatio() > best_ratio &&
+				cruncher.Ratio() > best_ratio {
+				best_ratio = cruncher.Ratio()
+				best_i = i
+				best_j = j
+			}
+		}
+	}
+	if best_ratio < cutoff {
+		// no non-identical "pretty close" pair
+		if eqi == -1 {
+			// no identical pair either -- treat it as a straight replace
+			out, _ = d.PlainReplace(a, alo, ahi, b, blo, bhi)
+			return out, nil
+		}
+		// no close pair, but an identical pair -- synch up on that
+		best_i = eqi
+		best_j = eqj
+		best_ratio = 1.0
+	} else {
+		// there's a close pair, so forget the identical pair (if any)
+		eqi = -1
+	}
+	// a[best_i] very similar to b[best_j]; eqi is None iff they're not
+	// identical
+
+	// pump out diffs from before the synch point
+	out = append(out, d.fancyHelper(a, alo, best_i, b, blo, best_j)...)
+
+	// do intraline marking on the synch pair
+	aelt, belt := a[best_i], b[best_j]
+	if eqi == -1 {
+		// pump out a '-', '?', '+', '?' quad for the synched lines
+		var atags, btags string
+		cruncher.SetSeqs(listifyString(aelt), listifyString(belt))
+		opcodes := cruncher.GetOpCodes()
+		for _, current := range opcodes {
+			ai1 := current.I1
+			ai2 := current.I2
+			bj1 := current.J1
+			bj2 := current.J2
+			la, lb := ai2-ai1, bj2-bj1
+			if current.Tag == 'r' {
+				atags += strings.Repeat("^", la)
+				btags += strings.Repeat("^", lb)
+			} else if current.Tag == 'd' {
+				atags += strings.Repeat("-", la)
+			} else if current.Tag == 'i' {
+				btags += strings.Repeat("+", lb)
+			} else if current.Tag == 'e' {
+				atags += strings.Repeat(" ", la)
+				btags += strings.Repeat(" ", lb)
+			} else {
+				return nil, errors.New(fmt.Sprintf("unknown tag %q",
+					current.Tag))
+			}
+		}
+		out = append(out, d.QFormat(aelt, belt, atags, btags)...)
+	} else {
+		// the synch pair is identical
+		out = append(out, "  "+aelt)
+	}
+	// pump out diffs from after the synch point
+	out = append(out, d.fancyHelper(a, best_i+1, ahi, b, best_j+1, bhi)...)
+	return out, nil
+}
+
+func (d *Differ) fancyHelper(a []string, alo int, ahi int, b []string, blo int, bhi int) (out []string) {
+	if alo < ahi {
+		if blo < bhi {
+			out, _ = d.FancyReplace(a, alo, ahi, b, blo, bhi)
+		} else {
+			out = d.Dump("-", a, alo, ahi)
+		}
+	} else if blo < bhi {
+		out = d.Dump("+", b, blo, bhi)
+	} else {
+		out = []string{}
+	}
+	return out
+}
+
+func (d *Differ) QFormat(aline string, bline string, atags string, btags string) (out []string) {
+	// Format "?" output and deal with leading tabs.
+
+	// Can hurt, but will probably help most of the time.
+	common := min(count_leading(aline, '\t'), count_leading(bline, '\t'))
+	common = min(common, count_leading(atags[:common], ' '))
+	common = min(common, count_leading(btags[:common], ' '))
+	atags = strings.TrimRightFunc(atags[common:], unicode.IsSpace)
+	btags = strings.TrimRightFunc(btags[common:], unicode.IsSpace)
+
+	out = []string{"- " + aline}
+	if len(atags) > 0 {
+		out = append(out, fmt.Sprintf("? %s%s\n",
+			strings.Repeat("\t", common), atags))
+	}
+	out = append(out, "+ "+bline)
+	if len(btags) > 0 {
+		out = append(out, fmt.Sprintf("? %s%s\n",
+			strings.Repeat("\t", common), btags))
+	}
+	return out
+}
+
 // Convert range to the "ed" format
 func formatRangeUnified(start, stop int) string {
 	// Per the diff spec at http://www.unix.org/single_unix_specification/
diff --git a/difflib/difflib_test.go b/difflib/difflib_test.go
@@ -30,6 +30,13 @@ func splitChars(s string) []string {
 	return chars
 }
 
+func TestlistifyString(t *testing.T) {
+	lst := listifyString("qwerty")
+	if reflect.DeepEqual(lst, []string{"q", "w", "e", "r", "t", "y"}) != true {
+		t.Fatal("listifyString failure:", lst)
+	}
+}
+
 func TestSequenceMatcherRatio(t *testing.T) {
 	s := NewMatcher(splitChars("abcd"), splitChars("bcde"))
 	assertEqual(t, s.Ratio(), 0.75)
@@ -424,3 +431,119 @@ func BenchmarkSplitLines100(b *testing.B) {
 func BenchmarkSplitLines10000(b *testing.B) {
 	benchmarkSplitLines(b, 10000)
 }
+
+func TestDifferCompare(t *testing.T) {
+	diff := NewDiffer()
+	// Test
+	aLst := []string{"foo\n", "bar\n", "baz\n"}
+	bLst := []string{"foo\n", "bar1\n", "asdf\n", "baz\n"}
+	out, err := diff.Compare(aLst, bLst)
+	if err != nil {
+		t.Fatal("Differ Compare() error:", err)
+	}
+	if reflect.DeepEqual(out, []string{
+		"  foo\n",
+		"- bar\n",
+		"+ bar1\n",
+		"?    +\n",
+		"+ asdf\n",
+		"  baz\n",
+	}) != true {
+		t.Fatal("Differ Compare failure:", out)
+	}
+}
+
+func TestDifferDump(t *testing.T) {
+	diff := NewDiffer()
+	out := diff.Dump("+",
+		[]string{"foo", "bar", "baz", "quux", "qwerty"},
+		1, 3)
+	if reflect.DeepEqual(out, []string{"+ bar", "+ baz"}) != true {
+		t.Fatal("Differ Dump() failure:", out)
+	}
+}
+
+func TestDifferPlainReplace(t *testing.T) {
+	diff := NewDiffer()
+	aLst := []string{"one\n", "two\n", "three\n", "four\n", "five\n"}
+	bLst := []string{"one\n", "two2\n", "three\n", "extra\n"}
+	// Test a then b
+	out, err := diff.PlainReplace(aLst, 1, 2, bLst, 1, 2)
+	if err != nil {
+		t.Fatal("Differ PlainReplace() error:", err)
+	}
+	if reflect.DeepEqual(out, []string{"- two\n", "+ two2\n"}) != true {
+		t.Fatal("Differ PlainReplace() failure:", out)
+	}
+	// Test b then a
+	out, err = diff.PlainReplace(aLst, 3, 5, bLst, 3, 4)
+	if err != nil {
+		t.Fatal("Differ PlainReplace() error:", err)
+	}
+	if reflect.DeepEqual(out,
+		[]string{"+ extra\n", "- four\n", "- five\n"}) != true {
+		t.Fatal("Differ PlainReplace() failure:", out)
+	}
+}
+
+func TestDifferFancyReplaceAndHelper(t *testing.T) {
+	diff := NewDiffer()
+	// Test identical sync point, both full
+	aLst := []string{"one\n", "asdf\n", "three\n"}
+	bLst := []string{"one\n", "two2\n", "three\n"}
+	out, err := diff.FancyReplace(aLst, 0, 3, bLst, 0, 3)
+	if err != nil {
+		t.Fatal("Differ FancyReplace() error:", err)
+	}
+	if reflect.DeepEqual(out,
+		[]string{"  one\n", "- asdf\n", "+ two2\n", "  three\n"}) != true {
+		t.Fatal("Differ FancyReplace() failure:", out)
+	}
+	// Test close sync point, both full
+	aLst = []string{"one\n", "two123456\n", "asdf\n", "three\n"}
+	bLst = []string{"one\n", "two123457\n", "qwerty\n", "three\n"}
+	out, err = diff.FancyReplace(aLst, 1, 3, bLst, 1, 3)
+	if err != nil {
+		t.Fatal("Differ FancyReplace() error:", err)
+	}
+	if reflect.DeepEqual(out, []string{
+		"- two123456\n",
+		"?         ^\n",
+		"+ two123457\n",
+		"?         ^\n",
+		"- asdf\n",
+		"+ qwerty\n",
+	}) != true {
+		t.Fatal("Differ FancyReplace() failure:", out)
+	}
+	// Test no identical no close
+	aLst = []string{"one\n", "asdf\n", "three\n"}
+	bLst = []string{"one\n", "qwerty\n", "three\n"}
+	out, err = diff.FancyReplace(aLst, 1, 2, bLst, 1, 2)
+	if err != nil {
+		t.Fatal("Differ FancyReplace() error:", err)
+	}
+	if reflect.DeepEqual(out, []string{
+		"- asdf\n",
+		"+ qwerty\n",
+	}) != true {
+		t.Fatal("Differ FancyReplace() failure:", out)
+	}
+}
+
+func TestDifferQFormat(t *testing.T) {
+	diff := NewDiffer()
+	aStr := "\tfoo2bar\n"
+	aTag := "    ^  ^"
+	bStr := "\tfoo3baz\n"
+	bTag := "    ^  ^"
+	out := diff.QFormat(aStr, bStr, aTag, bTag)
+	if reflect.DeepEqual(out, []string{
+		"- \tfoo2bar\n",
+		"? \t   ^  ^\n",
+		"+ \tfoo3baz\n",
+		"? \t   ^  ^\n",
+	}) != true {
+		t.Fatal("Differ QFormat() failure:", out)
+	}
+}