From a0505d8e3d2cae794e2c935a304dd9d8e96ccfa5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lovro=20Ma=C5=BEgon?= Date: Thu, 15 Feb 2024 20:42:10 +0100 Subject: [PATCH] add unified diff lib --- .../processor/builtin/internal/diff/README.md | 16 + .../processor/builtin/internal/diff/diff.go | 176 +++++++ .../builtin/internal/diff/diff_test.go | 207 ++++++++ .../internal/diff/difftest/difftest.go | 324 ++++++++++++ .../internal/diff/difftest/difftest_test.go | 82 +++ .../builtin/internal/diff/export_test.go | 9 + .../builtin/internal/diff/lcs/common.go | 179 +++++++ .../builtin/internal/diff/lcs/common_test.go | 140 +++++ .../builtin/internal/diff/lcs/doc.go | 156 ++++++ .../builtin/internal/diff/lcs/git.sh | 33 ++ .../builtin/internal/diff/lcs/labels.go | 55 ++ .../builtin/internal/diff/lcs/old.go | 480 ++++++++++++++++++ .../builtin/internal/diff/lcs/old_test.go | 251 +++++++++ .../builtin/internal/diff/lcs/sequence.go | 113 +++++ .../builtin/internal/diff/myers/diff.go | 246 +++++++++ .../builtin/internal/diff/myers/diff_test.go | 16 + .../processor/builtin/internal/diff/ndiff.go | 99 ++++ .../builtin/internal/diff/testenv/testenv.go | 199 ++++++++ .../builtin/internal/diff/unified.go | 251 +++++++++ 19 files changed, 3032 insertions(+) create mode 100644 pkg/plugin/processor/builtin/internal/diff/README.md create mode 100644 pkg/plugin/processor/builtin/internal/diff/diff.go create mode 100644 pkg/plugin/processor/builtin/internal/diff/diff_test.go create mode 100644 pkg/plugin/processor/builtin/internal/diff/difftest/difftest.go create mode 100644 pkg/plugin/processor/builtin/internal/diff/difftest/difftest_test.go create mode 100644 pkg/plugin/processor/builtin/internal/diff/export_test.go create mode 100644 pkg/plugin/processor/builtin/internal/diff/lcs/common.go create mode 100644 pkg/plugin/processor/builtin/internal/diff/lcs/common_test.go create mode 100644 pkg/plugin/processor/builtin/internal/diff/lcs/doc.go create mode 100644 pkg/plugin/processor/builtin/internal/diff/lcs/git.sh create mode 100644 pkg/plugin/processor/builtin/internal/diff/lcs/labels.go create mode 100644 pkg/plugin/processor/builtin/internal/diff/lcs/old.go create mode 100644 pkg/plugin/processor/builtin/internal/diff/lcs/old_test.go create mode 100644 pkg/plugin/processor/builtin/internal/diff/lcs/sequence.go create mode 100644 pkg/plugin/processor/builtin/internal/diff/myers/diff.go create mode 100644 pkg/plugin/processor/builtin/internal/diff/myers/diff_test.go create mode 100644 pkg/plugin/processor/builtin/internal/diff/ndiff.go create mode 100644 pkg/plugin/processor/builtin/internal/diff/testenv/testenv.go create mode 100644 pkg/plugin/processor/builtin/internal/diff/unified.go diff --git a/pkg/plugin/processor/builtin/internal/diff/README.md b/pkg/plugin/processor/builtin/internal/diff/README.md new file mode 100644 index 000000000..09985b6c8 --- /dev/null +++ b/pkg/plugin/processor/builtin/internal/diff/README.md @@ -0,0 +1,16 @@ +# Diff + +This package contains code taken from https://github.com/golang/tools/tree/master/internal/diff +on February 15th, 2024. We need the code to create a unified diff between two strings. + +The code is left as-is, except two changes: + +- The imports were changed to reference the Conduit module path. This was done + using the following command: + + ```sh + find . -type f -exec sed -i '' 's/golang.org\/x\/tools\/internal/github.com\/conduitio\/conduit\/pkg\/plugin\/processor\/builtin\/internal/g' {} + + ``` + +- The package `golang.org/x/tools/internal/testenv` was added into the `diff` package, + as that's the only place it's used. It also only includes the required functions. diff --git a/pkg/plugin/processor/builtin/internal/diff/diff.go b/pkg/plugin/processor/builtin/internal/diff/diff.go new file mode 100644 index 000000000..a13547b7a --- /dev/null +++ b/pkg/plugin/processor/builtin/internal/diff/diff.go @@ -0,0 +1,176 @@ +// Copyright 2019 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// Package diff computes differences between text files or strings. +package diff + +import ( + "fmt" + "sort" + "strings" +) + +// An Edit describes the replacement of a portion of a text file. +type Edit struct { + Start, End int // byte offsets of the region to replace + New string // the replacement +} + +func (e Edit) String() string { + return fmt.Sprintf("{Start:%d,End:%d,New:%q}", e.Start, e.End, e.New) +} + +// Apply applies a sequence of edits to the src buffer and returns the +// result. Edits are applied in order of start offset; edits with the +// same start offset are applied in they order they were provided. +// +// Apply returns an error if any edit is out of bounds, +// or if any pair of edits is overlapping. +func Apply(src string, edits []Edit) (string, error) { + edits, size, err := validate(src, edits) + if err != nil { + return "", err + } + + // Apply edits. + out := make([]byte, 0, size) + lastEnd := 0 + for _, edit := range edits { + if lastEnd < edit.Start { + out = append(out, src[lastEnd:edit.Start]...) + } + out = append(out, edit.New...) + lastEnd = edit.End + } + out = append(out, src[lastEnd:]...) + + if len(out) != size { + panic("wrong size") + } + + return string(out), nil +} + +// ApplyBytes is like Apply, but it accepts a byte slice. +// The result is always a new array. +func ApplyBytes(src []byte, edits []Edit) ([]byte, error) { + res, err := Apply(string(src), edits) + return []byte(res), err +} + +// validate checks that edits are consistent with src, +// and returns the size of the patched output. +// It may return a different slice. +func validate(src string, edits []Edit) ([]Edit, int, error) { + if !sort.IsSorted(editsSort(edits)) { + edits = append([]Edit(nil), edits...) + SortEdits(edits) + } + + // Check validity of edits and compute final size. + size := len(src) + lastEnd := 0 + for _, edit := range edits { + if !(0 <= edit.Start && edit.Start <= edit.End && edit.End <= len(src)) { + return nil, 0, fmt.Errorf("diff has out-of-bounds edits") + } + if edit.Start < lastEnd { + return nil, 0, fmt.Errorf("diff has overlapping edits") + } + size += len(edit.New) + edit.Start - edit.End + lastEnd = edit.End + } + + return edits, size, nil +} + +// SortEdits orders a slice of Edits by (start, end) offset. +// This ordering puts insertions (end = start) before deletions +// (end > start) at the same point, but uses a stable sort to preserve +// the order of multiple insertions at the same point. +// (Apply detects multiple deletions at the same point as an error.) +func SortEdits(edits []Edit) { + sort.Stable(editsSort(edits)) +} + +type editsSort []Edit + +func (a editsSort) Len() int { return len(a) } +func (a editsSort) Less(i, j int) bool { + if cmp := a[i].Start - a[j].Start; cmp != 0 { + return cmp < 0 + } + return a[i].End < a[j].End +} +func (a editsSort) Swap(i, j int) { a[i], a[j] = a[j], a[i] } + +// lineEdits expands and merges a sequence of edits so that each +// resulting edit replaces one or more complete lines. +// See ApplyEdits for preconditions. +func lineEdits(src string, edits []Edit) ([]Edit, error) { + edits, _, err := validate(src, edits) + if err != nil { + return nil, err + } + + // Do all deletions begin and end at the start of a line, + // and all insertions end with a newline? + // (This is merely a fast path.) + for _, edit := range edits { + if edit.Start >= len(src) || // insertion at EOF + edit.Start > 0 && src[edit.Start-1] != '\n' || // not at line start + edit.End > 0 && src[edit.End-1] != '\n' || // not at line start + edit.New != "" && edit.New[len(edit.New)-1] != '\n' { // partial insert + goto expand // slow path + } + } + return edits, nil // aligned + +expand: + if len(edits) == 0 { + return edits, nil // no edits (unreachable due to fast path) + } + expanded := make([]Edit, 0, len(edits)) // a guess + prev := edits[0] + // TODO(adonovan): opt: start from the first misaligned edit. + // TODO(adonovan): opt: avoid quadratic cost of string += string. + for _, edit := range edits[1:] { + between := src[prev.End:edit.Start] + if !strings.Contains(between, "\n") { + // overlapping lines: combine with previous edit. + prev.New += between + edit.New + prev.End = edit.End + } else { + // non-overlapping lines: flush previous edit. + expanded = append(expanded, expandEdit(prev, src)) + prev = edit + } + } + return append(expanded, expandEdit(prev, src)), nil // flush final edit +} + +// expandEdit returns edit expanded to complete whole lines. +func expandEdit(edit Edit, src string) Edit { + // Expand start left to start of line. + // (delta is the zero-based column number of start.) + start := edit.Start + if delta := start - 1 - strings.LastIndex(src[:start], "\n"); delta > 0 { + edit.Start -= delta + edit.New = src[start-delta:start] + edit.New + } + + // Expand end right to end of line. + end := edit.End + if end > 0 && src[end-1] != '\n' || + edit.New != "" && edit.New[len(edit.New)-1] != '\n' { + if nl := strings.IndexByte(src[end:], '\n'); nl < 0 { + edit.End = len(src) // extend to EOF + } else { + edit.End = end + nl + 1 // extend beyond \n + } + } + edit.New += src[end:edit.End] + + return edit +} diff --git a/pkg/plugin/processor/builtin/internal/diff/diff_test.go b/pkg/plugin/processor/builtin/internal/diff/diff_test.go new file mode 100644 index 000000000..055384679 --- /dev/null +++ b/pkg/plugin/processor/builtin/internal/diff/diff_test.go @@ -0,0 +1,207 @@ +// Copyright 2019 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package diff_test + +import ( + "bytes" + "math/rand" + "os" + "os/exec" + "path/filepath" + "reflect" + "strings" + "testing" + "unicode/utf8" + + "github.com/conduitio/conduit/pkg/plugin/processor/builtin/internal/diff" + "github.com/conduitio/conduit/pkg/plugin/processor/builtin/internal/diff/difftest" + "github.com/conduitio/conduit/pkg/plugin/processor/builtin/internal/diff/testenv" +) + +func TestApply(t *testing.T) { + for _, tc := range difftest.TestCases { + t.Run(tc.Name, func(t *testing.T) { + got, err := diff.Apply(tc.In, tc.Edits) + if err != nil { + t.Fatalf("Apply(Edits) failed: %v", err) + } + if got != tc.Out { + t.Errorf("Apply(Edits): got %q, want %q", got, tc.Out) + } + if tc.LineEdits != nil { + got, err := diff.Apply(tc.In, tc.LineEdits) + if err != nil { + t.Fatalf("Apply(LineEdits) failed: %v", err) + } + if got != tc.Out { + t.Errorf("Apply(LineEdits): got %q, want %q", got, tc.Out) + } + } + }) + } +} + +func TestNEdits(t *testing.T) { + for _, tc := range difftest.TestCases { + edits := diff.Strings(tc.In, tc.Out) + got, err := diff.Apply(tc.In, edits) + if err != nil { + t.Fatalf("Apply failed: %v", err) + } + if got != tc.Out { + t.Fatalf("%s: got %q wanted %q", tc.Name, got, tc.Out) + } + if len(edits) < len(tc.Edits) { // should find subline edits + t.Errorf("got %v, expected %v for %#v", edits, tc.Edits, tc) + } + } +} + +func TestNRandom(t *testing.T) { + rand.Seed(1) + for i := 0; i < 1000; i++ { + a := randstr("abω", 16) + b := randstr("abωc", 16) + edits := diff.Strings(a, b) + got, err := diff.Apply(a, edits) + if err != nil { + t.Fatalf("Apply failed: %v", err) + } + if got != b { + t.Fatalf("%d: got %q, wanted %q, starting with %q", i, got, b, a) + } + } +} + +// $ go test -fuzz=FuzzRoundTrip ./internal/diff +func FuzzRoundTrip(f *testing.F) { + f.Fuzz(func(t *testing.T, a, b string) { + if !utf8.ValidString(a) || !utf8.ValidString(b) { + return // inputs must be text + } + edits := diff.Strings(a, b) + got, err := diff.Apply(a, edits) + if err != nil { + t.Fatalf("Apply failed: %v", err) + } + if got != b { + t.Fatalf("applying diff(%q, %q) gives %q; edits=%v", a, b, got, edits) + } + }) +} + +func TestLineEdits(t *testing.T) { + for _, tc := range difftest.TestCases { + t.Run(tc.Name, func(t *testing.T) { + want := tc.LineEdits + if want == nil { + want = tc.Edits // already line-aligned + } + got, err := diff.LineEdits(tc.In, tc.Edits) + if err != nil { + t.Fatalf("LineEdits: %v", err) + } + if !reflect.DeepEqual(got, want) { + t.Errorf("in=<<%s>>\nout=<<%s>>\nraw edits=%s\nline edits=%s\nwant: %s", + tc.In, tc.Out, tc.Edits, got, want) + } + // make sure that applying the edits gives the expected result + fixed, err := diff.Apply(tc.In, got) + if err != nil { + t.Error(err) + } + if fixed != tc.Out { + t.Errorf("Apply(LineEdits): got %q, want %q", fixed, tc.Out) + } + }) + } +} + +func TestToUnified(t *testing.T) { + testenv.NeedsTool(t, "patch") + for _, tc := range difftest.TestCases { + t.Run(tc.Name, func(t *testing.T) { + unified, err := diff.ToUnified(difftest.FileA, difftest.FileB, tc.In, tc.Edits, diff.DefaultContextLines) + if err != nil { + t.Fatal(err) + } + if unified == "" { + return + } + orig := filepath.Join(t.TempDir(), "original") + err = os.WriteFile(orig, []byte(tc.In), 0644) + if err != nil { + t.Fatal(err) + } + temp := filepath.Join(t.TempDir(), "patched") + err = os.WriteFile(temp, []byte(tc.In), 0644) + if err != nil { + t.Fatal(err) + } + cmd := exec.Command("patch", "-p0", "-u", "-s", "-o", temp, orig) + cmd.Stdin = strings.NewReader(unified) + cmd.Stdout = new(bytes.Buffer) + cmd.Stderr = new(bytes.Buffer) + if err = cmd.Run(); err != nil { + t.Fatalf("%v: %q (%q) (%q)", err, cmd.String(), + cmd.Stderr, cmd.Stdout) + } + got, err := os.ReadFile(temp) + if err != nil { + t.Fatal(err) + } + if string(got) != tc.Out { + t.Errorf("applying unified failed: got\n%q, wanted\n%q unified\n%q", + got, tc.Out, unified) + } + + }) + } +} + +func TestRegressionOld001(t *testing.T) { + a := "// Copyright 2019 The Go Authors. All rights reserved.\n// Use of this source code is governed by a BSD-style\n// license that can be found in the LICENSE file.\n\npackage diff_test\n\nimport (\n\t\"fmt\"\n\t\"math/rand\"\n\t\"strings\"\n\t\"testing\"\n\n\t\"golang.org/x/tools/gopls/internal/lsp/diff\"\n\t\"github.com/conduitio/conduit/pkg/plugin/processor/builtin/internal/diff/difftest\"\n\t\"golang.org/x/tools/gopls/internal/span\"\n)\n" + + b := "// Copyright 2019 The Go Authors. All rights reserved.\n// Use of this source code is governed by a BSD-style\n// license that can be found in the LICENSE file.\n\npackage diff_test\n\nimport (\n\t\"fmt\"\n\t\"math/rand\"\n\t\"strings\"\n\t\"testing\"\n\n\t\"github.com/google/safehtml/template\"\n\t\"golang.org/x/tools/gopls/internal/lsp/diff\"\n\t\"github.com/conduitio/conduit/pkg/plugin/processor/builtin/internal/diff/difftest\"\n\t\"golang.org/x/tools/gopls/internal/span\"\n)\n" + diffs := diff.Strings(a, b) + got, err := diff.Apply(a, diffs) + if err != nil { + t.Fatalf("Apply failed: %v", err) + } + if got != b { + i := 0 + for ; i < len(a) && i < len(b) && got[i] == b[i]; i++ { + } + t.Errorf("oops %vd\n%q\n%q", diffs, got, b) + t.Errorf("\n%q\n%q", got[i:], b[i:]) + } +} + +func TestRegressionOld002(t *testing.T) { + a := "n\"\n)\n" + b := "n\"\n\t\"golang.org/x//nnal/stack\"\n)\n" + diffs := diff.Strings(a, b) + got, err := diff.Apply(a, diffs) + if err != nil { + t.Fatalf("Apply failed: %v", err) + } + if got != b { + i := 0 + for ; i < len(a) && i < len(b) && got[i] == b[i]; i++ { + } + t.Errorf("oops %vd\n%q\n%q", diffs, got, b) + t.Errorf("\n%q\n%q", got[i:], b[i:]) + } +} + +// return a random string of length n made of characters from s +func randstr(s string, n int) string { + src := []rune(s) + x := make([]rune, n) + for i := 0; i < n; i++ { + x[i] = src[rand.Intn(len(src))] + } + return string(x) +} diff --git a/pkg/plugin/processor/builtin/internal/diff/difftest/difftest.go b/pkg/plugin/processor/builtin/internal/diff/difftest/difftest.go new file mode 100644 index 000000000..bdb51cfa6 --- /dev/null +++ b/pkg/plugin/processor/builtin/internal/diff/difftest/difftest.go @@ -0,0 +1,324 @@ +// Copyright 2019 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// Package difftest supplies a set of tests that will operate on any +// implementation of a diff algorithm as exposed by +// "github.com/conduitio/conduit/pkg/plugin/processor/builtin/internal/diff" +package difftest + +// There are two kinds of tests, semantic tests, and 'golden data' tests. +// The semantic tests check that the computed diffs transform the input to +// the output, and that 'patch' accepts the computed unified diffs. +// The other tests just check that Edits and LineEdits haven't changed +// unexpectedly. These fields may need to be changed when the diff algorithm +// changes. + +import ( + "testing" + + "github.com/conduitio/conduit/pkg/plugin/processor/builtin/internal/diff" +) + +const ( + FileA = "from" + FileB = "to" + UnifiedPrefix = "--- " + FileA + "\n+++ " + FileB + "\n" +) + +var TestCases = []struct { + Name, In, Out, Unified string + Edits, LineEdits []diff.Edit // expectation (LineEdits=nil => already line-aligned) + NoDiff bool +}{{ + Name: "empty", + In: "", + Out: "", +}, { + Name: "no_diff", + In: "gargantuan\n", + Out: "gargantuan\n", +}, { + Name: "replace_all", + In: "fruit\n", + Out: "cheese\n", + Unified: UnifiedPrefix + ` +@@ -1 +1 @@ +-fruit ++cheese +`[1:], + Edits: []diff.Edit{{Start: 0, End: 5, New: "cheese"}}, + LineEdits: []diff.Edit{{Start: 0, End: 6, New: "cheese\n"}}, +}, { + Name: "insert_rune", + In: "gord\n", + Out: "gourd\n", + Unified: UnifiedPrefix + ` +@@ -1 +1 @@ +-gord ++gourd +`[1:], + Edits: []diff.Edit{{Start: 2, End: 2, New: "u"}}, + LineEdits: []diff.Edit{{Start: 0, End: 5, New: "gourd\n"}}, +}, { + Name: "delete_rune", + In: "groat\n", + Out: "goat\n", + Unified: UnifiedPrefix + ` +@@ -1 +1 @@ +-groat ++goat +`[1:], + Edits: []diff.Edit{{Start: 1, End: 2, New: ""}}, + LineEdits: []diff.Edit{{Start: 0, End: 6, New: "goat\n"}}, +}, { + Name: "replace_rune", + In: "loud\n", + Out: "lord\n", + Unified: UnifiedPrefix + ` +@@ -1 +1 @@ +-loud ++lord +`[1:], + Edits: []diff.Edit{{Start: 2, End: 3, New: "r"}}, + LineEdits: []diff.Edit{{Start: 0, End: 5, New: "lord\n"}}, +}, { + Name: "replace_partials", + In: "blanket\n", + Out: "bunker\n", + Unified: UnifiedPrefix + ` +@@ -1 +1 @@ +-blanket ++bunker +`[1:], + Edits: []diff.Edit{ + {Start: 1, End: 3, New: "u"}, + {Start: 6, End: 7, New: "r"}, + }, + LineEdits: []diff.Edit{{Start: 0, End: 8, New: "bunker\n"}}, +}, { + Name: "insert_line", + In: "1: one\n3: three\n", + Out: "1: one\n2: two\n3: three\n", + Unified: UnifiedPrefix + ` +@@ -1,2 +1,3 @@ + 1: one ++2: two + 3: three +`[1:], + Edits: []diff.Edit{{Start: 7, End: 7, New: "2: two\n"}}, +}, { + Name: "replace_no_newline", + In: "A", + Out: "B", + Unified: UnifiedPrefix + ` +@@ -1 +1 @@ +-A +\ No newline at end of file ++B +\ No newline at end of file +`[1:], + Edits: []diff.Edit{{Start: 0, End: 1, New: "B"}}, +}, { + Name: "delete_empty", + In: "meow", + Out: "", // GNU diff -u special case: +0,0 + Unified: UnifiedPrefix + ` +@@ -1 +0,0 @@ +-meow +\ No newline at end of file +`[1:], + Edits: []diff.Edit{{Start: 0, End: 4, New: ""}}, + LineEdits: []diff.Edit{{Start: 0, End: 4, New: ""}}, +}, { + Name: "append_empty", + In: "", // GNU diff -u special case: -0,0 + Out: "AB\nC", + Unified: UnifiedPrefix + ` +@@ -0,0 +1,2 @@ ++AB ++C +\ No newline at end of file +`[1:], + Edits: []diff.Edit{{Start: 0, End: 0, New: "AB\nC"}}, + LineEdits: []diff.Edit{{Start: 0, End: 0, New: "AB\nC"}}, +}, + // TODO(adonovan): fix this test: GNU diff -u prints "+1,2", Unifies prints "+1,3". + // { + // Name: "add_start", + // In: "A", + // Out: "B\nCA", + // Unified: UnifiedPrefix + ` + // @@ -1 +1,2 @@ + // -A + // \ No newline at end of file + // +B + // +CA + // \ No newline at end of file + // `[1:], + // Edits: []diff.TextEdit{{Span: newSpan(0, 0), NewText: "B\nC"}}, + // LineEdits: []diff.TextEdit{{Span: newSpan(0, 0), NewText: "B\nC"}}, + // }, + { + Name: "add_end", + In: "A", + Out: "AB", + Unified: UnifiedPrefix + ` +@@ -1 +1 @@ +-A +\ No newline at end of file ++AB +\ No newline at end of file +`[1:], + Edits: []diff.Edit{{Start: 1, End: 1, New: "B"}}, + LineEdits: []diff.Edit{{Start: 0, End: 1, New: "AB"}}, + }, { + Name: "add_empty", + In: "", + Out: "AB\nC", + Unified: UnifiedPrefix + ` +@@ -0,0 +1,2 @@ ++AB ++C +\ No newline at end of file +`[1:], + Edits: []diff.Edit{{Start: 0, End: 0, New: "AB\nC"}}, + LineEdits: []diff.Edit{{Start: 0, End: 0, New: "AB\nC"}}, + }, { + Name: "add_newline", + In: "A", + Out: "A\n", + Unified: UnifiedPrefix + ` +@@ -1 +1 @@ +-A +\ No newline at end of file ++A +`[1:], + Edits: []diff.Edit{{Start: 1, End: 1, New: "\n"}}, + LineEdits: []diff.Edit{{Start: 0, End: 1, New: "A\n"}}, + }, { + Name: "delete_front", + In: "A\nB\nC\nA\nB\nB\nA\n", + Out: "C\nB\nA\nB\nA\nC\n", + Unified: UnifiedPrefix + ` +@@ -1,7 +1,6 @@ +-A +-B + C ++B + A + B +-B + A ++C +`[1:], + NoDiff: true, // unified diff is different but valid + Edits: []diff.Edit{ + {Start: 0, End: 4, New: ""}, + {Start: 6, End: 6, New: "B\n"}, + {Start: 10, End: 12, New: ""}, + {Start: 14, End: 14, New: "C\n"}, + }, + LineEdits: []diff.Edit{ + {Start: 0, End: 4, New: ""}, + {Start: 6, End: 6, New: "B\n"}, + {Start: 10, End: 12, New: ""}, + {Start: 14, End: 14, New: "C\n"}, + }, + }, { + Name: "replace_last_line", + In: "A\nB\n", + Out: "A\nC\n\n", + Unified: UnifiedPrefix + ` +@@ -1,2 +1,3 @@ + A +-B ++C ++ +`[1:], + Edits: []diff.Edit{{Start: 2, End: 3, New: "C\n"}}, + LineEdits: []diff.Edit{{Start: 2, End: 4, New: "C\n\n"}}, + }, + { + Name: "multiple_replace", + In: "A\nB\nC\nD\nE\nF\nG\n", + Out: "A\nH\nI\nJ\nE\nF\nK\n", + Unified: UnifiedPrefix + ` +@@ -1,7 +1,7 @@ + A +-B +-C +-D ++H ++I ++J + E + F +-G ++K +`[1:], + Edits: []diff.Edit{ + {Start: 2, End: 8, New: "H\nI\nJ\n"}, + {Start: 12, End: 14, New: "K\n"}, + }, + NoDiff: true, // diff algorithm produces different delete/insert pattern + }, + { + Name: "extra_newline", + In: "\nA\n", + Out: "A\n", + Edits: []diff.Edit{{Start: 0, End: 1, New: ""}}, + Unified: UnifiedPrefix + `@@ -1,2 +1 @@ +- + A +`, + }, { + Name: "unified_lines", + In: "aaa\nccc\n", + Out: "aaa\nbbb\nccc\n", + Edits: []diff.Edit{{Start: 3, End: 3, New: "\nbbb"}}, + LineEdits: []diff.Edit{{Start: 0, End: 4, New: "aaa\nbbb\n"}}, + Unified: UnifiedPrefix + "@@ -1,2 +1,3 @@\n aaa\n+bbb\n ccc\n", + }, { + Name: "60379", + In: `package a + +type S struct { +s fmt.Stringer +} +`, + Out: `package a + +type S struct { + s fmt.Stringer +} +`, + Edits: []diff.Edit{{Start: 27, End: 27, New: "\t"}}, + LineEdits: []diff.Edit{{Start: 27, End: 42, New: "\ts fmt.Stringer\n"}}, + Unified: UnifiedPrefix + "@@ -1,5 +1,5 @@\n package a\n \n type S struct {\n-s fmt.Stringer\n+\ts fmt.Stringer\n }\n", + }, +} + +func DiffTest(t *testing.T, compute func(before, after string) []diff.Edit) { + for _, test := range TestCases { + t.Run(test.Name, func(t *testing.T) { + edits := compute(test.In, test.Out) + got, err := diff.Apply(test.In, edits) + if err != nil { + t.Fatalf("Apply failed: %v", err) + } + unified, err := diff.ToUnified(FileA, FileB, test.In, edits, diff.DefaultContextLines) + if err != nil { + t.Fatalf("ToUnified: %v", err) + } + if got != test.Out { + t.Errorf("Apply: got patched:\n%v\nfrom diff:\n%v\nexpected:\n%v", + got, unified, test.Out) + } + if !test.NoDiff && unified != test.Unified { + t.Errorf("Unified: got diff:\n%q\nexpected:\n%q diffs:%v", + unified, test.Unified, edits) + } + }) + } +} diff --git a/pkg/plugin/processor/builtin/internal/diff/difftest/difftest_test.go b/pkg/plugin/processor/builtin/internal/diff/difftest/difftest_test.go new file mode 100644 index 000000000..5ff4aae05 --- /dev/null +++ b/pkg/plugin/processor/builtin/internal/diff/difftest/difftest_test.go @@ -0,0 +1,82 @@ +// Copyright 2019 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// Package difftest supplies a set of tests that will operate on any +// implementation of a diff algorithm as exposed by +// "github.com/conduitio/conduit/pkg/plugin/processor/builtin/internal/diff" +package difftest_test + +import ( + "fmt" + "os" + "os/exec" + "strings" + "testing" + + "github.com/conduitio/conduit/pkg/plugin/processor/builtin/internal/diff/difftest" + "github.com/conduitio/conduit/pkg/plugin/processor/builtin/internal/diff/testenv" +) + +func TestVerifyUnified(t *testing.T) { + testenv.NeedsTool(t, "diff") + for _, test := range difftest.TestCases { + t.Run(test.Name, func(t *testing.T) { + if test.NoDiff { + t.Skip("diff tool produces expected different results") + } + diff, err := getDiffOutput(test.In, test.Out) + if err != nil { + t.Fatal(err) + } + if len(diff) > 0 { + diff = difftest.UnifiedPrefix + diff + } + if diff != test.Unified { + t.Errorf("unified:\n%s\ndiff -u:\n%s", test.Unified, diff) + } + }) + } +} + +func getDiffOutput(a, b string) (string, error) { + fileA, err := os.CreateTemp("", "myers.in") + if err != nil { + return "", err + } + defer os.Remove(fileA.Name()) + if _, err := fileA.Write([]byte(a)); err != nil { + return "", err + } + if err := fileA.Close(); err != nil { + return "", err + } + fileB, err := os.CreateTemp("", "myers.in") + if err != nil { + return "", err + } + defer os.Remove(fileB.Name()) + if _, err := fileB.Write([]byte(b)); err != nil { + return "", err + } + if err := fileB.Close(); err != nil { + return "", err + } + cmd := exec.Command("diff", "-u", fileA.Name(), fileB.Name()) + cmd.Env = append(cmd.Env, "LANG=en_US.UTF-8") + out, err := cmd.CombinedOutput() + if err != nil { + if _, ok := err.(*exec.ExitError); !ok { + return "", fmt.Errorf("failed to run diff -u %v %v: %v\n%v", fileA.Name(), fileB.Name(), err, string(out)) + } + } + diff := string(out) + if len(diff) <= 0 { + return diff, nil + } + bits := strings.SplitN(diff, "\n", 3) + if len(bits) != 3 { + return "", fmt.Errorf("diff output did not have file prefix:\n%s", diff) + } + return bits[2], nil +} diff --git a/pkg/plugin/processor/builtin/internal/diff/export_test.go b/pkg/plugin/processor/builtin/internal/diff/export_test.go new file mode 100644 index 000000000..eedf0dd77 --- /dev/null +++ b/pkg/plugin/processor/builtin/internal/diff/export_test.go @@ -0,0 +1,9 @@ +// Copyright 2022 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package diff + +// This file exports some private declarations to tests. + +var LineEdits = lineEdits diff --git a/pkg/plugin/processor/builtin/internal/diff/lcs/common.go b/pkg/plugin/processor/builtin/internal/diff/lcs/common.go new file mode 100644 index 000000000..c3e82dd26 --- /dev/null +++ b/pkg/plugin/processor/builtin/internal/diff/lcs/common.go @@ -0,0 +1,179 @@ +// Copyright 2022 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package lcs + +import ( + "log" + "sort" +) + +// lcs is a longest common sequence +type lcs []diag + +// A diag is a piece of the edit graph where A[X+i] == B[Y+i], for 0<=i l[j].Len + }) + return l +} + +// validate that the elements of the lcs do not overlap +// (can only happen when the two-sided algorithm ends early) +// expects the lcs to be sorted +func (l lcs) valid() bool { + for i := 1; i < len(l); i++ { + if l[i-1].X+l[i-1].Len > l[i].X { + return false + } + if l[i-1].Y+l[i-1].Len > l[i].Y { + return false + } + } + return true +} + +// repair overlapping lcs +// only called if two-sided stops early +func (l lcs) fix() lcs { + // from the set of diagonals in l, find a maximal non-conflicting set + // this problem may be NP-complete, but we use a greedy heuristic, + // which is quadratic, but with a better data structure, could be D log D. + // indepedent is not enough: {0,3,1} and {3,0,2} can't both occur in an lcs + // which has to have monotone x and y + if len(l) == 0 { + return nil + } + sort.Slice(l, func(i, j int) bool { return l[i].Len > l[j].Len }) + tmp := make(lcs, 0, len(l)) + tmp = append(tmp, l[0]) + for i := 1; i < len(l); i++ { + var dir direction + nxt := l[i] + for _, in := range tmp { + if dir, nxt = overlap(in, nxt); dir == empty || dir == bad { + break + } + } + if nxt.Len > 0 && dir != bad { + tmp = append(tmp, nxt) + } + } + tmp.sort() + if false && !tmp.valid() { // debug checking + log.Fatalf("here %d", len(tmp)) + } + return tmp +} + +type direction int + +const ( + empty direction = iota // diag is empty (so not in lcs) + leftdown // proposed acceptably to the left and below + rightup // proposed diag is acceptably to the right and above + bad // proposed diag is inconsistent with the lcs so far +) + +// overlap trims the proposed diag prop so it doesn't overlap with +// the existing diag that has already been added to the lcs. +func overlap(exist, prop diag) (direction, diag) { + if prop.X <= exist.X && exist.X < prop.X+prop.Len { + // remove the end of prop where it overlaps with the X end of exist + delta := prop.X + prop.Len - exist.X + prop.Len -= delta + if prop.Len <= 0 { + return empty, prop + } + } + if exist.X <= prop.X && prop.X < exist.X+exist.Len { + // remove the beginning of prop where overlaps with exist + delta := exist.X + exist.Len - prop.X + prop.Len -= delta + if prop.Len <= 0 { + return empty, prop + } + prop.X += delta + prop.Y += delta + } + if prop.Y <= exist.Y && exist.Y < prop.Y+prop.Len { + // remove the end of prop that overlaps (in Y) with exist + delta := prop.Y + prop.Len - exist.Y + prop.Len -= delta + if prop.Len <= 0 { + return empty, prop + } + } + if exist.Y <= prop.Y && prop.Y < exist.Y+exist.Len { + // remove the beginning of peop that overlaps with exist + delta := exist.Y + exist.Len - prop.Y + prop.Len -= delta + if prop.Len <= 0 { + return empty, prop + } + prop.X += delta // no test reaches this code + prop.Y += delta + } + if prop.X+prop.Len <= exist.X && prop.Y+prop.Len <= exist.Y { + return leftdown, prop + } + if exist.X+exist.Len <= prop.X && exist.Y+exist.Len <= prop.Y { + return rightup, prop + } + // prop can't be in an lcs that contains exist + return bad, prop +} + +// manipulating Diag and lcs + +// prepend a diagonal (x,y)-(x+1,y+1) segment either to an empty lcs +// or to its first Diag. prepend is only called to extend diagonals +// the backward direction. +func (lcs lcs) prepend(x, y int) lcs { + if len(lcs) > 0 { + d := &lcs[0] + if int(d.X) == x+1 && int(d.Y) == y+1 { + // extend the diagonal down and to the left + d.X, d.Y = int(x), int(y) + d.Len++ + return lcs + } + } + + r := diag{X: int(x), Y: int(y), Len: 1} + lcs = append([]diag{r}, lcs...) + return lcs +} + +// append appends a diagonal, or extends the existing one. +// by adding the edge (x,y)-(x+1.y+1). append is only called +// to extend diagonals in the forward direction. +func (lcs lcs) append(x, y int) lcs { + if len(lcs) > 0 { + last := &lcs[len(lcs)-1] + // Expand last element if adjoining. + if last.X+last.Len == x && last.Y+last.Len == y { + last.Len++ + return lcs + } + } + + return append(lcs, diag{X: x, Y: y, Len: 1}) +} + +// enforce constraint on d, k +func ok(d, k int) bool { + return d >= 0 && -d <= k && k <= d +} diff --git a/pkg/plugin/processor/builtin/internal/diff/lcs/common_test.go b/pkg/plugin/processor/builtin/internal/diff/lcs/common_test.go new file mode 100644 index 000000000..f19245e40 --- /dev/null +++ b/pkg/plugin/processor/builtin/internal/diff/lcs/common_test.go @@ -0,0 +1,140 @@ +// Copyright 2022 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package lcs + +import ( + "log" + "math/rand" + "strings" + "testing" +) + +type Btest struct { + a, b string + lcs []string +} + +var Btests = []Btest{ + {"aaabab", "abaab", []string{"abab", "aaab"}}, + {"aabbba", "baaba", []string{"aaba"}}, + {"cabbx", "cbabx", []string{"cabx", "cbbx"}}, + {"c", "cb", []string{"c"}}, + {"aaba", "bbb", []string{"b"}}, + {"bbaabb", "b", []string{"b"}}, + {"baaabb", "bbaba", []string{"bbb", "baa", "bab"}}, + {"baaabb", "abbab", []string{"abb", "bab", "aab"}}, + {"baaba", "aaabba", []string{"aaba"}}, + {"ca", "cba", []string{"ca"}}, + {"ccbcbc", "abba", []string{"bb"}}, + {"ccbcbc", "aabba", []string{"bb"}}, + {"ccb", "cba", []string{"cb"}}, + {"caef", "axe", []string{"ae"}}, + {"bbaabb", "baabb", []string{"baabb"}}, + // Example from Myers: + {"abcabba", "cbabac", []string{"caba", "baba", "cbba"}}, + {"3456aaa", "aaa", []string{"aaa"}}, + {"aaa", "aaa123", []string{"aaa"}}, + {"aabaa", "aacaa", []string{"aaaa"}}, + {"1a", "a", []string{"a"}}, + {"abab", "bb", []string{"bb"}}, + {"123", "ab", []string{""}}, + {"a", "b", []string{""}}, + {"abc", "123", []string{""}}, + {"aa", "aa", []string{"aa"}}, + {"abcde", "12345", []string{""}}, + {"aaa3456", "aaa", []string{"aaa"}}, + {"abcde", "12345a", []string{"a"}}, + {"ab", "123", []string{""}}, + {"1a2", "a", []string{"a"}}, + // for two-sided + {"babaab", "cccaba", []string{"aba"}}, + {"aabbab", "cbcabc", []string{"bab"}}, + {"abaabb", "bcacab", []string{"baab"}}, + {"abaabb", "abaaaa", []string{"abaa"}}, + {"bababb", "baaabb", []string{"baabb"}}, + {"abbbaa", "cabacc", []string{"aba"}}, + {"aabbaa", "aacaba", []string{"aaaa", "aaba"}}, +} + +func init() { + log.SetFlags(log.Lshortfile) +} + +func check(t *testing.T, str string, lcs lcs, want []string) { + t.Helper() + if !lcs.valid() { + t.Errorf("bad lcs %v", lcs) + } + var got strings.Builder + for _, dd := range lcs { + got.WriteString(str[dd.X : dd.X+dd.Len]) + } + ans := got.String() + for _, w := range want { + if ans == w { + return + } + } + t.Fatalf("str=%q lcs=%v want=%q got=%q", str, lcs, want, ans) +} + +func checkDiffs(t *testing.T, before string, diffs []Diff, after string) { + t.Helper() + var ans strings.Builder + sofar := 0 // index of position in before + for _, d := range diffs { + if sofar < d.Start { + ans.WriteString(before[sofar:d.Start]) + } + ans.WriteString(after[d.ReplStart:d.ReplEnd]) + sofar = d.End + } + ans.WriteString(before[sofar:]) + if ans.String() != after { + t.Fatalf("diff %v took %q to %q, not to %q", diffs, before, ans.String(), after) + } +} + +func lcslen(l lcs) int { + ans := 0 + for _, d := range l { + ans += int(d.Len) + } + return ans +} + +// return a random string of length n made of characters from s +func randstr(s string, n int) string { + src := []rune(s) + x := make([]rune, n) + for i := 0; i < n; i++ { + x[i] = src[rand.Intn(len(src))] + } + return string(x) +} + +func TestLcsFix(t *testing.T) { + tests := []struct{ before, after lcs }{ + {lcs{diag{0, 0, 3}, diag{2, 2, 5}, diag{3, 4, 5}, diag{8, 9, 4}}, lcs{diag{0, 0, 2}, diag{2, 2, 1}, diag{3, 4, 5}, diag{8, 9, 4}}}, + {lcs{diag{1, 1, 6}, diag{6, 12, 3}}, lcs{diag{1, 1, 5}, diag{6, 12, 3}}}, + {lcs{diag{0, 0, 4}, diag{3, 5, 4}}, lcs{diag{0, 0, 3}, diag{3, 5, 4}}}, + {lcs{diag{0, 20, 1}, diag{0, 0, 3}, diag{1, 20, 4}}, lcs{diag{0, 0, 3}, diag{3, 22, 2}}}, + {lcs{diag{0, 0, 4}, diag{1, 1, 2}}, lcs{diag{0, 0, 4}}}, + {lcs{diag{0, 0, 4}}, lcs{diag{0, 0, 4}}}, + {lcs{}, lcs{}}, + {lcs{diag{0, 0, 4}, diag{1, 1, 6}, diag{3, 3, 2}}, lcs{diag{0, 0, 1}, diag{1, 1, 6}}}, + } + for n, x := range tests { + got := x.before.fix() + if len(got) != len(x.after) { + t.Errorf("got %v, expected %v, for %v", got, x.after, x.before) + } + olen := lcslen(x.after) + glen := lcslen(got) + if olen != glen { + t.Errorf("%d: lens(%d,%d) differ, %v, %v, %v", n, glen, olen, got, x.after, x.before) + } + } +} diff --git a/pkg/plugin/processor/builtin/internal/diff/lcs/doc.go b/pkg/plugin/processor/builtin/internal/diff/lcs/doc.go new file mode 100644 index 000000000..9029dd20b --- /dev/null +++ b/pkg/plugin/processor/builtin/internal/diff/lcs/doc.go @@ -0,0 +1,156 @@ +// Copyright 2022 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// package lcs contains code to find longest-common-subsequences +// (and diffs) +package lcs + +/* +Compute longest-common-subsequences of two slices A, B using +algorithms from Myers' paper. A longest-common-subsequence +(LCS from now on) of A and B is a maximal set of lexically increasing +pairs of subscripts (x,y) with A[x]==B[y]. There may be many LCS, but +they all have the same length. An LCS determines a sequence of edits +that changes A into B. + +The key concept is the edit graph of A and B. +If A has length N and B has length M, then the edit graph has +vertices v[i][j] for 0 <= i <= N, 0 <= j <= M. There is a +horizontal edge from v[i][j] to v[i+1][j] whenever both are in +the graph, and a vertical edge from v[i][j] to f[i][j+1] similarly. +When A[i] == B[j] there is a diagonal edge from v[i][j] to v[i+1][j+1]. + +A path between in the graph between (0,0) and (N,M) determines a sequence +of edits converting A into B: each horizontal edge corresponds to removing +an element of A, and each vertical edge corresponds to inserting an +element of B. + +A vertex (x,y) is on (forward) diagonal k if x-y=k. A path in the graph +is of length D if it has D non-diagonal edges. The algorithms generate +forward paths (in which at least one of x,y increases at each edge), +or backward paths (in which at least one of x,y decreases at each edge), +or a combination. (Note that the orientation is the traditional mathematical one, +with the origin in the lower-left corner.) + +Here is the edit graph for A:"aabbaa", B:"aacaba". (I know the diagonals look weird.) + ⊙ ------- ⊙ ------- ⊙ ------- ⊙ ------- ⊙ ------- ⊙ ------- ⊙ + a | ___/‾‾‾ | ___/‾‾‾ | | | ___/‾‾‾ | ___/‾‾‾ | + ⊙ ------- ⊙ ------- ⊙ ------- ⊙ ------- ⊙ ------- ⊙ ------- ⊙ + b | | | ___/‾‾‾ | ___/‾‾‾ | | | + ⊙ ------- ⊙ ------- ⊙ ------- ⊙ ------- ⊙ ------- ⊙ ------- ⊙ + a | ___/‾‾‾ | ___/‾‾‾ | | | ___/‾‾‾ | ___/‾‾‾ | + ⊙ ------- ⊙ ------- ⊙ ------- ⊙ ------- ⊙ ------- ⊙ ------- ⊙ + c | | | | | | | + ⊙ ------- ⊙ ------- ⊙ ------- ⊙ ------- ⊙ ------- ⊙ ------- ⊙ + a | ___/‾‾‾ | ___/‾‾‾ | | | ___/‾‾‾ | ___/‾‾‾ | + ⊙ ------- ⊙ ------- ⊙ ------- ⊙ ------- ⊙ ------- ⊙ ------- ⊙ + a | ___/‾‾‾ | ___/‾‾‾ | | | ___/‾‾‾ | ___/‾‾‾ | + ⊙ ------- ⊙ ------- ⊙ ------- ⊙ ------- ⊙ ------- ⊙ ------- ⊙ + a a b b a a + + +The algorithm labels a vertex (x,y) with D,k if it is on diagonal k and at +the end of a maximal path of length D. (Because x-y=k it suffices to remember +only the x coordinate of the vertex.) + +The forward algorithm: Find the longest diagonal starting at (0,0) and +label its end with D=0,k=0. From that vertex take a vertical step and +then follow the longest diagonal (up and to the right), and label that vertex +with D=1,k=-1. From the D=0,k=0 point take a horizontal step and the follow +the longest diagonal (up and to the right) and label that vertex +D=1,k=1. In the same way, having labelled all the D vertices, +from a vertex labelled D,k find two vertices +tentatively labelled D+1,k-1 and D+1,k+1. There may be two on the same +diagonal, in which case take the one with the larger x. + +Eventually the path gets to (N,M), and the diagonals on it are the LCS. + +Here is the edit graph with the ends of D-paths labelled. (So, for instance, +0/2,2 indicates that x=2,y=2 is labelled with 0, as it should be, since the first +step is to go up the longest diagonal from (0,0).) +A:"aabbaa", B:"aacaba" + ⊙ ------- ⊙ ------- ⊙ -------(3/3,6)------- ⊙ -------(3/5,6)-------(4/6,6) + a | ___/‾‾‾ | ___/‾‾‾ | | | ___/‾‾‾ | ___/‾‾‾ | + ⊙ ------- ⊙ ------- ⊙ -------(2/3,5)------- ⊙ ------- ⊙ ------- ⊙ + b | | | ___/‾‾‾ | ___/‾‾‾ | | | + ⊙ ------- ⊙ ------- ⊙ ------- ⊙ ------- ⊙ -------(3/5,4)------- ⊙ + a | ___/‾‾‾ | ___/‾‾‾ | | | ___/‾‾‾ | ___/‾‾‾ | + ⊙ ------- ⊙ -------(1/2,3)-------(2/3,3)------- ⊙ ------- ⊙ ------- ⊙ + c | | | | | | | + ⊙ ------- ⊙ -------(0/2,2)-------(1/3,2)-------(2/4,2)-------(3/5,2)-------(4/6,2) + a | ___/‾‾‾ | ___/‾‾‾ | | | ___/‾‾‾ | ___/‾‾‾ | + ⊙ ------- ⊙ ------- ⊙ ------- ⊙ ------- ⊙ ------- ⊙ ------- ⊙ + a | ___/‾‾‾ | ___/‾‾‾ | | | ___/‾‾‾ | ___/‾‾‾ | + ⊙ ------- ⊙ ------- ⊙ ------- ⊙ ------- ⊙ ------- ⊙ ------- ⊙ + a a b b a a + +The 4-path is reconstructed starting at (4/6,6), horizontal to (3/5,6), diagonal to (3,4), vertical +to (2/3,3), horizontal to (1/2,3), vertical to (0/2,2), and diagonal to (0,0). As expected, +there are 4 non-diagonal steps, and the diagonals form an LCS. + +There is a symmetric backward algorithm, which gives (backwards labels are prefixed with a colon): +A:"aabbaa", B:"aacaba" + ⊙ -------- ⊙ -------- ⊙ -------- ⊙ -------- ⊙ -------- ⊙ -------- ⊙ + a | ____/‾‾‾ | ____/‾‾‾ | | | ____/‾‾‾ | ____/‾‾‾ | + ⊙ -------- ⊙ -------- ⊙ -------- ⊙ -------- ⊙ --------(:0/5,5)-------- ⊙ + b | | | ____/‾‾‾ | ____/‾‾‾ | | | + ⊙ -------- ⊙ -------- ⊙ --------(:1/3,4)-------- ⊙ -------- ⊙ -------- ⊙ + a | ____/‾‾‾ | ____/‾‾‾ | | | ____/‾‾‾ | ____/‾‾‾ | + (:3/0,3)--------(:2/1,3)-------- ⊙ --------(:2/3,3)--------(:1/4,3)-------- ⊙ -------- ⊙ + c | | | | | | | + ⊙ -------- ⊙ -------- ⊙ --------(:3/3,2)--------(:2/4,2)-------- ⊙ -------- ⊙ + a | ____/‾‾‾ | ____/‾‾‾ | | | ____/‾‾‾ | ____/‾‾‾ | + (:3/0,1)-------- ⊙ -------- ⊙ -------- ⊙ --------(:3/4,1)-------- ⊙ -------- ⊙ + a | ____/‾‾‾ | ____/‾‾‾ | | | ____/‾‾‾ | ____/‾‾‾ | + (:4/0,0)-------- ⊙ -------- ⊙ -------- ⊙ --------(:4/4,0)-------- ⊙ -------- ⊙ + a a b b a a + +Neither of these is ideal for use in an editor, where it is undesirable to send very long diffs to the +front end. It's tricky to decide exactly what 'very long diffs' means, as "replace A by B" is very short. +We want to control how big D can be, by stopping when it gets too large. The forward algorithm then +privileges common prefixes, and the backward algorithm privileges common suffixes. Either is an undesirable +asymmetry. + +Fortunately there is a two-sided algorithm, implied by results in Myers' paper. Here's what the labels in +the edit graph look like. +A:"aabbaa", B:"aacaba" + ⊙ --------- ⊙ --------- ⊙ --------- ⊙ --------- ⊙ --------- ⊙ --------- ⊙ + a | ____/‾‾‾‾ | ____/‾‾‾‾ | | | ____/‾‾‾‾ | ____/‾‾‾‾ | + ⊙ --------- ⊙ --------- ⊙ --------- (2/3,5) --------- ⊙ --------- (:0/5,5)--------- ⊙ + b | | | ____/‾‾‾‾ | ____/‾‾‾‾ | | | + ⊙ --------- ⊙ --------- ⊙ --------- (:1/3,4)--------- ⊙ --------- ⊙ --------- ⊙ + a | ____/‾‾‾‾ | ____/‾‾‾‾ | | | ____/‾‾‾‾ | ____/‾‾‾‾ | + ⊙ --------- (:2/1,3)--------- (1/2,3) ---------(2:2/3,3)--------- (:1/4,3)--------- ⊙ --------- ⊙ + c | | | | | | | + ⊙ --------- ⊙ --------- (0/2,2) --------- (1/3,2) ---------(2:2/4,2)--------- ⊙ --------- ⊙ + a | ____/‾‾‾‾ | ____/‾‾‾‾ | | | ____/‾‾‾‾ | ____/‾‾‾‾ | + ⊙ --------- ⊙ --------- ⊙ --------- ⊙ --------- ⊙ --------- ⊙ --------- ⊙ + a | ____/‾‾‾‾ | ____/‾‾‾‾ | | | ____/‾‾‾‾ | ____/‾‾‾‾ | + ⊙ --------- ⊙ --------- ⊙ --------- ⊙ --------- ⊙ --------- ⊙ --------- ⊙ + a a b b a a + +The algorithm stopped when it saw the backwards 2-path ending at (1,3) and the forwards 2-path ending at (3,5). The criterion +is a backwards path ending at (u,v) and a forward path ending at (x,y), where u <= x and the two points are on the same +diagonal. (Here the edgegraph has a diagonal, but the criterion is x-y=u-v.) Myers proves there is a forward +2-path from (0,0) to (1,3), and that together with the backwards 2-path ending at (1,3) gives the expected 4-path. +Unfortunately the forward path has to be constructed by another run of the forward algorithm; it can't be found from the +computed labels. That is the worst case. Had the code noticed (x,y)=(u,v)=(3,3) the whole path could be reconstructed +from the edgegraph. The implementation looks for a number of special cases to try to avoid computing an extra forward path. + +If the two-sided algorithm has stop early (because D has become too large) it will have found a forward LCS and a +backwards LCS. Ideally these go with disjoint prefixes and suffixes of A and B, but disjointness may fail and the two +computed LCS may conflict. (An easy example is where A is a suffix of B, and shares a short prefix. The backwards LCS +is all of A, and the forward LCS is a prefix of A.) The algorithm combines the two +to form a best-effort LCS. In the worst case the forward partial LCS may have to +be recomputed. +*/ + +/* Eugene Myers paper is titled +"An O(ND) Difference Algorithm and Its Variations" +and can be found at +http://www.xmailserver.org/diff2.pdf + +(There is a generic implementation of the algorithm the repository with git hash +b9ad7e4ade3a686d608e44475390ad428e60e7fc) +*/ diff --git a/pkg/plugin/processor/builtin/internal/diff/lcs/git.sh b/pkg/plugin/processor/builtin/internal/diff/lcs/git.sh new file mode 100644 index 000000000..b25ba4aac --- /dev/null +++ b/pkg/plugin/processor/builtin/internal/diff/lcs/git.sh @@ -0,0 +1,33 @@ +#!/bin/bash +# +# Copyright 2022 The Go Authors. All rights reserved. +# Use of this source code is governed by a BSD-style +# license that can be found in the LICENSE file. +# +# Creates a zip file containing all numbered versions +# of the commit history of a large source file, for use +# as input data for the tests of the diff algorithm. +# +# Run script from root of the x/tools repo. + +set -eu + +# WARNING: This script will install the latest version of $file +# The largest real source file in the x/tools repo. +# file=internal/golang/completion/completion.go +# file=internal/golang/diagnostics.go +file=internal/protocol/tsprotocol.go + +tmp=$(mktemp -d) +git log $file | + awk '/^commit / {print $2}' | + nl -ba -nrz | + while read n hash; do + git checkout --quiet $hash $file + cp -f $file $tmp/$n + done +(cd $tmp && zip -q - *) > testdata.zip +rm -fr $tmp +git restore --staged $file +git restore $file +echo "Created testdata.zip" diff --git a/pkg/plugin/processor/builtin/internal/diff/lcs/labels.go b/pkg/plugin/processor/builtin/internal/diff/lcs/labels.go new file mode 100644 index 000000000..504913d1d --- /dev/null +++ b/pkg/plugin/processor/builtin/internal/diff/lcs/labels.go @@ -0,0 +1,55 @@ +// Copyright 2022 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package lcs + +import ( + "fmt" +) + +// For each D, vec[D] has length D+1, +// and the label for (D, k) is stored in vec[D][(D+k)/2]. +type label struct { + vec [][]int +} + +// Temporary checking DO NOT COMMIT true TO PRODUCTION CODE +const debug = false + +// debugging. check that the (d,k) pair is valid +// (that is, -d<=k<=d and d+k even) +func checkDK(D, k int) { + if k >= -D && k <= D && (D+k)%2 == 0 { + return + } + panic(fmt.Sprintf("out of range, d=%d,k=%d", D, k)) +} + +func (t *label) set(D, k, x int) { + if debug { + checkDK(D, k) + } + for len(t.vec) <= D { + t.vec = append(t.vec, nil) + } + if t.vec[D] == nil { + t.vec[D] = make([]int, D+1) + } + t.vec[D][(D+k)/2] = x // known that D+k is even +} + +func (t *label) get(d, k int) int { + if debug { + checkDK(d, k) + } + return int(t.vec[d][(d+k)/2]) +} + +func newtriang(limit int) label { + if limit < 100 { + // Preallocate if limit is not large. + return label{vec: make([][]int, limit)} + } + return label{} +} diff --git a/pkg/plugin/processor/builtin/internal/diff/lcs/old.go b/pkg/plugin/processor/builtin/internal/diff/lcs/old.go new file mode 100644 index 000000000..a14ae9119 --- /dev/null +++ b/pkg/plugin/processor/builtin/internal/diff/lcs/old.go @@ -0,0 +1,480 @@ +// Copyright 2022 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package lcs + +// TODO(adonovan): remove unclear references to "old" in this package. + +import ( + "fmt" +) + +// A Diff is a replacement of a portion of A by a portion of B. +type Diff struct { + Start, End int // offsets of portion to delete in A + ReplStart, ReplEnd int // offset of replacement text in B +} + +// DiffStrings returns the differences between two strings. +// It does not respect rune boundaries. +func DiffStrings(a, b string) []Diff { return diff(stringSeqs{a, b}) } + +// DiffBytes returns the differences between two byte sequences. +// It does not respect rune boundaries. +func DiffBytes(a, b []byte) []Diff { return diff(bytesSeqs{a, b}) } + +// DiffRunes returns the differences between two rune sequences. +func DiffRunes(a, b []rune) []Diff { return diff(runesSeqs{a, b}) } + +func diff(seqs sequences) []Diff { + // A limit on how deeply the LCS algorithm should search. The value is just a guess. + const maxDiffs = 100 + diff, _ := compute(seqs, twosided, maxDiffs/2) + return diff +} + +// compute computes the list of differences between two sequences, +// along with the LCS. It is exercised directly by tests. +// The algorithm is one of {forward, backward, twosided}. +func compute(seqs sequences, algo func(*editGraph) lcs, limit int) ([]Diff, lcs) { + if limit <= 0 { + limit = 1 << 25 // effectively infinity + } + alen, blen := seqs.lengths() + g := &editGraph{ + seqs: seqs, + vf: newtriang(limit), + vb: newtriang(limit), + limit: limit, + ux: alen, + uy: blen, + delta: alen - blen, + } + lcs := algo(g) + diffs := lcs.toDiffs(alen, blen) + return diffs, lcs +} + +// editGraph carries the information for computing the lcs of two sequences. +type editGraph struct { + seqs sequences + vf, vb label // forward and backward labels + + limit int // maximal value of D + // the bounding rectangle of the current edit graph + lx, ly, ux, uy int + delta int // common subexpression: (ux-lx)-(uy-ly) +} + +// toDiffs converts an LCS to a list of edits. +func (lcs lcs) toDiffs(alen, blen int) []Diff { + var diffs []Diff + var pa, pb int // offsets in a, b + for _, l := range lcs { + if pa < l.X || pb < l.Y { + diffs = append(diffs, Diff{pa, l.X, pb, l.Y}) + } + pa = l.X + l.Len + pb = l.Y + l.Len + } + if pa < alen || pb < blen { + diffs = append(diffs, Diff{pa, alen, pb, blen}) + } + return diffs +} + +// --- FORWARD --- + +// fdone decides if the forwward path has reached the upper right +// corner of the rectangle. If so, it also returns the computed lcs. +func (e *editGraph) fdone(D, k int) (bool, lcs) { + // x, y, k are relative to the rectangle + x := e.vf.get(D, k) + y := x - k + if x == e.ux && y == e.uy { + return true, e.forwardlcs(D, k) + } + return false, nil +} + +// run the forward algorithm, until success or up to the limit on D. +func forward(e *editGraph) lcs { + e.setForward(0, 0, e.lx) + if ok, ans := e.fdone(0, 0); ok { + return ans + } + // from D to D+1 + for D := 0; D < e.limit; D++ { + e.setForward(D+1, -(D + 1), e.getForward(D, -D)) + if ok, ans := e.fdone(D+1, -(D + 1)); ok { + return ans + } + e.setForward(D+1, D+1, e.getForward(D, D)+1) + if ok, ans := e.fdone(D+1, D+1); ok { + return ans + } + for k := -D + 1; k <= D-1; k += 2 { + // these are tricky and easy to get backwards + lookv := e.lookForward(k, e.getForward(D, k-1)+1) + lookh := e.lookForward(k, e.getForward(D, k+1)) + if lookv > lookh { + e.setForward(D+1, k, lookv) + } else { + e.setForward(D+1, k, lookh) + } + if ok, ans := e.fdone(D+1, k); ok { + return ans + } + } + } + // D is too large + // find the D path with maximal x+y inside the rectangle and + // use that to compute the found part of the lcs + kmax := -e.limit - 1 + diagmax := -1 + for k := -e.limit; k <= e.limit; k += 2 { + x := e.getForward(e.limit, k) + y := x - k + if x+y > diagmax && x <= e.ux && y <= e.uy { + diagmax, kmax = x+y, k + } + } + return e.forwardlcs(e.limit, kmax) +} + +// recover the lcs by backtracking from the farthest point reached +func (e *editGraph) forwardlcs(D, k int) lcs { + var ans lcs + for x := e.getForward(D, k); x != 0 || x-k != 0; { + if ok(D-1, k-1) && x-1 == e.getForward(D-1, k-1) { + // if (x-1,y) is labelled D-1, x--,D--,k--,continue + D, k, x = D-1, k-1, x-1 + continue + } else if ok(D-1, k+1) && x == e.getForward(D-1, k+1) { + // if (x,y-1) is labelled D-1, x, D--,k++, continue + D, k = D-1, k+1 + continue + } + // if (x-1,y-1)--(x,y) is a diagonal, prepend,x--,y--, continue + y := x - k + ans = ans.prepend(x+e.lx-1, y+e.ly-1) + x-- + } + return ans +} + +// start at (x,y), go up the diagonal as far as possible, +// and label the result with d +func (e *editGraph) lookForward(k, relx int) int { + rely := relx - k + x, y := relx+e.lx, rely+e.ly + if x < e.ux && y < e.uy { + x += e.seqs.commonPrefixLen(x, e.ux, y, e.uy) + } + return x +} + +func (e *editGraph) setForward(d, k, relx int) { + x := e.lookForward(k, relx) + e.vf.set(d, k, x-e.lx) +} + +func (e *editGraph) getForward(d, k int) int { + x := e.vf.get(d, k) + return x +} + +// --- BACKWARD --- + +// bdone decides if the backward path has reached the lower left corner +func (e *editGraph) bdone(D, k int) (bool, lcs) { + // x, y, k are relative to the rectangle + x := e.vb.get(D, k) + y := x - (k + e.delta) + if x == 0 && y == 0 { + return true, e.backwardlcs(D, k) + } + return false, nil +} + +// run the backward algorithm, until success or up to the limit on D. +func backward(e *editGraph) lcs { + e.setBackward(0, 0, e.ux) + if ok, ans := e.bdone(0, 0); ok { + return ans + } + // from D to D+1 + for D := 0; D < e.limit; D++ { + e.setBackward(D+1, -(D + 1), e.getBackward(D, -D)-1) + if ok, ans := e.bdone(D+1, -(D + 1)); ok { + return ans + } + e.setBackward(D+1, D+1, e.getBackward(D, D)) + if ok, ans := e.bdone(D+1, D+1); ok { + return ans + } + for k := -D + 1; k <= D-1; k += 2 { + // these are tricky and easy to get wrong + lookv := e.lookBackward(k, e.getBackward(D, k-1)) + lookh := e.lookBackward(k, e.getBackward(D, k+1)-1) + if lookv < lookh { + e.setBackward(D+1, k, lookv) + } else { + e.setBackward(D+1, k, lookh) + } + if ok, ans := e.bdone(D+1, k); ok { + return ans + } + } + } + + // D is too large + // find the D path with minimal x+y inside the rectangle and + // use that to compute the part of the lcs found + kmax := -e.limit - 1 + diagmin := 1 << 25 + for k := -e.limit; k <= e.limit; k += 2 { + x := e.getBackward(e.limit, k) + y := x - (k + e.delta) + if x+y < diagmin && x >= 0 && y >= 0 { + diagmin, kmax = x+y, k + } + } + if kmax < -e.limit { + panic(fmt.Sprintf("no paths when limit=%d?", e.limit)) + } + return e.backwardlcs(e.limit, kmax) +} + +// recover the lcs by backtracking +func (e *editGraph) backwardlcs(D, k int) lcs { + var ans lcs + for x := e.getBackward(D, k); x != e.ux || x-(k+e.delta) != e.uy; { + if ok(D-1, k-1) && x == e.getBackward(D-1, k-1) { + // D--, k--, x unchanged + D, k = D-1, k-1 + continue + } else if ok(D-1, k+1) && x+1 == e.getBackward(D-1, k+1) { + // D--, k++, x++ + D, k, x = D-1, k+1, x+1 + continue + } + y := x - (k + e.delta) + ans = ans.append(x+e.lx, y+e.ly) + x++ + } + return ans +} + +// start at (x,y), go down the diagonal as far as possible, +func (e *editGraph) lookBackward(k, relx int) int { + rely := relx - (k + e.delta) // forward k = k + e.delta + x, y := relx+e.lx, rely+e.ly + if x > 0 && y > 0 { + x -= e.seqs.commonSuffixLen(0, x, 0, y) + } + return x +} + +// convert to rectangle, and label the result with d +func (e *editGraph) setBackward(d, k, relx int) { + x := e.lookBackward(k, relx) + e.vb.set(d, k, x-e.lx) +} + +func (e *editGraph) getBackward(d, k int) int { + x := e.vb.get(d, k) + return x +} + +// -- TWOSIDED --- + +func twosided(e *editGraph) lcs { + // The termination condition could be improved, as either the forward + // or backward pass could succeed before Myers' Lemma applies. + // Aside from questions of efficiency (is the extra testing cost-effective) + // this is more likely to matter when e.limit is reached. + e.setForward(0, 0, e.lx) + e.setBackward(0, 0, e.ux) + + // from D to D+1 + for D := 0; D < e.limit; D++ { + // just finished a backwards pass, so check + if got, ok := e.twoDone(D, D); ok { + return e.twolcs(D, D, got) + } + // do a forwards pass (D to D+1) + e.setForward(D+1, -(D + 1), e.getForward(D, -D)) + e.setForward(D+1, D+1, e.getForward(D, D)+1) + for k := -D + 1; k <= D-1; k += 2 { + // these are tricky and easy to get backwards + lookv := e.lookForward(k, e.getForward(D, k-1)+1) + lookh := e.lookForward(k, e.getForward(D, k+1)) + if lookv > lookh { + e.setForward(D+1, k, lookv) + } else { + e.setForward(D+1, k, lookh) + } + } + // just did a forward pass, so check + if got, ok := e.twoDone(D+1, D); ok { + return e.twolcs(D+1, D, got) + } + // do a backward pass, D to D+1 + e.setBackward(D+1, -(D + 1), e.getBackward(D, -D)-1) + e.setBackward(D+1, D+1, e.getBackward(D, D)) + for k := -D + 1; k <= D-1; k += 2 { + // these are tricky and easy to get wrong + lookv := e.lookBackward(k, e.getBackward(D, k-1)) + lookh := e.lookBackward(k, e.getBackward(D, k+1)-1) + if lookv < lookh { + e.setBackward(D+1, k, lookv) + } else { + e.setBackward(D+1, k, lookh) + } + } + } + + // D too large. combine a forward and backward partial lcs + // first, a forward one + kmax := -e.limit - 1 + diagmax := -1 + for k := -e.limit; k <= e.limit; k += 2 { + x := e.getForward(e.limit, k) + y := x - k + if x+y > diagmax && x <= e.ux && y <= e.uy { + diagmax, kmax = x+y, k + } + } + if kmax < -e.limit { + panic(fmt.Sprintf("no forward paths when limit=%d?", e.limit)) + } + lcs := e.forwardlcs(e.limit, kmax) + // now a backward one + // find the D path with minimal x+y inside the rectangle and + // use that to compute the lcs + diagmin := 1 << 25 // infinity + for k := -e.limit; k <= e.limit; k += 2 { + x := e.getBackward(e.limit, k) + y := x - (k + e.delta) + if x+y < diagmin && x >= 0 && y >= 0 { + diagmin, kmax = x+y, k + } + } + if kmax < -e.limit { + panic(fmt.Sprintf("no backward paths when limit=%d?", e.limit)) + } + lcs = append(lcs, e.backwardlcs(e.limit, kmax)...) + // These may overlap (e.forwardlcs and e.backwardlcs return sorted lcs) + ans := lcs.fix() + return ans +} + +// Does Myers' Lemma apply? +func (e *editGraph) twoDone(df, db int) (int, bool) { + if (df+db+e.delta)%2 != 0 { + return 0, false // diagonals cannot overlap + } + kmin := -db + e.delta + if -df > kmin { + kmin = -df + } + kmax := db + e.delta + if df < kmax { + kmax = df + } + for k := kmin; k <= kmax; k += 2 { + x := e.vf.get(df, k) + u := e.vb.get(db, k-e.delta) + if u <= x { + // is it worth looking at all the other k? + for l := k; l <= kmax; l += 2 { + x := e.vf.get(df, l) + y := x - l + u := e.vb.get(db, l-e.delta) + v := u - l + if x == u || u == 0 || v == 0 || y == e.uy || x == e.ux { + return l, true + } + } + return k, true + } + } + return 0, false +} + +func (e *editGraph) twolcs(df, db, kf int) lcs { + // db==df || db+1==df + x := e.vf.get(df, kf) + y := x - kf + kb := kf - e.delta + u := e.vb.get(db, kb) + v := u - kf + + // Myers proved there is a df-path from (0,0) to (u,v) + // and a db-path from (x,y) to (N,M). + // In the first case the overall path is the forward path + // to (u,v) followed by the backward path to (N,M). + // In the second case the path is the backward path to (x,y) + // followed by the forward path to (x,y) from (0,0). + + // Look for some special cases to avoid computing either of these paths. + if x == u { + // "babaab" "cccaba" + // already patched together + lcs := e.forwardlcs(df, kf) + lcs = append(lcs, e.backwardlcs(db, kb)...) + return lcs.sort() + } + + // is (u-1,v) or (u,v-1) labelled df-1? + // if so, that forward df-1-path plus a horizontal or vertical edge + // is the df-path to (u,v), then plus the db-path to (N,M) + if u > 0 && ok(df-1, u-1-v) && e.vf.get(df-1, u-1-v) == u-1 { + // "aabbab" "cbcabc" + lcs := e.forwardlcs(df-1, u-1-v) + lcs = append(lcs, e.backwardlcs(db, kb)...) + return lcs.sort() + } + if v > 0 && ok(df-1, (u-(v-1))) && e.vf.get(df-1, u-(v-1)) == u { + // "abaabb" "bcacab" + lcs := e.forwardlcs(df-1, u-(v-1)) + lcs = append(lcs, e.backwardlcs(db, kb)...) + return lcs.sort() + } + + // The path can't possibly contribute to the lcs because it + // is all horizontal or vertical edges + if u == 0 || v == 0 || x == e.ux || y == e.uy { + // "abaabb" "abaaaa" + if u == 0 || v == 0 { + return e.backwardlcs(db, kb) + } + return e.forwardlcs(df, kf) + } + + // is (x+1,y) or (x,y+1) labelled db-1? + if x+1 <= e.ux && ok(db-1, x+1-y-e.delta) && e.vb.get(db-1, x+1-y-e.delta) == x+1 { + // "bababb" "baaabb" + lcs := e.backwardlcs(db-1, kb+1) + lcs = append(lcs, e.forwardlcs(df, kf)...) + return lcs.sort() + } + if y+1 <= e.uy && ok(db-1, x-(y+1)-e.delta) && e.vb.get(db-1, x-(y+1)-e.delta) == x { + // "abbbaa" "cabacc" + lcs := e.backwardlcs(db-1, kb-1) + lcs = append(lcs, e.forwardlcs(df, kf)...) + return lcs.sort() + } + + // need to compute another path + // "aabbaa" "aacaba" + lcs := e.backwardlcs(db, kb) + oldx, oldy := e.ux, e.uy + e.ux = u + e.uy = v + lcs = append(lcs, forward(e)...) + e.ux, e.uy = oldx, oldy + return lcs.sort() +} diff --git a/pkg/plugin/processor/builtin/internal/diff/lcs/old_test.go b/pkg/plugin/processor/builtin/internal/diff/lcs/old_test.go new file mode 100644 index 000000000..789e9bc6c --- /dev/null +++ b/pkg/plugin/processor/builtin/internal/diff/lcs/old_test.go @@ -0,0 +1,251 @@ +// Copyright 2022 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package lcs + +import ( + "fmt" + "log" + "math/rand" + "os" + "strings" + "testing" +) + +func TestAlgosOld(t *testing.T) { + for i, algo := range []func(*editGraph) lcs{forward, backward, twosided} { + t.Run(strings.Fields("forward backward twosided")[i], func(t *testing.T) { + for _, tx := range Btests { + lim := len(tx.a) + len(tx.b) + + diffs, lcs := compute(stringSeqs{tx.a, tx.b}, algo, lim) + check(t, tx.a, lcs, tx.lcs) + checkDiffs(t, tx.a, diffs, tx.b) + + diffs, lcs = compute(stringSeqs{tx.b, tx.a}, algo, lim) + check(t, tx.b, lcs, tx.lcs) + checkDiffs(t, tx.b, diffs, tx.a) + } + }) + } +} + +func TestIntOld(t *testing.T) { + // need to avoid any characters in btests + lfill, rfill := "AAAAAAAAAAAA", "BBBBBBBBBBBB" + for _, tx := range Btests { + if len(tx.a) < 2 || len(tx.b) < 2 { + continue + } + left := tx.a + lfill + right := tx.b + rfill + lim := len(tx.a) + len(tx.b) + diffs, lcs := compute(stringSeqs{left, right}, twosided, lim) + check(t, left, lcs, tx.lcs) + checkDiffs(t, left, diffs, right) + diffs, lcs = compute(stringSeqs{right, left}, twosided, lim) + check(t, right, lcs, tx.lcs) + checkDiffs(t, right, diffs, left) + + left = lfill + tx.a + right = rfill + tx.b + diffs, lcs = compute(stringSeqs{left, right}, twosided, lim) + check(t, left, lcs, tx.lcs) + checkDiffs(t, left, diffs, right) + diffs, lcs = compute(stringSeqs{right, left}, twosided, lim) + check(t, right, lcs, tx.lcs) + checkDiffs(t, right, diffs, left) + } +} + +func TestSpecialOld(t *testing.T) { // exercises lcs.fix + a := "golang.org/x/tools/intern" + b := "github.com/google/safehtml/template\"\n\t\"golang.org/x/tools/intern" + diffs, lcs := compute(stringSeqs{a, b}, twosided, 4) + if !lcs.valid() { + t.Errorf("%d,%v", len(diffs), lcs) + } +} + +func TestRegressionOld001(t *testing.T) { + a := "// Copyright 2019 The Go Authors. All rights reserved.\n// Use of this source code is governed by a BSD-style\n// license that can be found in the LICENSE file.\n\npackage diff_test\n\nimport (\n\t\"fmt\"\n\t\"math/rand\"\n\t\"strings\"\n\t\"testing\"\n\n\t\"golang.org/x/tools/gopls/internal/lsp/diff\"\n\t\"github.com/conduitio/conduit/pkg/plugin/processor/builtin/internal/diff/difftest\"\n\t\"golang.org/x/tools/gopls/internal/span\"\n)\n" + + b := "// Copyright 2019 The Go Authors. All rights reserved.\n// Use of this source code is governed by a BSD-style\n// license that can be found in the LICENSE file.\n\npackage diff_test\n\nimport (\n\t\"fmt\"\n\t\"math/rand\"\n\t\"strings\"\n\t\"testing\"\n\n\t\"github.com/google/safehtml/template\"\n\t\"golang.org/x/tools/gopls/internal/lsp/diff\"\n\t\"github.com/conduitio/conduit/pkg/plugin/processor/builtin/internal/diff/difftest\"\n\t\"golang.org/x/tools/gopls/internal/span\"\n)\n" + for i := 1; i < len(b); i++ { + diffs, lcs := compute(stringSeqs{a, b}, twosided, i) // 14 from gopls + if !lcs.valid() { + t.Errorf("%d,%v", len(diffs), lcs) + } + checkDiffs(t, a, diffs, b) + } +} + +func TestRegressionOld002(t *testing.T) { + a := "n\"\n)\n" + b := "n\"\n\t\"golang.org/x//nnal/stack\"\n)\n" + for i := 1; i <= len(b); i++ { + diffs, lcs := compute(stringSeqs{a, b}, twosided, i) + if !lcs.valid() { + t.Errorf("%d,%v", len(diffs), lcs) + } + checkDiffs(t, a, diffs, b) + } +} + +func TestRegressionOld003(t *testing.T) { + a := "golang.org/x/hello v1.0.0\nrequire golang.org/x/unused v1" + b := "golang.org/x/hello v1" + for i := 1; i <= len(a); i++ { + diffs, lcs := compute(stringSeqs{a, b}, twosided, i) + if !lcs.valid() { + t.Errorf("%d,%v", len(diffs), lcs) + } + checkDiffs(t, a, diffs, b) + } +} + +func TestRandOld(t *testing.T) { + rand.Seed(1) + for i := 0; i < 1000; i++ { + // TODO(adonovan): use ASCII and bytesSeqs here? The use of + // non-ASCII isn't relevant to the property exercised by the test. + a := []rune(randstr("abω", 16)) + b := []rune(randstr("abωc", 16)) + seq := runesSeqs{a, b} + + const lim = 24 // large enough to get true lcs + _, forw := compute(seq, forward, lim) + _, back := compute(seq, backward, lim) + _, two := compute(seq, twosided, lim) + if lcslen(two) != lcslen(forw) || lcslen(forw) != lcslen(back) { + t.Logf("\n%v\n%v\n%v", forw, back, two) + t.Fatalf("%d forw:%d back:%d two:%d", i, lcslen(forw), lcslen(back), lcslen(two)) + } + if !two.valid() || !forw.valid() || !back.valid() { + t.Errorf("check failure") + } + } +} + +// TestDiffAPI tests the public API functions (Diff{Bytes,Strings,Runes}) +// to ensure at least miminal parity of the three representations. +func TestDiffAPI(t *testing.T) { + for _, test := range []struct { + a, b string + wantStrings, wantBytes, wantRunes string + }{ + {"abcXdef", "abcxdef", "[{3 4 3 4}]", "[{3 4 3 4}]", "[{3 4 3 4}]"}, // ASCII + {"abcωdef", "abcΩdef", "[{3 5 3 5}]", "[{3 5 3 5}]", "[{3 4 3 4}]"}, // non-ASCII + } { + + gotStrings := fmt.Sprint(DiffStrings(test.a, test.b)) + if gotStrings != test.wantStrings { + t.Errorf("DiffStrings(%q, %q) = %v, want %v", + test.a, test.b, gotStrings, test.wantStrings) + } + gotBytes := fmt.Sprint(DiffBytes([]byte(test.a), []byte(test.b))) + if gotBytes != test.wantBytes { + t.Errorf("DiffBytes(%q, %q) = %v, want %v", + test.a, test.b, gotBytes, test.wantBytes) + } + gotRunes := fmt.Sprint(DiffRunes([]rune(test.a), []rune(test.b))) + if gotRunes != test.wantRunes { + t.Errorf("DiffRunes(%q, %q) = %v, want %v", + test.a, test.b, gotRunes, test.wantRunes) + } + } +} + +func BenchmarkTwoOld(b *testing.B) { + tests := genBench("abc", 96) + for i := 0; i < b.N; i++ { + for _, tt := range tests { + _, two := compute(stringSeqs{tt.before, tt.after}, twosided, 100) + if !two.valid() { + b.Error("check failed") + } + } + } +} + +func BenchmarkForwOld(b *testing.B) { + tests := genBench("abc", 96) + for i := 0; i < b.N; i++ { + for _, tt := range tests { + _, two := compute(stringSeqs{tt.before, tt.after}, forward, 100) + if !two.valid() { + b.Error("check failed") + } + } + } +} + +func genBench(set string, n int) []struct{ before, after string } { + // before and after for benchmarks. 24 strings of length n with + // before and after differing at least once, and about 5% + rand.Seed(3) + var ans []struct{ before, after string } + for i := 0; i < 24; i++ { + // maybe b should have an approximately known number of diffs + a := randstr(set, n) + cnt := 0 + bb := make([]rune, 0, n) + for _, r := range a { + if rand.Float64() < .05 { + cnt++ + r = 'N' + } + bb = append(bb, r) + } + if cnt == 0 { + // avoid == shortcut + bb[n/2] = 'N' + } + ans = append(ans, struct{ before, after string }{a, string(bb)}) + } + return ans +} + +// This benchmark represents a common case for a diff command: +// large file with a single relatively small diff in the middle. +// (It's not clear whether this is representative of gopls workloads +// or whether it is important to gopls diff performance.) +// +// TODO(adonovan) opt: it could be much faster. For example, +// comparing a file against itself is about 10x faster than with the +// small deletion in the middle. Strangely, comparing a file against +// itself minus the last byte is faster still; I don't know why. +// There is much low-hanging fruit here for further improvement. +func BenchmarkLargeFileSmallDiff(b *testing.B) { + data, err := os.ReadFile("old.go") // large file + if err != nil { + log.Fatal(err) + } + + n := len(data) + + src := string(data) + dst := src[:n*49/100] + src[n*51/100:] // remove 2% from the middle + b.Run("string", func(b *testing.B) { + for i := 0; i < b.N; i++ { + compute(stringSeqs{src, dst}, twosided, len(src)+len(dst)) + } + }) + + srcBytes := []byte(src) + dstBytes := []byte(dst) + b.Run("bytes", func(b *testing.B) { + for i := 0; i < b.N; i++ { + compute(bytesSeqs{srcBytes, dstBytes}, twosided, len(srcBytes)+len(dstBytes)) + } + }) + + srcRunes := []rune(src) + dstRunes := []rune(dst) + b.Run("runes", func(b *testing.B) { + for i := 0; i < b.N; i++ { + compute(runesSeqs{srcRunes, dstRunes}, twosided, len(srcRunes)+len(dstRunes)) + } + }) +} diff --git a/pkg/plugin/processor/builtin/internal/diff/lcs/sequence.go b/pkg/plugin/processor/builtin/internal/diff/lcs/sequence.go new file mode 100644 index 000000000..2d72d2630 --- /dev/null +++ b/pkg/plugin/processor/builtin/internal/diff/lcs/sequence.go @@ -0,0 +1,113 @@ +// Copyright 2022 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package lcs + +// This file defines the abstract sequence over which the LCS algorithm operates. + +// sequences abstracts a pair of sequences, A and B. +type sequences interface { + lengths() (int, int) // len(A), len(B) + commonPrefixLen(ai, aj, bi, bj int) int // len(commonPrefix(A[ai:aj], B[bi:bj])) + commonSuffixLen(ai, aj, bi, bj int) int // len(commonSuffix(A[ai:aj], B[bi:bj])) +} + +type stringSeqs struct{ a, b string } + +func (s stringSeqs) lengths() (int, int) { return len(s.a), len(s.b) } +func (s stringSeqs) commonPrefixLen(ai, aj, bi, bj int) int { + return commonPrefixLenString(s.a[ai:aj], s.b[bi:bj]) +} +func (s stringSeqs) commonSuffixLen(ai, aj, bi, bj int) int { + return commonSuffixLenString(s.a[ai:aj], s.b[bi:bj]) +} + +// The explicit capacity in s[i:j:j] leads to more efficient code. + +type bytesSeqs struct{ a, b []byte } + +func (s bytesSeqs) lengths() (int, int) { return len(s.a), len(s.b) } +func (s bytesSeqs) commonPrefixLen(ai, aj, bi, bj int) int { + return commonPrefixLenBytes(s.a[ai:aj:aj], s.b[bi:bj:bj]) +} +func (s bytesSeqs) commonSuffixLen(ai, aj, bi, bj int) int { + return commonSuffixLenBytes(s.a[ai:aj:aj], s.b[bi:bj:bj]) +} + +type runesSeqs struct{ a, b []rune } + +func (s runesSeqs) lengths() (int, int) { return len(s.a), len(s.b) } +func (s runesSeqs) commonPrefixLen(ai, aj, bi, bj int) int { + return commonPrefixLenRunes(s.a[ai:aj:aj], s.b[bi:bj:bj]) +} +func (s runesSeqs) commonSuffixLen(ai, aj, bi, bj int) int { + return commonSuffixLenRunes(s.a[ai:aj:aj], s.b[bi:bj:bj]) +} + +// TODO(adonovan): optimize these functions using ideas from: +// - https://go.dev/cl/408116 common.go +// - https://go.dev/cl/421435 xor_generic.go + +// TODO(adonovan): factor using generics when available, +// but measure performance impact. + +// commonPrefixLen* returns the length of the common prefix of a[ai:aj] and b[bi:bj]. +func commonPrefixLenBytes(a, b []byte) int { + n := min(len(a), len(b)) + i := 0 + for i < n && a[i] == b[i] { + i++ + } + return i +} +func commonPrefixLenRunes(a, b []rune) int { + n := min(len(a), len(b)) + i := 0 + for i < n && a[i] == b[i] { + i++ + } + return i +} +func commonPrefixLenString(a, b string) int { + n := min(len(a), len(b)) + i := 0 + for i < n && a[i] == b[i] { + i++ + } + return i +} + +// commonSuffixLen* returns the length of the common suffix of a[ai:aj] and b[bi:bj]. +func commonSuffixLenBytes(a, b []byte) int { + n := min(len(a), len(b)) + i := 0 + for i < n && a[len(a)-1-i] == b[len(b)-1-i] { + i++ + } + return i +} +func commonSuffixLenRunes(a, b []rune) int { + n := min(len(a), len(b)) + i := 0 + for i < n && a[len(a)-1-i] == b[len(b)-1-i] { + i++ + } + return i +} +func commonSuffixLenString(a, b string) int { + n := min(len(a), len(b)) + i := 0 + for i < n && a[len(a)-1-i] == b[len(b)-1-i] { + i++ + } + return i +} + +func min(x, y int) int { + if x < y { + return x + } else { + return y + } +} diff --git a/pkg/plugin/processor/builtin/internal/diff/myers/diff.go b/pkg/plugin/processor/builtin/internal/diff/myers/diff.go new file mode 100644 index 000000000..d2b8d1ee6 --- /dev/null +++ b/pkg/plugin/processor/builtin/internal/diff/myers/diff.go @@ -0,0 +1,246 @@ +// Copyright 2019 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// Package myers implements the Myers diff algorithm. +package myers + +import ( + "strings" + + "github.com/conduitio/conduit/pkg/plugin/processor/builtin/internal/diff" +) + +// Sources: +// https://blog.jcoglan.com/2017/02/17/the-myers-diff-algorithm-part-3/ +// https://www.codeproject.com/Articles/42279/%2FArticles%2F42279%2FInvestigating-Myers-diff-algorithm-Part-1-of-2 + +// ComputeEdits returns the diffs of two strings using a simple +// line-based implementation, like [diff.Strings]. +// +// Deprecated: this implementation is moribund. However, when diffs +// appear in marker test expectations, they are the particular diffs +// produced by this implementation. The marker test framework +// asserts diff(orig, got)==wantDiff, but ideally it would compute +// got==apply(orig, wantDiff) so that the notation of the diff +// is immaterial. +func ComputeEdits(before, after string) []diff.Edit { + beforeLines := splitLines(before) + ops := operations(beforeLines, splitLines(after)) + + // Build a table mapping line number to offset. + lineOffsets := make([]int, 0, len(beforeLines)+1) + total := 0 + for i := range beforeLines { + lineOffsets = append(lineOffsets, total) + total += len(beforeLines[i]) + } + lineOffsets = append(lineOffsets, total) // EOF + + edits := make([]diff.Edit, 0, len(ops)) + for _, op := range ops { + start, end := lineOffsets[op.I1], lineOffsets[op.I2] + switch op.Kind { + case opDelete: + // Delete: before[I1:I2] is deleted. + edits = append(edits, diff.Edit{Start: start, End: end}) + case opInsert: + // Insert: after[J1:J2] is inserted at before[I1:I1]. + if content := strings.Join(op.Content, ""); content != "" { + edits = append(edits, diff.Edit{Start: start, End: end, New: content}) + } + } + } + return edits +} + +// opKind is used to denote the type of operation a line represents. +type opKind int + +const ( + opDelete opKind = iota // line deleted from input (-) + opInsert // line inserted into output (+) + opEqual // line present in input and output +) + +func (kind opKind) String() string { + switch kind { + case opDelete: + return "delete" + case opInsert: + return "insert" + case opEqual: + return "equal" + default: + panic("unknown opKind") + } +} + +type operation struct { + Kind opKind + Content []string // content from b + I1, I2 int // indices of the line in a + J1 int // indices of the line in b, J2 implied by len(Content) +} + +// operations returns the list of operations to convert a into b, consolidating +// operations for multiple lines and not including equal lines. +func operations(a, b []string) []*operation { + if len(a) == 0 && len(b) == 0 { + return nil + } + + trace, offset := shortestEditSequence(a, b) + snakes := backtrack(trace, len(a), len(b), offset) + + M, N := len(a), len(b) + + var i int + solution := make([]*operation, len(a)+len(b)) + + add := func(op *operation, i2, j2 int) { + if op == nil { + return + } + op.I2 = i2 + if op.Kind == opInsert { + op.Content = b[op.J1:j2] + } + solution[i] = op + i++ + } + x, y := 0, 0 + for _, snake := range snakes { + if len(snake) < 2 { + continue + } + var op *operation + // delete (horizontal) + for snake[0]-snake[1] > x-y { + if op == nil { + op = &operation{ + Kind: opDelete, + I1: x, + J1: y, + } + } + x++ + if x == M { + break + } + } + add(op, x, y) + op = nil + // insert (vertical) + for snake[0]-snake[1] < x-y { + if op == nil { + op = &operation{ + Kind: opInsert, + I1: x, + J1: y, + } + } + y++ + } + add(op, x, y) + op = nil + // equal (diagonal) + for x < snake[0] { + x++ + y++ + } + if x >= M && y >= N { + break + } + } + return solution[:i] +} + +// backtrack uses the trace for the edit sequence computation and returns the +// "snakes" that make up the solution. A "snake" is a single deletion or +// insertion followed by zero or diagonals. +func backtrack(trace [][]int, x, y, offset int) [][]int { + snakes := make([][]int, len(trace)) + d := len(trace) - 1 + for ; x > 0 && y > 0 && d > 0; d-- { + V := trace[d] + if len(V) == 0 { + continue + } + snakes[d] = []int{x, y} + + k := x - y + + var kPrev int + if k == -d || (k != d && V[k-1+offset] < V[k+1+offset]) { + kPrev = k + 1 + } else { + kPrev = k - 1 + } + + x = V[kPrev+offset] + y = x - kPrev + } + if x < 0 || y < 0 { + return snakes + } + snakes[d] = []int{x, y} + return snakes +} + +// shortestEditSequence returns the shortest edit sequence that converts a into b. +func shortestEditSequence(a, b []string) ([][]int, int) { + M, N := len(a), len(b) + V := make([]int, 2*(N+M)+1) + offset := N + M + trace := make([][]int, N+M+1) + + // Iterate through the maximum possible length of the SES (N+M). + for d := 0; d <= N+M; d++ { + copyV := make([]int, len(V)) + // k lines are represented by the equation y = x - k. We move in + // increments of 2 because end points for even d are on even k lines. + for k := -d; k <= d; k += 2 { + // At each point, we either go down or to the right. We go down if + // k == -d, and we go to the right if k == d. We also prioritize + // the maximum x value, because we prefer deletions to insertions. + var x int + if k == -d || (k != d && V[k-1+offset] < V[k+1+offset]) { + x = V[k+1+offset] // down + } else { + x = V[k-1+offset] + 1 // right + } + + y := x - k + + // Diagonal moves while we have equal contents. + for x < M && y < N && a[x] == b[y] { + x++ + y++ + } + + V[k+offset] = x + + // Return if we've exceeded the maximum values. + if x == M && y == N { + // Makes sure to save the state of the array before returning. + copy(copyV, V) + trace[d] = copyV + return trace, offset + } + } + + // Save the state of the array. + copy(copyV, V) + trace[d] = copyV + } + return nil, 0 +} + +func splitLines(text string) []string { + lines := strings.SplitAfter(text, "\n") + if lines[len(lines)-1] == "" { + lines = lines[:len(lines)-1] + } + return lines +} diff --git a/pkg/plugin/processor/builtin/internal/diff/myers/diff_test.go b/pkg/plugin/processor/builtin/internal/diff/myers/diff_test.go new file mode 100644 index 000000000..98fb250c9 --- /dev/null +++ b/pkg/plugin/processor/builtin/internal/diff/myers/diff_test.go @@ -0,0 +1,16 @@ +// Copyright 2019 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package myers_test + +import ( + "testing" + + "github.com/conduitio/conduit/pkg/plugin/processor/builtin/internal/diff/difftest" + "github.com/conduitio/conduit/pkg/plugin/processor/builtin/internal/diff/myers" +) + +func TestDiff(t *testing.T) { + difftest.DiffTest(t, myers.ComputeEdits) +} diff --git a/pkg/plugin/processor/builtin/internal/diff/ndiff.go b/pkg/plugin/processor/builtin/internal/diff/ndiff.go new file mode 100644 index 000000000..65b3fdb2f --- /dev/null +++ b/pkg/plugin/processor/builtin/internal/diff/ndiff.go @@ -0,0 +1,99 @@ +// Copyright 2022 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package diff + +import ( + "bytes" + "unicode/utf8" + + "github.com/conduitio/conduit/pkg/plugin/processor/builtin/internal/diff/lcs" +) + +// Strings computes the differences between two strings. +// The resulting edits respect rune boundaries. +func Strings(before, after string) []Edit { + if before == after { + return nil // common case + } + + if isASCII(before) && isASCII(after) { + // TODO(adonovan): opt: specialize diffASCII for strings. + return diffASCII([]byte(before), []byte(after)) + } + return diffRunes([]rune(before), []rune(after)) +} + +// Bytes computes the differences between two byte slices. +// The resulting edits respect rune boundaries. +func Bytes(before, after []byte) []Edit { + if bytes.Equal(before, after) { + return nil // common case + } + + if isASCII(before) && isASCII(after) { + return diffASCII(before, after) + } + return diffRunes(runes(before), runes(after)) +} + +func diffASCII(before, after []byte) []Edit { + diffs := lcs.DiffBytes(before, after) + + // Convert from LCS diffs. + res := make([]Edit, len(diffs)) + for i, d := range diffs { + res[i] = Edit{d.Start, d.End, string(after[d.ReplStart:d.ReplEnd])} + } + return res +} + +func diffRunes(before, after []rune) []Edit { + diffs := lcs.DiffRunes(before, after) + + // The diffs returned by the lcs package use indexes + // into whatever slice was passed in. + // Convert rune offsets to byte offsets. + res := make([]Edit, len(diffs)) + lastEnd := 0 + utf8Len := 0 + for i, d := range diffs { + utf8Len += runesLen(before[lastEnd:d.Start]) // text between edits + start := utf8Len + utf8Len += runesLen(before[d.Start:d.End]) // text deleted by this edit + res[i] = Edit{start, utf8Len, string(after[d.ReplStart:d.ReplEnd])} + lastEnd = d.End + } + return res +} + +// runes is like []rune(string(bytes)) without the duplicate allocation. +func runes(bytes []byte) []rune { + n := utf8.RuneCount(bytes) + runes := make([]rune, n) + for i := 0; i < n; i++ { + r, sz := utf8.DecodeRune(bytes) + bytes = bytes[sz:] + runes[i] = r + } + return runes +} + +// runesLen returns the length in bytes of the UTF-8 encoding of runes. +func runesLen(runes []rune) (len int) { + for _, r := range runes { + len += utf8.RuneLen(r) + } + return len +} + +// isASCII reports whether s contains only ASCII. +func isASCII[S string | []byte](s S) bool { + for i := 0; i < len(s); i++ { + if s[i] >= utf8.RuneSelf { + return false + } + } + return true +} diff --git a/pkg/plugin/processor/builtin/internal/diff/testenv/testenv.go b/pkg/plugin/processor/builtin/internal/diff/testenv/testenv.go new file mode 100644 index 000000000..d142e9356 --- /dev/null +++ b/pkg/plugin/processor/builtin/internal/diff/testenv/testenv.go @@ -0,0 +1,199 @@ +// Copyright 2019 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package testenv + +import ( + "bytes" + "fmt" + "os" + "os/exec" + "path/filepath" + "runtime" + "runtime/debug" + "strings" + "sync" + "testing" +) + +// packageMainIsDevel reports whether the module containing package main +// is a development version (if module information is available). +func packageMainIsDevel() bool { + info, ok := debug.ReadBuildInfo() + if !ok { + // Most test binaries currently lack build info, but this should become more + // permissive once https://golang.org/issue/33976 is fixed. + return true + } + + // Note: info.Main.Version describes the version of the module containing + // package main, not the version of “the main module”. + // See https://golang.org/issue/33975. + return info.Main.Version == "(devel)" +} + +var checkGoBuild struct { + once sync.Once + err error +} + +func hasTool(tool string) error { + if tool == "cgo" { + enabled, err := cgoEnabled(false) + if err != nil { + return fmt.Errorf("checking cgo: %v", err) + } + if !enabled { + return fmt.Errorf("cgo not enabled") + } + return nil + } + + _, err := exec.LookPath(tool) + if err != nil { + return err + } + + switch tool { + case "patch": + // check that the patch tools supports the -o argument + temp, err := os.CreateTemp("", "patch-test") + if err != nil { + return err + } + temp.Close() + defer os.Remove(temp.Name()) + cmd := exec.Command(tool, "-o", temp.Name()) + if err := cmd.Run(); err != nil { + return err + } + + case "go": + checkGoBuild.once.Do(func() { + if runtime.GOROOT() != "" { + // Ensure that the 'go' command found by exec.LookPath is from the correct + // GOROOT. Otherwise, 'some/path/go test ./...' will test against some + // version of the 'go' binary other than 'some/path/go', which is almost + // certainly not what the user intended. + out, err := exec.Command(tool, "env", "GOROOT").CombinedOutput() + if err != nil { + checkGoBuild.err = err + return + } + GOROOT := strings.TrimSpace(string(out)) + if GOROOT != runtime.GOROOT() { + checkGoBuild.err = fmt.Errorf("'go env GOROOT' does not match runtime.GOROOT:\n\tgo env: %s\n\tGOROOT: %s", GOROOT, runtime.GOROOT()) + return + } + } + + dir, err := os.MkdirTemp("", "testenv-*") + if err != nil { + checkGoBuild.err = err + return + } + defer os.RemoveAll(dir) + + mainGo := filepath.Join(dir, "main.go") + if err := os.WriteFile(mainGo, []byte("package main\nfunc main() {}\n"), 0644); err != nil { + checkGoBuild.err = err + return + } + cmd := exec.Command("go", "build", "-o", os.DevNull, mainGo) + cmd.Dir = dir + if out, err := cmd.CombinedOutput(); err != nil { + if len(out) > 0 { + checkGoBuild.err = fmt.Errorf("%v: %v\n%s", cmd, err, out) + } else { + checkGoBuild.err = fmt.Errorf("%v: %v", cmd, err) + } + } + }) + if checkGoBuild.err != nil { + return checkGoBuild.err + } + + case "diff": + // Check that diff is the GNU version, needed for the -u argument and + // to report missing newlines at the end of files. + out, err := exec.Command(tool, "-version").Output() + if err != nil { + return err + } + if !bytes.Contains(out, []byte("GNU diffutils")) { + return fmt.Errorf("diff is not the GNU version") + } + } + + return nil +} + +func cgoEnabled(bypassEnvironment bool) (bool, error) { + cmd := exec.Command("go", "env", "CGO_ENABLED") + if bypassEnvironment { + cmd.Env = append(append([]string(nil), os.Environ()...), "CGO_ENABLED=") + } + out, err := cmd.CombinedOutput() + if err != nil { + return false, err + } + enabled := strings.TrimSpace(string(out)) + return enabled == "1", nil +} + +func allowMissingTool(tool string) bool { + switch runtime.GOOS { + case "aix", "darwin", "dragonfly", "freebsd", "illumos", "linux", "netbsd", "openbsd", "plan9", "solaris", "windows": + // Known non-mobile OS. Expect a reasonably complete environment. + default: + return true + } + + switch tool { + case "cgo": + if strings.HasSuffix(os.Getenv("GO_BUILDER_NAME"), "-nocgo") { + // Explicitly disabled on -nocgo builders. + return true + } + if enabled, err := cgoEnabled(true); err == nil && !enabled { + // No platform support. + return true + } + case "go": + if os.Getenv("GO_BUILDER_NAME") == "illumos-amd64-joyent" { + // Work around a misconfigured builder (see https://golang.org/issue/33950). + return true + } + case "diff": + if os.Getenv("GO_BUILDER_NAME") != "" { + return true + } + case "patch": + if os.Getenv("GO_BUILDER_NAME") != "" { + return true + } + } + + // If a developer is actively working on this test, we expect them to have all + // of its dependencies installed. However, if it's just a dependency of some + // other module (for example, being run via 'go test all'), we should be more + // tolerant of unusual environments. + return !packageMainIsDevel() +} + +// NeedsTool skips t if the named tool is not present in the path. +// As a special case, "cgo" means "go" is present and can compile cgo programs. +func NeedsTool(t testing.TB, tool string) { + err := hasTool(tool) + if err == nil { + return + } + + t.Helper() + if allowMissingTool(tool) { + t.Skipf("skipping because %s tool not available: %v", tool, err) + } else { + t.Fatalf("%s tool not available: %v", tool, err) + } +} diff --git a/pkg/plugin/processor/builtin/internal/diff/unified.go b/pkg/plugin/processor/builtin/internal/diff/unified.go new file mode 100644 index 000000000..cfbda6102 --- /dev/null +++ b/pkg/plugin/processor/builtin/internal/diff/unified.go @@ -0,0 +1,251 @@ +// Copyright 2019 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package diff + +import ( + "fmt" + "log" + "strings" +) + +// DefaultContextLines is the number of unchanged lines of surrounding +// context displayed by Unified. Use ToUnified to specify a different value. +const DefaultContextLines = 3 + +// Unified returns a unified diff of the old and new strings. +// The old and new labels are the names of the old and new files. +// If the strings are equal, it returns the empty string. +func Unified(oldLabel, newLabel, old, new string) string { + edits := Strings(old, new) + unified, err := ToUnified(oldLabel, newLabel, old, edits, DefaultContextLines) + if err != nil { + // Can't happen: edits are consistent. + log.Fatalf("internal error in diff.Unified: %v", err) + } + return unified +} + +// ToUnified applies the edits to content and returns a unified diff, +// with contextLines lines of (unchanged) context around each diff hunk. +// The old and new labels are the names of the content and result files. +// It returns an error if the edits are inconsistent; see ApplyEdits. +func ToUnified(oldLabel, newLabel, content string, edits []Edit, contextLines int) (string, error) { + u, err := toUnified(oldLabel, newLabel, content, edits, contextLines) + if err != nil { + return "", err + } + return u.String(), nil +} + +// unified represents a set of edits as a unified diff. +type unified struct { + // from is the name of the original file. + from string + // to is the name of the modified file. + to string + // hunks is the set of edit hunks needed to transform the file content. + hunks []*hunk +} + +// Hunk represents a contiguous set of line edits to apply. +type hunk struct { + // The line in the original source where the hunk starts. + fromLine int + // The line in the original source where the hunk finishes. + toLine int + // The set of line based edits to apply. + lines []line +} + +// Line represents a single line operation to apply as part of a Hunk. +type line struct { + // kind is the type of line this represents, deletion, insertion or copy. + kind opKind + // content is the content of this line. + // For deletion it is the line being removed, for all others it is the line + // to put in the output. + content string +} + +// opKind is used to denote the type of operation a line represents. +type opKind int + +const ( + // opDelete is the operation kind for a line that is present in the input + // but not in the output. + opDelete opKind = iota + // opInsert is the operation kind for a line that is new in the output. + opInsert + // opEqual is the operation kind for a line that is the same in the input and + // output, often used to provide context around edited lines. + opEqual +) + +// String returns a human readable representation of an OpKind. It is not +// intended for machine processing. +func (k opKind) String() string { + switch k { + case opDelete: + return "delete" + case opInsert: + return "insert" + case opEqual: + return "equal" + default: + panic("unknown operation kind") + } +} + +// toUnified takes a file contents and a sequence of edits, and calculates +// a unified diff that represents those edits. +func toUnified(fromName, toName string, content string, edits []Edit, contextLines int) (unified, error) { + gap := contextLines * 2 + u := unified{ + from: fromName, + to: toName, + } + if len(edits) == 0 { + return u, nil + } + var err error + edits, err = lineEdits(content, edits) // expand to whole lines + if err != nil { + return u, err + } + lines := splitLines(content) + var h *hunk + last := 0 + toLine := 0 + for _, edit := range edits { + // Compute the zero-based line numbers of the edit start and end. + // TODO(adonovan): opt: compute incrementally, avoid O(n^2). + start := strings.Count(content[:edit.Start], "\n") + end := strings.Count(content[:edit.End], "\n") + if edit.End == len(content) && len(content) > 0 && content[len(content)-1] != '\n' { + end++ // EOF counts as an implicit newline + } + + switch { + case h != nil && start == last: + //direct extension + case h != nil && start <= last+gap: + //within range of previous lines, add the joiners + addEqualLines(h, lines, last, start) + default: + //need to start a new hunk + if h != nil { + // add the edge to the previous hunk + addEqualLines(h, lines, last, last+contextLines) + u.hunks = append(u.hunks, h) + } + toLine += start - last + h = &hunk{ + fromLine: start + 1, + toLine: toLine + 1, + } + // add the edge to the new hunk + delta := addEqualLines(h, lines, start-contextLines, start) + h.fromLine -= delta + h.toLine -= delta + } + last = start + for i := start; i < end; i++ { + h.lines = append(h.lines, line{kind: opDelete, content: lines[i]}) + last++ + } + if edit.New != "" { + for _, content := range splitLines(edit.New) { + h.lines = append(h.lines, line{kind: opInsert, content: content}) + toLine++ + } + } + } + if h != nil { + // add the edge to the final hunk + addEqualLines(h, lines, last, last+contextLines) + u.hunks = append(u.hunks, h) + } + return u, nil +} + +func splitLines(text string) []string { + lines := strings.SplitAfter(text, "\n") + if lines[len(lines)-1] == "" { + lines = lines[:len(lines)-1] + } + return lines +} + +func addEqualLines(h *hunk, lines []string, start, end int) int { + delta := 0 + for i := start; i < end; i++ { + if i < 0 { + continue + } + if i >= len(lines) { + return delta + } + h.lines = append(h.lines, line{kind: opEqual, content: lines[i]}) + delta++ + } + return delta +} + +// String converts a unified diff to the standard textual form for that diff. +// The output of this function can be passed to tools like patch. +func (u unified) String() string { + if len(u.hunks) == 0 { + return "" + } + b := new(strings.Builder) + fmt.Fprintf(b, "--- %s\n", u.from) + fmt.Fprintf(b, "+++ %s\n", u.to) + for _, hunk := range u.hunks { + fromCount, toCount := 0, 0 + for _, l := range hunk.lines { + switch l.kind { + case opDelete: + fromCount++ + case opInsert: + toCount++ + default: + fromCount++ + toCount++ + } + } + fmt.Fprint(b, "@@") + if fromCount > 1 { + fmt.Fprintf(b, " -%d,%d", hunk.fromLine, fromCount) + } else if hunk.fromLine == 1 && fromCount == 0 { + // Match odd GNU diff -u behavior adding to empty file. + fmt.Fprintf(b, " -0,0") + } else { + fmt.Fprintf(b, " -%d", hunk.fromLine) + } + if toCount > 1 { + fmt.Fprintf(b, " +%d,%d", hunk.toLine, toCount) + } else if hunk.toLine == 1 && toCount == 0 { + // Match odd GNU diff -u behavior adding to empty file. + fmt.Fprintf(b, " +0,0") + } else { + fmt.Fprintf(b, " +%d", hunk.toLine) + } + fmt.Fprint(b, " @@\n") + for _, l := range hunk.lines { + switch l.kind { + case opDelete: + fmt.Fprintf(b, "-%s", l.content) + case opInsert: + fmt.Fprintf(b, "+%s", l.content) + default: + fmt.Fprintf(b, " %s", l.content) + } + if !strings.HasSuffix(l.content, "\n") { + fmt.Fprintf(b, "\n\\ No newline at end of file\n") + } + } + } + return b.String() +}