From a181dd9c6443297acf619a6a8c2aba3e8cc30f7d Mon Sep 17 00:00:00 2001 From: Dave Anderson Date: Thu, 25 Jul 2024 01:33:11 -0700 Subject: [PATCH] tools/internal/parser: add diff support (#2071) * tool/internal/parser: add a change marker to blocks As part of that, refactor slightly to hide fields that should not be editable outside of the package, and provide read-only accessors instead. * tool/internal/parser: add diff support List.SetBaseVersion takes an older base List, and makes Block.Changed() reflect whether or not each block has changed compared to that base. * tools: remove toolchain directive from go.mod Some stdlib features require Go 1.22, we don't need to mandate a specific minimum version of 1.22. --- tools/go.mod | 2 - tools/internal/parser/diff.go | 326 ++++++++++++++++++ tools/internal/parser/diff_test.go | 480 +++++++++++++++++++++++++++ tools/internal/parser/file.go | 61 +++- tools/internal/parser/parser.go | 24 +- tools/internal/parser/parser_test.go | 34 +- tools/internal/parser/text.go | 6 - tools/internal/parser/text_test.go | 23 +- 8 files changed, 911 insertions(+), 45 deletions(-) create mode 100644 tools/internal/parser/diff.go create mode 100644 tools/internal/parser/diff_test.go diff --git a/tools/go.mod b/tools/go.mod index 834cc24c7..75a047efc 100644 --- a/tools/go.mod +++ b/tools/go.mod @@ -2,8 +2,6 @@ module github.com/publicsuffix/list/tools go 1.22 -toolchain go1.22.5 - require ( github.com/google/go-cmp v0.6.0 golang.org/x/net v0.26.0 diff --git a/tools/internal/parser/diff.go b/tools/internal/parser/diff.go new file mode 100644 index 000000000..69f5df335 --- /dev/null +++ b/tools/internal/parser/diff.go @@ -0,0 +1,326 @@ +package parser + +import ( + "fmt" + "strings" +) + +// SetBaseVersion sets the list's base of comparison to old, and +// updates the changed/unchanged annotations on all Blocks to match. +// +// If wholeSuffixBlocks is true, any changed Suffix or Wildcard within +// a Suffixes block marks all suffixes and wildcards in that block as +// changed. +// +// Precise marking (wholeSuffixBlocks=false) is intended for +// maintainer and machine edits, where change-aware validators should +// exaine only the specific changed items. +// +// Expansive marking (wholeSuffixBlocks=true) is intended for external +// PRs from suffix block owners, to opportunistically point out more +// issues that they have the knowledge and authority to fix. +func (l *List) SetBaseVersion(old *List, wholeSuffixBlocks bool) { + diff := differ{ + oldCnt: map[string]int{}, + inCurrent: map[string][][]Block{}, + keys: map[Block]string{}, + + wholeSuffixBlocks: wholeSuffixBlocks, + } + + // Tree diff is an open area of research, and it's possible to use + // extremely fancy (and slow) algorithms. Thankfully, the PSL has + // some additional domain-specific properties that let us take + // shortcuts and implement something O(n). + // + // First, academic tree_diff(OLD,NEW) produces an "edit script" as + // the output, which describes how to add, delete, move and mutate + // tree nodes to transform the OLD tree into the NEW tree. For the + // PSL, we don't care about the exact structural changes, we just + // need to know if we can skip validation checks. So we have to + // answer a simple question: is a given block in NEW also present + // in OLD? + // + // Second, all nodes in a well-formed list have a stable unique + // identity. We can use this to answer the previous question in + // constant time, instead of having to do complex tree analysis to + // locate equivalent nodes. + // + // Node identities may be duplicated in an ill-formed List, for + // example a suffix block that lists the same suffix twice. We + // deal with this using brute force, and mark all duplicate + // identities as changed. This means that a malformed PSL file + // might report more changes than the strict minimum, but in + // practice it's not much more, and in exchange we don't have to + // do anything complex to decide what to revalidate. + // + // Third, how do we propagate child changes to parents? This is + // where academic algorithms quickly go into O(n^3) + // territory. Once again, we avoid this with brute force: a + // changed tree node marks all its parents as changed as + // well. That means that if you fix a typo in one Suffix, we say + // that the Suffix changed, but also its parent Suffixes, Section, + // and List nodes. + // + // We could theoretically dirty fewer nodes in some cases, but + // that introduces a risk of false negatives (we forget to re-run + // a necessary validation), and it makes the diff harder to reason + // about when writing validators. In practice, this slightly + // pessimistic dirtying is cheap for the currently-planned + // validators, so we stick with the behavior that is easy to + // reason about and simple to implement. + // + // Finally, we need to do something about deleted nodes. We can + // handle that with a single additional pass through the OLD list, + // thanks to the node identity property. Again for simplicity, we + // treat deletions similar to edits: all the parents of a deleted + // node are marked dirty. Again we could be more precise here, but + // in practice it's currently cheap to be pessimistic, and makes + // the code and mental model simpler. + // + // There are various optimizations possible for this code. The + // biggest would be doing something more efficient to track block + // identities, which are currently expressed as big strings + // because that makes them convenient to compare and use as map + // keys. However, this algorithm as currently implemented takes + // <100ms to diff a full PSL file, so for now we err on the side + // of simplicity. + + // Compile the identities of all the blocks in old. + diff.scanOld(old, "") + // Mark unchanged blocks. Thanks to the previous step, each tree + // node can be checked in O(1) time. + diff.scanCurrent(l, "", nil) + // Dirty the parents of deleted blocks. + diff.markDeletions(old, "") +} + +type differ struct { + // wholeSuffixBlocks is whether Suffix/Wildcard changes propagate + // to all children of the parent Suffixes block. + wholeSuffixBlocks bool + + // oldCnt counts the number of blocks in the old list with a given + // identity key. + oldCnt map[string]int + + // inCurrent maps block identity keys to the tree paths in of the + // current list with that identity. Given a block with identity K, + // inCurrent[K] is a list of paths. In each path, path[0] is a + // block with identity K, and path[1..n] are its parents going + // back to the root of the tree. + // + // In a well-formed List, each cache entry has a single path, but + // we track duplicates in order to function correctly on malformed + // lists as well. + inCurrent map[string][][]Block + + // keys caches identity keys by block pointer. There are several + // passes of traversal through trees, and when old and current are + // nearly identical (the common case) this can save significant + // CPU time. + keys map[Block]string +} + +// scanOld records b and its children in d.oldCnt. +func (d *differ) scanOld(b Block, parentKey string) { + k := d.getKey(b, parentKey) + d.oldCnt[k]++ + for _, child := range b.Children() { + d.scanOld(child, k) + } +} + +// scanCurrent adds b and all its children to b.inCurrent, and updates +// their isUnchanged annotation based on the information in d.oldCnt. +func (d *differ) scanCurrent(curBlock Block, parentKey string, parents []Block) { + k := d.getKey(curBlock, parentKey) + + path := make([]Block, 0, len(parents)+1) + path = append(path, curBlock) + path = append(path, parents...) + + // Assume we're unchanged to start with. The job of the remaining + // diff code is to falsify this claim and mark the node as changed + // if needed. + // + // Setting this early and unconditionally lets us optimize the + // logic in markChanged, by ensuring that each node transitions + // false->true only once, before any possible true->false + // transitions that affect it. + curBlock.info().isUnchanged = true + + // Record the path to the current block, and if it's a + // doppelganger of some other Block, mark changed. Tracking diffs + // of duplicates requires solving some hard theoretical problems + // of tree diff, so we don't bother. + // + // Duplicate identities only happens on a malformed PSL, and we + // can save a lot of pain by just over-rechecking such PSLs + // slightly. + d.inCurrent[k] = append(d.inCurrent[k], path) + if l := len(d.inCurrent[k]); l == 2 { + // This is the first duplicate, previous path didn't know it + // wasn't unique. Mark both the current and earlier path as + // changed. + d.markChanged(d.inCurrent[k]...) + } else if l > 2 { + // Previous paths already marked, only curBlock's one needs + // updating. + d.markChanged(path) + } + + // This covers both the case where a block is new (oldCnt of 0), + // and the case where this block isn't a dupe in current, but was + // a dupe in old. In that case, like above we avoid algorithmic + // headaches by just dirtying the block instead of trying to + // resolve which version of the old dupes we're looking at. + if d.oldCnt[k] != 1 { + d.markChanged(path) + } + + // Scan through child subtrees. These subtrees may call + // markChanged and set Unchanged=false on us. + for _, child := range curBlock.Children() { + d.scanCurrent(child, k, path) + } + + // If the caller requested, and we're changed anyway, see if we + // should propagate the change back downwards again. + if !curBlock.info().isUnchanged { + d.maybeMarkWholeSuffixBlock(path) + } +} + +// markDeletions marks parents of deleted nodes as changed in current. +// +// For example, if the diff contains a suffix deletion, this will mark +// the enclosing Suffixes block as changed. +func (d *differ) markDeletions(oldBlock Block, parentKey string) bool { + k := d.getKey(oldBlock, parentKey) + + pathsInCurrent, ok := d.inCurrent[k] + if !ok { + // oldBlock was deleted, report to caller. + return true + } + + childDeleted := false + for _, child := range oldBlock.Children() { + if d.markDeletions(child, k) { + // Note, can't short-circuit here because there may be + // other paths under this block that also need to be + // updated. We're not only trying to update oldBlock, but + // also all of its children. + childDeleted = true + } + } + + // Children were deleted, mark ourselves changed. This implicitly + // also marks the parent as changed, so no need to tell it that a + // change happened, it'll just do extra no-op work. + if childDeleted { + d.markChanged(pathsInCurrent...) + } + + return false +} + +// maybeMarkWholeSuffixBlock calls markSuffixAndWildcardChanged on all +// Suffixes in path, if the caller of MarkUnchanged requested +// expansive marking. +func (d *differ) maybeMarkWholeSuffixBlock(path []Block) { + if !d.wholeSuffixBlocks { + return + } + + switch path[0].(type) { + case *Suffixes, *Suffix, *Wildcard: + for i, parent := range path { + if _, ok := parent.(*Suffixes); ok { + d.markSuffixAndWildcardChanged(parent, path[i+1:]) + } + } + } +} + +// markSuffixAndWildcardChanged marks as changed all Suffix and +// Wildcard blocks in the tree rooted at curBlock. +func (d *differ) markSuffixAndWildcardChanged(curBlock Block, parents []Block) { + path := append([]Block{curBlock}, parents...) + + switch curBlock.(type) { + case *Suffix, *Wildcard: + d.markChanged(path) + default: + for _, child := range curBlock.Children() { + d.markSuffixAndWildcardChanged(child, path) + } + } +} + +// markChanged marks as changed all the blocks in paths. +func (d *differ) markChanged(paths ...[]Block) { +pathLoop: + for _, path := range paths { + for _, b := range path { + if b.info().isUnchanged == false { + // We never mark a node as changed in isolation, we + // always propagate the change to all its + // parents. Therefore, we can stop the upwards + // traversal in this path as soon as we find any node + // that's already in the correct state. + continue pathLoop + } + b.info().isUnchanged = false + } + } +} + +// getKey returns the identity key for blk, which must be a direct +// child of parentKey. getKey keeps a cache of all keys built in the +// lifetime of this differ, to make future calls more efficient. +func (d *differ) getKey(blk Block, parentKey string) string { + ret, ok := d.keys[blk] + if !ok { + ret = d.makeKey(blk, parentKey) + d.keys[blk] = ret + } + return ret +} + +// makeKey builds the identity key of blk, which must be a child node +// of parentKey. +func (d *differ) makeKey(b Block, parentKey string) string { + switch v := b.(type) { + case *List: + return fmt.Sprintf("%s;List", parentKey) + case *Section: + return fmt.Sprintf("%s;Section,%q", parentKey, v.Name) + case *Suffixes: + // Note parsed suffix metadata isn't included in the identity, + // to avoid marking all suffixes in a block changed when + // someone adjusts their URL or email. Such edits will still + // indirectly dirty the block, because the metadata comment + // includes the entire comment text in its identity, and will + // dirty the parent Suffixes. + // + // Two temporary exceptions: TransIP and MetaCentrum both have + // two blocks each, with different contact emails. Until those + // are fixed, also include the maintainer email in the + // identity to avoid constant false positives. + ret := fmt.Sprintf("%s;Suffixes,%q", parentKey, v.Info.Name) + if strings.Contains(v.Info.Name, "MetaCentrum") || strings.Contains(v.Info.Name, "TransIP") { + ret += fmt.Sprintf(",%v", v.Info.Maintainers) + } + return ret + case *Suffix: + return fmt.Sprintf("%s;Suffix,%q", parentKey, v.Domain) + case *Wildcard: + return fmt.Sprintf("%s;Wildcard,%q,%#v", parentKey, v.Domain, v.Exceptions) + case *Comment: + return fmt.Sprintf("%s;Comment,%#v", parentKey, v.Text) + default: + panic("unknown ast node") + } +} diff --git a/tools/internal/parser/diff_test.go b/tools/internal/parser/diff_test.go new file mode 100644 index 000000000..7596318c0 --- /dev/null +++ b/tools/internal/parser/diff_test.go @@ -0,0 +1,480 @@ +package parser + +import ( + "slices" + "testing" +) + +func TestDiff(t *testing.T) { + tests := []struct { + name string + before *List + now *List + expansive bool + changed *List + }{ + { + name: "new_suffix", + before: list( + section(1, 1, "PRIVATE DOMAINS", + suffixes(1, 1, noInfo, + suffix(1, "example.com"), + ), + ), + ), + now: list( + section(1, 1, "PRIVATE DOMAINS", + suffixes(1, 1, noInfo, + suffix(1, "example.com"), + suffix(2, "example.net"), + ), + ), + ), + changed: list( + section(1, 1, "PRIVATE DOMAINS", + suffixes(1, 1, noInfo, + suffix(2, "example.net"), + ), + ), + ), + }, + + { + name: "new_wildcard", + before: list( + section(1, 1, "PRIVATE DOMAINS", + suffixes(1, 1, noInfo, + suffix(1, "example.com"), + ), + ), + ), + now: list( + section(1, 1, "PRIVATE DOMAINS", + suffixes(1, 1, noInfo, + suffix(1, "example.com"), + wildcard(2, 2, "example.net"), + ), + ), + ), + changed: list( + section(1, 1, "PRIVATE DOMAINS", + suffixes(1, 1, noInfo, + wildcard(2, 2, "example.net"), + ), + ), + ), + }, + + { + name: "new_exception", + before: list( + section(1, 1, "PRIVATE DOMAINS", + suffixes(1, 1, noInfo, + suffix(1, "example.com"), + wildcard(2, 2, "example.net", "exception1"), + ), + ), + ), + now: list( + section(1, 1, "PRIVATE DOMAINS", + suffixes(1, 1, noInfo, + suffix(1, "example.com"), + wildcard(2, 2, "example.net", "exception1", "exception2"), + ), + ), + ), + changed: list( + section(1, 1, "PRIVATE DOMAINS", + suffixes(1, 1, noInfo, + wildcard(2, 2, "example.net", "exception1", "exception2"), + ), + ), + ), + }, + + { + name: "new_suffix_block", + before: list( + section(1, 1, "PRIVATE DOMAINS", + suffixes(1, 1, info("Example LLC", nil, nil, nil, true), + suffix(1, "example.com"), + ), + ), + ), + now: list( + section(1, 1, "PRIVATE DOMAINS", + suffixes(1, 1, info("Example LLC", nil, nil, nil, true), + suffix(1, "example.com"), + ), + suffixes(1, 1, info("Zork Ltd", nil, nil, nil, true), + suffix(1, "zork.example.com"), + ), + ), + ), + changed: list( + section(1, 1, "PRIVATE DOMAINS", + suffixes(1, 1, info("Zork Ltd", nil, nil, nil, true), + suffix(1, "zork.example.com"), + ), + ), + ), + }, + + { + name: "new_section", + before: list( + section(1, 1, "PRIVATE DOMAINS", + suffixes(1, 1, info("Example LLC", nil, nil, nil, true), + suffix(1, "example.com"), + ), + ), + ), + now: list( + section(2, 2, "ICANN DOMAINS", + suffixes(1, 1, info("aaa", nil, nil, nil, true), + suffix(1, "aaa"), + ), + ), + section(1, 1, "PRIVATE DOMAINS", + suffixes(1, 1, info("Example LLC", nil, nil, nil, true), + suffix(1, "example.com"), + ), + ), + ), + changed: list( + section(2, 2, "ICANN DOMAINS", + suffixes(1, 1, info("aaa", nil, nil, nil, true), + suffix(1, "aaa"), + ), + ), + ), + }, + + { + name: "change_suffixes", + before: list( + section(1, 1, "PRIVATE DOMAINS", + suffixes(1, 1, info("Example LLC", nil, nil, nil, true), + suffix(1, "example.com"), + suffix(2, "example.org"), + wildcard(3, 3, "example.net", "exception"), + wildcard(4, 4, "example.edu", "faculty"), + ), + suffixes(2, 2, info("Unchanged GmbH", nil, nil, nil, true), + suffix(1, "example.de"), + suffix(2, "example.fr"), + suffix(3, "example.nl"), + ), + ), + ), + now: list( + section(1, 1, "PRIVATE DOMAINS", + suffixes(1, 1, info("Example LLC", nil, nil, nil, true), + suffix(1, "example.gov"), // com -> gov + suffix(2, "example.org"), + wildcard(3, 3, "example.net", "exception2"), // exception -> exception2 + wildcard(4, 4, "w.example.edu", "faculty"), // example.edu -> w.example.edu + ), + suffixes(2, 2, info("Unchanged GmbH", nil, nil, nil, true), + suffix(1, "example.de"), + suffix(2, "example.fr"), + suffix(3, "example.nl"), + ), + ), + ), + changed: list( + section(1, 1, "PRIVATE DOMAINS", + suffixes(1, 1, info("Example LLC", nil, nil, nil, true), + suffix(1, "example.gov"), + wildcard(3, 3, "example.net", "exception2"), + wildcard(4, 4, "w.example.edu", "faculty"), + ), + ), + ), + }, + + { + name: "change_block_info", + before: list( + section(1, 1, "PRIVATE DOMAINS", + suffixes(1, 1, + info( + "Example LLC", + urls("https://example.com"), + nil, + nil, + true), + comment(1, "Example LLC: https://example.com"), + suffix(2, "example.com"), + suffix(3, "example.org"), + wildcard(4, 4, "example.net", "exception"), + ), + ), + ), + now: list( + section(1, 1, "PRIVATE DOMAINS", + suffixes(1, 1, + info( + "Example LLC", + urls("https://example.org"), // different URL + nil, + nil, + true), + comment(1, "Example LLC: https://example.org"), + suffix(2, "example.com"), + suffix(3, "example.org"), + wildcard(4, 4, "example.net", "exception"), + ), + ), + ), + changed: list( + section(1, 1, "PRIVATE DOMAINS", + suffixes(1, 1, + info( + "Example LLC", + urls("https://example.org"), + nil, + nil, + true), + comment(1, "Example LLC: https://example.org"), + // Suffixes not changed + ), + ), + ), + }, + + { + name: "duplicates_in_new", + before: list( + section(1, 1, "PRIVATE DOMAINS", + suffixes(1, 1, info("Example LLC", nil, nil, nil, true), + suffix(1, "example.com"), + suffix(2, "example.org"), + ), + suffixes(2, 2, info("Unchanged GmbH", nil, nil, nil, true), + suffix(1, "example.de"), + suffix(2, "example.fr"), + suffix(3, "example.nl"), + ), + ), + ), + now: list( + section(1, 1, "PRIVATE DOMAINS", + suffixes(1, 1, info("Example LLC", nil, nil, nil, true), + suffix(1, "example.com"), + suffix(2, "example.org"), + suffix(3, "example.com"), // dupe + ), + suffixes(2, 2, info("Unchanged GmbH", nil, nil, nil, true), + suffix(1, "example.de"), + suffix(2, "example.fr"), + suffix(3, "example.nl"), + suffix(4, "example.fr"), // dupe + ), + ), + ), + changed: list( + section(1, 1, "PRIVATE DOMAINS", + suffixes(1, 1, info("Example LLC", nil, nil, nil, true), + suffix(1, "example.com"), + suffix(3, "example.com"), + ), + suffixes(2, 2, info("Unchanged GmbH", nil, nil, nil, true), + suffix(2, "example.fr"), + suffix(4, "example.fr"), + ), + ), + ), + }, + + { + name: "duplicates_in_old", + before: list( + section(1, 1, "PRIVATE DOMAINS", + suffixes(1, 1, info("Example LLC", nil, nil, nil, true), + suffix(1, "example.com"), + suffix(2, "example.org"), + suffix(3, "example.com"), // dupe + ), + suffixes(2, 2, info("Unchanged GmbH", nil, nil, nil, true), + suffix(1, "example.de"), + suffix(2, "example.fr"), + suffix(3, "example.nl"), + ), + ), + ), + now: list( + // no changes compared to before + section(1, 1, "PRIVATE DOMAINS", + suffixes(1, 1, info("Example LLC", nil, nil, nil, true), + suffix(1, "example.com"), + suffix(2, "example.org"), + suffix(3, "example.com"), + ), + suffixes(2, 2, info("Unchanged GmbH", nil, nil, nil, true), + suffix(1, "example.de"), + suffix(2, "example.fr"), + suffix(3, "example.nl"), + ), + ), + ), + changed: list( + section(1, 1, "PRIVATE DOMAINS", + suffixes(1, 1, info("Example LLC", nil, nil, nil, true), + // all dupes marked for recheck, despite no changes + suffix(1, "example.com"), + suffix(3, "example.com"), + ), + ), + ), + }, + + { + name: "deletions", + before: list( + section(1, 1, "SUFFIX DELETES", + suffixes(1, 1, info("Example LLC", nil, nil, nil, true), + suffix(1, "example.com"), + suffix(2, "example.org"), + suffix(3, "example.net"), + ), + suffixes(2, 2, info("Unchanged GmbH", nil, nil, nil, true), + suffix(1, "example.de"), + suffix(2, "example.fr"), + suffix(3, "example.nl"), + ), + ), + section(2, 2, "SUFFIX BLOCK DELETES", + suffixes(1, 1, info("aaa", nil, nil, nil, true), + suffix(1, "aaa"), + suffix(2, "bbb"), + ), + suffixes(1, 1, info("bbb", nil, nil, nil, true), + suffix(1, "ccc"), + ), + ), + section(3, 3, "SECTION TO DELETE", + suffixes(1, 1, info("delete me", nil, nil, nil, true), + suffix(1, "delete-me.zork"), + ), + ), + ), + now: list( + section(1, 1, "SUFFIX DELETES", + suffixes(1, 1, info("Example LLC", nil, nil, nil, true), + suffix(1, "example.com"), + // deleted example.org + suffix(3, "example.net"), + ), + suffixes(2, 2, info("Unchanged GmbH", nil, nil, nil, true), + suffix(1, "example.de"), + suffix(2, "example.fr"), + suffix(3, "example.nl"), + ), + ), + section(2, 2, "SUFFIX BLOCK DELETES", + // deleted aaa + suffixes(1, 1, info("bbb", nil, nil, nil, true), + suffix(1, "ccc"), + ), + ), + // deleted section to delete + ), + changed: list( + section(1, 1, "SUFFIX DELETES", + suffixes(1, 1, info("Example LLC", nil, nil, nil, true)), + ), + section(2, 2, "SUFFIX BLOCK DELETES"), + ), + }, + + { + name: "expand", + before: list( + section(1, 1, "PRIVATE DOMAINS", + suffixes(1, 1, info("Example LLC", nil, nil, nil, true), + suffix(1, "example.com"), + suffix(2, "example.org"), + suffix(3, "example.net"), + ), + suffixes(2, 2, info("Unchanged GmbH", nil, nil, nil, true), + suffix(1, "example.de"), + suffix(2, "example.fr"), + suffix(3, "example.nl"), + ), + ), + ), + now: list( + section(1, 1, "PRIVATE DOMAINS", + suffixes(1, 1, info("Example LLC", nil, nil, nil, true), + suffix(1, "example.com"), + suffix(2, "example.org"), + suffix(3, "example.biz"), // changed + ), + suffixes(2, 2, info("Unchanged GmbH", nil, nil, nil, true), + suffix(1, "example.de"), + suffix(2, "example.fr"), + suffix(3, "example.nl"), + ), + ), + ), + expansive: true, + changed: list( + section(1, 1, "PRIVATE DOMAINS", + suffixes(1, 1, info("Example LLC", nil, nil, nil, true), + // all suffixes marked, not just the changed one + suffix(1, "example.com"), + suffix(2, "example.org"), + suffix(3, "example.biz"), + ), + ), + ), + }, + } + + for _, tc := range tests { + t.Run(tc.name, func(t *testing.T) { + got := tc.now + got.SetBaseVersion(tc.before, tc.expansive) + deleteUnchanged(got) + checkDiff(t, "MarkUnchanged output", got, tc.changed) + }) + } +} + +func deleteUnchanged(b Block) { + if !b.Changed() { + return + } + + switch v := b.(type) { + case *List: + v.Blocks = slices.DeleteFunc(v.Blocks, func(child Block) bool { + return !child.Changed() + }) + if len(v.Blocks) == 0 { + v.Blocks = nil + } + case *Section: + v.Blocks = slices.DeleteFunc(v.Blocks, func(child Block) bool { + return !child.Changed() + }) + if len(v.Blocks) == 0 { + v.Blocks = nil + } + case *Suffixes: + v.Blocks = slices.DeleteFunc(v.Blocks, func(child Block) bool { + return !child.Changed() + }) + if len(v.Blocks) == 0 { + v.Blocks = nil + } + case *Suffix, *Wildcard, *Comment: + default: + panic("unknown ast node") + } + + for _, child := range b.Children() { + deleteUnchanged(child) + } +} diff --git a/tools/internal/parser/file.go b/tools/internal/parser/file.go index e532863e3..4578a38a0 100644 --- a/tools/internal/parser/file.go +++ b/tools/internal/parser/file.go @@ -9,17 +9,6 @@ import ( "github.com/publicsuffix/list/tools/internal/domain" ) -// List is a parsed public suffix list. -type List struct { - SourceRange - - // Blocks are the top-level elements of the list, in the order - // they appear. - Blocks []Block -} - -func (l *List) Children() []Block { return l.Blocks } - // A Block is a parsed chunk of a PSL file. Each block is one of the // concrete types Comment, Section, Suffixes, Suffix, or Wildcard. type Block interface { @@ -27,6 +16,11 @@ type Block interface { SrcRange() SourceRange // Children returns the block's direct children, if any. Children() []Block + // Changed reports whether the tree rooted at block has changed + // since the base of comparison (see List.SetBaseVersion). + Changed() bool + + info() *blockInfo } // BlocksOfType recursively collects and returns all blocks of @@ -49,10 +43,45 @@ func blocksOfTypeRec[T Block](tree Block, out *[]T) { } } +// blockInfo is common information shared by all Block types. +type blockInfo struct { + SourceRange + + // isUnchanged records that a Block (including any children) is + // semantically unchanged from a past base point. The default base + // of comparison is a null List, meaning that Unchanged=false for + // all blocks. A different base of comparison can be set with + // List.Diff. + isUnchanged bool +} + +func (b blockInfo) SrcRange() SourceRange { + return b.SourceRange +} + +func (b blockInfo) Changed() bool { + return !b.isUnchanged +} + +func (b *blockInfo) info() *blockInfo { + return b +} + +// List is a parsed public suffix list. +type List struct { + blockInfo + + // Blocks are the top-level elements of the list, in the order + // they appear. + Blocks []Block +} + +func (l *List) Children() []Block { return l.Blocks } + // Comment is a comment block, consisting of one or more contiguous // lines of commented text. type Comment struct { - SourceRange + blockInfo // Text is the unprocessed content of the comment lines, with the // leading comment syntax removed. Text []string @@ -63,7 +92,7 @@ func (c *Comment) Children() []Block { return nil } // Section is a named part of a PSL file, containing suffixes which // behave similarly. type Section struct { - SourceRange + blockInfo // Name is he section name. In a normal well-formed PSL file, the // names are "ICANN DOMAINS" and "PRIVATE DOMAINS". @@ -82,7 +111,7 @@ func (s *Section) Children() []Block { return s.Blocks } // domain suffixes. The suffix list may contain additional // unstructured inline comments. type Suffixes struct { - SourceRange + blockInfo // Info is information about the authoritative maintainers for // this set of suffixes. @@ -186,7 +215,7 @@ func (m MaintainerInfo) HasInfo() bool { // Suffix is one public suffix, represented in the standard domain // name format. type Suffix struct { - SourceRange + blockInfo // Domain is the public suffix's domain name. Domain domain.Name @@ -197,7 +226,7 @@ func (s *Suffix) Children() []Block { return nil } // Wildcard is a wildcard public suffix, along with any exceptions to // that wildcard. type Wildcard struct { - SourceRange + blockInfo // Domain is the base of the wildcard public suffix, without the // leading "*" label. diff --git a/tools/internal/parser/parser.go b/tools/internal/parser/parser.go index 56cfb56c5..d1cfae419 100644 --- a/tools/internal/parser/parser.go +++ b/tools/internal/parser/parser.go @@ -234,8 +234,10 @@ func (p *parser) parseSection() *Section { // Initialize with the start-of-section marker's data. start := p.next().(tokenSectionStart) ret := &Section{ - SourceRange: start.SourceRange, - Name: start.Name, + blockInfo: blockInfo{ + SourceRange: start.SourceRange, + }, + Name: start.Name, } emit := blockEmitter(&ret.Blocks, &ret.SourceRange) @@ -349,8 +351,10 @@ func (p *parser) parseSuffix() Block { } return &Suffix{ - SourceRange: tok.SourceRange, - Domain: domain, + blockInfo: blockInfo{ + SourceRange: tok.SourceRange, + }, + Domain: domain, } } @@ -366,8 +370,10 @@ func (p *parser) parseWildcard() Block { } return &Wildcard{ - SourceRange: tok.SourceRange, - Domain: domain, + blockInfo: blockInfo{ + SourceRange: tok.SourceRange, + }, + Domain: domain, } } @@ -402,8 +408,10 @@ func (p *parser) parseException(previous []Block) { func (p *parser) parseComment() *Comment { tok := p.next().(tokenComment) ret := &Comment{ - SourceRange: tok.SourceRange, - Text: []string{tok.Text}, + blockInfo: blockInfo{ + SourceRange: tok.SourceRange, + }, + Text: []string{tok.Text}, } for { if tok, ok := p.peek().(tokenComment); ok { diff --git a/tools/internal/parser/parser_test.go b/tools/internal/parser/parser_test.go index 1d72925ac..7264d872b 100644 --- a/tools/internal/parser/parser_test.go +++ b/tools/internal/parser/parser_test.go @@ -357,24 +357,30 @@ func list(blocks ...Block) *List { func comment(start int, lines ...string) *Comment { return &Comment{ - SourceRange: mkSrc(start, start+len(lines)), - Text: lines, + blockInfo: blockInfo{ + SourceRange: mkSrc(start, start+len(lines)), + }, + Text: lines, } } func section(start, end int, name string, blocks ...Block) *Section { return &Section{ - SourceRange: mkSrc(start, end), - Name: name, - Blocks: blocks, + blockInfo: blockInfo{ + SourceRange: mkSrc(start, end), + }, + Name: name, + Blocks: blocks, } } func suffixes(start, end int, info MaintainerInfo, blocks ...Block) *Suffixes { return &Suffixes{ - SourceRange: mkSrc(start, end), - Info: info, - Blocks: blocks, + blockInfo: blockInfo{ + SourceRange: mkSrc(start, end), + }, + Info: info, + Blocks: blocks, } } @@ -396,8 +402,10 @@ func suffix(line int, domainStr string) *Suffix { panic(err) } return &Suffix{ - SourceRange: mkSrc(line, line+1), - Domain: domain, + blockInfo: blockInfo{ + SourceRange: mkSrc(line, line+1), + }, + Domain: domain, } } @@ -408,8 +416,10 @@ func wildcard(start, end int, base string, exceptions ...string) *Wildcard { } ret := &Wildcard{ - SourceRange: mkSrc(start, end), - Domain: dom, + blockInfo: blockInfo{ + SourceRange: mkSrc(start, end), + }, + Domain: dom, } for _, s := range exceptions { exc, err := domain.ParseLabel(s) diff --git a/tools/internal/parser/text.go b/tools/internal/parser/text.go index 3bf76f251..bc6dc3c45 100644 --- a/tools/internal/parser/text.go +++ b/tools/internal/parser/text.go @@ -51,12 +51,6 @@ func (s SourceRange) merge(other SourceRange) SourceRange { } } -// SrcRange returns the SourceRange. This looks a little strange, but -// it's to satisfy the Block interface. This allows other code to -// retrieve the SourceRange of any Block without having to typeswitch -// all the possible sub-types. -func (s SourceRange) SrcRange() SourceRange { return s } - const ( bomUTF8 = "\xEF\xBB\xBF" bomUTF16BE = "\xFE\xFF" diff --git a/tools/internal/parser/text_test.go b/tools/internal/parser/text_test.go index cc5eb570b..e65d81aaa 100644 --- a/tools/internal/parser/text_test.go +++ b/tools/internal/parser/text_test.go @@ -3,6 +3,7 @@ package parser import ( "bytes" "fmt" + "reflect" "testing" "github.com/google/go-cmp/cmp" @@ -224,7 +225,27 @@ func utf8WithBOM(s string) []byte { func checkDiff(t *testing.T, whatIsBeingDiffed string, got, want any) { t.Helper() - if diff := cmp.Diff(got, want); diff != "" { + + // cmp.Diff refuses to examine unexported fields by default. Tell + // it that it's okay to look at unexported fields of blocks and + // blockInfo, since we own those fields and want to include their + // values in comparisons. + exportInfo := cmp.Exporter(func(t reflect.Type) bool { + if t.Kind() != reflect.Pointer { + t = reflect.PointerTo(t) + } + + if t.Elem() == reflect.TypeFor[blockInfo]() { + return true + } + + if t.Implements(reflect.TypeFor[Block]()) { + return true + } + + return false + }) + if diff := cmp.Diff(got, want, exportInfo); diff != "" { t.Errorf("%s is wrong (-got+want):\n%s", whatIsBeingDiffed, diff) } }