From a181dd9c6443297acf619a6a8c2aba3e8cc30f7d Mon Sep 17 00:00:00 2001
From: Dave Anderson <dave@natulte.net>
Date: Thu, 25 Jul 2024 01:33:11 -0700
Subject: [PATCH] tools/internal/parser: add diff support (#2071)

* tool/internal/parser: add a change marker to blocks

As part of that, refactor slightly to hide fields that should not be
editable outside of the package, and provide read-only accessors instead.

* tool/internal/parser: add diff support

List.SetBaseVersion takes an older base List, and makes Block.Changed()
reflect whether or not each block has changed compared to that base.

* tools: remove toolchain directive from go.mod

Some stdlib features require Go 1.22, we don't need to mandate a specific
minimum version of 1.22.
---
 tools/go.mod                         |   2 -
 tools/internal/parser/diff.go        | 326 ++++++++++++++++++
 tools/internal/parser/diff_test.go   | 480 +++++++++++++++++++++++++++
 tools/internal/parser/file.go        |  61 +++-
 tools/internal/parser/parser.go      |  24 +-
 tools/internal/parser/parser_test.go |  34 +-
 tools/internal/parser/text.go        |   6 -
 tools/internal/parser/text_test.go   |  23 +-
 8 files changed, 911 insertions(+), 45 deletions(-)
 create mode 100644 tools/internal/parser/diff.go
 create mode 100644 tools/internal/parser/diff_test.go

diff --git a/tools/go.mod b/tools/go.mod
index 834cc24c7..75a047efc 100644
--- a/tools/go.mod
+++ b/tools/go.mod
@@ -2,8 +2,6 @@ module github.com/publicsuffix/list/tools
 
 go 1.22
 
-toolchain go1.22.5
-
 require (
 	github.com/google/go-cmp v0.6.0
 	golang.org/x/net v0.26.0
diff --git a/tools/internal/parser/diff.go b/tools/internal/parser/diff.go
new file mode 100644
index 000000000..69f5df335
--- /dev/null
+++ b/tools/internal/parser/diff.go
@@ -0,0 +1,326 @@
+package parser
+
+import (
+	"fmt"
+	"strings"
+)
+
+// SetBaseVersion sets the list's base of comparison to old, and
+// updates the changed/unchanged annotations on all Blocks to match.
+//
+// If wholeSuffixBlocks is true, any changed Suffix or Wildcard within
+// a Suffixes block marks all suffixes and wildcards in that block as
+// changed.
+//
+// Precise marking (wholeSuffixBlocks=false) is intended for
+// maintainer and machine edits, where change-aware validators should
+// exaine only the specific changed items.
+//
+// Expansive marking (wholeSuffixBlocks=true) is intended for external
+// PRs from suffix block owners, to opportunistically point out more
+// issues that they have the knowledge and authority to fix.
+func (l *List) SetBaseVersion(old *List, wholeSuffixBlocks bool) {
+	diff := differ{
+		oldCnt:    map[string]int{},
+		inCurrent: map[string][][]Block{},
+		keys:      map[Block]string{},
+
+		wholeSuffixBlocks: wholeSuffixBlocks,
+	}
+
+	// Tree diff is an open area of research, and it's possible to use
+	// extremely fancy (and slow) algorithms. Thankfully, the PSL has
+	// some additional domain-specific properties that let us take
+	// shortcuts and implement something O(n).
+	//
+	// First, academic tree_diff(OLD,NEW) produces an "edit script" as
+	// the output, which describes how to add, delete, move and mutate
+	// tree nodes to transform the OLD tree into the NEW tree. For the
+	// PSL, we don't care about the exact structural changes, we just
+	// need to know if we can skip validation checks. So we have to
+	// answer a simple question: is a given block in NEW also present
+	// in OLD?
+	//
+	// Second, all nodes in a well-formed list have a stable unique
+	// identity. We can use this to answer the previous question in
+	// constant time, instead of having to do complex tree analysis to
+	// locate equivalent nodes.
+	//
+	// Node identities may be duplicated in an ill-formed List, for
+	// example a suffix block that lists the same suffix twice. We
+	// deal with this using brute force, and mark all duplicate
+	// identities as changed. This means that a malformed PSL file
+	// might report more changes than the strict minimum, but in
+	// practice it's not much more, and in exchange we don't have to
+	// do anything complex to decide what to revalidate.
+	//
+	// Third, how do we propagate child changes to parents? This is
+	// where academic algorithms quickly go into O(n^3)
+	// territory. Once again, we avoid this with brute force: a
+	// changed tree node marks all its parents as changed as
+	// well. That means that if you fix a typo in one Suffix, we say
+	// that the Suffix changed, but also its parent Suffixes, Section,
+	// and List nodes.
+	//
+	// We could theoretically dirty fewer nodes in some cases, but
+	// that introduces a risk of false negatives (we forget to re-run
+	// a necessary validation), and it makes the diff harder to reason
+	// about when writing validators. In practice, this slightly
+	// pessimistic dirtying is cheap for the currently-planned
+	// validators, so we stick with the behavior that is easy to
+	// reason about and simple to implement.
+	//
+	// Finally, we need to do something about deleted nodes. We can
+	// handle that with a single additional pass through the OLD list,
+	// thanks to the node identity property. Again for simplicity, we
+	// treat deletions similar to edits: all the parents of a deleted
+	// node are marked dirty. Again we could be more precise here, but
+	// in practice it's currently cheap to be pessimistic, and makes
+	// the code and mental model simpler.
+	//
+	// There are various optimizations possible for this code. The
+	// biggest would be doing something more efficient to track block
+	// identities, which are currently expressed as big strings
+	// because that makes them convenient to compare and use as map
+	// keys. However, this algorithm as currently implemented takes
+	// <100ms to diff a full PSL file, so for now we err on the side
+	// of simplicity.
+
+	// Compile the identities of all the blocks in old.
+	diff.scanOld(old, "")
+	// Mark unchanged blocks. Thanks to the previous step, each tree
+	// node can be checked in O(1) time.
+	diff.scanCurrent(l, "", nil)
+	// Dirty the parents of deleted blocks.
+	diff.markDeletions(old, "")
+}
+
+type differ struct {
+	// wholeSuffixBlocks is whether Suffix/Wildcard changes propagate
+	// to all children of the parent Suffixes block.
+	wholeSuffixBlocks bool
+
+	// oldCnt counts the number of blocks in the old list with a given
+	// identity key.
+	oldCnt map[string]int
+
+	// inCurrent maps block identity keys to the tree paths in of the
+	// current list with that identity. Given a block with identity K,
+	// inCurrent[K] is a list of paths. In each path, path[0] is a
+	// block with identity K, and path[1..n] are its parents going
+	// back to the root of the tree.
+	//
+	// In a well-formed List, each cache entry has a single path, but
+	// we track duplicates in order to function correctly on malformed
+	// lists as well.
+	inCurrent map[string][][]Block
+
+	// keys caches identity keys by block pointer. There are several
+	// passes of traversal through trees, and when old and current are
+	// nearly identical (the common case) this can save significant
+	// CPU time.
+	keys map[Block]string
+}
+
+// scanOld records b and its children in d.oldCnt.
+func (d *differ) scanOld(b Block, parentKey string) {
+	k := d.getKey(b, parentKey)
+	d.oldCnt[k]++
+	for _, child := range b.Children() {
+		d.scanOld(child, k)
+	}
+}
+
+// scanCurrent adds b and all its children to b.inCurrent, and updates
+// their isUnchanged annotation based on the information in d.oldCnt.
+func (d *differ) scanCurrent(curBlock Block, parentKey string, parents []Block) {
+	k := d.getKey(curBlock, parentKey)
+
+	path := make([]Block, 0, len(parents)+1)
+	path = append(path, curBlock)
+	path = append(path, parents...)
+
+	// Assume we're unchanged to start with. The job of the remaining
+	// diff code is to falsify this claim and mark the node as changed
+	// if needed.
+	//
+	// Setting this early and unconditionally lets us optimize the
+	// logic in markChanged, by ensuring that each node transitions
+	// false->true only once, before any possible true->false
+	// transitions that affect it.
+	curBlock.info().isUnchanged = true
+
+	// Record the path to the current block, and if it's a
+	// doppelganger of some other Block, mark changed. Tracking diffs
+	// of duplicates requires solving some hard theoretical problems
+	// of tree diff, so we don't bother.
+	//
+	// Duplicate identities only happens on a malformed PSL, and we
+	// can save a lot of pain by just over-rechecking such PSLs
+	// slightly.
+	d.inCurrent[k] = append(d.inCurrent[k], path)
+	if l := len(d.inCurrent[k]); l == 2 {
+		// This is the first duplicate, previous path didn't know it
+		// wasn't unique. Mark both the current and earlier path as
+		// changed.
+		d.markChanged(d.inCurrent[k]...)
+	} else if l > 2 {
+		// Previous paths already marked, only curBlock's one needs
+		// updating.
+		d.markChanged(path)
+	}
+
+	// This covers both the case where a block is new (oldCnt of 0),
+	// and the case where this block isn't a dupe in current, but was
+	// a dupe in old. In that case, like above we avoid algorithmic
+	// headaches by just dirtying the block instead of trying to
+	// resolve which version of the old dupes we're looking at.
+	if d.oldCnt[k] != 1 {
+		d.markChanged(path)
+	}
+
+	// Scan through child subtrees. These subtrees may call
+	// markChanged and set Unchanged=false on us.
+	for _, child := range curBlock.Children() {
+		d.scanCurrent(child, k, path)
+	}
+
+	// If the caller requested, and we're changed anyway, see if we
+	// should propagate the change back downwards again.
+	if !curBlock.info().isUnchanged {
+		d.maybeMarkWholeSuffixBlock(path)
+	}
+}
+
+// markDeletions marks parents of deleted nodes as changed in current.
+//
+// For example, if the diff contains a suffix deletion, this will mark
+// the enclosing Suffixes block as changed.
+func (d *differ) markDeletions(oldBlock Block, parentKey string) bool {
+	k := d.getKey(oldBlock, parentKey)
+
+	pathsInCurrent, ok := d.inCurrent[k]
+	if !ok {
+		// oldBlock was deleted, report to caller.
+		return true
+	}
+
+	childDeleted := false
+	for _, child := range oldBlock.Children() {
+		if d.markDeletions(child, k) {
+			// Note, can't short-circuit here because there may be
+			// other paths under this block that also need to be
+			// updated. We're not only trying to update oldBlock, but
+			// also all of its children.
+			childDeleted = true
+		}
+	}
+
+	// Children were deleted, mark ourselves changed. This implicitly
+	// also marks the parent as changed, so no need to tell it that a
+	// change happened, it'll just do extra no-op work.
+	if childDeleted {
+		d.markChanged(pathsInCurrent...)
+	}
+
+	return false
+}
+
+// maybeMarkWholeSuffixBlock calls markSuffixAndWildcardChanged on all
+// Suffixes in path, if the caller of MarkUnchanged requested
+// expansive marking.
+func (d *differ) maybeMarkWholeSuffixBlock(path []Block) {
+	if !d.wholeSuffixBlocks {
+		return
+	}
+
+	switch path[0].(type) {
+	case *Suffixes, *Suffix, *Wildcard:
+		for i, parent := range path {
+			if _, ok := parent.(*Suffixes); ok {
+				d.markSuffixAndWildcardChanged(parent, path[i+1:])
+			}
+		}
+	}
+}
+
+// markSuffixAndWildcardChanged marks as changed all Suffix and
+// Wildcard blocks in the tree rooted at curBlock.
+func (d *differ) markSuffixAndWildcardChanged(curBlock Block, parents []Block) {
+	path := append([]Block{curBlock}, parents...)
+
+	switch curBlock.(type) {
+	case *Suffix, *Wildcard:
+		d.markChanged(path)
+	default:
+		for _, child := range curBlock.Children() {
+			d.markSuffixAndWildcardChanged(child, path)
+		}
+	}
+}
+
+// markChanged marks as changed all the blocks in paths.
+func (d *differ) markChanged(paths ...[]Block) {
+pathLoop:
+	for _, path := range paths {
+		for _, b := range path {
+			if b.info().isUnchanged == false {
+				// We never mark a node as changed in isolation, we
+				// always propagate the change to all its
+				// parents. Therefore, we can stop the upwards
+				// traversal in this path as soon as we find any node
+				// that's already in the correct state.
+				continue pathLoop
+			}
+			b.info().isUnchanged = false
+		}
+	}
+}
+
+// getKey returns the identity key for blk, which must be a direct
+// child of parentKey. getKey keeps a cache of all keys built in the
+// lifetime of this differ, to make future calls more efficient.
+func (d *differ) getKey(blk Block, parentKey string) string {
+	ret, ok := d.keys[blk]
+	if !ok {
+		ret = d.makeKey(blk, parentKey)
+		d.keys[blk] = ret
+	}
+	return ret
+}
+
+// makeKey builds the identity key of blk, which must be a child node
+// of parentKey.
+func (d *differ) makeKey(b Block, parentKey string) string {
+	switch v := b.(type) {
+	case *List:
+		return fmt.Sprintf("%s;List", parentKey)
+	case *Section:
+		return fmt.Sprintf("%s;Section,%q", parentKey, v.Name)
+	case *Suffixes:
+		// Note parsed suffix metadata isn't included in the identity,
+		// to avoid marking all suffixes in a block changed when
+		// someone adjusts their URL or email. Such edits will still
+		// indirectly dirty the block, because the metadata comment
+		// includes the entire comment text in its identity, and will
+		// dirty the parent Suffixes.
+		//
+		// Two temporary exceptions: TransIP and MetaCentrum both have
+		// two blocks each, with different contact emails. Until those
+		// are fixed, also include the maintainer email in the
+		// identity to avoid constant false positives.
+		ret := fmt.Sprintf("%s;Suffixes,%q", parentKey, v.Info.Name)
+		if strings.Contains(v.Info.Name, "MetaCentrum") || strings.Contains(v.Info.Name, "TransIP") {
+			ret += fmt.Sprintf(",%v", v.Info.Maintainers)
+		}
+		return ret
+	case *Suffix:
+		return fmt.Sprintf("%s;Suffix,%q", parentKey, v.Domain)
+	case *Wildcard:
+		return fmt.Sprintf("%s;Wildcard,%q,%#v", parentKey, v.Domain, v.Exceptions)
+	case *Comment:
+		return fmt.Sprintf("%s;Comment,%#v", parentKey, v.Text)
+	default:
+		panic("unknown ast node")
+	}
+}
diff --git a/tools/internal/parser/diff_test.go b/tools/internal/parser/diff_test.go
new file mode 100644
index 000000000..7596318c0
--- /dev/null
+++ b/tools/internal/parser/diff_test.go
@@ -0,0 +1,480 @@
+package parser
+
+import (
+	"slices"
+	"testing"
+)
+
+func TestDiff(t *testing.T) {
+	tests := []struct {
+		name      string
+		before    *List
+		now       *List
+		expansive bool
+		changed   *List
+	}{
+		{
+			name: "new_suffix",
+			before: list(
+				section(1, 1, "PRIVATE DOMAINS",
+					suffixes(1, 1, noInfo,
+						suffix(1, "example.com"),
+					),
+				),
+			),
+			now: list(
+				section(1, 1, "PRIVATE DOMAINS",
+					suffixes(1, 1, noInfo,
+						suffix(1, "example.com"),
+						suffix(2, "example.net"),
+					),
+				),
+			),
+			changed: list(
+				section(1, 1, "PRIVATE DOMAINS",
+					suffixes(1, 1, noInfo,
+						suffix(2, "example.net"),
+					),
+				),
+			),
+		},
+
+		{
+			name: "new_wildcard",
+			before: list(
+				section(1, 1, "PRIVATE DOMAINS",
+					suffixes(1, 1, noInfo,
+						suffix(1, "example.com"),
+					),
+				),
+			),
+			now: list(
+				section(1, 1, "PRIVATE DOMAINS",
+					suffixes(1, 1, noInfo,
+						suffix(1, "example.com"),
+						wildcard(2, 2, "example.net"),
+					),
+				),
+			),
+			changed: list(
+				section(1, 1, "PRIVATE DOMAINS",
+					suffixes(1, 1, noInfo,
+						wildcard(2, 2, "example.net"),
+					),
+				),
+			),
+		},
+
+		{
+			name: "new_exception",
+			before: list(
+				section(1, 1, "PRIVATE DOMAINS",
+					suffixes(1, 1, noInfo,
+						suffix(1, "example.com"),
+						wildcard(2, 2, "example.net", "exception1"),
+					),
+				),
+			),
+			now: list(
+				section(1, 1, "PRIVATE DOMAINS",
+					suffixes(1, 1, noInfo,
+						suffix(1, "example.com"),
+						wildcard(2, 2, "example.net", "exception1", "exception2"),
+					),
+				),
+			),
+			changed: list(
+				section(1, 1, "PRIVATE DOMAINS",
+					suffixes(1, 1, noInfo,
+						wildcard(2, 2, "example.net", "exception1", "exception2"),
+					),
+				),
+			),
+		},
+
+		{
+			name: "new_suffix_block",
+			before: list(
+				section(1, 1, "PRIVATE DOMAINS",
+					suffixes(1, 1, info("Example LLC", nil, nil, nil, true),
+						suffix(1, "example.com"),
+					),
+				),
+			),
+			now: list(
+				section(1, 1, "PRIVATE DOMAINS",
+					suffixes(1, 1, info("Example LLC", nil, nil, nil, true),
+						suffix(1, "example.com"),
+					),
+					suffixes(1, 1, info("Zork Ltd", nil, nil, nil, true),
+						suffix(1, "zork.example.com"),
+					),
+				),
+			),
+			changed: list(
+				section(1, 1, "PRIVATE DOMAINS",
+					suffixes(1, 1, info("Zork Ltd", nil, nil, nil, true),
+						suffix(1, "zork.example.com"),
+					),
+				),
+			),
+		},
+
+		{
+			name: "new_section",
+			before: list(
+				section(1, 1, "PRIVATE DOMAINS",
+					suffixes(1, 1, info("Example LLC", nil, nil, nil, true),
+						suffix(1, "example.com"),
+					),
+				),
+			),
+			now: list(
+				section(2, 2, "ICANN DOMAINS",
+					suffixes(1, 1, info("aaa", nil, nil, nil, true),
+						suffix(1, "aaa"),
+					),
+				),
+				section(1, 1, "PRIVATE DOMAINS",
+					suffixes(1, 1, info("Example LLC", nil, nil, nil, true),
+						suffix(1, "example.com"),
+					),
+				),
+			),
+			changed: list(
+				section(2, 2, "ICANN DOMAINS",
+					suffixes(1, 1, info("aaa", nil, nil, nil, true),
+						suffix(1, "aaa"),
+					),
+				),
+			),
+		},
+
+		{
+			name: "change_suffixes",
+			before: list(
+				section(1, 1, "PRIVATE DOMAINS",
+					suffixes(1, 1, info("Example LLC", nil, nil, nil, true),
+						suffix(1, "example.com"),
+						suffix(2, "example.org"),
+						wildcard(3, 3, "example.net", "exception"),
+						wildcard(4, 4, "example.edu", "faculty"),
+					),
+					suffixes(2, 2, info("Unchanged GmbH", nil, nil, nil, true),
+						suffix(1, "example.de"),
+						suffix(2, "example.fr"),
+						suffix(3, "example.nl"),
+					),
+				),
+			),
+			now: list(
+				section(1, 1, "PRIVATE DOMAINS",
+					suffixes(1, 1, info("Example LLC", nil, nil, nil, true),
+						suffix(1, "example.gov"), // com -> gov
+						suffix(2, "example.org"),
+						wildcard(3, 3, "example.net", "exception2"), // exception -> exception2
+						wildcard(4, 4, "w.example.edu", "faculty"),  // example.edu -> w.example.edu
+					),
+					suffixes(2, 2, info("Unchanged GmbH", nil, nil, nil, true),
+						suffix(1, "example.de"),
+						suffix(2, "example.fr"),
+						suffix(3, "example.nl"),
+					),
+				),
+			),
+			changed: list(
+				section(1, 1, "PRIVATE DOMAINS",
+					suffixes(1, 1, info("Example LLC", nil, nil, nil, true),
+						suffix(1, "example.gov"),
+						wildcard(3, 3, "example.net", "exception2"),
+						wildcard(4, 4, "w.example.edu", "faculty"),
+					),
+				),
+			),
+		},
+
+		{
+			name: "change_block_info",
+			before: list(
+				section(1, 1, "PRIVATE DOMAINS",
+					suffixes(1, 1,
+						info(
+							"Example LLC",
+							urls("https://example.com"),
+							nil,
+							nil,
+							true),
+						comment(1, "Example LLC: https://example.com"),
+						suffix(2, "example.com"),
+						suffix(3, "example.org"),
+						wildcard(4, 4, "example.net", "exception"),
+					),
+				),
+			),
+			now: list(
+				section(1, 1, "PRIVATE DOMAINS",
+					suffixes(1, 1,
+						info(
+							"Example LLC",
+							urls("https://example.org"), // different URL
+							nil,
+							nil,
+							true),
+						comment(1, "Example LLC: https://example.org"),
+						suffix(2, "example.com"),
+						suffix(3, "example.org"),
+						wildcard(4, 4, "example.net", "exception"),
+					),
+				),
+			),
+			changed: list(
+				section(1, 1, "PRIVATE DOMAINS",
+					suffixes(1, 1,
+						info(
+							"Example LLC",
+							urls("https://example.org"),
+							nil,
+							nil,
+							true),
+						comment(1, "Example LLC: https://example.org"),
+						// Suffixes not changed
+					),
+				),
+			),
+		},
+
+		{
+			name: "duplicates_in_new",
+			before: list(
+				section(1, 1, "PRIVATE DOMAINS",
+					suffixes(1, 1, info("Example LLC", nil, nil, nil, true),
+						suffix(1, "example.com"),
+						suffix(2, "example.org"),
+					),
+					suffixes(2, 2, info("Unchanged GmbH", nil, nil, nil, true),
+						suffix(1, "example.de"),
+						suffix(2, "example.fr"),
+						suffix(3, "example.nl"),
+					),
+				),
+			),
+			now: list(
+				section(1, 1, "PRIVATE DOMAINS",
+					suffixes(1, 1, info("Example LLC", nil, nil, nil, true),
+						suffix(1, "example.com"),
+						suffix(2, "example.org"),
+						suffix(3, "example.com"), // dupe
+					),
+					suffixes(2, 2, info("Unchanged GmbH", nil, nil, nil, true),
+						suffix(1, "example.de"),
+						suffix(2, "example.fr"),
+						suffix(3, "example.nl"),
+						suffix(4, "example.fr"), // dupe
+					),
+				),
+			),
+			changed: list(
+				section(1, 1, "PRIVATE DOMAINS",
+					suffixes(1, 1, info("Example LLC", nil, nil, nil, true),
+						suffix(1, "example.com"),
+						suffix(3, "example.com"),
+					),
+					suffixes(2, 2, info("Unchanged GmbH", nil, nil, nil, true),
+						suffix(2, "example.fr"),
+						suffix(4, "example.fr"),
+					),
+				),
+			),
+		},
+
+		{
+			name: "duplicates_in_old",
+			before: list(
+				section(1, 1, "PRIVATE DOMAINS",
+					suffixes(1, 1, info("Example LLC", nil, nil, nil, true),
+						suffix(1, "example.com"),
+						suffix(2, "example.org"),
+						suffix(3, "example.com"), // dupe
+					),
+					suffixes(2, 2, info("Unchanged GmbH", nil, nil, nil, true),
+						suffix(1, "example.de"),
+						suffix(2, "example.fr"),
+						suffix(3, "example.nl"),
+					),
+				),
+			),
+			now: list(
+				// no changes compared to before
+				section(1, 1, "PRIVATE DOMAINS",
+					suffixes(1, 1, info("Example LLC", nil, nil, nil, true),
+						suffix(1, "example.com"),
+						suffix(2, "example.org"),
+						suffix(3, "example.com"),
+					),
+					suffixes(2, 2, info("Unchanged GmbH", nil, nil, nil, true),
+						suffix(1, "example.de"),
+						suffix(2, "example.fr"),
+						suffix(3, "example.nl"),
+					),
+				),
+			),
+			changed: list(
+				section(1, 1, "PRIVATE DOMAINS",
+					suffixes(1, 1, info("Example LLC", nil, nil, nil, true),
+						// all dupes marked for recheck, despite no changes
+						suffix(1, "example.com"),
+						suffix(3, "example.com"),
+					),
+				),
+			),
+		},
+
+		{
+			name: "deletions",
+			before: list(
+				section(1, 1, "SUFFIX DELETES",
+					suffixes(1, 1, info("Example LLC", nil, nil, nil, true),
+						suffix(1, "example.com"),
+						suffix(2, "example.org"),
+						suffix(3, "example.net"),
+					),
+					suffixes(2, 2, info("Unchanged GmbH", nil, nil, nil, true),
+						suffix(1, "example.de"),
+						suffix(2, "example.fr"),
+						suffix(3, "example.nl"),
+					),
+				),
+				section(2, 2, "SUFFIX BLOCK DELETES",
+					suffixes(1, 1, info("aaa", nil, nil, nil, true),
+						suffix(1, "aaa"),
+						suffix(2, "bbb"),
+					),
+					suffixes(1, 1, info("bbb", nil, nil, nil, true),
+						suffix(1, "ccc"),
+					),
+				),
+				section(3, 3, "SECTION TO DELETE",
+					suffixes(1, 1, info("delete me", nil, nil, nil, true),
+						suffix(1, "delete-me.zork"),
+					),
+				),
+			),
+			now: list(
+				section(1, 1, "SUFFIX DELETES",
+					suffixes(1, 1, info("Example LLC", nil, nil, nil, true),
+						suffix(1, "example.com"),
+						// deleted example.org
+						suffix(3, "example.net"),
+					),
+					suffixes(2, 2, info("Unchanged GmbH", nil, nil, nil, true),
+						suffix(1, "example.de"),
+						suffix(2, "example.fr"),
+						suffix(3, "example.nl"),
+					),
+				),
+				section(2, 2, "SUFFIX BLOCK DELETES",
+					// deleted aaa
+					suffixes(1, 1, info("bbb", nil, nil, nil, true),
+						suffix(1, "ccc"),
+					),
+				),
+				// deleted section to delete
+			),
+			changed: list(
+				section(1, 1, "SUFFIX DELETES",
+					suffixes(1, 1, info("Example LLC", nil, nil, nil, true)),
+				),
+				section(2, 2, "SUFFIX BLOCK DELETES"),
+			),
+		},
+
+		{
+			name: "expand",
+			before: list(
+				section(1, 1, "PRIVATE DOMAINS",
+					suffixes(1, 1, info("Example LLC", nil, nil, nil, true),
+						suffix(1, "example.com"),
+						suffix(2, "example.org"),
+						suffix(3, "example.net"),
+					),
+					suffixes(2, 2, info("Unchanged GmbH", nil, nil, nil, true),
+						suffix(1, "example.de"),
+						suffix(2, "example.fr"),
+						suffix(3, "example.nl"),
+					),
+				),
+			),
+			now: list(
+				section(1, 1, "PRIVATE DOMAINS",
+					suffixes(1, 1, info("Example LLC", nil, nil, nil, true),
+						suffix(1, "example.com"),
+						suffix(2, "example.org"),
+						suffix(3, "example.biz"), // changed
+					),
+					suffixes(2, 2, info("Unchanged GmbH", nil, nil, nil, true),
+						suffix(1, "example.de"),
+						suffix(2, "example.fr"),
+						suffix(3, "example.nl"),
+					),
+				),
+			),
+			expansive: true,
+			changed: list(
+				section(1, 1, "PRIVATE DOMAINS",
+					suffixes(1, 1, info("Example LLC", nil, nil, nil, true),
+						// all suffixes marked, not just the changed one
+						suffix(1, "example.com"),
+						suffix(2, "example.org"),
+						suffix(3, "example.biz"),
+					),
+				),
+			),
+		},
+	}
+
+	for _, tc := range tests {
+		t.Run(tc.name, func(t *testing.T) {
+			got := tc.now
+			got.SetBaseVersion(tc.before, tc.expansive)
+			deleteUnchanged(got)
+			checkDiff(t, "MarkUnchanged output", got, tc.changed)
+		})
+	}
+}
+
+func deleteUnchanged(b Block) {
+	if !b.Changed() {
+		return
+	}
+
+	switch v := b.(type) {
+	case *List:
+		v.Blocks = slices.DeleteFunc(v.Blocks, func(child Block) bool {
+			return !child.Changed()
+		})
+		if len(v.Blocks) == 0 {
+			v.Blocks = nil
+		}
+	case *Section:
+		v.Blocks = slices.DeleteFunc(v.Blocks, func(child Block) bool {
+			return !child.Changed()
+		})
+		if len(v.Blocks) == 0 {
+			v.Blocks = nil
+		}
+	case *Suffixes:
+		v.Blocks = slices.DeleteFunc(v.Blocks, func(child Block) bool {
+			return !child.Changed()
+		})
+		if len(v.Blocks) == 0 {
+			v.Blocks = nil
+		}
+	case *Suffix, *Wildcard, *Comment:
+	default:
+		panic("unknown ast node")
+	}
+
+	for _, child := range b.Children() {
+		deleteUnchanged(child)
+	}
+}
diff --git a/tools/internal/parser/file.go b/tools/internal/parser/file.go
index e532863e3..4578a38a0 100644
--- a/tools/internal/parser/file.go
+++ b/tools/internal/parser/file.go
@@ -9,17 +9,6 @@ import (
 	"github.com/publicsuffix/list/tools/internal/domain"
 )
 
-// List is a parsed public suffix list.
-type List struct {
-	SourceRange
-
-	// Blocks are the top-level elements of the list, in the order
-	// they appear.
-	Blocks []Block
-}
-
-func (l *List) Children() []Block { return l.Blocks }
-
 // A Block is a parsed chunk of a PSL file. Each block is one of the
 // concrete types Comment, Section, Suffixes, Suffix, or Wildcard.
 type Block interface {
@@ -27,6 +16,11 @@ type Block interface {
 	SrcRange() SourceRange
 	// Children returns the block's direct children, if any.
 	Children() []Block
+	// Changed reports whether the tree rooted at block has changed
+	// since the base of comparison (see List.SetBaseVersion).
+	Changed() bool
+
+	info() *blockInfo
 }
 
 // BlocksOfType recursively collects and returns all blocks of
@@ -49,10 +43,45 @@ func blocksOfTypeRec[T Block](tree Block, out *[]T) {
 	}
 }
 
+// blockInfo is common information shared by all Block types.
+type blockInfo struct {
+	SourceRange
+
+	// isUnchanged records that a Block (including any children) is
+	// semantically unchanged from a past base point. The default base
+	// of comparison is a null List, meaning that Unchanged=false for
+	// all blocks. A different base of comparison can be set with
+	// List.Diff.
+	isUnchanged bool
+}
+
+func (b blockInfo) SrcRange() SourceRange {
+	return b.SourceRange
+}
+
+func (b blockInfo) Changed() bool {
+	return !b.isUnchanged
+}
+
+func (b *blockInfo) info() *blockInfo {
+	return b
+}
+
+// List is a parsed public suffix list.
+type List struct {
+	blockInfo
+
+	// Blocks are the top-level elements of the list, in the order
+	// they appear.
+	Blocks []Block
+}
+
+func (l *List) Children() []Block { return l.Blocks }
+
 // Comment is a comment block, consisting of one or more contiguous
 // lines of commented text.
 type Comment struct {
-	SourceRange
+	blockInfo
 	// Text is the unprocessed content of the comment lines, with the
 	// leading comment syntax removed.
 	Text []string
@@ -63,7 +92,7 @@ func (c *Comment) Children() []Block { return nil }
 // Section is a named part of a PSL file, containing suffixes which
 // behave similarly.
 type Section struct {
-	SourceRange
+	blockInfo
 
 	// Name is he section name. In a normal well-formed PSL file, the
 	// names are "ICANN DOMAINS" and "PRIVATE DOMAINS".
@@ -82,7 +111,7 @@ func (s *Section) Children() []Block { return s.Blocks }
 // domain suffixes. The suffix list may contain additional
 // unstructured inline comments.
 type Suffixes struct {
-	SourceRange
+	blockInfo
 
 	// Info is information about the authoritative maintainers for
 	// this set of suffixes.
@@ -186,7 +215,7 @@ func (m MaintainerInfo) HasInfo() bool {
 // Suffix is one public suffix, represented in the standard domain
 // name format.
 type Suffix struct {
-	SourceRange
+	blockInfo
 
 	// Domain is the public suffix's domain name.
 	Domain domain.Name
@@ -197,7 +226,7 @@ func (s *Suffix) Children() []Block { return nil }
 // Wildcard is a wildcard public suffix, along with any exceptions to
 // that wildcard.
 type Wildcard struct {
-	SourceRange
+	blockInfo
 
 	// Domain is the base of the wildcard public suffix, without the
 	// leading "*" label.
diff --git a/tools/internal/parser/parser.go b/tools/internal/parser/parser.go
index 56cfb56c5..d1cfae419 100644
--- a/tools/internal/parser/parser.go
+++ b/tools/internal/parser/parser.go
@@ -234,8 +234,10 @@ func (p *parser) parseSection() *Section {
 	// Initialize with the start-of-section marker's data.
 	start := p.next().(tokenSectionStart)
 	ret := &Section{
-		SourceRange: start.SourceRange,
-		Name:        start.Name,
+		blockInfo: blockInfo{
+			SourceRange: start.SourceRange,
+		},
+		Name: start.Name,
 	}
 	emit := blockEmitter(&ret.Blocks, &ret.SourceRange)
 
@@ -349,8 +351,10 @@ func (p *parser) parseSuffix() Block {
 	}
 
 	return &Suffix{
-		SourceRange: tok.SourceRange,
-		Domain:      domain,
+		blockInfo: blockInfo{
+			SourceRange: tok.SourceRange,
+		},
+		Domain: domain,
 	}
 }
 
@@ -366,8 +370,10 @@ func (p *parser) parseWildcard() Block {
 	}
 
 	return &Wildcard{
-		SourceRange: tok.SourceRange,
-		Domain:      domain,
+		blockInfo: blockInfo{
+			SourceRange: tok.SourceRange,
+		},
+		Domain: domain,
 	}
 }
 
@@ -402,8 +408,10 @@ func (p *parser) parseException(previous []Block) {
 func (p *parser) parseComment() *Comment {
 	tok := p.next().(tokenComment)
 	ret := &Comment{
-		SourceRange: tok.SourceRange,
-		Text:        []string{tok.Text},
+		blockInfo: blockInfo{
+			SourceRange: tok.SourceRange,
+		},
+		Text: []string{tok.Text},
 	}
 	for {
 		if tok, ok := p.peek().(tokenComment); ok {
diff --git a/tools/internal/parser/parser_test.go b/tools/internal/parser/parser_test.go
index 1d72925ac..7264d872b 100644
--- a/tools/internal/parser/parser_test.go
+++ b/tools/internal/parser/parser_test.go
@@ -357,24 +357,30 @@ func list(blocks ...Block) *List {
 
 func comment(start int, lines ...string) *Comment {
 	return &Comment{
-		SourceRange: mkSrc(start, start+len(lines)),
-		Text:        lines,
+		blockInfo: blockInfo{
+			SourceRange: mkSrc(start, start+len(lines)),
+		},
+		Text: lines,
 	}
 }
 
 func section(start, end int, name string, blocks ...Block) *Section {
 	return &Section{
-		SourceRange: mkSrc(start, end),
-		Name:        name,
-		Blocks:      blocks,
+		blockInfo: blockInfo{
+			SourceRange: mkSrc(start, end),
+		},
+		Name:   name,
+		Blocks: blocks,
 	}
 }
 
 func suffixes(start, end int, info MaintainerInfo, blocks ...Block) *Suffixes {
 	return &Suffixes{
-		SourceRange: mkSrc(start, end),
-		Info:        info,
-		Blocks:      blocks,
+		blockInfo: blockInfo{
+			SourceRange: mkSrc(start, end),
+		},
+		Info:   info,
+		Blocks: blocks,
 	}
 }
 
@@ -396,8 +402,10 @@ func suffix(line int, domainStr string) *Suffix {
 		panic(err)
 	}
 	return &Suffix{
-		SourceRange: mkSrc(line, line+1),
-		Domain:      domain,
+		blockInfo: blockInfo{
+			SourceRange: mkSrc(line, line+1),
+		},
+		Domain: domain,
 	}
 }
 
@@ -408,8 +416,10 @@ func wildcard(start, end int, base string, exceptions ...string) *Wildcard {
 	}
 
 	ret := &Wildcard{
-		SourceRange: mkSrc(start, end),
-		Domain:      dom,
+		blockInfo: blockInfo{
+			SourceRange: mkSrc(start, end),
+		},
+		Domain: dom,
 	}
 	for _, s := range exceptions {
 		exc, err := domain.ParseLabel(s)
diff --git a/tools/internal/parser/text.go b/tools/internal/parser/text.go
index 3bf76f251..bc6dc3c45 100644
--- a/tools/internal/parser/text.go
+++ b/tools/internal/parser/text.go
@@ -51,12 +51,6 @@ func (s SourceRange) merge(other SourceRange) SourceRange {
 	}
 }
 
-// SrcRange returns the SourceRange. This looks a little strange, but
-// it's to satisfy the Block interface. This allows other code to
-// retrieve the SourceRange of any Block without having to typeswitch
-// all the possible sub-types.
-func (s SourceRange) SrcRange() SourceRange { return s }
-
 const (
 	bomUTF8    = "\xEF\xBB\xBF"
 	bomUTF16BE = "\xFE\xFF"
diff --git a/tools/internal/parser/text_test.go b/tools/internal/parser/text_test.go
index cc5eb570b..e65d81aaa 100644
--- a/tools/internal/parser/text_test.go
+++ b/tools/internal/parser/text_test.go
@@ -3,6 +3,7 @@ package parser
 import (
 	"bytes"
 	"fmt"
+	"reflect"
 	"testing"
 
 	"github.com/google/go-cmp/cmp"
@@ -224,7 +225,27 @@ func utf8WithBOM(s string) []byte {
 
 func checkDiff(t *testing.T, whatIsBeingDiffed string, got, want any) {
 	t.Helper()
-	if diff := cmp.Diff(got, want); diff != "" {
+
+	// cmp.Diff refuses to examine unexported fields by default. Tell
+	// it that it's okay to look at unexported fields of blocks and
+	// blockInfo, since we own those fields and want to include their
+	// values in comparisons.
+	exportInfo := cmp.Exporter(func(t reflect.Type) bool {
+		if t.Kind() != reflect.Pointer {
+			t = reflect.PointerTo(t)
+		}
+
+		if t.Elem() == reflect.TypeFor[blockInfo]() {
+			return true
+		}
+
+		if t.Implements(reflect.TypeFor[Block]()) {
+			return true
+		}
+
+		return false
+	})
+	if diff := cmp.Diff(got, want, exportInfo); diff != "" {
 		t.Errorf("%s is wrong (-got+want):\n%s", whatIsBeingDiffed, diff)
 	}
 }