From a8f5771aeac5170fee8551ad5f0d714379ed82d2 Mon Sep 17 00:00:00 2001
From: Daniel McCarney <daniel@binaryparadox.net>
Date: Thu, 18 Feb 2021 06:57:24 -0500
Subject: [PATCH] tools: improved newgtlds.go, removed replace-between. (#1204)

== tools: improved newgtlds.go, removed replace-between. -

https://github.com/publicsuffix/list/commit/d996dadc0c00bdadbbebfa26e3a38fd1c3dc5a7e

Previously the `tools/newgtlds.go` utility was difficult to use with the new Github action based pull-request workflow because it _always_ updated the timestamp in the header comment in the gTLD section of `public_suffix_list.dat` when it was run, even if no gTLD data changed. This meant there was a diff in the workdir after the tool ran that would cause a PR to be opened by the action. See https://github.com/publicsuffix/list/pull/1166 for more discussion/background.

In the old travis version of the automation I used a crude shell pipeline to exclude the header comment when deciding if there was a diff or not. With the new action it's not possible to change how the diff status is determined (without re-inventing the pull request action in-repo). So instead we need to abandon the timestamp in the gTLD section header comment (that would be sad) or, we need the script to be idempotent when there's no true data change.

To make the tooling smarter about when it makes changes I reworked `newgtlds.go` to be able to consume the existing `public_suffix_list.dat` and to split apart the data templating and the header comment templating. Now the tool can exclude the header comment and compare against the existing data to determine if there is a real data change or not. If there's no change in the data then the file is left untouched. If there is a change then the file is updated and only then is the header comment with the date stamp updated along with the new data. I've included unit tests that get 80% statement coverage of this new tooling and also tested it locally with success. Conveniently this rework also lets us remove the `tools/replace-between` Perl script reducing the number of languages in play for this process.

== CI: re-enable github tld-update workflow -

https://github.com/publicsuffix/list/commit/785683990adb2da19f335c5e33fa7a411be8cbc6

Now that the tooling won't produce PRs without meaningful data updates we can re-enable the `tld-update.yml` workflow.

== CI: run go unit tests in tld-update workflow -

https://github.com/publicsuffix/list/commit/f6749a5e9c729f9f36f126ee8920c57083c5921f

Before using the `tools/newgtlds.go` tooling to open a PR we should run the unit tests to make sure the code works the way we expect. This will also help catch any bitrot.
---
 .../{tld-update.yml.hold => tld-update.yml}   |   3 +
 .gitignore                                    |   2 +-
 .travis.yml                                   |   8 +-
 tools/newgtlds.go                             | 363 +++++++++++--
 tools/newgtlds_test.go                        | 494 +++++++++++++++++-
 tools/patchnewgtlds                           |   8 +-
 tools/replace-between                         |  48 --
 7 files changed, 825 insertions(+), 101 deletions(-)
 rename .github/workflows/{tld-update.yml.hold => tld-update.yml} (96%)
 delete mode 100755 tools/replace-between

diff --git a/.github/workflows/tld-update.yml.hold b/.github/workflows/tld-update.yml
similarity index 96%
rename from .github/workflows/tld-update.yml.hold
rename to .github/workflows/tld-update.yml
index 2248924061..7950067241 100644
--- a/.github/workflows/tld-update.yml.hold
+++ b/.github/workflows/tld-update.yml
@@ -19,6 +19,9 @@ jobs:
         with:
           go-version: ^1.15
 
+      - name: Run unit tests
+        run: go test tools/*.go
+
       - name: Set current date
         id: get-date
         run: echo "::set-output name=now::$(date +'%Y-%m-%dT%H:%M:%S %Z')"
diff --git a/.gitignore b/.gitignore
index 75ec50d1f5..86af741708 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,3 +1,3 @@
 linter/log
 libpsl
-
+coverage.out
diff --git a/.travis.yml b/.travis.yml
index 3434bacae4..fd1be9d3be 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -1,7 +1,12 @@
 language: c
 compiler: gcc
 
-script: make
+script: 
+  - make
+  - go test -v -coverprofile=coverage.out tools/*.go
+
+go:
+  - "1.15.x"
 
 addons:
     apt:
@@ -18,4 +23,3 @@ addons:
             - libicu-dev
             - libunistring0
             - libunistring-dev
-
diff --git a/tools/newgtlds.go b/tools/newgtlds.go
index 33cf8e452b..750a661d7e 100644
--- a/tools/newgtlds.go
+++ b/tools/newgtlds.go
@@ -6,6 +6,7 @@ import (
 	"bytes"
 	"encoding/json"
 	"errors"
+	"flag"
 	"fmt"
 	"io"
 	"io/ioutil"
@@ -16,10 +17,18 @@ import (
 	"time"
 )
 
-// ICANN_GTLD_JSON_URL is the URL for the ICANN gTLD JSON registry (version
-// 2). See https://www.icann.org/resources/pages/registries/registries-en for
-// more information.
-const ICANN_GTLD_JSON_URL = "https://www.icann.org/resources/registries/gtlds/v2/gtlds.json"
+const (
+	// ICANN_GTLD_JSON_URL is the URL for the ICANN gTLD JSON registry (version
+	// 2). See https://www.icann.org/resources/pages/registries/registries-en for
+	// more information.
+	ICANN_GTLD_JSON_URL = "https://www.icann.org/resources/registries/gtlds/v2/gtlds.json"
+	// PSL_GTLDS_SECTION_HEADER marks the start of the newGTLDs section of the
+	// overall public suffix dat file.
+	PSL_GTLDS_SECTION_HEADER = "// newGTLDs"
+	// PSL_GTLDS_SECTION_FOOTER marks the end of the newGTLDs section of the
+	// overall public suffix dat file.
+	PSL_GTLDS_SECTION_FOOTER = "// ===END ICANN DOMAINS==="
+)
 
 var (
 	// legacyGTLDs are gTLDs that predate ICANN's new gTLD program. These legacy
@@ -48,22 +57,31 @@ var (
 		"xxx":    true,
 	}
 
+	// pslHeaderTemplate is a parsed text/template instance for rendering the header
+	// before the data rendered with the pslTemplate. We use two separate templates
+	// so that we can avoid having a variable date stamp in the pslTemplate, allowing
+	// us to easily check that the data in the current .dat file is unchanged from
+	// what we render when there are no updates to add.
+	//
+	// Expected template data:
+	//   URL - the string URL that the data was fetched from.
+	//   Date - the time.Date that the data was fetched.
+	//   DateFormat - the format string to use with the date.
+	pslHeaderTemplate = template.Must(template.New("public-suffix-list-gtlds-header").Parse(`
+// List of new gTLDs imported from {{ .URL }} on {{ .Date.Format .DateFormat }}
+// This list is auto-generated, don't edit it manually.`))
+
 	// pslTemplate is a parsed text/template instance for rendering a list of pslEntry
 	// objects in the format used by the public suffix list.
 	//
 	// It expects the following template data:
-	//   URL - the string URL that the data was fetched from.
-	//   Date - the time.Date that the data was fetched.
 	//   Entries - a list of pslEntry objects.
-	pslTemplate = template.Must(template.New("public-suffix-list-gtlds").Parse(`
-// List of new gTLDs imported from {{ .URL }} on {{ .Date.Format "2006-01-02T15:04:05Z07:00" }}
-// This list is auto-generated, don't edit it manually.
-
+	pslTemplate = template.Must(
+		template.New("public-suffix-list-gtlds").Parse(`
 {{- range .Entries }}
-{{ .Comment }}
-{{ printf "%s\n" .ULabel}}
-{{- end }}
-`))
+{{- .Comment }}
+{{ printf "%s\n" .ULabel }}
+{{ end }}`))
 )
 
 // pslEntry is a struct matching a subset of the gTLD data fields present in
@@ -131,6 +149,176 @@ func (e pslEntry) Comment() string {
 	return strings.Join(parts, " ")
 }
 
+// gTLDDatSpan represents the span between the PSL_GTLD_SECTION_HEADER and
+// the PSL_GTLDS_SECTION_FOOTER in the PSL dat file.
+type gTLDDatSpan struct {
+	startIndex int
+	endIndex   int
+}
+
+var (
+	errNoHeader = fmt.Errorf("did not find expected header line %q",
+		PSL_GTLDS_SECTION_HEADER)
+	errMultipleHeaders = fmt.Errorf("found expected header line %q more than once",
+		PSL_GTLDS_SECTION_HEADER)
+	errNoFooter = fmt.Errorf("did not find expected footer line %q",
+		PSL_GTLDS_SECTION_FOOTER)
+)
+
+type errInvertedSpan struct {
+	span gTLDDatSpan
+}
+
+func (e errInvertedSpan) Error() string {
+	return fmt.Sprintf(
+		"found footer line %q before header line %q (index %d vs %d)",
+		PSL_GTLDS_SECTION_FOOTER, PSL_GTLDS_SECTION_HEADER,
+		e.span.endIndex, e.span.startIndex)
+}
+
+// validate checks that a given gTLDDatSpan is sensible. It returns an err if
+// the span is nil, if the start or end index haven't been set to > 0, or if the
+// end index is <= the the start index.
+func (s gTLDDatSpan) validate() error {
+	if s.startIndex <= 0 {
+		return errNoHeader
+	}
+	if s.endIndex <= 0 {
+		return errNoFooter
+	}
+	if s.endIndex <= s.startIndex {
+		return errInvertedSpan{span: s}
+	}
+	return nil
+}
+
+// datFile holds the individual lines read from the public suffix list dat file and
+// the span that holds the gTLD specific data section. It supports reading the
+// gTLD specific data, and replacing it.
+type datFile struct {
+	// lines holds the datfile contents split by "\n"
+	lines []string
+	// gTLDSpan holds the indexes where the gTLD data can be found in lines.
+	gTLDSpan gTLDDatSpan
+}
+
+type errSpanOutOfBounds struct {
+	span     gTLDDatSpan
+	numLines int
+}
+
+func (e errSpanOutOfBounds) Error() string {
+	return fmt.Sprintf(
+		"span out of bounds: start index %d, end index %d, number of lines %d",
+		e.span.startIndex, e.span.endIndex, e.numLines)
+}
+
+// validate validates the state of the datFile. It returns an error if
+// the gTLD span validate() returns an error, or if gTLD span endIndex is >= the
+// number of lines in the file.
+func (d datFile) validate() error {
+	if err := d.gTLDSpan.validate(); err != nil {
+		return err
+	}
+	if d.gTLDSpan.endIndex >= len(d.lines) {
+		return errSpanOutOfBounds{span: d.gTLDSpan, numLines: len(d.lines)}
+	}
+	return nil
+}
+
+// getGTLDLines returns the lines from the dat file within the gTLD data span,
+// or an error if the span isn't valid for the dat file.
+func (d datFile) getGTLDLines() ([]string, error) {
+	if err := d.validate(); err != nil {
+		return nil, err
+	}
+	return d.lines[d.gTLDSpan.startIndex:d.gTLDSpan.endIndex], nil
+}
+
+// ReplaceGTLDContent updates the dat file's lines to replace the gTLD data span
+// with new content.
+func (d *datFile) ReplaceGTLDContent(content string) error {
+	if err := d.validate(); err != nil {
+		return err
+	}
+
+	contentLines := strings.Split(content, "\n")
+	beforeLines := d.lines[0:d.gTLDSpan.startIndex]
+	afterLines := d.lines[d.gTLDSpan.endIndex:]
+	newLines := append(beforeLines, append(contentLines, afterLines...)...)
+
+	// Update the span based on the new content length
+	d.gTLDSpan.endIndex = len(beforeLines) + len(contentLines)
+	// and update the data file lines
+	d.lines = newLines
+	return nil
+}
+
+// String returns the dat file's lines joined together.
+func (d datFile) String() string {
+	return strings.Join(d.lines, "\n")
+}
+
+// readDatFile reads the contents of the PSL dat file from the provided path
+// and returns a representation holding all of the lines and the span where the gTLD
+// data is found within the dat file. An error is returned if the file can't be read
+// or if the gTLD data span can't be found or is invalid.
+func readDatFile(datFilePath string) (*datFile, error) {
+	pslDatBytes, err := ioutil.ReadFile(datFilePath)
+	if err != nil {
+		return nil, err
+	}
+	return readDatFileContent(string(pslDatBytes))
+}
+
+func readDatFileContent(pslData string) (*datFile, error) {
+	pslDatLines := strings.Split(pslData, "\n")
+
+	headerIndex, footerIndex := 0, 0
+	for i := 0; i < len(pslDatLines); i++ {
+		line := pslDatLines[i]
+
+		if line == PSL_GTLDS_SECTION_HEADER && headerIndex == 0 {
+			// If the line matches the header and we haven't seen the header yet, capture
+			// the index
+			headerIndex = i
+		} else if line == PSL_GTLDS_SECTION_HEADER && headerIndex != 0 {
+			// If the line matches the header and we've already seen the header return
+			// an error. This is unexpected.
+			return nil, errMultipleHeaders
+		} else if line == PSL_GTLDS_SECTION_FOOTER && footerIndex == 0 {
+			// If the line matches the footer, capture the index. We don't need
+			// to consider the case where we've already seen a footer because we break
+			// below when we have both a header and footer index.
+			footerIndex = i
+		}
+
+		// Break when we have found one header and one footer.
+		if headerIndex != 0 && footerIndex != 0 {
+			break
+		}
+	}
+
+	if headerIndex == 0 {
+		return nil, errNoHeader
+	} else if footerIndex == 0 {
+		return nil, errNoFooter
+	}
+
+	datFile := &datFile{
+		lines: pslDatLines,
+		gTLDSpan: gTLDDatSpan{
+			startIndex: headerIndex + 1,
+			endIndex:   footerIndex,
+		},
+	}
+	if err := datFile.validate(); err != nil {
+		return nil, err
+	}
+
+	return datFile, nil
+}
+
 // getData performs a HTTP GET request to the given URL and returns the
 // response body bytes or returns an error. An HTTP response code other than
 // http.StatusOK (200) is considered to be an error.
@@ -215,34 +403,121 @@ func getPSLEntries(url string) ([]*pslEntry, error) {
 	return filtered, nil
 }
 
+// renderTemplate renders the given template to the provided writer, using the
+// templateData, or returns an error.
+func renderTemplate(writer io.Writer, template *template.Template, templateData interface{}) error {
+	var buf bytes.Buffer
+	if err := template.Execute(&buf, templateData); err != nil {
+		return err
+	}
+
+	_, err := writer.Write(buf.Bytes())
+	if err != nil {
+		return err
+	}
+	return nil
+}
+
+// clock is a small interface that lets us mock time in unit tests.
+type clock interface {
+	Now() time.Time
+}
+
+// realClock is an implementation of clock that uses time.Now() natively.
+type realClock struct{}
+
+// Now returns the current time.Time using the system clock.
+func (c realClock) Now() time.Time {
+	return time.Now()
+}
+
+// renderHeader renders the pslHeaderTemplate to the writer or returns an error. The
+// provided clock instance is used for the header last update timestamp. If no
+// clk instance is provided realClock is used.
+func renderHeader(writer io.Writer, clk clock) error {
+	if clk == nil {
+		clk = &realClock{}
+	}
+	templateData := struct {
+		URL        string
+		Date       time.Time
+		DateFormat string
+	}{
+		URL:        ICANN_GTLD_JSON_URL,
+		Date:       clk.Now().UTC(),
+		DateFormat: time.RFC3339,
+	}
+
+	return renderTemplate(writer, pslHeaderTemplate, templateData)
+}
+
 // renderData renders the given list of pslEntry objects using the pslTemplate.
-// The rendered template data is written to the provided writer.
-func renderData(entries []*pslEntry, writer io.Writer) error {
+// The rendered template data is written to the provided writer or an error is
+// returned.
+func renderData(writer io.Writer, entries []*pslEntry) error {
 	templateData := struct {
-		URL     string
-		Date    time.Time
 		Entries []*pslEntry
 	}{
-		URL:     ICANN_GTLD_JSON_URL,
-		Date:    time.Now(),
 		Entries: entries,
 	}
 
-	var buf bytes.Buffer
-	if err := pslTemplate.Execute(&buf, templateData); err != nil {
-		return err
+	return renderTemplate(writer, pslTemplate, templateData)
+}
+
+// Process handles updating a datFile with new gTLD content. If there are no
+// gTLD updates the existing dat file's contents will be returned. If there are
+// updates, the new updates will be spliced into place and the updated file contents
+// returned.
+func process(datFile *datFile, dataURL string, clk clock) (string, error) {
+	// Get the lines for the gTLD data span - this includes both the header with the
+	// date and the actual gTLD entries.
+	spanLines, err := datFile.getGTLDLines()
+	if err != nil {
+		return "", err
 	}
 
-	_, err := writer.Write(buf.Bytes())
+	// Render a new header for the gTLD data.
+	var newHeaderBuf strings.Builder
+	if err := renderHeader(&newHeaderBuf, clk); err != nil {
+		return "", err
+	}
+
+	// Figure out how many lines the header with the dynamic date is.
+	newHeaderLines := strings.Split(newHeaderBuf.String(), "\n")
+	headerLen := len(newHeaderLines)
+
+	// We should have at least that many lines in the existing span data.
+	if len(spanLines) <= headerLen {
+		return "", errors.New("gtld span data was too small, missing header?")
+	}
+
+	// The gTLD data can be found by skipping the header lines
+	existingData := strings.Join(spanLines[headerLen:], "\n")
+
+	// Fetch new PSL entries.
+	entries, err := getPSLEntries(dataURL)
 	if err != nil {
-		return err
+		return "", err
 	}
-	return nil
+
+	// Render the new gTLD PSL section with the new entries.
+	var newDataBuf strings.Builder
+	if err := renderData(&newDataBuf, entries); err != nil {
+		return "", err
+	}
+
+	// If the newly rendered data doesn't match the existing data then we want to
+	// update the dat file content by replacing the old span with the new content.
+	if newDataBuf.String() != existingData {
+		newContent := newHeaderBuf.String() + "\n" + newDataBuf.String()
+		if err := datFile.ReplaceGTLDContent(newContent); err != nil {
+			return "", err
+		}
+	}
+
+	return datFile.String(), nil
 }
 
-// main will fetch the PSL entires from the ICANN gTLD JSON registry, parse
-// them, normalize them, remove legacy and terminated gTLDs, and finally render
-// them with the pslTemplate, printing the results to standard out.
 func main() {
 	ifErrQuit := func(err error) {
 		if err != nil {
@@ -251,9 +526,35 @@ func main() {
 		}
 	}
 
-	entries, err := getPSLEntries(ICANN_GTLD_JSON_URL)
+	pslDatFile := flag.String(
+		"psl-dat-file",
+		"public_suffix_list.dat",
+		"file path to the public_suffix.dat data file to be updated with new gTLDs")
+
+	overwrite := flag.Bool(
+		"overwrite",
+		false,
+		"overwrite -psl-dat-file with the new data instead of printing to stdout")
+
+	// Parse CLI flags.
+	flag.Parse()
+
+	// Read the existing file content and find the span that contains the gTLD data.
+	datFile, err := readDatFile(*pslDatFile)
 	ifErrQuit(err)
 
-	err = renderData(entries, os.Stdout)
+	// Process the dat file.
+	content, err := process(datFile, ICANN_GTLD_JSON_URL, nil)
+	ifErrQuit(err)
+
+	// If we're not overwriting the file, print the content to stdout.
+	if !*overwrite {
+		fmt.Println(content)
+		os.Exit(0)
+	}
+
+	// Otherwise print nothing to stdout and write the content over the exiting
+	// pslDatFile path we read earlier.
+	err = ioutil.WriteFile(*pslDatFile, []byte(content), 0644)
 	ifErrQuit(err)
 }
diff --git a/tools/newgtlds_test.go b/tools/newgtlds_test.go
index b73ffb8d0a..cc2e9b005f 100644
--- a/tools/newgtlds_test.go
+++ b/tools/newgtlds_test.go
@@ -5,11 +5,14 @@ import (
 	"encoding/json"
 	"fmt"
 	"io"
+	"io/ioutil"
 	"net/http"
 	"net/http/httptest"
+	"os"
 	"reflect"
 	"strings"
 	"testing"
+	"time"
 )
 
 func TestEntryNormalize(t *testing.T) {
@@ -124,7 +127,7 @@ type badStatusHandler struct{}
 
 func (h *badStatusHandler) ServeHTTP(w http.ResponseWriter, _ *http.Request) {
 	w.WriteHeader(http.StatusUnavailableForLegalReasons)
-	w.Write([]byte("sorry"))
+	_, _ = w.Write([]byte("sorry"))
 }
 
 func TestGetData(t *testing.T) {
@@ -151,7 +154,7 @@ type mockHandler struct {
 }
 
 func (h *mockHandler) ServeHTTP(w http.ResponseWriter, _ *http.Request) {
-	w.Write(h.respData)
+	_, _ = w.Write(h.respData)
 }
 
 func TestGetPSLEntries(t *testing.T) {
@@ -320,23 +323,486 @@ ceepeeyou
 `
 
 	var buf bytes.Buffer
-	if err := renderData(entries, io.Writer(&buf)); err != nil {
+	if err := renderData(io.Writer(&buf), entries); err != nil {
 		t.Fatalf("unexpected error from renderData: %v", err)
 	}
 
-	rendered := buf.String()
+	if rendered := buf.String(); rendered != expectedList {
+		t.Errorf("expected rendered list content %q, got %q",
+			expectedList, rendered)
+	}
+}
 
-	lines := strings.Split(rendered, "\n")
-	if len(lines) < 3 {
-		t.Fatalf("expected at least two header lines in rendered data. "+
-			"Found only %d lines", len(lines))
+func TestErrInvertedSpan(t *testing.T) {
+	err := errInvertedSpan{gTLDDatSpan{startIndex: 50, endIndex: 10}}
+	expected := `found footer line "// ===END ICANN DOMAINS===" ` +
+		`before header line "// newGTLDs" (index 10 vs 50)`
+	if actual := err.Error(); actual != expected {
+		t.Errorf("expected %#v Error() to return %q got %q", err, expected, actual)
 	}
+}
 
-	listContent := strings.Join(lines[3:], "\n")
-	fmt.Printf("Got: \n%s\n", listContent)
-	fmt.Printf("Expected: \n%s\n", expectedList)
-	if listContent != expectedList {
-		t.Errorf("expected rendered list content %q, got %q",
-			expectedList, listContent)
+func TestGTLDDatSpanValidate(t *testing.T) {
+	testCases := []struct {
+		name     string
+		span     gTLDDatSpan
+		expected error
+	}{
+		{
+			name:     "no header",
+			span:     gTLDDatSpan{},
+			expected: errNoHeader,
+		},
+		{
+			name:     "no footer",
+			span:     gTLDDatSpan{startIndex: 10},
+			expected: errNoFooter,
+		},
+		{
+			name:     "inverted",
+			span:     gTLDDatSpan{startIndex: 50, endIndex: 10},
+			expected: errInvertedSpan{gTLDDatSpan{startIndex: 50, endIndex: 10}},
+		},
+		{
+			name: "valid",
+			span: gTLDDatSpan{startIndex: 10, endIndex: 20},
+		},
+	}
+
+	for _, tc := range testCases {
+		t.Run(tc.name, func(t *testing.T) {
+			if actual := tc.span.validate(); actual != tc.expected {
+				t.Errorf("expected span %v validate to return %v got %v",
+					tc.span, tc.expected, actual)
+			}
+		})
+	}
+}
+
+func TestErrSpanOutOfBounds(t *testing.T) {
+	err := errSpanOutOfBounds{
+		span:     gTLDDatSpan{startIndex: 5, endIndex: 50},
+		numLines: 20,
+	}
+	expected := `span out of bounds: start index 5, end index 50, number of lines 20`
+	if actual := err.Error(); actual != expected {
+		t.Errorf("expected %#v Error() to return %q got %q", err, expected, actual)
+	}
+}
+
+func TestDatFileValidate(t *testing.T) {
+	testCases := []struct {
+		name     string
+		file     datFile
+		expected error
+	}{
+		{
+			name:     "bad gTLD span",
+			file:     datFile{gTLDSpan: gTLDDatSpan{}},
+			expected: errNoHeader,
+		},
+		{
+			name: "out of bounds span",
+			file: datFile{
+				lines:    []string{"one line"},
+				gTLDSpan: gTLDDatSpan{startIndex: 5, endIndex: 10},
+			},
+			expected: errSpanOutOfBounds{
+				span:     gTLDDatSpan{startIndex: 5, endIndex: 10},
+				numLines: 1,
+			},
+		},
+		{
+			name: "valid",
+			file: datFile{
+				lines:    []string{"one line", "two line", "three line", "four"},
+				gTLDSpan: gTLDDatSpan{startIndex: 2, endIndex: 3}},
+		},
+	}
+
+	for _, tc := range testCases {
+		t.Run(tc.name, func(t *testing.T) {
+			if actual := tc.file.validate(); actual != tc.expected {
+				t.Errorf("expected dat file %v validate to return %v got %v",
+					tc.file, tc.expected, actual)
+			}
+		})
+	}
+}
+
+func TestGetGTLDLines(t *testing.T) {
+	lines := []string{
+		"some junk",              // Index 0
+		PSL_GTLDS_SECTION_HEADER, // Index 1
+		"here be gTLDs",          // Index 2
+		"so many gTLDs",          // Index 3
+		PSL_GTLDS_SECTION_FOOTER, // Index 4
+		"more junk",              // Index 5
+	}
+	file := datFile{
+		lines:    lines,
+		gTLDSpan: gTLDDatSpan{startIndex: 2, endIndex: 4},
+	}
+
+	expectedLines := []string{
+		lines[2], lines[3],
+	}
+
+	if actual, err := file.getGTLDLines(); err != nil {
+		t.Errorf("unexpected err: %v", err)
+	} else if !reflect.DeepEqual(actual, expectedLines) {
+		t.Errorf("expected %v got %v", expectedLines, actual)
+	}
+
+	// Now update the gTLDSpan to be invalid and try again
+	file.gTLDSpan.endIndex = 99
+	expectedErr := errSpanOutOfBounds{
+		numLines: len(lines),
+		span:     gTLDDatSpan{startIndex: 2, endIndex: 99},
+	}
+	if _, err := file.getGTLDLines(); err != expectedErr {
+		t.Errorf("expected err %v got %v", expectedErr, err)
+	}
+}
+
+func TestReplaceGTLDContent(t *testing.T) {
+	origLines := []string{
+		"some junk",              // Index 0
+		PSL_GTLDS_SECTION_HEADER, // Index 1
+		"here be gTLDs",          // Index 2
+		"so many gTLDs",          // Index 3
+		PSL_GTLDS_SECTION_FOOTER, // Index 4
+		"more junk",              // Index 5
+	}
+	file := datFile{
+		lines:    origLines,
+		gTLDSpan: gTLDDatSpan{startIndex: 2, endIndex: 4},
+	}
+	newLines := []string{
+		"new gTLD A", // Index 0
+		"new gTLD B", // Index 1
+		"new gTLD C", // Index 2
+	}
+
+	newContent := strings.Join(newLines, "\n")
+	if err := file.ReplaceGTLDContent(newContent); err != nil {
+		t.Errorf("unexpected err %v", err)
+	}
+
+	expectedLines := []string{
+		origLines[0],
+		origLines[1],
+		newLines[0],
+		newLines[1],
+		newLines[2],
+		origLines[4],
+		origLines[5],
+	}
+	if !reflect.DeepEqual(file.lines, expectedLines) {
+		t.Errorf("expected lines to be updated to %v was %v", expectedLines, file.lines)
+	}
+	if file.gTLDSpan.endIndex != 5 {
+		t.Errorf("expected file to have gTLDSpan end updated to 5, was %d",
+			file.gTLDSpan.endIndex)
+	}
+
+	// Now update the gTLDSpan to be invalid and try again
+	file.gTLDSpan.endIndex = 99
+	expectedErr := errSpanOutOfBounds{
+		numLines: len(expectedLines),
+		span:     gTLDDatSpan{startIndex: 2, endIndex: 99},
+	}
+	if err := file.ReplaceGTLDContent("ignored content"); err != expectedErr {
+		t.Errorf("expected err %v got %v", expectedErr, err)
+	} else if !reflect.DeepEqual(file.lines, expectedLines) {
+		t.Errorf("expected lines to still be %v was changed to %v",
+			expectedLines, file.lines)
+	}
+}
+
+func TestDatFileString(t *testing.T) {
+	file := datFile{
+		lines: []string{"hello", "world"},
+	}
+	expected := "hello\nworld"
+	if actual := file.String(); actual != expected {
+		t.Errorf("expected file %v String() to be %q was %q", file, expected, actual)
+	}
+}
+
+func TestReadDatFile(t *testing.T) {
+	mustWriteTemp := func(t *testing.T, content string) string {
+		tmpfile, err := ioutil.TempFile("", "dat")
+		if err != nil {
+			t.Fatalf("Failed to create temp file: %v", err)
+		}
+		if _, err := tmpfile.Write([]byte(content)); err != nil {
+			t.Fatalf("Failed to write temp file: %v", err)
+		}
+		if err := tmpfile.Close(); err != nil {
+			t.Fatalf("Failed to close temp file: %v", err)
+		}
+		return tmpfile.Name()
+	}
+
+	noHeaderContent := strings.Join([]string{
+		"foo",
+		"bar",
+	}, "\n")
+	noHeaderFile := mustWriteTemp(t, noHeaderContent)
+	defer os.Remove(noHeaderFile)
+
+	noFooterContent := strings.Join([]string{
+		"foo",
+		PSL_GTLDS_SECTION_HEADER,
+		"bar",
+	}, "\n")
+	noFooterFile := mustWriteTemp(t, noFooterContent)
+	defer os.Remove(noFooterFile)
+
+	multiHeaderContent := strings.Join([]string{
+		"foo",
+		PSL_GTLDS_SECTION_HEADER,
+		"test",
+		PSL_GTLDS_SECTION_HEADER,
+		"test",
+		PSL_GTLDS_SECTION_FOOTER,
+		"bar",
+	}, "\n")
+	multiHeaderFile := mustWriteTemp(t, multiHeaderContent)
+	defer os.Remove(multiHeaderFile)
+
+	invertedContent := strings.Join([]string{
+		"foo",
+		PSL_GTLDS_SECTION_FOOTER,
+		"test",
+		PSL_GTLDS_SECTION_HEADER,
+		"bar",
+	}, "\n")
+	invertedFile := mustWriteTemp(t, invertedContent)
+	defer os.Remove(invertedFile)
+
+	validContent := strings.Join([]string{
+		"foo",                    // Index 0
+		PSL_GTLDS_SECTION_HEADER, // Index 1
+		"test",                   // Index 2
+		PSL_GTLDS_SECTION_FOOTER, // Index 3
+		"bar",                    // Index 4
+	}, "\n")
+	validFile := mustWriteTemp(t, validContent)
+	defer os.Remove(validFile)
+
+	testCases := []struct {
+		name            string
+		path            string
+		expectedErrMsg  string
+		expectedDatFile *datFile
+	}{
+		{
+			name:           "no such file",
+			path:           "",
+			expectedErrMsg: "open : no such file or directory",
+		},
+		{
+			name:           "no header",
+			path:           noHeaderFile,
+			expectedErrMsg: errNoHeader.Error(),
+		},
+		{
+			name:           "no footer",
+			path:           noFooterFile,
+			expectedErrMsg: errNoFooter.Error(),
+		},
+		{
+			name:           "multiple headers",
+			path:           multiHeaderFile,
+			expectedErrMsg: errMultipleHeaders.Error(),
+		},
+		{
+			name:           "inverted header/footer",
+			path:           invertedFile,
+			expectedErrMsg: (errInvertedSpan{gTLDDatSpan{startIndex: 4, endIndex: 1}}).Error(),
+		},
+		{
+			name: "valid",
+			path: validFile,
+			expectedDatFile: &datFile{
+				lines: strings.Split(validContent, "\n"),
+				gTLDSpan: gTLDDatSpan{
+					startIndex: 2,
+					endIndex:   3,
+				},
+			},
+		},
+	}
+
+	for _, tc := range testCases {
+		t.Run(tc.name, func(t *testing.T) {
+			actual, err := readDatFile(tc.path)
+			if err != nil && tc.expectedErrMsg == "" {
+				t.Errorf("unexpected err: %v", err)
+			} else if err != nil && err.Error() != tc.expectedErrMsg {
+				t.Errorf("expected err: %q, got: %q", tc.expectedErrMsg, err.Error())
+			} else if err == nil && tc.expectedErrMsg != "" {
+				t.Errorf("expected err: %q, got: nil", tc.expectedErrMsg)
+			} else if !reflect.DeepEqual(actual, tc.expectedDatFile) {
+				t.Errorf("expected dat file: %q, got %q", tc.expectedDatFile, actual)
+			}
+		})
+	}
+}
+
+type mockClock struct {
+	fakeUnixTime int64
+}
+
+func (m mockClock) Now() time.Time {
+	return time.Unix(m.fakeUnixTime, 0)
+}
+
+func TestProcess(t *testing.T) {
+	mockHandler := func(content string) http.HandlerFunc {
+		return func(w http.ResponseWriter, r *http.Request) {
+			w.Header().Set("Content-Type", "application/json")
+			w.WriteHeader(http.StatusOK)
+			fmt.Fprintln(w, content)
+		}
+	}
+
+	existingData := `
+...
+
+// newGTLDs
+
+// List of new gTLDs imported from https://www.icann.org/resources/registries/gtlds/v2/gtlds.json on 2021-02-07T13:25:56-05:00
+// This list is auto-generated, don't edit it manually.
+// aaa : 2015-02-26 American Automobile Association, Inc.
+aaa
+
+
+// ===END ICANN DOMAINS===
+
+...
+`
+	existingJSON := `
+{
+	"gTLDs": [
+		{
+			"contractTerminated": false,
+			"dateOfContractSignature": "2015-02-26",
+			"gTLD": "aaa",
+			"registryOperator": "American Automobile Association, Inc.",
+			"removalDate": null,
+			"uLabel": null
+		}
+	]
+}
+`
+
+	newJSON := `
+{
+	"gTLDs": [
+		{
+			"contractTerminated": false,
+			"dateOfContractSignature": "2015-02-26",
+			"gTLD": "aaa",
+			"registryOperator": "American Automobile Association, Inc.",
+			"removalDate": null,
+			"uLabel": null
+		},
+		{
+			"contractTerminated": false,
+			"dateOfContractSignature": "2014-03-20",
+			"gTLD": "accountants",
+			"registryOperator": "Binky Moon, LLC",
+			"removalDate": null,
+			"uLabel": null
+		}
+	]
+}
+`
+
+	fakeClock := mockClock{
+		fakeUnixTime: 1612916654,
+	}
+	newData := `
+...
+
+// newGTLDs
+
+// List of new gTLDs imported from https://www.icann.org/resources/registries/gtlds/v2/gtlds.json on 2021-02-10T00:24:14Z
+// This list is auto-generated, don't edit it manually.
+// aaa : 2015-02-26 American Automobile Association, Inc.
+aaa
+
+// accountants : 2014-03-20 Binky Moon, LLC
+accountants
+
+
+// ===END ICANN DOMAINS===
+
+...
+`
+
+	mustReadDatFile := func(t *testing.T, content string) *datFile {
+		datFile, err := readDatFileContent(content)
+		if err != nil {
+			t.Fatalf("failed to readDatFileContent %q: %v", content, err)
+		}
+		return datFile
+	}
+
+	testCases := []struct {
+		name            string
+		file            *datFile
+		pslJSON         string
+		expectedErrMsg  string
+		expectedContent string
+	}{
+		{
+			name:           "bad span",
+			file:           &datFile{},
+			expectedErrMsg: errNoHeader.Error(),
+		},
+		{
+			name: "span too small",
+			file: &datFile{
+				lines:    []string{"a", "b", "c"},
+				gTLDSpan: gTLDDatSpan{startIndex: 1, endIndex: 2},
+			},
+			expectedErrMsg: "gtld span data was too small, missing header?",
+		},
+		{
+			name:            "no change in data",
+			file:            mustReadDatFile(t, existingData),
+			pslJSON:         existingJSON,
+			expectedContent: existingData,
+		},
+		{
+			name:            "change in data",
+			file:            mustReadDatFile(t, existingData),
+			pslJSON:         newJSON,
+			expectedContent: newData,
+		},
+	}
+
+	for _, tc := range testCases {
+		t.Run(tc.name, func(t *testing.T) {
+			s := httptest.NewServer(mockHandler(tc.pslJSON))
+			defer s.Close()
+
+			content, err := process(tc.file, s.URL, fakeClock)
+			if err != nil && tc.expectedErrMsg == "" {
+				t.Errorf("unexpected err: %v", err)
+			} else if err != nil && err.Error() != tc.expectedErrMsg {
+				t.Errorf("expected err: %q, got: %q", tc.expectedErrMsg, err.Error())
+			} else if err == nil && tc.expectedErrMsg != "" {
+				t.Errorf("expected err: %q, got: nil", tc.expectedErrMsg)
+			} else if content != tc.expectedContent {
+				fmt.Printf("got content:\n%s", content)
+				fmt.Printf("expected content:\n%s", tc.expectedContent)
+				t.Errorf("expected content: %q, got %q", tc.expectedContent, content)
+			}
+		})
 	}
 }
diff --git a/tools/patchnewgtlds b/tools/patchnewgtlds
index baecf2cbcd..ddc93a9007 100755
--- a/tools/patchnewgtlds
+++ b/tools/patchnewgtlds
@@ -11,8 +11,6 @@ fi
 
 BASEDIR=$(dirname "$0")
 
-go run "$BASEDIR/newgtlds.go" | \
-  "$BASEDIR/replace-between" \
-    "$BASEDIR/../public_suffix_list.dat" \
-    "// newGTLDs" \
-    "// ===END ICANN DOMAINS"
+go run "$BASEDIR/newgtlds.go" \
+  -overwrite \
+  -psl-dat-file="$BASEDIR/../public_suffix_list.dat"
diff --git a/tools/replace-between b/tools/replace-between
deleted file mode 100755
index d6ea40ad5f..0000000000
--- a/tools/replace-between
+++ /dev/null
@@ -1,48 +0,0 @@
-#!/usr/bin/perl -w
-#
-# This script takes a target file, a start marker and an end marker, and
-# replaces the text in that file between those two markers with some
-# alternative text from another file or from STDIN.
-
-binmode STDIN, ':utf8';
-
-usage() if (!$ARGV[2]);
-
-my ($base_filename, $start_marker, $end_marker, $insert_filename) = @ARGV;
-
-my $base = read_file_utf8($base_filename);
-
-my $new;
-if ($insert_filename) {
-    $new = read_file_utf8($insert_filename);
-}
-else {
-    $new = do { local $/; <STDIN> };
-}
-
-$base =~ s/\Q$start_marker\E.*\Q$end_marker\E/$start_marker\n$new\n$end_marker/s;
-
-write_file_utf8($base_filename, $base);
-
-sub usage {
-    print "Usage: replace-between <file-to-process> START_MARKER END_MARKER <file-to-insert>\n";
-    print "Or, give data to insert on STDIN.\n";
-    exit(1);
-}
-
-sub read_file_utf8 {
-    my $name = shift;
-    open my $fh, '<:encoding(UTF-8)', $name
-        or die "Couldn't open '$name': $!";
-    local $/;
-    my $data = <$fh>;
-    return $data;
-};
-
-sub write_file_utf8 {
-    my $name = shift;
-    open my $fh, '>:encoding(UTF-8)', $name
-        or die "Couldn't create '$name': $!";
-    local $/;
-    print {$fh} $_ for @_;
-};