From 3ea913a12426d1be5fb07903d30892920d175095 Mon Sep 17 00:00:00 2001 From: Dave Anderson Date: Mon, 14 Oct 2024 14:25:15 -0700 Subject: [PATCH] tools/internal/parser: check TXT records (#2213) Supports checking a single PR, as well as re-verifying the entire PSL. --- tools/go.mod | 5 +- tools/go.sum | 2 + tools/internal/parser/errors.go | 51 ++ tools/internal/parser/exceptions.go | 814 +++++++++++++++++++++++++++- tools/internal/parser/validate.go | 335 ++++++++++++ tools/psltool/psltool.go | 21 +- 6 files changed, 1220 insertions(+), 8 deletions(-) diff --git a/tools/go.mod b/tools/go.mod index 97cc4aa32..1660b4cbf 100644 --- a/tools/go.mod +++ b/tools/go.mod @@ -16,4 +16,7 @@ require ( github.com/natefinch/atomic v1.0.1 ) -require github.com/google/go-querystring v1.1.0 // indirect +require ( + github.com/creachadair/taskgroup v0.9.0 // indirect + github.com/google/go-querystring v1.1.0 // indirect +) diff --git a/tools/go.sum b/tools/go.sum index ea37ffd4f..09822230a 100644 --- a/tools/go.sum +++ b/tools/go.sum @@ -4,6 +4,8 @@ github.com/creachadair/flax v0.0.0-20240525192034-44db93b3a8ad h1:Fv6FRWgCJTHssl github.com/creachadair/flax v0.0.0-20240525192034-44db93b3a8ad/go.mod h1:K8bFvn8hMdAljQkaKNc7I3os5Wk36JxkyCkfdZ7S8d4= github.com/creachadair/mds v0.15.2 h1:es1qGKgRGSaztpvrSQcZ0B9I6NsHYJ1Sa9naD/3OfCM= github.com/creachadair/mds v0.15.2/go.mod h1:4vrFYUzTXMJpMBU+OA292I6IUxKWCCfZkgXg+/kBZMo= +github.com/creachadair/taskgroup v0.9.0 h1:kzXSea5C7R5DtnKFBOTEW3hvmCkiVnRkODMVDMgSS6k= +github.com/creachadair/taskgroup v0.9.0/go.mod h1:+1hJc8zL1rQkxcMVqEYJ0UPGtwl6Iz1+fd4zcOLtt+A= github.com/google/go-cmp v0.5.2/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE= github.com/google/go-cmp v0.6.0 h1:ofyhxvXcZhMsU5ulbFiLKl/XBFqE1GSq7atu8tAmTRI= github.com/google/go-cmp v0.6.0/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY= diff --git a/tools/internal/parser/errors.go b/tools/internal/parser/errors.go index ee3871d84..fcbebf726 100644 --- a/tools/internal/parser/errors.go +++ b/tools/internal/parser/errors.go @@ -181,3 +181,54 @@ type ErrConflictingSuffixAndException struct { func (e ErrConflictingSuffixAndException) Error() string { return fmt.Sprintf("%s: suffix %s conflicts with exception in wildcard at %s", e.LocationString(), e.Domain, e.Wildcard.LocationString()) } + +type ErrMissingTXTRecord struct { + Block +} + +func (e ErrMissingTXTRecord) Error() string { + var name string + switch v := e.Block.(type) { + case *Suffix: + name = v.Domain.String() + case *Wildcard: + name = v.Domain.String() + default: + panic(fmt.Sprintf("unexpected block type %T in ErrInvalidTXTRecord", e.Block)) + } + return fmt.Sprintf("%s: suffix %s has no TXT record", e.SrcRange().LocationString(), name) +} + +type ErrTXTRecordMismatch struct { + Block + PR int +} + +func (e ErrTXTRecordMismatch) Error() string { + switch v := e.Block.(type) { + case *Suffix: + return fmt.Sprintf("%s: suffix %s has a TXT record pointing to https://github.com/publicsuffix/list/pull/%d, but that PR does not change this suffix", e.SrcRange().LocationString(), v.Domain, e.PR) + case *Wildcard: + return fmt.Sprintf("%s: wildcard *.%s has a TXT record pointing to https://github.com/publicsuffix/list/pull/%d, but that PR does not change this wildcard", e.SrcRange().LocationString(), v.Domain, e.PR) + default: + panic(fmt.Sprintf("unexpected block type %T in ErrTXTRecordMismatch", e.Block)) + } +} + +type ErrTXTCheckFailure struct { + Block + Err error +} + +func (e ErrTXTCheckFailure) Error() string { + var name string + switch v := e.Block.(type) { + case *Suffix: + name = v.Domain.String() + case *Wildcard: + name = v.Domain.String() + default: + panic(fmt.Sprintf("unexpected block type %T in ErrInvalidTXTRecord", e.Block)) + } + return fmt.Sprintf("%s: error checking suffix %s: %v", e.SrcRange().LocationString(), name, e.Err) +} diff --git a/tools/internal/parser/exceptions.go b/tools/internal/parser/exceptions.go index b1d8d564f..b7db258cd 100644 --- a/tools/internal/parser/exceptions.go +++ b/tools/internal/parser/exceptions.go @@ -1,6 +1,12 @@ package parser -import "slices" +import ( + "slices" + "strings" + + "github.com/creachadair/mds/mapset" + "github.com/publicsuffix/list/tools/internal/domain" +) // Exceptions are parts of the PSL that would fail current validation // and stylistic requirements, but are exempted due to predating those @@ -21,6 +27,40 @@ func exemptFromSorting(entity string) bool { return slices.Contains(incorrectSort, entity) } +// exemptFromTXT reports whether the given domain name is exempt from +// the requirement to have a _psl TXT record. +func exemptFromTXT(domain domain.Name) bool { + s := domain.String() + if missingTXT.Has(domain.String()) { + return true + } + for _, suffix := range missingTXTAmazonSuffixes { + if strings.HasSuffix(s, suffix) { + return true + } + } + return false +} + +// adjustTXTPR returns a PR number to use instead of prNum for +// checking, or prNum unchanged if no adjustment is needed. +func adjustTXTPR(prNum int) int { + if ret, ok := txtReplacePRs[prNum]; ok { + return ret + } + return prNum +} + +// acceptPRForDomain reports whether the given prNum should be +// accepted as verification for the given domain, without further +// checking of the Github PR. +func acceptPRForDomain(domain domain.Name, prNum int) bool { + if exc, ok := txtAcceptPRs[domain.String()]; ok && exc == prNum { + return true + } + return false +} + // missingEmail are source code blocks in the private domains section // that are allowed to lack email contact information. var missingEmail = []string{ @@ -121,3 +161,775 @@ var incorrectSort = []string{ "VeryPositive SIA", "V.UA Domain Administrator", } + +// missingTXT are the domains that are exempt from the _psl TXT record +// requirement. +var missingTXT = mapset.New( + "001www.com", + "0emm.com", + "4u.com", + "accesscam.org", + "ac.ru", + "adimo.co.uk", + "ae.org", + "africa.com", + "akadns.net", + "akamaiedge.net", + "akamaiedge-staging.net", + "akamaihd.net", + "akamaihd-staging.net", + "akamai.net", + "akamaiorigin.net", + "akamaiorigin-staging.net", + "akamai-staging.net", + "akamaized.net", + "akamaized-staging.net", + "al.eu.org", + "api.stdlib.com", + "ap.ngrok.io", + "app.render.com", + "appspot.com", + "art.pl", + "asso.eu.org", + "at-band-camp.net", + "at.eu.org", + "ath.cx", + "au.eu.org", + "au.ngrok.io", + "aus.basketball", + "azure-api.net", + "azureedge.net", + "azurefd.net", + "azure-mobile.net", + "azurewebsites.net", + "barrell-of-knowledge.info", + "barrel-of-knowledge.info", + "bci.dnstrace.pro", + "beagleboard.io", + "be.eu.org", + "beta.bounty-full.com", + "betainabox.com", + "better-than.tv", + "bg.eu.org", + "biz.at", + "biz.ua", + "blob.core.windows.net", + "blogdns.com", + "blogdns.net", + "blogdns.org", + "blogsite.org", + "blogspot.ae", + "blogspot.al", + "blogspot.am", + "blogspot.ba", + "blogspot.be", + "blogspot.bg", + "blogspot.bj", + "blogspot.ca", + "blogspot.cf", + "blogspot.ch", + "blogspot.cl", + "blogspot.co.at", + "blogspot.co.id", + "blogspot.co.il", + "blogspot.co.ke", + "blogspot.com", + "blogspot.com.ar", + "blogspot.com.au", + "blogspot.com.br", + "blogspot.com.by", + "blogspot.com.co", + "blogspot.com.cy", + "blogspot.com.ee", + "blogspot.com.eg", + "blogspot.com.es", + "blogspot.com.mt", + "blogspot.com.ng", + "blogspot.com.tr", + "blogspot.com.uy", + "blogspot.co.nz", + "blogspot.co.uk", + "blogspot.co.za", + "blogspot.cv", + "blogspot.cz", + "blogspot.de", + "blogspot.dk", + "blogspot.fi", + "blogspot.fr", + "blogspot.gr", + "blogspot.hk", + "blogspot.hr", + "blogspot.hu", + "blogspot.ie", + "blogspot.in", + "blogspot.is", + "blogspot.it", + "blogspot.jp", + "blogspot.kr", + "blogspot.li", + "blogspot.lt", + "blogspot.lu", + "blogspot.md", + "blogspot.mk", + "blogspot.mr", + "blogspot.mx", + "blogspot.my", + "blogspot.nl", + "blogspot.no", + "blogspot.pe", + "blogspot.pt", + "blogspot.qa", + "blogspot.re", + "blogspot.ro", + "blogspot.rs", + "blogspot.ru", + "blogspot.se", + "blogspot.sg", + "blogspot.si", + "blogspot.sk", + "blogspot.sn", + "blogspot.tw", + "blogspot.ug", + "blogspot.vn", + "bloxcms.com", + "boldlygoingnowhere.org", + "bookonline.app", + "br.com", + "broke-it.net", + "buyshouses.net", + "ca.eu.org", + "camdvr.org", + "casacam.net", + "cd.eu.org", + "cechire.com", + "certmgr.org", + "ch.eu.org", + "cloudapp.net", + "cloudfront.net", + "cloudfunctions.net", + "cloudns.biz", + "cloudns.in", + "cloudns.info", + "cloudns.us", + "cn.com", + "cn.eu.org", + "co.business", + "co.ca", + "co.com", + "co.cz", + "codeberg.page", + "codespot.com", + "co.education", + "co.events", + "co.financial", + "co.krd", + "com.de", + "com.se", + "co.network", + "co.nl", + "co.no", + "co.pl", + "co.place", + "co.technology", + "co.ua", + "cryptonomic.net", + "csx.cc", + "curv.dev", + "cy.eu.org", + "cyon.link", + "cyon.site", + "cz.eu.org", + "daplie.me", + "ddnsfree.com", + "ddnsgeek.com", + "ddnss.de", + "de.com", + "de.eu.org", + "demo.datadetect.com", + "diskstation.me", + "dk.eu.org", + "dnsalias.com", + "dnsalias.net", + "dnsalias.org", + "dnsdojo.com", + "dnsdojo.net", + "dnsdojo.org", + "dnshome.de", + "does-it.net", + "doesntexist.com", + "doesntexist.org", + "dontexist.com", + "dontexist.net", + "dontexist.org", + "doomdns.com", + "doomdns.org", + "dreamhosters.com", + "drud.us", + "dscloud.biz", + "dscloud.me", + "dscloud.mobi", + "dsmynas.com", + "dsmynas.net", + "dsmynas.org", + "duckdns.org", + "dvrdns.org", + "dynalias.com", + "dynalias.net", + "dynalias.org", + "dynathome.net", + "dyndns-at-home.com", + "dyndns-at-work.com", + "dyndns.biz", + "dyndns-blog.com", + "dyndns.dappnode.io", + "dyndns-free.com", + "dyndns-home.com", + "dyndns.info", + "dyndns-ip.com", + "dyndns-mail.com", + "dyndns-office.com", + "dyndns.org", + "dyndns-pics.com", + "dyndns-remote.com", + "dyndns-server.com", + "dyndns.tv", + "dyndns-web.com", + "dyndns-wiki.com", + "dyndns-work.com", + "dyndns.ws", + "dyn-o-saur.com", + "dynu.net", + "dynv6.net", + "e4.cz", + "edgekey.net", + "edgekey-staging.net", + "edgesuite.net", + "edgesuite-staging.net", + "edu.eu.org", + "edu.krd", + "edu.ru", + "ee.eu.org", + "endofinternet.net", + "endofinternet.org", + "endoftheinternet.org", + "es.eu.org", + "est-a-la-maison.com", + "est-a-la-masion.com", + "est-le-patron.com", + "est-mon-blogueur.com", + "eu.com", + "eu.ngrok.io", + "eu.org", + "familyds.com", + "familyds.net", + "familyds.org", + "fi.eu.org", + "firebaseapp.com", + "fly.dev", + "for-better.biz", + "forgot.her.name", + "forgot.his.name", + "for-more.biz", + "for-our.info", + "for-some.biz", + "for-the.biz", + "freeddns.org", + "free.hr", + "fr.eu.org", + "from-ak.com", + "from-al.com", + "from-ar.com", + "from-az.net", + "from-ca.com", + "from-co.net", + "from-ct.com", + "from-dc.com", + "from-de.com", + "from-fl.com", + "from-ga.com", + "from-hi.com", + "from-ia.com", + "from-id.com", + "from-il.com", + "from-in.com", + "from-ks.com", + "from-ky.com", + "from-la.net", + "from-ma.com", + "from-md.com", + "from-me.org", + "from-mi.com", + "from-mn.com", + "from-mo.com", + "from-ms.com", + "from-mt.com", + "from-nc.com", + "from-nd.com", + "from-ne.com", + "from-nh.com", + "from-nj.com", + "from-nm.com", + "from-nv.com", + "from-ny.net", + "from-oh.com", + "from-ok.com", + "from-or.com", + "from-pa.com", + "from-pr.com", + "from-ri.com", + "from-sc.com", + "from-sd.com", + "from-tn.com", + "from-tx.com", + "from-ut.com", + "from-va.com", + "from-vt.com", + "from-wa.com", + "from-wi.com", + "from-wv.com", + "from-wy.com", + "ftpaccess.cc", + "fuettertdasnetz.de", + "game-host.org", + "game-server.cc", + "gb.net", + "gdansk.pl", + "gda.pl", + "gdynia.pl", + "getmyip.com", + "gets-it.net", + "giize.com", + "github.io", + "githubusercontent.com", + "gleeze.com", + "gliwice.pl", + "go.dyndns.org", + "googleapis.com", + "googlecode.com", + "gotdns.com", + "gotdns.org", + "gotpantheon.com", + "gov.ru", + "gr.com", + "gr.eu.org", + "groks-the.info", + "groks-this.info", + "gsj.bz", + "günstigbestellen.de", + "günstigliefern.de", + "ham-radio-op.net", + "hashbang.sh", + "hepforge.org", + "here-for-more.info", + "herokuapp.com", + "herokussl.com", + "hk.com", + "hk.org", + "hobby-site.com", + "hobby-site.org", + "homedns.org", + "home.dyndns.org", + "homeftp.net", + "homeftp.org", + "homeip.net", + "homelink.one", + "homelinux.com", + "homelinux.net", + "homelinux.org", + "homeunix.com", + "homeunix.net", + "homeunix.org", + "hosting.ovh.net", + "hr.eu.org", + "hu.eu.org", + "hu.net", + "hzc.io", + "i234.me", + "iamallama.com", + "id.firewalledreplit.co", + "id.forgerock.io", + "ie.eu.org", + "iki.fi", + "il.eu.org", + "inc.hk", + "ind.mom", + "in.eu.org", + "info.at", + "in.net", + "in.ngrok.io", + "int.eu.org", + "in-the-band.net", + "int.ru", + "is-a-anarchist.com", + "is-a-blogger.com", + "is-a-bookkeeper.com", + "is-a-bruinsfan.org", + "is-a-bulls-fan.com", + "is-a-candidate.org", + "is-a-caterer.com", + "is-a-celticsfan.org", + "is-a-chef.com", + "is-a-chef.net", + "is-a-chef.org", + "is-a-conservative.com", + "is-a-cpa.com", + "is-a-cubicle-slave.com", + "is-a-democrat.com", + "is-a-designer.com", + "is-a-doctor.com", + "is-a-financialadvisor.com", + "is-a-geek.com", + "isa-geek.com", + "is-a-geek.net", + "isa-geek.net", + "is-a-geek.org", + "isa-geek.org", + "is-a-green.com", + "is-a-guru.com", + "is-a-hard-worker.com", + "isa-hockeynut.com", + "is-a-hunter.com", + "is-a-knight.org", + "is-a-landscaper.com", + "is-a-lawyer.com", + "is-a-liberal.com", + "is-a-libertarian.com", + "is-a-linux-user.org", + "is-a-llama.com", + "is-a-musician.com", + "is-an-accountant.com", + "is-an-actor.com", + "is-an-actress.com", + "is-an-anarchist.com", + "is-an-artist.com", + "is-a-nascarfan.com", + "is-an-engineer.com", + "is-an-entertainer.com", + "is-a-nurse.com", + "is-a-painter.com", + "is-a-patsfan.org", + "is-a-personaltrainer.com", + "is-a-photographer.com", + "is-a-player.com", + "is-a-republican.com", + "is-a-rockstar.com", + "is-a-socialist.com", + "is-a-soxfan.org", + "is-a-student.com", + "is-a-teacher.com", + "is-a-techie.com", + "is-a-therapist.com", + "is-by.us", + "is-certified.com", + "is.eu.org", + "is-found.org", + "is-gone.com", + "is-into-anime.com", + "is-into-cars.com", + "is-into-cartoons.com", + "is-into-games.com", + "is-leet.com", + "is-lost.org", + "is-not-certified.com", + "is-saved.org", + "is-slick.com", + "issmarterthanyou.com", + "isteingeek.de", + "istmein.de", + "is-uberleet.com", + "is-very-bad.org", + "is-very-evil.org", + "is-very-good.org", + "is-very-nice.org", + "is-very-sweet.org", + "is-with-theband.com", + "it.eu.org", + "jp.eu.org", + "jpn.com", + "jp.net", + "jp.ngrok.io", + "js.wpenginepowered.com", + "keymachine.de", + "kicks-ass.net", + "kicks-ass.org", + "knowsitall.info", + "kozow.com", + "krakow.pl", + "kr.eu.org", + "land-4-sale.us", + "lebtimnetz.de", + "leitungsen.de", + "lib.de.us", + "likescandy.com", + "likes-pie.com", + "lk3.ru", + "localhost.daplie.me", + "loseyourip.com", + "ltd.hk", + "lt.eu.org", + "lu.eu.org", + "lv.eu.org", + "mazeplay.com", + "med.pl", + "me.eu.org", + "memset.net", + "merseine.nu", + "mex.com", + "mil.ru", + "mine.nu", + "miniserver.com", + "misconfused.org", + "mk.eu.org", + "moonscale.net", + "mt.eu.org", + "myasustor.com", + "mydatto.com", + "myddns.rocks", + "mydrobo.com", + "myds.me", + "my.eu.org", + "mypep.link", + "mypets.ws", + "myphotos.cc", + "mywire.org", + "neat-url.com", + "net.eu.org", + "netfy.app", + "nfshost.com", + "ng.eu.org", + "nl.eu.org", + "no.eu.org", + "now-dns.top", + "nz.basketball", + "nz.eu.org", + "office-on-the.net", + "onfabrica.com", + "on-rancher.cloud", + "onred.one", + "onrender.com", + "on-rio.io", + "on-the-web.tv", + "ooguy.com", + "operaunite.com", + "outsystemscloud.com", + "ownprovider.com", + "ox.rs", + "pagespeedmobilizer.com", + "paris.eu.org", + "platter-app.com", + "pl.eu.org", + "podzone.net", + "podzone.org", + "poznan.pl", + "pp.ua", + "protonet.io", + "pt.eu.org", + "qa2.com", + "q-a.eu.org", + "qbuser.com", + "rackmaze.com", + "rackmaze.net", + "readmyblog.org", + "realm.cz", + "repl.run", + "rhcloud.com", + "ric.jelastic.vps-host.net", + "ro.eu.org", + "routingthecloud.net", + "ru.com", + "ru.eu.org", + "sa.com", + "sandcats.io", + "sa.ngrok.io", + "saves-the-whales.com", + "scrapper-site.net", + "scrapping.cc", + "sdscloud.pl", + "secaas.hk", + "se.eu.org", + "selfip.biz", + "selfip.com", + "selfip.info", + "selfip.net", + "selfip.org", + "sells-for-less.com", + "sells-for-u.com", + "sells-it.net", + "sellsyourhome.org", + "se.net", + "senseering.net", + "servebbs.com", + "servebbs.net", + "servebbs.org", + "serveftp.net", + "serveftp.org", + "servegame.org", + "servicebus.windows.net", + "service.gov.uk", + "shacknet.nu", + "shoparena.pl", + "shopware.store", + "si.eu.org", + "simple-url.com", + "sk.eu.org", + "sn.mynetname.net", + "sopot.pl", + "spacekit.io", + "space-to-rent.com", + "staging.onred.one", + "static.observableusercontent.com", + "stg.dev", + "stuff-4-sale.org", + "stuff-4-sale.us", + "synology.me", + "teaches-yoga.com", + "test.ru", + "theworkpc.com", + "thruhere.net", + "tickets.io", + "toolforge.org", + "townnews-staging.com", + "traeumtgerade.de", + "trafficmanager.net", + "translate.goog", + "tr.eu.org", + "uk.com", + "uk.eu.org", + "uk.net", + "us.com", + "user.party.eus", + "us.eu.org", + "us.ngrok.io", + "us.org", + "vercel.app", + "vercel.dev", + "virtual-user.de", + "vpnplus.to", + "webhop.biz", + "webhop.info", + "webhop.net", + "webhop.org", + "webpaas.ovh.net", + "webredirect.org", + "withgoogle.com", + "withyoutube.com", + "wmcloud.org", + "wmflabs.org", + "worse-than.tv", + "wpenginepowered.com", + "wpmucdn.com", + "writesthisblog.com", + "wroc.pl", + "xen.prgmr.com", + "yolasite.com", + "za.bz", + "za.com", + "zakopane.pl", + "za.net", + "za.org", + + // Amazon suffixes, managed by a different bulk process. + "amplifyapp.com", +) + +// missingTXTAmazonSuffixes are Amazon domains that are exempt from +// the _psl TXT requirement, due to managing their submissions through +// a separate high-volume process. +var missingTXTAmazonSuffixes = []string{ + ".amazonaws.com", + ".amazonaws.com.cn", + ".amazoncognito.com", +} + +// txtReplacePRs substitutes some TXT PR numbers for +// replacements. This is to paper over some early PSL submissions that +// used the _psl process, where the domain owners complied with all +// PSL policies, but for various reasons their own PR was closed and a +// PSL maintainer sent and merged their own PR with the same change. +// +// While the _psl record is technically not quite right, since it +// points to a PR that was never merged, the owners of the suffixes in +// question did everything right, so it wouldn't be fair to punish +// them for the break in the chain of custody. +var txtReplacePRs = map[int]int{ + // Lukanet Ltd domains. + 596: 652, + // CloudAccess.net + 372: 466, + // .fr update + 1621: 1823, + // bounty-full.com + 104: 230, + // drayddns.com + 451: 475, + // dy.fi + 95: 229, + // activetrail.biz + 1541: 1655, + // freeboxos + 129: 232, + // rs.ba + 1289: 1367, + // Hoplix + 1282: 1405, + 1364: 1405, +} + +// txtAcceptPRs maps suffix strings to a PR number that can be +// accepted for that suffix without further checking. +// +// This is to work around situations similar to txtReplacePRs, where +// the suffix owners followed all the rules, but the change was either +// merged separately from the PR, or Github's API has bad data and +// returns bogus information that prevents us from tracing the change. +var txtAcceptPRs = map[string]int{ + // Change pushed to master by maintainer rather than merging PR. + "js.org": 264, + + // https://github.com/publicsuffix/list/pull/174 . API returns + // bogus merge SHA. + "xenapponazure.com": 174, + "biz.dk": 174, + "co.dk": 174, + "firm.dk": 174, + "reg.dk": 174, + "store.dk": 174, + + // https://github.com/publicsuffix/list/pull/155 . API returns + // bogus merge SHA. + "pantheonsite.io": 155, + + // https://github.com/publicsuffix/list/pull/10 . API returns + // bogus merge SHA. + "bmoattachments.org": 10, + + // https://github.com/publicsuffix/list/pull/15 . API returns + // bogus merge SHA. + "c.cdn77.org": 15, + "cdn77-ssl.net": 15, + "r.cdn77.net": 15, + "rsc.cdn77.org": 15, + "ssl.origin.cdn77-secure.org": 15, + + // PR sent and then closed due to unresponsiveness, but the suffix + // owner updated their _psl record. + "hb.cldmail.ru": 1871, + + // https://github.com/publicsuffix/list/pull/1473 shuffled around + // some other domains owned by the same entities, but they updated + // all their TXT records not just the changed ones. + "ngo.ng": 1473, + "orx.biz": 1473, + "biz.gl": 1473, + "ltd.ng": 1473, + "col.ng": 1473, + "gen.ng": 1473, + + // https://github.com/publicsuffix/list/pull/1851 proposed moving + // priv.at to the ICANN section and set up a _psl record to prove + // ownership, but the change was rejected. That means the PR is + // technically invalid for this TXT record, but it does + // demonstrate ownership by the PR's author so is useful to keep + // around as a "valid" entry with an exception. + "priv.at": 1851, +} diff --git a/tools/internal/parser/validate.go b/tools/internal/parser/validate.go index 4c851ded8..8661128a8 100644 --- a/tools/internal/parser/validate.go +++ b/tools/internal/parser/validate.go @@ -1,7 +1,17 @@ package parser import ( + "context" + "errors" + "log" + "net" + "strconv" + "strings" + "github.com/creachadair/mds/mapset" + "github.com/creachadair/taskgroup" + "github.com/publicsuffix/list/tools/internal/domain" + "github.com/publicsuffix/list/tools/internal/github" ) // ValidateOffline runs offline validations on a parsed PSL. @@ -109,3 +119,328 @@ func validateSuffixUniqueness(block Block) (errs []error) { return errs } + +// ValidateOnline runs online validations on a parsed PSL. Online +// validations are slower than offline validation, especially when +// checking the entire PSL. All online validations respect +// cancellation on the given context. +func ValidateOnline(ctx context.Context, l *List) (errs []error) { + for _, section := range BlocksOfType[*Section](l) { + if section.Name == "PRIVATE DOMAINS" { + errs = append(errs, validateTXTRecords(ctx, section)...) + break + } + } + return errs +} + +const ( + // concurrentDNSRequests is the maximum number of in-flight TXT + // lookups. This is primarily limited by the ability of the local + // stub resolver and LAN resolver to absorb the traffic without + // dropping packets. This is set very conservatively to avoid + // false positives. + concurrentDNSRequests = 10 + // concurrentGithubRequests is the maximum number of in-flight + // Github API requests. This is limited to avoid tripping Github + // DoS protections. Github seems to have made their rate limits + // more aggressive recently, so we stick to something very slow. + concurrentGithubRequests = 10 + + // txtRecordPrefix is the prefix of valid _psl TXT records. + txtRecordPrefix = "https://github.com/publicsuffix/list/pull/" +) + +// prExpected is the set of changed suffixes we expect to see in a +// particular PR, given TXT record information. +type prExpected struct { + // suffixes maps a domain string (Suffix.Domain.String()) to the + // Suffix entry in the PSL that's being validated. + suffixes map[string]*Suffix + // wildcards maps a wildcard domain (Wildcard.Domain.String()) to + // the Wildcard entry in the PSL that's being validated. + wildcards map[string]*Wildcard +} + +// txtRecordChecker is the in-flight state for validateTXTRecords. +type txtRecordChecker struct { + // ctx is the parent context for the validation. In-flight + // validation should abort if this context gets canceled + // (e.g. user hit ctrl+c, or timeout was reached) + ctx context.Context + + // gh is the Github API client used to look up PRs and commits. + gh github.Client + // resolver is the DNS resolver used to do TXT lookups. + resolver net.Resolver + + // errs accumulates the TXT validation errors discovered during + // the checking process. + errs []error + + // prExpected records what changes we expect to see in PRs, based + // on TXT lookups. For example, if a TXT lookup for _psl.foo.com + // points to PR 123, this map will have an entry for 123 -> + // {suffixes: [foo.com]}, meaning if we look at PR 123 on github, + // we want to see a change for foo.com. + prExpected map[int]*prExpected +} + +// validateTXTRecords checks the TXT records of all Suffix and +// Wildcard blocks found under b. +func validateTXTRecords(ctx context.Context, b Block) (errs []error) { + checker := txtRecordChecker{ + ctx: ctx, + prExpected: map[int]*prExpected{}, + } + + // TXT checking happens in two phases: first, look up all TXT + // records and construct a map of suffix<>github PR map (and + // record errors for missing/malformed records, of course). + // + // Then, check the Github PRs and verify that they are changing + // the expected domains. We do this so that someone cannot point + // their TXT record to some random PR and pass our validation + // check, the PR must be changing the correct suffix(es). + // + // Both TXT lookups and PR verification are parallelized. This + // complicates the code below slightly due to the use of + // taskgroup, but it results in a 10-100x speedup compared to + // serialized verification. + + // TXT record checking. The checkTXT function (below) is the + // parallel part of the process, processLookupResult collects the + // results/errors. + collect := taskgroup.NewCollector(checker.processLookupResult) + group, start := taskgroup.New(nil).Limit(concurrentDNSRequests) + + for _, suf := range BlocksOfType[*Suffix](b) { + if !suf.Changed() || exemptFromTXT(suf.Domain) { + continue + } + start(collect.NoError(func() txtResult { return checker.checkTXT(suf, suf.Domain) })) + } + for _, wild := range BlocksOfType[*Wildcard](b) { + if !wild.Changed() || exemptFromTXT(wild.Domain) { + continue + } + start(collect.NoError(func() txtResult { return checker.checkTXT(wild, wild.Domain) })) + } + + group.Wait() + + // PR verification. Now that TXT lookups are complete, + // checker.prExpected has a list of PRs, and the list of changes + // we want to see in each PR. The checkPR function is the parallel + // part and does all the work, the collector just concats all the + // validation errors together. + collectPR := taskgroup.NewCollector(func(errs []error) { + checker.errs = append(checker.errs, errs...) + }) + group, start = taskgroup.New(nil).Limit(concurrentGithubRequests) + + for prNum, info := range checker.prExpected { + start(collectPR.NoError(func() []error { return checker.checkPR(prNum, info) })) + } + group.Wait() + + return checker.errs +} + +// txtResult is the result of one DNS TXT lookup. +type txtResult struct { + // Block is the Suffix or Wildcard that this lookup is for. We + // track it here because we need it to provide context in + // validation errors that could happen in future. + Block + // prs is the list of PR numbers that were found in TXT records. + prs []int + // err is the DNS lookup error we got, or nil if the query + // succeeded. + err error +} + +// checkTXT checks the _psl TXT record for one domain. +func (c *txtRecordChecker) checkTXT(block Block, domain domain.Name) txtResult { + log.Printf("Checking TXT for _psl.%s", domain.String()) + // The trailing dot is to prevent search path expansion. + res, err := c.resolver.LookupTXT(c.ctx, "_psl."+domain.ASCIIString()+".") + if err != nil { + return txtResult{ + Block: block, + err: err, + } + } + return txtResult{ + Block: block, + prs: extractPSLRecords(res), + } +} + +// extractPSLRecords extracts github PR numbers from raw TXT +// records. TXT records which do not match the required PSL record +// format (e.g. SPF records, DKIM records, other unrelated +// verification records) are ignored. +func extractPSLRecords(txts []string) []int { + // You might think that since we put PSL records under + // _psl., we would only see well formed PSL + // records. However, a large number of domains return SPF and + // other "well known" TXT records types as well, probably because + // their DNS server is hardcoded to return those for all TXT + // query. So, we have to gracefully ignore "malformed" TXT records + // here. + var ret []int + for _, txt := range txts { + if !strings.HasPrefix(txt, txtRecordPrefix) { + continue + } + for _, f := range strings.Fields(strings.TrimSpace(txt)) { + prStr, ok := strings.CutPrefix(f, txtRecordPrefix) + if !ok { + continue + } + prNum, err := strconv.Atoi(prStr) + if err != nil { + continue + } + + if prNum == 0 { + // At least one _psl record points to a bogus PR + // number 0. Presumably this was a placeholder that + // was never updated. Treat it identically to other + // missing/malformed records. + continue + } + + // Apply special cases where the PR listed in DNS is not + // quite right, but due to procedural issues on the PSL + // side rather than suffix owner error. See exceptions.go + // for more explanation. + prNum = adjustTXTPR(prNum) + + ret = append(ret, prNum) + } + } + return ret +} + +// processLookupResult updates the record checker's state with the +// information in the given txtResult. +func (c *txtRecordChecker) processLookupResult(res txtResult) { + var dnsErr *net.DNSError + if errors.As(res.err, &dnsErr) { + if dnsErr.IsNotFound { + // DNS server returned NXDOMAIN. Use a specific error for + // that. + c.errs = append(c.errs, ErrMissingTXTRecord{res.Block}) + } else { + // Other DNS errors, e.g. SERVFAIL or REFUSED. + c.errs = append(c.errs, ErrTXTCheckFailure{res.Block, res.err}) + } + return + } else if res.err != nil { + // Other non-DNS errors, e.g. timeout or "no route to host". + c.errs = append(c.errs, ErrTXTCheckFailure{res.Block, res.err}) + return + } + if len(res.prs) == 0 { + // _psl. has TXT records, but none of them look like + // valid PSL records. Behave the same as NXDOMAIN. + c.errs = append(c.errs, ErrMissingTXTRecord{res.Block}) + return + } + for _, prNum := range res.prs { + // Found some PRs for this suffix, record them in preparation + // for PR validation. + inf := c.prInfo(prNum) + switch v := res.Block.(type) { + case *Suffix: + inf.suffixes[v.Domain.String()] = v + case *Wildcard: + inf.wildcards[v.Domain.String()] = v + default: + panic("unexpected AST node") + } + } +} + +// checkPRs looks up the given Github PR, and verifies that it changes +// all the suffixes and wildcards provided in info. +func (c *txtRecordChecker) checkPR(prNum int, info *prExpected) []error { + log.Printf("Checking PR %d", prNum) + + // Some PRs have broken state in Github, the API returns nonsense + // information. These cases were verified manually and recorded in + // exceptions.go, so we skip those checks to avoid spurious + // errors. + for _, suf := range info.suffixes { + if acceptPRForDomain(suf.Domain, prNum) { + delete(info.suffixes, suf.Domain.String()) + } + } + if len(info.suffixes) == 0 && len(info.wildcards) == 0 { + return nil + } + + var ret []error + + beforeBs, afterBs, err := c.gh.PSLForPullRequest(c.ctx, prNum) + if err != nil { + for _, suf := range info.suffixes { + ret = append(ret, ErrTXTCheckFailure{suf, err}) + } + for _, wild := range info.wildcards { + ret = append(ret, ErrTXTCheckFailure{wild, err}) + } + return ret + } + + before, _ := Parse(beforeBs) + after, _ := Parse(afterBs) + after.SetBaseVersion(before, true) + + // Look at the changed suffixes/wildcards in the PR, and remove + // those from info. At the end of this, info is either empty (no + // issues, we found all the suffixes we wanted in the correct PR), + // or has leftover suffixes (validation error: a TXT record + // references an unrelated PR). + for _, suf := range BlocksOfType[*Suffix](after) { + if !suf.Changed() { + continue + } + + // delete is a no-op if the key doesn't exist, so we can do + // the deletion unconditionally. + delete(info.suffixes, suf.Domain.String()) + } + + for _, wild := range BlocksOfType[*Wildcard](after) { + if !wild.Changed() { + continue + } + + delete(info.wildcards, wild.Domain.String()) + } + + // Anything left over now is a validation error. + for _, suf := range info.suffixes { + ret = append(ret, ErrTXTRecordMismatch{suf, prNum}) + } + for _, wild := range info.wildcards { + ret = append(ret, ErrTXTRecordMismatch{wild, prNum}) + } + return ret +} + +// prInfo is a helper to get-or-create c.prExpected[pr]. +func (c *txtRecordChecker) prInfo(pr int) *prExpected { + if ret, ok := c.prExpected[pr]; ok { + return ret + } + ret := &prExpected{ + suffixes: map[string]*Suffix{}, + wildcards: map[string]*Wildcard{}, + } + c.prExpected[pr] = ret + return ret +} diff --git a/tools/psltool/psltool.go b/tools/psltool/psltool.go index 981eba4b5..987072795 100644 --- a/tools/psltool/psltool.go +++ b/tools/psltool/psltool.go @@ -14,6 +14,7 @@ import ( "strconv" "strings" "syscall" + "time" "unicode" "github.com/creachadair/command" @@ -166,7 +167,13 @@ func runValidate(env *command.Env, pathOrHash string) error { errs = append(errs, psl.Clean()...) errs = append(errs, parser.ValidateOffline(psl)...) if validateArgs.Online { - // TODO: no online validations implemented yet. + if os.Getenv("PSLTOOL_ALLOW_BROKEN") == "" { + errs = append(errs, fmt.Errorf("refusing to run online validation on the entire PSL, it's currently broken and gets you rate-limited by github. For development, export PSLTOOL_ALLOW_BROKEN=1.")) + } else { + ctx, cancel := context.WithTimeout(env.Context(), 1200*time.Second) + defer cancel() + errs = append(errs, parser.ValidateOnline(ctx, psl)...) + } } clean := psl.MarshalPSL() @@ -214,8 +221,10 @@ func runCheckPR(env *command.Env, prStr string) error { after.SetBaseVersion(before, true) errs = append(errs, after.Clean()...) errs = append(errs, parser.ValidateOffline(after)...) - if validateArgs.Online { - // TODO: no online validations implemented yet. + if checkPRArgs.Online { + ctx, cancel := context.WithTimeout(env.Context(), 300*time.Second) + defer cancel() + errs = append(errs, parser.ValidateOnline(ctx, after)...) } clean := after.MarshalPSL() @@ -249,12 +258,12 @@ func runCheckPR(env *command.Env, prStr string) error { } if l := len(errs); l == 0 { - fmt.Fprintln(env, "PSL file is valid") + fmt.Fprintln(env, "PSL change is valid") return nil } else if l == 1 { - return errors.New("file has 1 error") + return errors.New("change has 1 error") } else { - return fmt.Errorf("file has %d errors", l) + return fmt.Errorf("change has %d errors", l) } }