Merge pull request #46 from GSA-TTS/jadudm/filter-cleanup

Jadudm/filter cleanup
GSA-TTS · Dec 27, 2024 · c90ebdb · c90ebdb
2 parents 0726a43 + c10c87f
commit c90ebdb
Show file tree

Hide file tree

Showing 40 changed files with 30,243 additions and 596 deletions.
diff --git a/cmd/fetch/work.go b/cmd/fetch/work.go
@@ -4,6 +4,7 @@ import (
 	"context"
 	"fmt"
 	"math/rand/v2"
+	"net/url"
 	"regexp"
 	"strconv"
 	"strings"
@@ -15,6 +16,7 @@ import (
 
 	"github.com/GSA-TTS/jemison/config"
 	common "github.com/GSA-TTS/jemison/internal/common"
+	filter "github.com/GSA-TTS/jemison/internal/filtering"
 	kv "github.com/GSA-TTS/jemison/internal/kv"
 	"github.com/GSA-TTS/jemison/internal/postgres/work_db"
 	"github.com/GSA-TTS/jemison/internal/queueing"
@@ -69,6 +71,17 @@ func stripHostToAscii(host string) string {
 
 func (w *FetchWorker) Work(ctx context.Context, job *river.Job[common.FetchArgs]) error {
 
+	u := url.URL{
+		Scheme: job.Args.Scheme,
+		Host:   job.Args.Host,
+		Path:   job.Args.Path,
+	}
+
+	err := filter.IsReject(&u)
+	if err != nil {
+		return nil
+	}
+
 	// Have we seen them before?
 	if Gateway.HostExists(job.Args.Host) {
 		// If we have, and it is too soon, send them to their queue.

diff --git a/cmd/migrate/search_db/make_inheritence_tables.py b/cmd/migrate/search_db/make_inheritence_tables.py
@@ -105,6 +105,41 @@ def migrate_up(tlds, jd64, start_int, end_int):
                     fqdn, rfqdn, rdomain, tld_nibbles, domain_nibbles, subdomain_nibbles
                 )
 
+def indexes(tlds, jd64, start_int, end_int):
+    fp = get_fp()
+    fp.close()
+    for tld in tlds:
+        d64tofqdn = jd64[tld]["Domain64ToFQDN"]
+        for d64, fqdn in d64tofqdn.items():
+            d64_int = int(d64, 16)
+            if d64_int >= start_int and d64_int <= end_int:
+                tld_nibbles = d64[0:2]
+                domain_nibbles = d64[2:8]
+                subdomain_nibbles = d64[8:14]
+                tld = list(reversed(fqdn.split(".")))[0]
+                rdomain = "_".join(list(reversed(fqdn.split(".")))[0:2])
+                rfqdn = "_".join(list(reversed(fqdn.split("."))))
+                clean_rfqdn = "".join(filter(safe, rfqdn.lower()))
+
+                fp = get_fp()
+                indexes = f"""
+-------------------------
+{clean_rfqdn} indexes
+-------------------------
+create index if not exists {clean_rfqdn}_domain64_idx on {clean_rfqdn} (domain64);
+create index if not exists {clean_rfqdn}_tag_idx on {clean_rfqdn} (tag);
+create index if not exists {clean_rfqdn}_gin_paths_idx on {clean_rfqdn} 
+using gin (to_tsvector('english', path));
+create index if not exists {clean_rfqdn}_gist_paths_idx on {clean_rfqdn} 
+using gist (to_tsvector('english', path));
+create index if not exists {clean_rfqdn}_gin_bodies_idx on {clean_rfqdn} 
+using gin (to_tsvector('english', content));
+-- This uses a new FTS vector column. Pre-compute for speed.
+create index if not exists {clean_rfqdn}_fts_idx on {clean_rfqdn} using gin (fts);
+        """
+                fp.write(indexes)
+                fp.write("\n")
+                fp.close()
 
 def migrate_down(tlds, jd64, start_int, end_int):
     fp = get_fp()
@@ -177,8 +212,9 @@ def trigger_function(tlds, jd64, start_int, end_int):
                 fp.write(
                     f"    {cond} (new.domain64 >= x'{d64}'::bigint and new.domain64 < x'{d64_plus_one}00'::bigint)\n"
                 )
-                fp.write(f"      then insert into {clean_rfqdn} values (new.*);\n")
-    fp.write(f"    else insert into {tld} values (new.*);\n")
+                fp.write(f"      then insert into {clean_rfqdn} (domain64, path, tag, content) values (new.domain64, new.path, new.tag, new.content);\n")
+    # fp.write(f"    else insert into {tld} values (new.*);\n")
+    fp.write(f"    else insert into {tld} (domain64, path, tag, content) values (new.domain64, new.path, new.tag, new.content);\n")
     fp.write("  end if;\n")
     fp.write("  return null;\n")
     fp.write("  end;\n")
@@ -213,6 +249,7 @@ def main(path, migration_name, start, end):
         pass
 
     migrate_up(tlds, jd64, start_int, end_int)
+    indexes(tlds, jd64, start_int, end_int)
     trigger_function(tlds, jd64, start_int, end_int)
     migrate_down(tlds, jd64, start_int, end_int)
 

diff --git a/cmd/walk/bug.txt b/cmd/walk/bug.txt
diff --git a/cmd/walk/work.go b/cmd/walk/work.go
@@ -1,7 +1,6 @@
 package main
 
 import (
-	"bytes"
 	"context"
 	"errors"
 	"fmt"
@@ -10,6 +9,7 @@ import (
 	"strings"
 
 	common "github.com/GSA-TTS/jemison/internal/common"
+	filter "github.com/GSA-TTS/jemison/internal/filtering"
 	"github.com/GSA-TTS/jemison/internal/kv"
 	"github.com/GSA-TTS/jemison/internal/queueing"
 	"github.com/GSA-TTS/jemison/internal/util"
@@ -145,67 +145,21 @@ func walk_html(s3json *kv.S3JSON) {
 // A set of functions applied that, one at a time, decide if a link should
 // be crawled.
 
-func tooManyRepeats(s string, repeatLength int, threshold int) bool {
-	end := len(s) - repeatLength
-	chunks := make(map[string]bool)
-	repeats := make(map[string]int)
-	for ndx := 0; ndx < end; ndx++ {
-		piece := s[ndx : ndx+repeatLength]
-		if _, ok := chunks[piece]; ok {
-			repeats[piece] = repeats[piece] + 1
-		} else {
-			chunks[piece] = true
-			repeats[piece] = 0
-		}
-	}
-
-	total := 0
-	for _, v := range repeats {
-		total += v
-	}
-
-	return total >= threshold
-}
-
 func is_crawlable(s3json *kv.S3JSON, link string) (string, error) {
 	base := url.URL{
 		Scheme: s3json.GetString("scheme"),
 		Host:   s3json.GetString("host"),
 		Path:   s3json.GetString("path"),
 	}
 
-	// zap.L().Debug("considering the url",
-	// 	zap.String("url", link))
-
-	// Is the URL at least length 1?
-	if len(link) < 1 {
-		return "", errors.New("crawler: URL is too short to crawl")
-	}
-
-	skippable_prefixes := []string{"#", "mailto"}
-	for _, sp := range skippable_prefixes {
-		// Skip anything that starts with a #
-		if strings.HasPrefix(link, sp) {
-			return "", fmt.Errorf("skipping %s: %s", sp, link)
-		}
-	}
-
-	// FIXME: These need to become config parameters.
-	// Does it have a large number of repeats?
-	// If so, we might be in an infinite loop.
-	if tooManyRepeats(link, 8, 50) {
-		return "", fmt.Errorf("too many repeats: %s", link)
-	}
-
-	for _, ext := range []string{"jpg", "jpeg", "png", "tiff", "tif", "gif", "svg", "raw", "psd", "mp3", "mov", "webp", "bmp", "acc", "ogg"} {
-		if strings.HasSuffix(link, ext) {
-			return "", fmt.Errorf("ignoring extension: %s", ext)
-		}
+	lu, err := url.Parse(link)
+	if err != nil {
+		return "", fmt.Errorf("crawler: link does not parse: %s", link)
 	}
 
-	// Does it have a mailto: ? Skip it.
-	if strings.Contains(link, "mailto:") {
-		return "", fmt.Errorf("looks like a mailto link: %s", link)
+	err = filter.IsReject(lu)
+	if err != nil {
+		return "", err
 	}
 
 	// Does it reference the root? Resolve it.
@@ -229,16 +183,6 @@ func is_crawlable(s3json *kv.S3JSON, link string) (string, error) {
 		return base.String(), nil
 	}
 
-	lu, err := url.Parse(link)
-	if err != nil {
-		return "", fmt.Errorf("crawler: link does not parse: %s", link)
-	}
-
-	// Does it end in .gov?
-	// if bytes.HasSuffix([]byte(lu.Host), []byte("gov")) {
-	// 	return "", errors.New("crawler: URL does not end in .gov")
-	// }
-
 	pieces := strings.Split(base.Host, ".")
 	if len(pieces) < 2 {
 		return "", errors.New("crawler: link host has too few pieces")
@@ -251,18 +195,7 @@ func is_crawlable(s3json *kv.S3JSON, link string) (string, error) {
 		}
 	}
 
-	// FIXME: There seem to be whitespace URLs coming through. I don't know why.
-	// This could be revisited, as it is expensive.
-	// Do we still have garbage?
-	if !bytes.HasPrefix([]byte(lu.String()), []byte("https")) ||
-		!bytes.HasPrefix([]byte(lu.String()), []byte("http")) {
-		return "", errors.New("crawler: link does not start with http(s)")
-	}
-	// Is it pure whitespace?
-	if len(strings.Replace(lu.String(), " ", "", -1)) < 5 {
-		return "", errors.New("crawler: link too short")
-	}
-	return lu.String(), nil
+	return "", fmt.Errorf("could not decide: %s", link)
 }
 
 func trimSuffix(s, suffix string) string {

diff --git a/config/_allowed_hosts.jsonnet b/config/_allowed_hosts.jsonnet