Skip to content

Commit

Permalink
Merge pull request #46 from GSA-TTS/jadudm/filter-cleanup
Browse files Browse the repository at this point in the history
Jadudm/filter cleanup
  • Loading branch information
jadudm authored Dec 27, 2024
2 parents 0726a43 + c10c87f commit c90ebdb
Show file tree
Hide file tree
Showing 40 changed files with 30,243 additions and 596 deletions.
13 changes: 13 additions & 0 deletions cmd/fetch/work.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ import (
"context"
"fmt"
"math/rand/v2"
"net/url"
"regexp"
"strconv"
"strings"
Expand All @@ -15,6 +16,7 @@ import (

"github.com/GSA-TTS/jemison/config"
common "github.com/GSA-TTS/jemison/internal/common"
filter "github.com/GSA-TTS/jemison/internal/filtering"
kv "github.com/GSA-TTS/jemison/internal/kv"
"github.com/GSA-TTS/jemison/internal/postgres/work_db"
"github.com/GSA-TTS/jemison/internal/queueing"
Expand Down Expand Up @@ -69,6 +71,17 @@ func stripHostToAscii(host string) string {

func (w *FetchWorker) Work(ctx context.Context, job *river.Job[common.FetchArgs]) error {

u := url.URL{
Scheme: job.Args.Scheme,
Host: job.Args.Host,
Path: job.Args.Path,
}

err := filter.IsReject(&u)
if err != nil {
return nil
}

// Have we seen them before?
if Gateway.HostExists(job.Args.Host) {
// If we have, and it is too soon, send them to their queue.
Expand Down
41 changes: 39 additions & 2 deletions cmd/migrate/search_db/make_inheritence_tables.py
Original file line number Diff line number Diff line change
Expand Up @@ -105,6 +105,41 @@ def migrate_up(tlds, jd64, start_int, end_int):
fqdn, rfqdn, rdomain, tld_nibbles, domain_nibbles, subdomain_nibbles
)

def indexes(tlds, jd64, start_int, end_int):
fp = get_fp()
fp.close()
for tld in tlds:
d64tofqdn = jd64[tld]["Domain64ToFQDN"]
for d64, fqdn in d64tofqdn.items():
d64_int = int(d64, 16)
if d64_int >= start_int and d64_int <= end_int:
tld_nibbles = d64[0:2]
domain_nibbles = d64[2:8]
subdomain_nibbles = d64[8:14]
tld = list(reversed(fqdn.split(".")))[0]
rdomain = "_".join(list(reversed(fqdn.split(".")))[0:2])
rfqdn = "_".join(list(reversed(fqdn.split("."))))
clean_rfqdn = "".join(filter(safe, rfqdn.lower()))

fp = get_fp()
indexes = f"""
-------------------------
{clean_rfqdn} indexes
-------------------------
create index if not exists {clean_rfqdn}_domain64_idx on {clean_rfqdn} (domain64);
create index if not exists {clean_rfqdn}_tag_idx on {clean_rfqdn} (tag);
create index if not exists {clean_rfqdn}_gin_paths_idx on {clean_rfqdn}
using gin (to_tsvector('english', path));
create index if not exists {clean_rfqdn}_gist_paths_idx on {clean_rfqdn}
using gist (to_tsvector('english', path));
create index if not exists {clean_rfqdn}_gin_bodies_idx on {clean_rfqdn}
using gin (to_tsvector('english', content));
-- This uses a new FTS vector column. Pre-compute for speed.
create index if not exists {clean_rfqdn}_fts_idx on {clean_rfqdn} using gin (fts);
"""
fp.write(indexes)
fp.write("\n")
fp.close()

def migrate_down(tlds, jd64, start_int, end_int):
fp = get_fp()
Expand Down Expand Up @@ -177,8 +212,9 @@ def trigger_function(tlds, jd64, start_int, end_int):
fp.write(
f" {cond} (new.domain64 >= x'{d64}'::bigint and new.domain64 < x'{d64_plus_one}00'::bigint)\n"
)
fp.write(f" then insert into {clean_rfqdn} values (new.*);\n")
fp.write(f" else insert into {tld} values (new.*);\n")
fp.write(f" then insert into {clean_rfqdn} (domain64, path, tag, content) values (new.domain64, new.path, new.tag, new.content);\n")
# fp.write(f" else insert into {tld} values (new.*);\n")
fp.write(f" else insert into {tld} (domain64, path, tag, content) values (new.domain64, new.path, new.tag, new.content);\n")
fp.write(" end if;\n")
fp.write(" return null;\n")
fp.write(" end;\n")
Expand Down Expand Up @@ -213,6 +249,7 @@ def main(path, migration_name, start, end):
pass

migrate_up(tlds, jd64, start_int, end_int)
indexes(tlds, jd64, start_int, end_int)
trigger_function(tlds, jd64, start_int, end_int)
migrate_down(tlds, jd64, start_int, end_int)

Expand Down
81 changes: 0 additions & 81 deletions cmd/walk/bug.txt

This file was deleted.

83 changes: 8 additions & 75 deletions cmd/walk/work.go
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
package main

import (
"bytes"
"context"
"errors"
"fmt"
Expand All @@ -10,6 +9,7 @@ import (
"strings"

common "github.com/GSA-TTS/jemison/internal/common"
filter "github.com/GSA-TTS/jemison/internal/filtering"
"github.com/GSA-TTS/jemison/internal/kv"
"github.com/GSA-TTS/jemison/internal/queueing"
"github.com/GSA-TTS/jemison/internal/util"
Expand Down Expand Up @@ -145,67 +145,21 @@ func walk_html(s3json *kv.S3JSON) {
// A set of functions applied that, one at a time, decide if a link should
// be crawled.

func tooManyRepeats(s string, repeatLength int, threshold int) bool {
end := len(s) - repeatLength
chunks := make(map[string]bool)
repeats := make(map[string]int)
for ndx := 0; ndx < end; ndx++ {
piece := s[ndx : ndx+repeatLength]
if _, ok := chunks[piece]; ok {
repeats[piece] = repeats[piece] + 1
} else {
chunks[piece] = true
repeats[piece] = 0
}
}

total := 0
for _, v := range repeats {
total += v
}

return total >= threshold
}

func is_crawlable(s3json *kv.S3JSON, link string) (string, error) {
base := url.URL{
Scheme: s3json.GetString("scheme"),
Host: s3json.GetString("host"),
Path: s3json.GetString("path"),
}

// zap.L().Debug("considering the url",
// zap.String("url", link))

// Is the URL at least length 1?
if len(link) < 1 {
return "", errors.New("crawler: URL is too short to crawl")
}

skippable_prefixes := []string{"#", "mailto"}
for _, sp := range skippable_prefixes {
// Skip anything that starts with a #
if strings.HasPrefix(link, sp) {
return "", fmt.Errorf("skipping %s: %s", sp, link)
}
}

// FIXME: These need to become config parameters.
// Does it have a large number of repeats?
// If so, we might be in an infinite loop.
if tooManyRepeats(link, 8, 50) {
return "", fmt.Errorf("too many repeats: %s", link)
}

for _, ext := range []string{"jpg", "jpeg", "png", "tiff", "tif", "gif", "svg", "raw", "psd", "mp3", "mov", "webp", "bmp", "acc", "ogg"} {
if strings.HasSuffix(link, ext) {
return "", fmt.Errorf("ignoring extension: %s", ext)
}
lu, err := url.Parse(link)
if err != nil {
return "", fmt.Errorf("crawler: link does not parse: %s", link)
}

// Does it have a mailto: ? Skip it.
if strings.Contains(link, "mailto:") {
return "", fmt.Errorf("looks like a mailto link: %s", link)
err = filter.IsReject(lu)
if err != nil {
return "", err
}

// Does it reference the root? Resolve it.
Expand All @@ -229,16 +183,6 @@ func is_crawlable(s3json *kv.S3JSON, link string) (string, error) {
return base.String(), nil
}

lu, err := url.Parse(link)
if err != nil {
return "", fmt.Errorf("crawler: link does not parse: %s", link)
}

// Does it end in .gov?
// if bytes.HasSuffix([]byte(lu.Host), []byte("gov")) {
// return "", errors.New("crawler: URL does not end in .gov")
// }

pieces := strings.Split(base.Host, ".")
if len(pieces) < 2 {
return "", errors.New("crawler: link host has too few pieces")
Expand All @@ -251,18 +195,7 @@ func is_crawlable(s3json *kv.S3JSON, link string) (string, error) {
}
}

// FIXME: There seem to be whitespace URLs coming through. I don't know why.
// This could be revisited, as it is expensive.
// Do we still have garbage?
if !bytes.HasPrefix([]byte(lu.String()), []byte("https")) ||
!bytes.HasPrefix([]byte(lu.String()), []byte("http")) {
return "", errors.New("crawler: link does not start with http(s)")
}
// Is it pure whitespace?
if len(strings.Replace(lu.String(), " ", "", -1)) < 5 {
return "", errors.New("crawler: link too short")
}
return lu.String(), nil
return "", fmt.Errorf("could not decide: %s", link)
}

func trimSuffix(s, suffix string) string {
Expand Down
78 changes: 0 additions & 78 deletions config/_allowed_hosts.jsonnet

This file was deleted.

Loading

0 comments on commit c90ebdb

Please sign in to comment.