Skip to content

Commit

Permalink
Do not remove "term" parameter from urbandictionary.com urls
Browse files Browse the repository at this point in the history
  • Loading branch information
emwalker committed May 30, 2020
1 parent 035c2d5 commit e8bb81a
Show file tree
Hide file tree
Showing 4 changed files with 71 additions and 47 deletions.
107 changes: 63 additions & 44 deletions cmd/frontend/services/pageinfo/url.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,11 @@ type URL struct {
Sha1 string
}

type urlSpec struct {
suffix string
keepParams []string
}

const normalizationFlags = pl.FlagRemoveDefaultPort |
pl.FlagDecodeDWORDHost |
pl.FlagDecodeOctalHost |
Expand All @@ -30,24 +35,6 @@ const normalizationFlags = pl.FlagRemoveDefaultPort |
pl.FlagSortQuery

var (
omitQuerySites = []string{
"amazon.com",
"theatlantic.com",
"businessinsider.com",
"dictionary.com",
"independent.co.uk",
"motherjones.com",
"newyorker.com",
"nymag.com",
"nytimes.com",
"reuters.com",
"scientificamerican.com",
"thedailybeast.com",
"theguardian.com",
"thehill.com",
"twitter.com",
}

omitFields = []string{
"fbclid",
"mbid",
Expand All @@ -63,8 +50,50 @@ var (
"https",
"ssh",
}

urlSpecs = []urlSpec{
urlSpec{suffix: "youtube.com", keepParams: []string{"v"}},
urlSpec{suffix: "urbandictionary.com", keepParams: []string{"term"}},
urlSpec{suffix: "amazon.com"},
urlSpec{suffix: "businessinsider.com"},
urlSpec{suffix: "dictionary.com"},
urlSpec{suffix: "independent.co.uk"},
urlSpec{suffix: "motherjones.com"},
urlSpec{suffix: "newyorker.com"},
urlSpec{suffix: "nymag.com"},
urlSpec{suffix: "nytimes.com"},
urlSpec{suffix: "reuters.com"},
urlSpec{suffix: "scientificamerican.com"},
urlSpec{suffix: "theatlantic.com"},
urlSpec{suffix: "thedailybeast.com"},
urlSpec{suffix: "theguardian.com"},
urlSpec{suffix: "thehill.com"},
urlSpec{suffix: "twitter.com"},
}
)

func (s *urlSpec) matchesHost(host string) bool {
return strings.HasSuffix(host, s.suffix)
}

func (s *urlSpec) normalizeUrl(parsed *url.URL) string {
query := parsed.Query()

Loop:
for queryParam := range query {
for _, keepParam := range s.keepParams {
if queryParam == keepParam {
continue Loop
}
}
query.Del(queryParam)
}

// FIXME: don't modify the input argument
parsed.RawQuery = query.Encode()
return parsed.String()
}

// IsURL returns true if a string parses as a URL and false otherwise.
func IsURL(str string) bool {
parsed, err := url.ParseRequestURI(str)
Expand All @@ -79,15 +108,6 @@ func IsURL(str string) bool {
return false
}

func removeQueryAndAnchor(parsed *url.URL) bool {
for _, host := range omitQuerySites {
if strings.HasSuffix(parsed.Host, host) {
return true
}
}
return false
}

func stripFragment(parsed *url.URL) bool {
return !strings.HasSuffix(parsed.Host, "mail.google.com")
}
Expand All @@ -106,6 +126,15 @@ func removeQueryParam(field string) bool {
return false
}

func urlSpecFor(host string) *urlSpec {
for _, spec := range urlSpecs {
if spec.matchesHost(host) {
return &spec
}
}
return nil
}

// NewURL returns a URL with a canonicalized form and a SHA1.
func NewURL(providedURL string) (*URL, error) {
value, err := NormalizeURL(providedURL)
Expand All @@ -123,26 +152,16 @@ func NewURL(providedURL string) (*URL, error) {

// NormalizeURL normalizes a url before it is stored in the database.
func NormalizeURL(rawURL string) (*URL, error) {
parsed, err := url.Parse(rawURL)
copiedURL := rawURL
parsed, err := url.Parse(copiedURL)
if err != nil {
return nil, err
}

if removeQueryAndAnchor(parsed) {
parsed.RawQuery = ""
rawURL = parsed.String()
} else if strings.HasSuffix(parsed.Host, "youtube.com") {
query := parsed.Query()
spec := urlSpecFor(parsed.Host)

for key := range query {
if key == "v" {
continue
}
query.Del(key)
}

parsed.RawQuery = query.Encode()
rawURL = parsed.String()
if spec != nil {
copiedURL = spec.normalizeUrl(parsed)
} else {
query := parsed.Query()

Expand All @@ -153,15 +172,15 @@ func NormalizeURL(rawURL string) (*URL, error) {
}

parsed.RawQuery = query.Encode()
rawURL = parsed.String()
copiedURL = parsed.String()
}

flags := normalizationFlags
if stripFragment(parsed) {
flags |= pl.FlagRemoveFragment
}

canonical, err := pl.NormalizeURLString(rawURL, flags)
canonical, err := pl.NormalizeURLString(copiedURL, flags)
if err != nil {
return nil, err
}
Expand Down
5 changes: 5 additions & 0 deletions cmd/frontend/services/pageinfo/url_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -147,6 +147,11 @@ func TestNormalizeURL(t *testing.T) {
inputURL: "https://www.thedailybeast.com/how-natural-news-became-a-conspiracy-hub-rivaling-infowars?someparam",
canonicalURL: "https://www.thedailybeast.com/how-natural-news-became-a-conspiracy-hub-rivaling-infowars",
},
{
name: "An Urban Dictionary definition",
inputURL: "https://www.urbandictionary.com/define.php?term=Vote%20from%20the%20rooftops",
canonicalURL: "https://www.urbandictionary.com/define.php?term=Vote+from+the+rooftops",
},
}

for _, testCase := range testCases {
Expand Down
4 changes: 2 additions & 2 deletions k8s/cluster/frontend.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ spec:
envFrom:
- secretRef:
name: digraph-secrets
image: emwalker/digraph-api:v222
image: emwalker/digraph-api:v223
imagePullPolicy: Always
ports:
- containerPort: 8080
Expand All @@ -65,7 +65,7 @@ spec:
envFrom:
- secretRef:
name: digraph-secrets
image: emwalker/digraph-node:v222
image: emwalker/digraph-node:v223
imagePullPolicy: Always
ports:
- containerPort: 3001
Expand Down
2 changes: 1 addition & 1 deletion k8s/release
Original file line number Diff line number Diff line change
@@ -1 +1 @@
v222
v223

0 comments on commit e8bb81a

Please sign in to comment.