Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Optimize models URLToString #226

Merged
merged 4 commits into from
Feb 17, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 9 additions & 1 deletion pkg/models/url.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ import (
"net/http"
"net/url"
"strings"
"sync"

"github.com/CorentinB/warc/pkg/spooledtempfile"
"github.com/PuerkitoBio/goquery"
Expand All @@ -24,6 +25,9 @@ type URL struct {
mimetype *mimetype.MIME
Hops int // This determines the number of hops this item is the result of, a hop is a "jump" from 1 page to another page
Redirects int

stringCache string
once sync.Once
}

func (u *URL) Parse() (err error) {
Expand Down Expand Up @@ -108,7 +112,10 @@ func (u *URL) GetHops() int {
}

func (u *URL) String() string {
return URLToString(u.parsed)
u.once.Do(func() {
u.stringCache = URLToString(u.parsed)
})
return u.stringCache
}

// URLToString exists to apply some custom stuff, in opposition of simply
Expand Down Expand Up @@ -155,6 +162,7 @@ func encodeQuery(v url.Values) string {
}

var buf strings.Builder

first := true

for k, vs := range v {
Expand Down
62 changes: 49 additions & 13 deletions internal/pkg/utils/url_test.go → pkg/models/url_test.go
Original file line number Diff line number Diff line change
@@ -1,13 +1,12 @@
package utils
package models

import (
"sync"
"testing"

"github.com/internetarchive/Zeno/pkg/models"
)

func TestURLToStringPunycode(t *testing.T) {
u := &models.URL{Raw: "https://xn----8sbddjhbicfsohgbg1aeo.xn--p1ia/pic/file/map_of_sarlat.pdf"}
u := &URL{Raw: "https://xn----8sbddjhbicfsohgbg1aeo.xn--p1ia/pic/file/map_of_sarlat.pdf"}
err := u.Parse()
if err != nil {
t.Fatalf("error parsing URL: %v", err)
Expand All @@ -16,12 +15,12 @@ func TestURLToStringPunycode(t *testing.T) {
expected := "https://xn----8sbddjhbicfsohgbg1aeo.xn--p1ia/pic/file/map_of_sarlat.pdf"
actual := u.String()
if actual != expected {
t.Fatalf("expected %s, got %s", expected, actual)
t.Fatalf("Expected %s, got %s", expected, actual)
}
}

func TestURLToStringPunycodeWithPort(t *testing.T) {
u := &models.URL{Raw: "https://xn----8sbddjhbicfsohgbg1aeo.xn--p1ia:8080/pic/file/map_of_sarlat.pdf"}
u := &URL{Raw: "https://xn----8sbddjhbicfsohgbg1aeo.xn--p1ia:8080/pic/file/map_of_sarlat.pdf"}
err := u.Parse()
if err != nil {
t.Fatalf("Error parsing URL: %v", err)
Expand All @@ -35,7 +34,7 @@ func TestURLToStringPunycodeWithPort(t *testing.T) {
}

func TestURLToStringUnicodetoIDNA(t *testing.T) {
u := &models.URL{Raw: "https://о-змладйвеклблнозеж.xn--p1ia:8080/pic/file/map_of_sarlat.pdf"}
u := &URL{Raw: "https://о-змладйвеклблнозеж.xn--p1ia:8080/pic/file/map_of_sarlat.pdf"}
err := u.Parse()
if err != nil {
t.Fatalf("Error parsing URL: %v", err)
Expand All @@ -49,7 +48,7 @@ func TestURLToStringUnicodetoIDNA(t *testing.T) {
}

func TestURLToStringWithPath(t *testing.T) {
u := &models.URL{Raw: "http://παράδειγμα.δοκιμή/Αρχική_σελίδα"}
u := &URL{Raw: "http://παράδειγμα.δοκιμή/Αρχική_σελίδα"}
err := u.Parse()
if err != nil {
t.Fatalf("Error parsing URL: %v", err)
Expand All @@ -63,7 +62,7 @@ func TestURLToStringWithPath(t *testing.T) {
}

func TestURLToStringUnicodetoIDNAWithPort(t *testing.T) {
u := &models.URL{Raw: "https://о-змладйвеклблнозеж.xn--p1ia:8080/pic/file/map_of_sarlat.pdf"}
u := &URL{Raw: "https://о-змладйвеклблнозеж.xn--p1ia:8080/pic/file/map_of_sarlat.pdf"}
err := u.Parse()
if err != nil {
t.Fatalf("Error parsing URL: %v", err)
Expand All @@ -77,7 +76,7 @@ func TestURLToStringUnicodetoIDNAWithPort(t *testing.T) {
}

func TestURLwithIPv6(t *testing.T) {
u := &models.URL{Raw: "https://[2600:4040:23c7:a620:3642:ebaa:ab23:735e]/test"}
u := &URL{Raw: "https://[2600:4040:23c7:a620:3642:ebaa:ab23:735e]/test"}
err := u.Parse()
if err != nil {
t.Fatalf("Error parsing URL: %v", err)
Expand All @@ -91,7 +90,7 @@ func TestURLwithIPv6(t *testing.T) {
}

func TestURLwithIPv6WithPort(t *testing.T) {
u := &models.URL{Raw: "https://[2600:4040:23c7:a620:3642:ebaa:ab23:735e]:8080/test"}
u := &URL{Raw: "https://[2600:4040:23c7:a620:3642:ebaa:ab23:735e]:8080/test"}
err := u.Parse()
if err != nil {
t.Fatalf("Error parsing URL: %v", err)
Expand All @@ -105,7 +104,7 @@ func TestURLwithIPv6WithPort(t *testing.T) {
}

func TestURLwithSpacesandUnicode(t *testing.T) {
u := &models.URL{Raw: "https://www.youtube.com/watch/0HBwC_wIFF4?t=18363石神視点【Minecraft】平日もど真ん中なんだから早く寝なきゃ【石神のぞみ/にじさんじ所属】https://www.youtube.com/watch/L30uAR9X8Uw?t=10100【倉持エン足中"}
u := &URL{Raw: "https://www.youtube.com/watch/0HBwC_wIFF4?t=18363石神視点【Minecraft】平日もど真ん中なんだから早く寝なきゃ【石神のぞみ/にじさんじ所属】https://www.youtube.com/watch/L30uAR9X8Uw?t=10100【倉持エン足中"}
err := u.Parse()
if err != nil {
t.Fatalf("Error parsing URL: %v", err)
Expand All @@ -120,7 +119,7 @@ func TestURLwithSpacesandUnicode(t *testing.T) {

// For technical reasons we are not encoding Reddit URLs.
func TestURLwithRedditOverride(t *testing.T) {
u := &models.URL{Raw: "https://styles.redditmedia.com/t5_7wkhw/styles/profileIcon_8w6r6fr3rh2d1.jpeg?width=64&height=64&frame=1&auto=webp&crop=64:64,smart&s=6d8ab9b89c9b846c9eb65622db9ced4992dc0905"}
u := &URL{Raw: "https://styles.redditmedia.com/t5_7wkhw/styles/profileIcon_8w6r6fr3rh2d1.jpeg?width=64&height=64&frame=1&auto=webp&crop=64:64,smart&s=6d8ab9b89c9b846c9eb65622db9ced4992dc0905"}
err := u.Parse()
if err != nil {
t.Fatalf("Error parsing URL: %v", err)
Expand All @@ -132,3 +131,40 @@ func TestURLwithRedditOverride(t *testing.T) {
t.Fatalf("Expected %s, got %s", expected, actual)
}
}

func TestURLConcurrentAccess(t *testing.T) {
concurrency := 100

u := &URL{Raw: "https://example.com"}
err := u.Parse()
if err != nil {
t.Fatalf("Error parsing URL: %v", err)
}

resCh := make(chan string, concurrency)
var wg sync.WaitGroup
wg.Add(concurrency)
for i := 0; i < concurrency; i++ {
go func() {
defer wg.Done()
resCh <- u.String()
}()
}
wg.Wait()
close(resCh)

var res []string
for r := range resCh {
res = append(res, r)
}

if len(res) != concurrency {
t.Fatalf("Expected %d results, got %d", concurrency, len(res))
}

for _, r := range res {
if r != "https://example.com" {
t.Fatalf("Expected https://example.com, got %s", r)
}
}
}