From 786e1c035ecdccf4c781480c53c07e991263b2d0 Mon Sep 17 00:00:00 2001 From: Thomas FOUBERT Date: Fri, 14 Feb 2025 17:01:36 +0100 Subject: [PATCH 1/3] init commit to start the PR --- pkg/models/url.go | 1 + 1 file changed, 1 insertion(+) diff --git a/pkg/models/url.go b/pkg/models/url.go index ee65a2a6..774948eb 100644 --- a/pkg/models/url.go +++ b/pkg/models/url.go @@ -155,6 +155,7 @@ func encodeQuery(v url.Values) string { } var buf strings.Builder + first := true for k, vs := range v { From 9a6e6605b6e4732ca91d7663145fd8a5cc45bf1f Mon Sep 17 00:00:00 2001 From: Thomas FOUBERT Date: Mon, 17 Feb 2025 10:56:25 +0100 Subject: [PATCH 2/3] models.url: moved tests from utils package to models package and added a concurrency test for upcoming changes --- .../pkg/utils => pkg/models}/url_test.go | 62 +++++++++++++++---- 1 file changed, 49 insertions(+), 13 deletions(-) rename {internal/pkg/utils => pkg/models}/url_test.go (65%) diff --git a/internal/pkg/utils/url_test.go b/pkg/models/url_test.go similarity index 65% rename from internal/pkg/utils/url_test.go rename to pkg/models/url_test.go index cd777c4e..5ce25162 100644 --- a/internal/pkg/utils/url_test.go +++ b/pkg/models/url_test.go @@ -1,13 +1,12 @@ -package utils +package models import ( + "sync" "testing" - - "github.com/internetarchive/Zeno/pkg/models" ) func TestURLToStringPunycode(t *testing.T) { - u := &models.URL{Raw: "https://xn----8sbddjhbicfsohgbg1aeo.xn--p1ia/pic/file/map_of_sarlat.pdf"} + u := &URL{Raw: "https://xn----8sbddjhbicfsohgbg1aeo.xn--p1ia/pic/file/map_of_sarlat.pdf"} err := u.Parse() if err != nil { t.Fatalf("error parsing URL: %v", err) @@ -16,12 +15,12 @@ func TestURLToStringPunycode(t *testing.T) { expected := "https://xn----8sbddjhbicfsohgbg1aeo.xn--p1ia/pic/file/map_of_sarlat.pdf" actual := u.String() if actual != expected { - t.Fatalf("expected %s, got %s", expected, actual) + t.Fatalf("Expected %s, got %s", expected, actual) } } func TestURLToStringPunycodeWithPort(t *testing.T) { - u := &models.URL{Raw: "https://xn----8sbddjhbicfsohgbg1aeo.xn--p1ia:8080/pic/file/map_of_sarlat.pdf"} + u := &URL{Raw: "https://xn----8sbddjhbicfsohgbg1aeo.xn--p1ia:8080/pic/file/map_of_sarlat.pdf"} err := u.Parse() if err != nil { t.Fatalf("Error parsing URL: %v", err) @@ -35,7 +34,7 @@ func TestURLToStringPunycodeWithPort(t *testing.T) { } func TestURLToStringUnicodetoIDNA(t *testing.T) { - u := &models.URL{Raw: "https://о-змладйвеклблнозеж.xn--p1ia:8080/pic/file/map_of_sarlat.pdf"} + u := &URL{Raw: "https://о-змладйвеклблнозеж.xn--p1ia:8080/pic/file/map_of_sarlat.pdf"} err := u.Parse() if err != nil { t.Fatalf("Error parsing URL: %v", err) @@ -49,7 +48,7 @@ func TestURLToStringUnicodetoIDNA(t *testing.T) { } func TestURLToStringWithPath(t *testing.T) { - u := &models.URL{Raw: "http://παράδειγμα.δοκιμή/Αρχική_σελίδα"} + u := &URL{Raw: "http://παράδειγμα.δοκιμή/Αρχική_σελίδα"} err := u.Parse() if err != nil { t.Fatalf("Error parsing URL: %v", err) @@ -63,7 +62,7 @@ func TestURLToStringWithPath(t *testing.T) { } func TestURLToStringUnicodetoIDNAWithPort(t *testing.T) { - u := &models.URL{Raw: "https://о-змладйвеклблнозеж.xn--p1ia:8080/pic/file/map_of_sarlat.pdf"} + u := &URL{Raw: "https://о-змладйвеклблнозеж.xn--p1ia:8080/pic/file/map_of_sarlat.pdf"} err := u.Parse() if err != nil { t.Fatalf("Error parsing URL: %v", err) @@ -77,7 +76,7 @@ func TestURLToStringUnicodetoIDNAWithPort(t *testing.T) { } func TestURLwithIPv6(t *testing.T) { - u := &models.URL{Raw: "https://[2600:4040:23c7:a620:3642:ebaa:ab23:735e]/test"} + u := &URL{Raw: "https://[2600:4040:23c7:a620:3642:ebaa:ab23:735e]/test"} err := u.Parse() if err != nil { t.Fatalf("Error parsing URL: %v", err) @@ -91,7 +90,7 @@ func TestURLwithIPv6(t *testing.T) { } func TestURLwithIPv6WithPort(t *testing.T) { - u := &models.URL{Raw: "https://[2600:4040:23c7:a620:3642:ebaa:ab23:735e]:8080/test"} + u := &URL{Raw: "https://[2600:4040:23c7:a620:3642:ebaa:ab23:735e]:8080/test"} err := u.Parse() if err != nil { t.Fatalf("Error parsing URL: %v", err) @@ -105,7 +104,7 @@ func TestURLwithIPv6WithPort(t *testing.T) { } func TestURLwithSpacesandUnicode(t *testing.T) { - u := &models.URL{Raw: "https://www.youtube.com/watch/0HBwC_wIFF4?t=18363石神視点【Minecraft】平日もど真ん中なんだから早く寝なきゃ【石神のぞみ/にじさんじ所属】https://www.youtube.com/watch/L30uAR9X8Uw?t=10100【倉持エン足中"} + u := &URL{Raw: "https://www.youtube.com/watch/0HBwC_wIFF4?t=18363石神視点【Minecraft】平日もど真ん中なんだから早く寝なきゃ【石神のぞみ/にじさんじ所属】https://www.youtube.com/watch/L30uAR9X8Uw?t=10100【倉持エン足中"} err := u.Parse() if err != nil { t.Fatalf("Error parsing URL: %v", err) @@ -120,7 +119,7 @@ func TestURLwithSpacesandUnicode(t *testing.T) { // For technical reasons we are not encoding Reddit URLs. func TestURLwithRedditOverride(t *testing.T) { - u := &models.URL{Raw: "https://styles.redditmedia.com/t5_7wkhw/styles/profileIcon_8w6r6fr3rh2d1.jpeg?width=64&height=64&frame=1&auto=webp&crop=64:64,smart&s=6d8ab9b89c9b846c9eb65622db9ced4992dc0905"} + u := &URL{Raw: "https://styles.redditmedia.com/t5_7wkhw/styles/profileIcon_8w6r6fr3rh2d1.jpeg?width=64&height=64&frame=1&auto=webp&crop=64:64,smart&s=6d8ab9b89c9b846c9eb65622db9ced4992dc0905"} err := u.Parse() if err != nil { t.Fatalf("Error parsing URL: %v", err) @@ -132,3 +131,40 @@ func TestURLwithRedditOverride(t *testing.T) { t.Fatalf("Expected %s, got %s", expected, actual) } } + +func TestURLConcurrentAccess(t *testing.T) { + concurrency := 100 + + u := &URL{Raw: "https://example.com"} + err := u.Parse() + if err != nil { + t.Fatalf("Error parsing URL: %v", err) + } + + resCh := make(chan string, concurrency) + var wg sync.WaitGroup + wg.Add(concurrency) + for i := 0; i < concurrency; i++ { + go func() { + defer wg.Done() + resCh <- u.String() + }() + } + wg.Wait() + close(resCh) + + var res []string + for r := range resCh { + res = append(res, r) + } + + if len(res) != concurrency { + t.Fatalf("Expected %d results, got %d", concurrency, len(res)) + } + + for _, r := range res { + if r != "https://example.com" { + t.Fatalf("Expected https://example.com, got %s", r) + } + } +} From 071622679c716709911b35d2779f3f955db34362 Mon Sep 17 00:00:00 2001 From: Thomas FOUBERT Date: Mon, 17 Feb 2025 10:57:37 +0100 Subject: [PATCH 3/3] models.url: implemented @yzqzss idea to cache result of URLToString to reduce number of calls --- pkg/models/url.go | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/pkg/models/url.go b/pkg/models/url.go index 5c25943e..bdbcb0c7 100644 --- a/pkg/models/url.go +++ b/pkg/models/url.go @@ -7,6 +7,7 @@ import ( "net/http" "net/url" "strings" + "sync" "github.com/CorentinB/warc/pkg/spooledtempfile" "github.com/PuerkitoBio/goquery" @@ -24,6 +25,9 @@ type URL struct { mimetype *mimetype.MIME Hops int // This determines the number of hops this item is the result of, a hop is a "jump" from 1 page to another page Redirects int + + stringCache string + once sync.Once } func (u *URL) Parse() (err error) { @@ -108,7 +112,10 @@ func (u *URL) GetHops() int { } func (u *URL) String() string { - return URLToString(u.parsed) + u.once.Do(func() { + u.stringCache = URLToString(u.parsed) + }) + return u.stringCache } // URLToString exists to apply some custom stuff, in opposition of simply