From b430806dccaacd32ad887bc99a9723db918be9fb Mon Sep 17 00:00:00 2001 From: Paul Haedrich Date: Tue, 6 Aug 2024 10:15:09 +0200 Subject: [PATCH] Fixed restrict crawl --- indexbot/spiders/indexbot.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/indexbot/spiders/indexbot.py b/indexbot/spiders/indexbot.py index 21341eb..97919a1 100644 --- a/indexbot/spiders/indexbot.py +++ b/indexbot/spiders/indexbot.py @@ -26,7 +26,9 @@ class IndexbotSpider(CrawlSpider): # Restrict crawling to the start URLs if RESTRICT_CRAWL is set if os.getenv("RESTRICT_CRAWL"): - allowed_domains = start_urls + allowed_domains = [] + for url in start_urls: + allowed_domains.append(url.replace("https://", "").replace("http://", "").split("/")[0]) #allowed_domains = ["producthunt.com"] # Replace with the target domain(s) #start_urls = ["http://producthunt.com"] # Replace with the initial URL(s)