Skip to content

Commit

Permalink
Added seed URL and crawl restriction options
Browse files Browse the repository at this point in the history
  • Loading branch information
berrysauce committed Aug 5, 2024
1 parent 05616ec commit 63aba92
Showing 1 changed file with 11 additions and 5 deletions.
16 changes: 11 additions & 5 deletions indexbot/spiders/indexbot.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,26 +2,32 @@
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule

import os
import time
import logging
import requests

from indexbot.items import IndexbotItem
from indexbot.utils.schema_parser import parse_schema
from indexbot.utils.text_processing import extract_keywords, generate_summary
#from indexbot.utils.text_processing import extract_keywords, generate_summary

class IndexbotSpider(CrawlSpider):
name = "indexbot"

# Load start URLs from sources.txt
#logging.log(logging.INFO, "Requesting sources.txt")
start_urls = []
res = requests.get("https://data.openwebindex.org/indexes/metaindex-alpha/sources.txt")
try:
seed_url = os.getenv("SEED_URL")
res = requests.get(seed_url)
except:
res = requests.get("https://cdn.data.openwebindex.org/seeds/default/seed.txt")
for line in res.text.split("\n"):
if line.strip() and not line.startswith("#"):
start_urls.append(line.strip())
#logging.log(logging.INFO, f"Loaded {len(start_urls)} start URLs from sources.txt")

# Restrict crawling to the start URLs if RESTRICT_CRAWL is set
if os.getenv("RESTRICT_CRAWL"):
allowed_domains = start_urls

#allowed_domains = ["producthunt.com"] # Replace with the target domain(s)
#start_urls = ["http://producthunt.com"] # Replace with the initial URL(s)

Expand Down

0 comments on commit 63aba92

Please sign in to comment.