diff --git a/Dockerfile b/Dockerfile index da229eb..e4bc7eb 100644 --- a/Dockerfile +++ b/Dockerfile @@ -15,6 +15,9 @@ RUN pip install -r requirements.txt # Copy the rest of the application COPY . /app +# Create the output directory +RUN mkdir -p /app/output + WORKDIR /app/indexbot # Set the entry point for the container diff --git a/indexbot/items.py b/indexbot/items.py index 52c5676..64e8b4a 100644 --- a/indexbot/items.py +++ b/indexbot/items.py @@ -11,11 +11,12 @@ class IndexbotItem(scrapy.Item): canonical_url = scrapy.Field() language = scrapy.Field() title = scrapy.Field() + content = scrapy.Field() meta = scrapy.Field() opengraph = scrapy.Field() publishing = scrapy.Field() headers = scrapy.Field() - gen = scrapy.Field() + #gen = scrapy.Field() metrics = scrapy.Field() schema = scrapy.Field() status_code = scrapy.Field() diff --git a/indexbot/pipelines.py b/indexbot/pipelines.py index 2ff2533..cb72f87 100644 --- a/indexbot/pipelines.py +++ b/indexbot/pipelines.py @@ -7,34 +7,36 @@ # useful for handling different item types with a single interface from itemadapter import ItemAdapter +import json + class IndexbotPipeline: def process_item(self, item, spider): return item -import json - class JsonWriterPipeline: def open_spider(self, spider): - self.file = open('crawled_data.jl', 'w') + self.file = open("../output/crawled_data.jl", "w") def close_spider(self, spider): self.file.close() def process_item(self, item, spider): - line = json.dumps(item) + "\n" + item_dict = ItemAdapter(item).asdict() + line = json.dumps(item_dict) + "\n" self.file.write(line) return item class TxtWriterPipeline: def open_spider(self, spider): - self.file = open('crawled_sites.txt', 'w') + self.file = open("../output/crawled_sites.txt", "w") def close_spider(self, spider): self.file.close() def process_item(self, item, spider): - line = item["url"] + "\n" + item_dict = ItemAdapter(item).asdict() + line = item_dict["url"] + "\n" self.file.write(line) return item diff --git a/indexbot/settings.py b/indexbot/settings.py index 1468f01..2ba858b 100644 --- a/indexbot/settings.py +++ b/indexbot/settings.py @@ -19,7 +19,7 @@ ROBOTSTXT_OBEY = True # Configure job directory for data persistence -JOBDIR = "crawls/indexbot" +#JOBDIR = "crawls/indexbot" # Configure logging LOG_LEVEL = "INFO" # Set the logging level to WARNING or ERROR to reduce output diff --git a/indexbot/spiders/indexbot.py b/indexbot/spiders/indexbot.py index f4da303..1666517 100644 --- a/indexbot/spiders/indexbot.py +++ b/indexbot/spiders/indexbot.py @@ -1,26 +1,26 @@ import scrapy from scrapy.linkextractors import LinkExtractor from scrapy.spiders import CrawlSpider, Rule -from items import IndexbotItem -from utils.schema_parser import parse_schema -from utils.text_processing import extract_keywords, generate_summary -import json import time import logging import requests +from indexbot.items import IndexbotItem +from indexbot.utils.schema_parser import parse_schema +from indexbot.utils.text_processing import extract_keywords, generate_summary + class IndexbotSpider(CrawlSpider): name = "indexbot" # Load start URLs from sources.txt - logging.INFO("Requesting sources.txt") + #logging.log(logging.INFO, "Requesting sources.txt") start_urls = [] res = requests.get("https://data.openwebindex.org/indexes/metaindex-alpha/sources.txt") for line in res.text.split("\n"): if line.strip() and not line.startswith("#"): start_urls.append(line.strip()) - logging.INFO(f"Loaded {len(start_urls)} URLs from sources.txt") + #logging.log(logging.INFO, f"Loaded {len(start_urls)} start URLs from sources.txt") #allowed_domains = ["producthunt.com"] # Replace with the target domain(s) #start_urls = ["http://producthunt.com"] # Replace with the initial URL(s) @@ -31,12 +31,13 @@ class IndexbotSpider(CrawlSpider): def parse_item(self, response): # Extract all paragraph content - paragraphs = response.xpath('//p//text()').getall() - content = ' '.join(paragraphs) + paragraphs = response.xpath("//p//text()").getall() + content = ' '.join(paragraphs).strip() + content = content[:2000] + "..." # limit content to 1000 characters - # Generate RAKE keywords and Gensim summary - keywords = extract_keywords(content) - summary = generate_summary(content, word_count=100) + # Generate RAKE keywords and SUMY summary + #keywords = extract_keywords(content) + #summary = generate_summary(content, sentences=3) # Parse schema data schema_data = parse_schema(response) @@ -46,6 +47,7 @@ def parse_item(self, response): canonical_url = response.xpath("//link[@rel='canonical']/@href").get(), # Canonical URL language = response.xpath("//html/@lang").get(), # Language of the page title = response.xpath("//title/text()").get(), # Page title + content = content, # Page content meta = { "description": response.xpath("//meta[@name='description']/@content").get(), # Meta description "keywords": response.xpath("//meta[@name='keywords']/@content").get(), # Meta keywords @@ -65,10 +67,10 @@ def parse_item(self, response): "content_length": response.headers.get("Content-Length", b"").decode("utf-8"), # Content-Length header "server": response.headers.get("Server", b"").decode("utf-8"), # Server header }, - gen = { - "keywords": keywords, # RAKE keywords - "summary": summary, # Gensim summary - }, + #gen = { + # "keywords": keywords, # RAKE keywords + # "summary": summary, # SUMY summary + #}, metrics={ "content_length": len(response.text), # Length of the page content "internal_links": len(response.xpath("//a[starts-with(@href, '/')]/@href").getall()), # Number of internal links diff --git a/indexbot/utils/text_processing.py b/indexbot/utils/text_processing.py index 5a342cc..2fc58db 100644 --- a/indexbot/utils/text_processing.py +++ b/indexbot/utils/text_processing.py @@ -1,5 +1,10 @@ +import nltk from rake_nltk import Rake -from gensim.summarization import summarize +from sumy.parsers.plaintext import PlaintextParser +from sumy.nlp.tokenizers import Tokenizer +from sumy.summarizers.lsa import LsaSummarizer + +nltk.download("stopwords") def extract_keywords(content): rake = Rake() @@ -9,8 +14,11 @@ def extract_keywords(content): except: return [] -def generate_summary(content, word_count=100): +def generate_summary(content, sentences=3): try: - return summarize(content, word_count=word_count) - except ValueError: - return "Summary could not be generated." + # Parse the text & generate the summary + parser = PlaintextParser.from_string(content, Tokenizer("english")) + summarizer = LsaSummarizer() + summary = summarizer(parser.document, sentences) + except: + return None diff --git a/requirements.txt b/requirements.txt index b5763bb..48a77e6 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,3 +1,4 @@ scrapy +nltk rake_nltk -gensim \ No newline at end of file +sumy \ No newline at end of file