Skip to content

Commit

Permalink
Fixed errors and disabled gen for now
Browse files Browse the repository at this point in the history
  • Loading branch information
berrysauce committed Aug 5, 2024
1 parent d0633c4 commit 9ab84f8
Show file tree
Hide file tree
Showing 7 changed files with 46 additions and 29 deletions.
3 changes: 3 additions & 0 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,9 @@ RUN pip install -r requirements.txt
# Copy the rest of the application
COPY . /app

# Create the output directory
RUN mkdir -p /app/output

WORKDIR /app/indexbot

# Set the entry point for the container
Expand Down
3 changes: 2 additions & 1 deletion indexbot/items.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,11 +11,12 @@ class IndexbotItem(scrapy.Item):
canonical_url = scrapy.Field()
language = scrapy.Field()
title = scrapy.Field()
content = scrapy.Field()
meta = scrapy.Field()
opengraph = scrapy.Field()
publishing = scrapy.Field()
headers = scrapy.Field()
gen = scrapy.Field()
#gen = scrapy.Field()
metrics = scrapy.Field()
schema = scrapy.Field()
status_code = scrapy.Field()
Expand Down
14 changes: 8 additions & 6 deletions indexbot/pipelines.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,34 +7,36 @@
# useful for handling different item types with a single interface
from itemadapter import ItemAdapter

import json


class IndexbotPipeline:
def process_item(self, item, spider):
return item

import json

class JsonWriterPipeline:
def open_spider(self, spider):
self.file = open('crawled_data.jl', 'w')
self.file = open("../output/crawled_data.jl", "w")

def close_spider(self, spider):
self.file.close()

def process_item(self, item, spider):
line = json.dumps(item) + "\n"
item_dict = ItemAdapter(item).asdict()
line = json.dumps(item_dict) + "\n"
self.file.write(line)
return item


class TxtWriterPipeline:
def open_spider(self, spider):
self.file = open('crawled_sites.txt', 'w')
self.file = open("../output/crawled_sites.txt", "w")

def close_spider(self, spider):
self.file.close()

def process_item(self, item, spider):
line = item["url"] + "\n"
item_dict = ItemAdapter(item).asdict()
line = item_dict["url"] + "\n"
self.file.write(line)
return item
2 changes: 1 addition & 1 deletion indexbot/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
ROBOTSTXT_OBEY = True

# Configure job directory for data persistence
JOBDIR = "crawls/indexbot"
#JOBDIR = "crawls/indexbot"

# Configure logging
LOG_LEVEL = "INFO" # Set the logging level to WARNING or ERROR to reduce output
Expand Down
32 changes: 17 additions & 15 deletions indexbot/spiders/indexbot.py
Original file line number Diff line number Diff line change
@@ -1,26 +1,26 @@
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from items import IndexbotItem
from utils.schema_parser import parse_schema
from utils.text_processing import extract_keywords, generate_summary

import json
import time
import logging
import requests

from indexbot.items import IndexbotItem
from indexbot.utils.schema_parser import parse_schema
from indexbot.utils.text_processing import extract_keywords, generate_summary

class IndexbotSpider(CrawlSpider):
name = "indexbot"

# Load start URLs from sources.txt
logging.INFO("Requesting sources.txt")
#logging.log(logging.INFO, "Requesting sources.txt")
start_urls = []
res = requests.get("https://data.openwebindex.org/indexes/metaindex-alpha/sources.txt")
for line in res.text.split("\n"):
if line.strip() and not line.startswith("#"):
start_urls.append(line.strip())
logging.INFO(f"Loaded {len(start_urls)} URLs from sources.txt")
#logging.log(logging.INFO, f"Loaded {len(start_urls)} start URLs from sources.txt")

#allowed_domains = ["producthunt.com"] # Replace with the target domain(s)
#start_urls = ["http://producthunt.com"] # Replace with the initial URL(s)
Expand All @@ -31,12 +31,13 @@ class IndexbotSpider(CrawlSpider):

def parse_item(self, response):
# Extract all paragraph content
paragraphs = response.xpath('//p//text()').getall()
content = ' '.join(paragraphs)
paragraphs = response.xpath("//p//text()").getall()
content = ' '.join(paragraphs).strip()
content = content[:2000] + "..." # limit content to 1000 characters

# Generate RAKE keywords and Gensim summary
keywords = extract_keywords(content)
summary = generate_summary(content, word_count=100)
# Generate RAKE keywords and SUMY summary
#keywords = extract_keywords(content)
#summary = generate_summary(content, sentences=3)

# Parse schema data
schema_data = parse_schema(response)
Expand All @@ -46,6 +47,7 @@ def parse_item(self, response):
canonical_url = response.xpath("//link[@rel='canonical']/@href").get(), # Canonical URL
language = response.xpath("//html/@lang").get(), # Language of the page
title = response.xpath("//title/text()").get(), # Page title
content = content, # Page content
meta = {
"description": response.xpath("//meta[@name='description']/@content").get(), # Meta description
"keywords": response.xpath("//meta[@name='keywords']/@content").get(), # Meta keywords
Expand All @@ -65,10 +67,10 @@ def parse_item(self, response):
"content_length": response.headers.get("Content-Length", b"").decode("utf-8"), # Content-Length header
"server": response.headers.get("Server", b"").decode("utf-8"), # Server header
},
gen = {
"keywords": keywords, # RAKE keywords
"summary": summary, # Gensim summary
},
#gen = {
# "keywords": keywords, # RAKE keywords
# "summary": summary, # SUMY summary
#},
metrics={
"content_length": len(response.text), # Length of the page content
"internal_links": len(response.xpath("//a[starts-with(@href, '/')]/@href").getall()), # Number of internal links
Expand Down
18 changes: 13 additions & 5 deletions indexbot/utils/text_processing.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,10 @@
import nltk
from rake_nltk import Rake
from gensim.summarization import summarize
from sumy.parsers.plaintext import PlaintextParser
from sumy.nlp.tokenizers import Tokenizer
from sumy.summarizers.lsa import LsaSummarizer

nltk.download("stopwords")

def extract_keywords(content):
rake = Rake()
Expand All @@ -9,8 +14,11 @@ def extract_keywords(content):
except:
return []

def generate_summary(content, word_count=100):
def generate_summary(content, sentences=3):
try:
return summarize(content, word_count=word_count)
except ValueError:
return "Summary could not be generated."
# Parse the text & generate the summary
parser = PlaintextParser.from_string(content, Tokenizer("english"))
summarizer = LsaSummarizer()
summary = summarizer(parser.document, sentences)
except:
return None
3 changes: 2 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
scrapy
nltk
rake_nltk
gensim
sumy

0 comments on commit 9ab84f8

Please sign in to comment.