Skip to content

Commit

Permalink
Fix crawl limit (max_urls)
Browse files Browse the repository at this point in the history
  • Loading branch information
jdmansour committed Oct 16, 2024
1 parent c7868b5 commit b57d7c0
Show file tree
Hide file tree
Showing 2 changed files with 29 additions and 7 deletions.
26 changes: 22 additions & 4 deletions converter/spiders/generic_spider.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,21 +62,38 @@ def __init__(self, urltocrawl="", validated_result="", ai_enabled="True", find_s
max_urls="3", filter_set_id="", **kwargs):
super().__init__(**kwargs)

log.info("Initializing GenericSpider")
log.info("Arguments:")
log.info(" urltocrawl: %r", urltocrawl)
log.info(" validated_result: %r", validated_result)
log.info(" ai_enabled: %r", ai_enabled)
log.info(" find_sitemap: %r", find_sitemap)
log.info(" max_urls: %r", max_urls)
log.info(" filter_set_id: %r", filter_set_id)

if urltocrawl and filter_set_id:
raise ValueError("You must set either 'urltocrawl' or 'filter_set_id', not both.")

if not urltocrawl and not validated_result:
raise ValueError("You must set either 'urltocrawl' or 'validated_result'.")

if filter_set_id != "":
self.filter_set_id = int(filter_set_id)
else:
self.filter_set_id = None

self.max_urls = int(max_urls)

self.results_dict = {}
if urltocrawl != "":
urls = [url.strip() for url in urltocrawl.split(",")]
if find_sitemap == "True" and len(urls) == 1:
max_sitemap_urls = int(max_urls)
sitemap_urls = find_generate_sitemap(
urls[0], max_entries=max_sitemap_urls)
urls[0], max_entries=self.max_urls)
self.start_urls = sitemap_urls
else:
self.start_urls = urls
self.start_urls = urls[:self.max_urls]

if validated_result != "":
self.results_dict = json.loads(validated_result)
urltocrawl = self.results_dict["url"]
Expand Down Expand Up @@ -137,8 +154,9 @@ def spider_opened(self, spider: GenericSpider):
# List filter rules in this filter set
connection = sqlite3.connect(db_path)

matches = fetch_urls_passing_filterset(connection, self.filter_set_id)
matches = fetch_urls_passing_filterset(connection, self.filter_set_id, limit=self.max_urls)

log.info("Adding %d URLs to start_urls", len(matches))
for row in matches:
log.info("Adding URL to start_urls: %s", row.url)
self.start_urls.append(row.url)
Expand Down
10 changes: 7 additions & 3 deletions converter/util/generic_crawler_db.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

import logging
import sqlite3
from typing import NamedTuple
from typing import NamedTuple, Optional

# import sqlparse

Expand Down Expand Up @@ -60,7 +60,8 @@ def generate_url_filter(filter_rules: list[FilterRule]) -> tuple[str, list[str]]
return url_filter, parameters


def fetch_urls_passing_filterset(connection: sqlite3.Connection, filter_set_id: int):
def fetch_urls_passing_filterset(connection: sqlite3.Connection, filter_set_id: int,
limit: Optional[int] = None):
log.info("Filter set ID: %s", filter_set_id)
# List filter rules in this filter set
cursor = connection.cursor()
Expand Down Expand Up @@ -98,6 +99,8 @@ def fetch_urls_passing_filterset(connection: sqlite3.Connection, filter_set_id:
# params.append(crawl_job_id)
# where_clause = "WHERE (" + filter_expression + ") AND crawl_job_id = ?"
# params.append(crawl_job_id)
if limit:
assert isinstance(limit, int)

query = f"""
SELECT
Expand All @@ -121,7 +124,8 @@ def fetch_urls_passing_filterset(connection: sqlite3.Connection, filter_set_id:
WHERE fr_inner.filter_set_id = fs.id
AND fr_inner.include = 1
AND cu.url LIKE (fr_inner.rule || '%')
);
)
{f"LIMIT {limit}" if limit else ""};
"""
params.append(str(filter_set_id))

Expand Down

0 comments on commit b57d7c0

Please sign in to comment.