Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix: crawler timeout #13

Merged
merged 1 commit into from
Jan 27, 2025
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 27 additions & 0 deletions src/main.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
"""This module defines the main entry point for the llsm.txt generator actor."""

import logging
from datetime import timedelta
from typing import TYPE_CHECKING
from urllib.parse import urlparse

Expand All @@ -14,6 +15,9 @@

logger = logging.getLogger('apify')

# minimum for the llms.txt generator to process the results
MIN_GENERATOR_RUN_SECS = 60


async def main() -> None:
"""Main entry point for the llms.txt generator actor."""
Expand All @@ -29,6 +33,28 @@ async def main() -> None:
max_crawl_pages = int(actor_input.get('maxCrawlPages', 50))
crawler_type = actor_input.get('crawlerType', 'playwright:adaptive')

if run_id := Actor.config.actor_run_id:
if not (run := await Actor.apify_client.run(run_id).get()):
msg = 'Failed to get the actor run details!'
raise RuntimeError(msg)

if not (timeout_secs := run.get('options', {}).get('timeoutSecs')):
msg = 'Missing "timeoutSecs" attribute in actor run details!'
raise ValueError(msg)

# crawler timeout is set to timeout - MIN_GENERATOR_RUN_SECS or timeout if tha time is too low
timeout_crawler = timedelta(
seconds=(
timeout_secs - MIN_GENERATOR_RUN_SECS
if timeout_secs >= MIN_GENERATOR_RUN_SECS * 2
else timeout_secs
)
)
# if run is local, do not set the timeout
else:
logger.warning('Running the actor locally, not setting the crawler timeout!')
timeout_crawler = None

# call apify/website-content-crawler actor to get the html content
logger.info(f'Starting the "apify/website-content-crawler" actor for URL: {url}')
actor_run_details = await Actor.call(
Expand All @@ -38,6 +64,7 @@ async def main() -> None:
),
# memory limit for the crawler actor so free tier can use this actor
memory_mbytes=4096,
timeout=timeout_crawler,
)
if actor_run_details is None:
msg = 'Failed to start the "apify/website-content-crawler" actor!'
Expand Down
Loading