Skip to content

Commit

Permalink
feat: expose more wcc options (#9)
Browse files Browse the repository at this point in the history
  • Loading branch information
MQ37 authored Jan 24, 2025
1 parent 3fe392b commit 15cda8d
Show file tree
Hide file tree
Showing 4 changed files with 39 additions and 2 deletions.
28 changes: 28 additions & 0 deletions .actor/input_schema.json
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,34 @@
"description": "The maximum depth of the crawl. Default is 1.",
"editor": "number",
"default": 1
},
"maxCrawlPages": {
"title": "Max crawl pages",
"type": "integer",
"description": "The maximum number of pages to crawl. Default is 50.",
"editor": "number",
"default": 50
},
"crawlerType": {
"title": "Crawler type",
"type": "string",
"enum": [
"playwright:adaptive",
"playwright:firefox",
"playwright:chrome",
"cheerio",
"jsdom"
],
"enumTitles": [
"Adaptive switching between browser and raw HTTP - Fast and renders JavaScript if needed. This is the recommended option.",
"Headless browser (Firefox+Playwright) - Reliable, renders JavaScript, best in avoiding blocking, but might be slow.",
"Headless browser (Chrome+Playwright) - Deprecated, the crawler will use Firefox+Playwright instead.",
"Raw HTTP client (Cheerio) - Fastest crawler, but cannot render JavaScript.",
"Raw HTTP client with JavaScript (JSDOM) - Experimental, use at your own risk."
],
"description": "Select the crawling engine:\n- **Headless web browser** - Useful for modern websites with anti-scraping protections and JavaScript rendering. It recognizes common blocking patterns like CAPTCHAs and automatically retries blocked requests through new sessions. However, running web browsers is more expensive as it requires more computing resources and is slower. It is recommended to use at least 8 GB of RAM.\n- **Stealthy web browser** (default) - Another headless web browser with anti-blocking measures enabled. Try this if you encounter bot protection while scraping. For best performance, use with Apify Proxy residential IPs. \n- **Adaptive switching between Chrome and raw HTTP client** - The crawler automatically switches between raw HTTP for static pages and Chrome browser (via Playwright) for dynamic pages, to get the maximum performance wherever possible. \n- **Raw HTTP client** - High-performance crawling mode that uses raw HTTP requests to fetch the pages. It is faster and cheaper, but it might not work on all websites.\n\nBeware that with the raw HTTP client or adaptive crawling mode, some features are not available, e.g. wait for dynamic content, maximum scroll height, or remove cookie warnings.",
"default": "playwright:adaptive",
"prefill": "playwright:adaptive"
}
},
"required": ["startUrl"]
Expand Down
1 change: 1 addition & 0 deletions src/crawler_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
'keepElementsCssSelector': 'meta[name="description"],meta[name="Description"]\ntitle',
# changed by get_crawler_actor_config with default value 1
'maxCrawlDepth': 0, # 0 by default for root page only just in case
'maxCrawlPages': 10, # 10 by default, just in case it is not set
'saveHtmlAsFile': True,
'startUrls': [
# is populated by get_crawler_actor_config
Expand Down
6 changes: 5 additions & 1 deletion src/helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,11 +57,15 @@ async def get_description_from_kvstore(kvstore: KeyValueStoreClientAsync, html_u
return get_description_from_html(html)


def get_crawler_actor_config(url: str, max_crawl_depth: int = 1) -> dict:
def get_crawler_actor_config(
url: str, max_crawl_depth: int = 1, max_crawl_pages: int = 50, crawler_type: str = 'playwright:adaptive'
) -> dict:
"""Creates actor input configuration for the `apify/website-content-crawler` actor."""
config = CRAWLER_CONFIG
config['startUrls'] = [{'url': url, 'method': 'GET'}]
config['maxCrawlDepth'] = max_crawl_depth
config['maxCrawlPages'] = max_crawl_pages
config['crawlerType'] = crawler_type

return config

Expand Down
6 changes: 5 additions & 1 deletion src/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,12 +26,16 @@ async def main() -> None:
url_normalized = normalize_url(url)

max_crawl_depth = int(actor_input.get('maxCrawlDepth', 1))
max_crawl_pages = int(actor_input.get('maxCrawlPages', 50))
crawler_type = actor_input.get('crawlerType', 'playwright:adaptive')

# call apify/website-content-crawler actor to get the html content
logger.info(f'Starting the "apify/website-content-crawler" actor for URL: {url}')
actor_run_details = await Actor.call(
'apify/website-content-crawler',
get_crawler_actor_config(url, max_crawl_depth=max_crawl_depth),
get_crawler_actor_config(
url, max_crawl_depth=max_crawl_depth, max_crawl_pages=max_crawl_pages, crawler_type=crawler_type
),
# memory limit for the crawler actor so free tier can use this actor
memory_mbytes=4096,
)
Expand Down

0 comments on commit 15cda8d

Please sign in to comment.