feat: expose more wcc options (#9)

apify · Jan 24, 2025 · 15cda8d · 15cda8d
1 parent 3fe392b
commit 15cda8d
Show file tree

Hide file tree

Showing 4 changed files with 39 additions and 2 deletions.
diff --git a/.actor/input_schema.json b/.actor/input_schema.json
@@ -16,6 +16,34 @@
       "description": "The maximum depth of the crawl. Default is 1.",
       "editor": "number",
       "default": 1
+    },
+    "maxCrawlPages": {
+      "title": "Max crawl pages",
+      "type": "integer",
+      "description": "The maximum number of pages to crawl. Default is 50.",
+      "editor": "number",
+      "default": 50
+    },
+    "crawlerType": {
+        "title": "Crawler type",
+        "type": "string",
+        "enum": [
+            "playwright:adaptive",
+            "playwright:firefox",
+            "playwright:chrome",
+            "cheerio",
+            "jsdom"
+        ],
+        "enumTitles": [
+            "Adaptive switching between browser and raw HTTP - Fast and renders JavaScript if needed. This is the recommended option.",
+            "Headless browser (Firefox+Playwright) - Reliable, renders JavaScript, best in avoiding blocking, but might be slow.",
+            "Headless browser (Chrome+Playwright) - Deprecated, the crawler will use Firefox+Playwright instead.",
+            "Raw HTTP client (Cheerio) - Fastest crawler, but cannot render JavaScript.",
+            "Raw HTTP client with JavaScript (JSDOM) - Experimental, use at your own risk."
+        ],
+        "description": "Select the crawling engine:\n- **Headless web browser** - Useful for modern websites with anti-scraping protections and JavaScript rendering. It recognizes common blocking patterns like CAPTCHAs and automatically retries blocked requests through new sessions. However, running web browsers is more expensive as it requires more computing resources and is slower. It is recommended to use at least 8 GB of RAM.\n- **Stealthy web browser** (default) - Another headless web browser with anti-blocking measures enabled. Try this if you encounter bot protection while scraping. For best performance, use with Apify Proxy residential IPs. \n- **Adaptive switching between Chrome and raw HTTP client** - The crawler automatically switches between raw HTTP for static pages and Chrome browser (via Playwright) for dynamic pages, to get the maximum performance wherever possible. \n- **Raw HTTP client** - High-performance crawling mode that uses raw HTTP requests to fetch the pages. It is faster and cheaper, but it might not work on all websites.\n\nBeware that with the raw HTTP client or adaptive crawling mode, some features are not available, e.g. wait for dynamic content, maximum scroll height, or remove cookie warnings.",
+        "default": "playwright:adaptive",
+        "prefill": "playwright:adaptive"
     }
   },
   "required": ["startUrl"]

diff --git a/src/crawler_config.py b/src/crawler_config.py
@@ -3,6 +3,7 @@
     'keepElementsCssSelector': 'meta[name="description"],meta[name="Description"]\ntitle',
     # changed by get_crawler_actor_config with default value 1
     'maxCrawlDepth': 0,  # 0 by default for root page only just in case
+    'maxCrawlPages': 10,  # 10 by default, just in case it is not set
     'saveHtmlAsFile': True,
     'startUrls': [
         # is populated by get_crawler_actor_config

diff --git a/src/helpers.py b/src/helpers.py
@@ -57,11 +57,15 @@ async def get_description_from_kvstore(kvstore: KeyValueStoreClientAsync, html_u
     return get_description_from_html(html)
 
 
-def get_crawler_actor_config(url: str, max_crawl_depth: int = 1) -> dict:
+def get_crawler_actor_config(
+    url: str, max_crawl_depth: int = 1, max_crawl_pages: int = 50, crawler_type: str = 'playwright:adaptive'
+) -> dict:
     """Creates actor input configuration for the `apify/website-content-crawler` actor."""
     config = CRAWLER_CONFIG
     config['startUrls'] = [{'url': url, 'method': 'GET'}]
     config['maxCrawlDepth'] = max_crawl_depth
+    config['maxCrawlPages'] = max_crawl_pages
+    config['crawlerType'] = crawler_type
 
     return config
 

diff --git a/src/main.py b/src/main.py
@@ -26,12 +26,16 @@ async def main() -> None:
         url_normalized = normalize_url(url)
 
         max_crawl_depth = int(actor_input.get('maxCrawlDepth', 1))
+        max_crawl_pages = int(actor_input.get('maxCrawlPages', 50))
+        crawler_type = actor_input.get('crawlerType', 'playwright:adaptive')
 
         # call apify/website-content-crawler actor to get the html content
         logger.info(f'Starting the "apify/website-content-crawler" actor for URL: {url}')
         actor_run_details = await Actor.call(
             'apify/website-content-crawler',
-            get_crawler_actor_config(url, max_crawl_depth=max_crawl_depth),
+            get_crawler_actor_config(
+                url, max_crawl_depth=max_crawl_depth, max_crawl_pages=max_crawl_pages, crawler_type=crawler_type
+            ),
             # memory limit for the crawler actor so free tier can use this actor
             memory_mbytes=4096,
         )