chore: Update of Python templates

apify · Jan 21, 2025 · 3cd51c3 · 3cd51c3
1 parent 1d59913
commit 3cd51c3
Show file tree

Hide file tree

Showing 5 changed files with 116 additions and 67 deletions.
diff --git a/templates/python-beautifulsoup/src/main.py b/templates/python-beautifulsoup/src/main.py
@@ -21,6 +21,7 @@ async def main() -> None:
     Asynchronous execution is required for communication with Apify platform, and it also enhances performance in
     the field of web scraping significantly.
     """
+    # Enter the context of the Actor.
     async with Actor:
         # Retrieve the Actor input, and use default values if not provided.
         actor_input = await Actor.get_input() or {}
@@ -39,49 +40,58 @@ async def main() -> None:
         for start_url in start_urls:
             url = start_url.get('url')
             Actor.log.info(f'Enqueuing {url} ...')
-            request = Request.from_url(url, user_data={'depth': 0})
-            await request_queue.add_request(request)
-
-        # Process the URLs from the request queue.
-        while request := await request_queue.fetch_next_request():
-            url = request.url
-            depth = request.user_data['depth']
-            Actor.log.info(f'Scraping {url} ...')
-
-            try:
-                # Fetch the HTTP response from the specified URL using HTTPX.
-                async with AsyncClient() as client:
+            new_request = Request.from_url(url, user_data={'depth': 0})
+            await request_queue.add_request(new_request)
+
+        # Create an HTTPX client to fetch the HTML content of the URLs.
+        async with AsyncClient() as client:
+            # Process the URLs from the request queue.
+            while request := await request_queue.fetch_next_request():
+                url = request.url
+
+                if not isinstance(request.user_data['depth'], (str, int)):
+                    raise TypeError('Request.depth is an enexpected type.')
+
+                depth = int(request.user_data['depth'])
+                Actor.log.info(f'Scraping {url} (depth={depth}) ...')
+
+                try:
+                    # Fetch the HTTP response from the specified URL using HTTPX.
                     response = await client.get(url, follow_redirects=True)
 
-                # Parse the HTML content using Beautiful Soup.
-                soup = BeautifulSoup(response.content, 'html.parser')
-
-                # If the current depth is less than max_depth, find nested links and enqueue them.
-                if depth < max_depth:
-                    for link in soup.find_all('a'):
-                        link_href = link.get('href')
-                        link_url = urljoin(url, link_href)
-
-                        if link_url.startswith(('http://', 'https://')):
-                            Actor.log.info(f'Enqueuing {link_url} ...')
-                            request = Request.from_url(link_url, user_data={'depth': depth + 1})
-                            await request_queue.add_request(request)
-
-                # Extract the desired data.
-                data = {
-                    'url': url,
-                    'title': soup.title.string if soup.title else None,
-                    'h1s': [h1.text for h1 in soup.find_all('h1')],
-                    'h2s': [h2.text for h2 in soup.find_all('h2')],
-                    'h3s': [h3.text for h3 in soup.find_all('h3')],
-                }
-
-                # Store the extracted data to the default dataset.
-                await Actor.push_data(data)
-
-            except Exception:
-                Actor.log.exception(f'Cannot extract data from {url}.')
-
-            finally:
-                # Mark the request as handled to ensure it is not processed again.
-                await request_queue.mark_request_as_handled(request)
+                    # Parse the HTML content using Beautiful Soup.
+                    soup = BeautifulSoup(response.content, 'html.parser')
+
+                    # If the current depth is less than max_depth, find nested links
+                    # and enqueue them.
+                    if depth < max_depth:
+                        for link in soup.find_all('a'):
+                            link_href = link.get('href')
+                            link_url = urljoin(url, link_href)
+
+                            if link_url.startswith(('http://', 'https://')):
+                                Actor.log.info(f'Enqueuing {link_url} ...')
+                                new_request = Request.from_url(
+                                    link_url,
+                                    user_data={'depth': depth + 1},
+                                )
+                                await request_queue.add_request(new_request)
+
+                    # Extract the desired data.
+                    data = {
+                        'url': url,
+                        'title': soup.title.string if soup.title else None,
+                        'h1s': [h1.text for h1 in soup.find_all('h1')],
+                        'h2s': [h2.text for h2 in soup.find_all('h2')],
+                        'h3s': [h3.text for h3 in soup.find_all('h3')],
+                    }
+
+                    # Store the extracted data to the default dataset.
+                    await Actor.push_data(data)
+
+                except Exception:
+                    Actor.log.exception(f'Cannot extract data from {url}.')
+
+                finally:
+                    # Mark the request as handled to ensure it is not processed again.
+                    await request_queue.mark_request_as_handled(new_request)
diff --git a/templates/python-crawlee-beautifulsoup/src/main.py b/templates/python-crawlee-beautifulsoup/src/main.py
@@ -17,10 +17,17 @@ async def main() -> None:
     Asynchronous execution is required for communication with Apify platform, and it also enhances performance in
     the field of web scraping significantly.
     """
+    # Enter the context of the Actor.
     async with Actor:
         # Retrieve the Actor input, and use default values if not provided.
         actor_input = await Actor.get_input() or {}
-        start_urls = [url.get('url') for url in actor_input.get('start_urls', [{'url': 'https://apify.com'}])]
+        start_urls = [
+            url.get('url')
+            for url in actor_input.get(
+                'start_urls',
+                [{'url': 'https://apify.com'}],
+            )
+        ]
 
         # Exit if no start URLs are provided.
         if not start_urls:

diff --git a/templates/python-crawlee-playwright/src/main.py b/templates/python-crawlee-playwright/src/main.py
@@ -17,10 +17,17 @@ async def main() -> None:
     Asynchronous execution is required for communication with Apify platform, and it also enhances performance in
     the field of web scraping significantly.
     """
+    # Enter the context of the Actor.
     async with Actor:
         # Retrieve the Actor input, and use default values if not provided.
         actor_input = await Actor.get_input() or {}
-        start_urls = [url.get('url') for url in actor_input.get('start_urls', [{'url': 'https://apify.com'}])]
+        start_urls = [
+            url.get('url')
+            for url in actor_input.get(
+                'start_urls',
+                [{'url': 'https://apify.com'}],
+            )
+        ]
 
         # Exit if no start URLs are provided.
         if not start_urls:

diff --git a/templates/python-playwright/src/main.py b/templates/python-playwright/src/main.py
@@ -14,7 +14,8 @@
 
 # Note: To run this Actor locally, ensure that Playwright browsers are installed.
 # Run `playwright install --with-deps` in the Actor's virtual environment to install them.
-# When running on the Apify platform, these dependencies are already included in the Actor's Docker image.
+# When running on the Apify platform, these dependencies are already included
+# in the Actor's Docker image.
 
 
 async def main() -> None:
@@ -24,6 +25,7 @@ async def main() -> None:
     Asynchronous execution is required for communication with Apify platform, and it also enhances performance in
     the field of web scraping significantly.
     """
+    # Enter the context of the Actor.
     async with Actor:
         # Retrieve the Actor input, and use default values if not provided.
         actor_input = await Actor.get_input() or {}
@@ -42,38 +44,49 @@ async def main() -> None:
         for start_url in start_urls:
             url = start_url.get('url')
             Actor.log.info(f'Enqueuing {url} ...')
-            request = Request.from_url(url, user_data={'depth': 0})
-            await request_queue.add_request(request)
+            new_request = Request.from_url(url, user_data={'depth': 0})
+            await request_queue.add_request(new_request)
 
         Actor.log.info('Launching Playwright...')
 
         # Launch Playwright and open a new browser context.
         async with async_playwright() as playwright:
             # Configure the browser to launch in headless mode as per Actor configuration.
-            browser = await playwright.chromium.launch(headless=Actor.config.headless, args=['--disable-gpu'])
+            browser = await playwright.chromium.launch(
+                headless=Actor.config.headless,
+                args=['--disable-gpu'],
+            )
             context = await browser.new_context()
 
             # Process the URLs from the request queue.
             while request := await request_queue.fetch_next_request():
                 url = request.url
-                depth = request.user_data['depth']
-                Actor.log.info(f'Scraping {url} ...')
+
+                if not isinstance(request.user_data['depth'], (str, int)):
+                    raise TypeError('Request.depth is an enexpected type.')
+
+                depth = int(request.user_data['depth'])
+                Actor.log.info(f'Scraping {url} (depth={depth}) ...')
 
                 try:
                     # Open a new page in the browser context and navigate to the URL.
                     page = await context.new_page()
                     await page.goto(url)
 
-                    # If the current depth is less than max_depth, find nested links and enqueue them.
+                    # If the current depth is less than max_depth, find nested links
+                    # and enqueue them.
                     if depth < max_depth:
                         for link in await page.locator('a').all():
                             link_href = await link.get_attribute('href')
                             link_url = urljoin(url, link_href)
 
                             if link_url.startswith(('http://', 'https://')):
                                 Actor.log.info(f'Enqueuing {link_url} ...')
-                                request = Request.from_url(link_url, user_data={'depth': depth + 1})
-                                await request_queue.add_request(request)
+                                new_request = Request.from_url(
+                                    link_url,
+                                    user_data={'depth': depth + 1},
+                                )
+                                await request_queue.add_request(new_request)
 
                     # Extract the desired data.
                     data = {

diff --git a/templates/python-selenium/src/main.py b/templates/python-selenium/src/main.py
@@ -16,9 +16,10 @@
 from apify import Actor, Request
 
 # To run this Actor locally, you need to have the Selenium Chromedriver installed.
-# Follow the installation guide at: https://www.selenium.dev/documentation/webdriver/getting_started/install_drivers/
-# When running on the Apify platform, the Chromedriver is already included in the Actor's Docker image.
-
+# Follow the installation guide at:
+# https://www.selenium.dev/documentation/webdriver/getting_started/install_drivers/
+# When running on the Apify platform, the Chromedriver is already included
+# in the Actor's Docker image.
 
 async def main() -> None:
     """Main entry point for the Apify Actor.
@@ -27,6 +28,7 @@ async def main() -> None:
     Asynchronous execution is required for communication with Apify platform, and it also enhances performance in
     the field of web scraping significantly.
     """
+    # Enter the context of the Actor.
     async with Actor:
         # Retrieve the Actor input, and use default values if not provided.
         actor_input = await Actor.get_input() or {}
@@ -45,8 +47,8 @@ async def main() -> None:
         for start_url in start_urls:
             url = start_url.get('url')
             Actor.log.info(f'Enqueuing {url} ...')
-            request = Request.from_url(url, user_data={'depth': 0})
-            await request_queue.add_request(request)
+            new_request = Request.from_url(url, user_data={'depth': 0})
+            await request_queue.add_request(new_request)
 
         # Launch a new Selenium Chrome WebDriver and configure it.
         Actor.log.info('Launching Chrome WebDriver...')
@@ -61,28 +63,38 @@ async def main() -> None:
 
         # Test WebDriver setup by navigating to an example page.
         driver.get('http://www.example.com')
-        assert driver.title == 'Example Domain'
+        if driver.title != 'Example Domain':
+            raise ValueError('Failed to open example page.')
 
         # Process the URLs from the request queue.
         while request := await request_queue.fetch_next_request():
             url = request.url
-            depth = request.user_data['depth']
-            Actor.log.info(f'Scraping {url} ...')
+
+            if not isinstance(request.user_data['depth'], (str, int)):
+                raise TypeError('Request.depth is an enexpected type.')
+
+            depth = int(request.user_data['depth'])
+            Actor.log.info(f'Scraping {url} (depth={depth}) ...')
 
             try:
-                # Navigate to the URL using Selenium WebDriver. Use asyncio.to_thread for non-blocking execution.
+                # Navigate to the URL using Selenium WebDriver. Use asyncio.to_thread
+                # for non-blocking execution.
                 await asyncio.to_thread(driver.get, url)
 
-                # If the current depth is less than max_depth, find nested links and enqueue them.
+                # If the current depth is less than max_depth, find nested links
+                # and enqueue them.
                 if depth < max_depth:
                     for link in driver.find_elements(By.TAG_NAME, 'a'):
                         link_href = link.get_attribute('href')
                         link_url = urljoin(url, link_href)
 
                         if link_url.startswith(('http://', 'https://')):
                             Actor.log.info(f'Enqueuing {link_url} ...')
-                            request = Request.from_url(link_url, user_data={'depth': depth + 1})
-                            await request_queue.add_request(request)
+                            new_request = Request.from_url(
+                                link_url,
+                                user_data={'depth': depth + 1},
+                            )
+                            await request_queue.add_request(new_request)
 
                 # Extract the desired data.
                 data = {