Skip to content

Commit

Permalink
chore: Update of Python templates
Browse files Browse the repository at this point in the history
  • Loading branch information
vdusek committed Jan 21, 2025
1 parent 1d59913 commit 3cd51c3
Show file tree
Hide file tree
Showing 5 changed files with 116 additions and 67 deletions.
98 changes: 54 additions & 44 deletions templates/python-beautifulsoup/src/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ async def main() -> None:
Asynchronous execution is required for communication with Apify platform, and it also enhances performance in
the field of web scraping significantly.
"""
# Enter the context of the Actor.
async with Actor:
# Retrieve the Actor input, and use default values if not provided.
actor_input = await Actor.get_input() or {}
Expand All @@ -39,49 +40,58 @@ async def main() -> None:
for start_url in start_urls:
url = start_url.get('url')
Actor.log.info(f'Enqueuing {url} ...')
request = Request.from_url(url, user_data={'depth': 0})
await request_queue.add_request(request)

# Process the URLs from the request queue.
while request := await request_queue.fetch_next_request():
url = request.url
depth = request.user_data['depth']
Actor.log.info(f'Scraping {url} ...')

try:
# Fetch the HTTP response from the specified URL using HTTPX.
async with AsyncClient() as client:
new_request = Request.from_url(url, user_data={'depth': 0})
await request_queue.add_request(new_request)

# Create an HTTPX client to fetch the HTML content of the URLs.
async with AsyncClient() as client:
# Process the URLs from the request queue.
while request := await request_queue.fetch_next_request():
url = request.url

if not isinstance(request.user_data['depth'], (str, int)):
raise TypeError('Request.depth is an enexpected type.')

depth = int(request.user_data['depth'])
Actor.log.info(f'Scraping {url} (depth={depth}) ...')

try:
# Fetch the HTTP response from the specified URL using HTTPX.
response = await client.get(url, follow_redirects=True)

# Parse the HTML content using Beautiful Soup.
soup = BeautifulSoup(response.content, 'html.parser')

# If the current depth is less than max_depth, find nested links and enqueue them.
if depth < max_depth:
for link in soup.find_all('a'):
link_href = link.get('href')
link_url = urljoin(url, link_href)

if link_url.startswith(('http://', 'https://')):
Actor.log.info(f'Enqueuing {link_url} ...')
request = Request.from_url(link_url, user_data={'depth': depth + 1})
await request_queue.add_request(request)

# Extract the desired data.
data = {
'url': url,
'title': soup.title.string if soup.title else None,
'h1s': [h1.text for h1 in soup.find_all('h1')],
'h2s': [h2.text for h2 in soup.find_all('h2')],
'h3s': [h3.text for h3 in soup.find_all('h3')],
}

# Store the extracted data to the default dataset.
await Actor.push_data(data)

except Exception:
Actor.log.exception(f'Cannot extract data from {url}.')

finally:
# Mark the request as handled to ensure it is not processed again.
await request_queue.mark_request_as_handled(request)
# Parse the HTML content using Beautiful Soup.
soup = BeautifulSoup(response.content, 'html.parser')

# If the current depth is less than max_depth, find nested links
# and enqueue them.
if depth < max_depth:
for link in soup.find_all('a'):
link_href = link.get('href')
link_url = urljoin(url, link_href)

if link_url.startswith(('http://', 'https://')):
Actor.log.info(f'Enqueuing {link_url} ...')
new_request = Request.from_url(
link_url,
user_data={'depth': depth + 1},
)
await request_queue.add_request(new_request)

# Extract the desired data.
data = {
'url': url,
'title': soup.title.string if soup.title else None,
'h1s': [h1.text for h1 in soup.find_all('h1')],
'h2s': [h2.text for h2 in soup.find_all('h2')],
'h3s': [h3.text for h3 in soup.find_all('h3')],
}

# Store the extracted data to the default dataset.
await Actor.push_data(data)

except Exception:
Actor.log.exception(f'Cannot extract data from {url}.')

finally:
# Mark the request as handled to ensure it is not processed again.
await request_queue.mark_request_as_handled(new_request)
9 changes: 8 additions & 1 deletion templates/python-crawlee-beautifulsoup/src/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,10 +17,17 @@ async def main() -> None:
Asynchronous execution is required for communication with Apify platform, and it also enhances performance in
the field of web scraping significantly.
"""
# Enter the context of the Actor.
async with Actor:
# Retrieve the Actor input, and use default values if not provided.
actor_input = await Actor.get_input() or {}
start_urls = [url.get('url') for url in actor_input.get('start_urls', [{'url': 'https://apify.com'}])]
start_urls = [
url.get('url')
for url in actor_input.get(
'start_urls',
[{'url': 'https://apify.com'}],
)
]

# Exit if no start URLs are provided.
if not start_urls:
Expand Down
9 changes: 8 additions & 1 deletion templates/python-crawlee-playwright/src/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,10 +17,17 @@ async def main() -> None:
Asynchronous execution is required for communication with Apify platform, and it also enhances performance in
the field of web scraping significantly.
"""
# Enter the context of the Actor.
async with Actor:
# Retrieve the Actor input, and use default values if not provided.
actor_input = await Actor.get_input() or {}
start_urls = [url.get('url') for url in actor_input.get('start_urls', [{'url': 'https://apify.com'}])]
start_urls = [
url.get('url')
for url in actor_input.get(
'start_urls',
[{'url': 'https://apify.com'}],
)
]

# Exit if no start URLs are provided.
if not start_urls:
Expand Down
31 changes: 22 additions & 9 deletions templates/python-playwright/src/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,8 @@

# Note: To run this Actor locally, ensure that Playwright browsers are installed.
# Run `playwright install --with-deps` in the Actor's virtual environment to install them.
# When running on the Apify platform, these dependencies are already included in the Actor's Docker image.
# When running on the Apify platform, these dependencies are already included
# in the Actor's Docker image.


async def main() -> None:
Expand All @@ -24,6 +25,7 @@ async def main() -> None:
Asynchronous execution is required for communication with Apify platform, and it also enhances performance in
the field of web scraping significantly.
"""
# Enter the context of the Actor.
async with Actor:
# Retrieve the Actor input, and use default values if not provided.
actor_input = await Actor.get_input() or {}
Expand All @@ -42,38 +44,49 @@ async def main() -> None:
for start_url in start_urls:
url = start_url.get('url')
Actor.log.info(f'Enqueuing {url} ...')
request = Request.from_url(url, user_data={'depth': 0})
await request_queue.add_request(request)
new_request = Request.from_url(url, user_data={'depth': 0})
await request_queue.add_request(new_request)

Actor.log.info('Launching Playwright...')

# Launch Playwright and open a new browser context.
async with async_playwright() as playwright:
# Configure the browser to launch in headless mode as per Actor configuration.
browser = await playwright.chromium.launch(headless=Actor.config.headless, args=['--disable-gpu'])
browser = await playwright.chromium.launch(
headless=Actor.config.headless,
args=['--disable-gpu'],
)
context = await browser.new_context()

# Process the URLs from the request queue.
while request := await request_queue.fetch_next_request():
url = request.url
depth = request.user_data['depth']
Actor.log.info(f'Scraping {url} ...')

if not isinstance(request.user_data['depth'], (str, int)):
raise TypeError('Request.depth is an enexpected type.')

depth = int(request.user_data['depth'])
Actor.log.info(f'Scraping {url} (depth={depth}) ...')

try:
# Open a new page in the browser context and navigate to the URL.
page = await context.new_page()
await page.goto(url)

# If the current depth is less than max_depth, find nested links and enqueue them.
# If the current depth is less than max_depth, find nested links
# and enqueue them.
if depth < max_depth:
for link in await page.locator('a').all():
link_href = await link.get_attribute('href')
link_url = urljoin(url, link_href)

if link_url.startswith(('http://', 'https://')):
Actor.log.info(f'Enqueuing {link_url} ...')
request = Request.from_url(link_url, user_data={'depth': depth + 1})
await request_queue.add_request(request)
new_request = Request.from_url(
link_url,
user_data={'depth': depth + 1},
)
await request_queue.add_request(new_request)

# Extract the desired data.
data = {
Expand Down
36 changes: 24 additions & 12 deletions templates/python-selenium/src/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,9 +16,10 @@
from apify import Actor, Request

# To run this Actor locally, you need to have the Selenium Chromedriver installed.
# Follow the installation guide at: https://www.selenium.dev/documentation/webdriver/getting_started/install_drivers/
# When running on the Apify platform, the Chromedriver is already included in the Actor's Docker image.

# Follow the installation guide at:
# https://www.selenium.dev/documentation/webdriver/getting_started/install_drivers/
# When running on the Apify platform, the Chromedriver is already included
# in the Actor's Docker image.

async def main() -> None:
"""Main entry point for the Apify Actor.
Expand All @@ -27,6 +28,7 @@ async def main() -> None:
Asynchronous execution is required for communication with Apify platform, and it also enhances performance in
the field of web scraping significantly.
"""
# Enter the context of the Actor.
async with Actor:
# Retrieve the Actor input, and use default values if not provided.
actor_input = await Actor.get_input() or {}
Expand All @@ -45,8 +47,8 @@ async def main() -> None:
for start_url in start_urls:
url = start_url.get('url')
Actor.log.info(f'Enqueuing {url} ...')
request = Request.from_url(url, user_data={'depth': 0})
await request_queue.add_request(request)
new_request = Request.from_url(url, user_data={'depth': 0})
await request_queue.add_request(new_request)

# Launch a new Selenium Chrome WebDriver and configure it.
Actor.log.info('Launching Chrome WebDriver...')
Expand All @@ -61,28 +63,38 @@ async def main() -> None:

# Test WebDriver setup by navigating to an example page.
driver.get('http://www.example.com')
assert driver.title == 'Example Domain'
if driver.title != 'Example Domain':
raise ValueError('Failed to open example page.')

# Process the URLs from the request queue.
while request := await request_queue.fetch_next_request():
url = request.url
depth = request.user_data['depth']
Actor.log.info(f'Scraping {url} ...')

if not isinstance(request.user_data['depth'], (str, int)):
raise TypeError('Request.depth is an enexpected type.')

depth = int(request.user_data['depth'])
Actor.log.info(f'Scraping {url} (depth={depth}) ...')

try:
# Navigate to the URL using Selenium WebDriver. Use asyncio.to_thread for non-blocking execution.
# Navigate to the URL using Selenium WebDriver. Use asyncio.to_thread
# for non-blocking execution.
await asyncio.to_thread(driver.get, url)

# If the current depth is less than max_depth, find nested links and enqueue them.
# If the current depth is less than max_depth, find nested links
# and enqueue them.
if depth < max_depth:
for link in driver.find_elements(By.TAG_NAME, 'a'):
link_href = link.get_attribute('href')
link_url = urljoin(url, link_href)

if link_url.startswith(('http://', 'https://')):
Actor.log.info(f'Enqueuing {link_url} ...')
request = Request.from_url(link_url, user_data={'depth': depth + 1})
await request_queue.add_request(request)
new_request = Request.from_url(
link_url,
user_data={'depth': depth + 1},
)
await request_queue.add_request(new_request)

# Extract the desired data.
data = {
Expand Down

0 comments on commit 3cd51c3

Please sign in to comment.