-
Notifications
You must be signed in to change notification settings - Fork 172
/
Copy pathmain.py
83 lines (66 loc) · 2.3 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
import asyncio
from crawl4ai import AsyncWebCrawler
from dotenv import load_dotenv
from config import BASE_URL, CSS_SELECTOR, REQUIRED_KEYS
from utils.data_utils import (
save_venues_to_csv,
)
from utils.scraper_utils import (
fetch_and_process_page,
get_browser_config,
get_llm_strategy,
)
load_dotenv()
async def crawl_venues():
"""
Main function to crawl venue data from the website.
"""
# Initialize configurations
browser_config = get_browser_config()
llm_strategy = get_llm_strategy()
session_id = "venue_crawl_session"
# Initialize state variables
page_number = 1
all_venues = []
seen_names = set()
# Start the web crawler context
# https://docs.crawl4ai.com/api/async-webcrawler/#asyncwebcrawler
async with AsyncWebCrawler(config=browser_config) as crawler:
while True:
# Fetch and process data from the current page
venues, no_results_found = await fetch_and_process_page(
crawler,
page_number,
BASE_URL,
CSS_SELECTOR,
llm_strategy,
session_id,
REQUIRED_KEYS,
seen_names,
)
if no_results_found:
print("No more venues found. Ending crawl.")
break # Stop crawling when "No Results Found" message appears
if not venues:
print(f"No venues extracted from page {page_number}.")
break # Stop if no venues are extracted
# Add the venues from this page to the total list
all_venues.extend(venues)
page_number += 1 # Move to the next page
# Pause between requests to be polite and avoid rate limits
await asyncio.sleep(2) # Adjust sleep time as needed
# Save the collected venues to a CSV file
if all_venues:
save_venues_to_csv(all_venues, "complete_venues.csv")
print(f"Saved {len(all_venues)} venues to 'complete_venues.csv'.")
else:
print("No venues were found during the crawl.")
# Display usage statistics for the LLM strategy
llm_strategy.show_usage()
async def main():
"""
Entry point of the script.
"""
await crawl_venues()
if __name__ == "__main__":
asyncio.run(main())