-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathscraper.py
124 lines (96 loc) Β· 3.44 KB
/
scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
import logging
import time
from concurrent.futures import ThreadPoolExecutor, wait
from functools import wraps
from pathlib import Path
import requests
from bs4 import BeautifulSoup
from scraper_utils import scrape_tag
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
TAGS = [
"data-science",
"machine-learning",
"python",
"programming",
"technology",
]
# format must match YYYY/MM/DD
START_DATE = "2018/01/01"
END_DATE = "2022/10/01"
def get_proxies():
url = "https://free-proxy-list.net/"
response = requests.get(url)
soup = BeautifulSoup(response.text, "html.parser")
proxies = []
for i in soup.tbody.find_all("tr"):
if len(proxies) == 5:
break
proxies.append(":".join([i.find_all("td")[0].string, i.find_all("td")[1].string]))
return proxies
def get_driver():
"""Create a selenium driver"""
options = webdriver.ChromeOptions()
options.add_experimental_option("excludeSwitches", ["enable-automation"])
options.add_experimental_option("useAutomationExtension", False)
options.add_argument("--disable-blink-features=AutomationControlled")
s = Service(ChromeDriverManager().install())
driver = webdriver.Chrome(options=options, service=s)
driver.implicitly_wait(30)
driver.execute_cdp_cmd(
"Network.setUserAgentOverride",
{
"userAgent": (
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like"
" Gecko) Chrome/91.0.4472.114 Safari/537.36"
)
},
)
driver.execute_script("Object.defineProperty(navigator, 'webdriver', {get: () => undefined})")
return driver
def timeit(func):
"""a decorator to time a function"""
@wraps(func)
def timeit_wrapper(*args, **kwargs):
start_time = time.perf_counter()
result = func(*args, **kwargs)
end_time = time.perf_counter()
total_time = end_time - start_time
logging.info(f"{func.__name__}{args} {kwargs} took {total_time:.4f} seconds")
return result
return timeit_wrapper
def single_scrape(path):
"""scrape with single driver"""
for tag in TAGS:
logging.info(f"Scraping tag: {tag}")
logging.info("=====================================")
driver = get_driver()
scrape_tag(tag, driver, path, START_DATE, END_DATE)
logging.info(f"Finished scraping {tag}")
logging.info("done")
@timeit
def concurrent_scrape(path):
"""scrape with concurrent drivers"""
drivers = [get_driver() for _ in range(5)]
with ThreadPoolExecutor(max_workers=5) as executor:
futures = [
executor.submit(scrape_tag, tag, driver, path, START_DATE, END_DATE)
for tag, driver in zip(TAGS, drivers)
]
wait(futures)
[driver.quit() for driver in drivers]
def main():
project_dir = Path(__file__).resolve().parents[2]
path = project_dir / "data" / "0_raw"
try:
path.mkdir(parents=True, exist_ok=False)
except FileExistsError:
logging.error("Folder is already there")
single_scrape(path)
# switching to concurrent scraping is faster but gets bot checked after a while
# concurrent_scrape(path)
if __name__ == "__main__":
log_fmt = "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
logging.basicConfig(level=logging.INFO, format=log_fmt)
main()