-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathscraping_multithreading.py
127 lines (108 loc) · 4.67 KB
/
scraping_multithreading.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
import random
import time
import pandas as pd
from bs4 import BeautifulSoup
from selenium import webdriver
import concurrent.futures
from selenium.webdriver.common.by import By
from selenium.webdriver.firefox.options import Options
from datetime import datetime
def variable_init():
global url
global executable_path
global options
global dataframe
global Links_list
global descriptions
options = Options()
options.headless = True
options.binary_location = 'C:\\Program Files\\Mozilla Firefox\\firefox.exe'
executable_path = "C:/Users/shiva/Desktop/UofT/Term3-Fall2022/APS1624/Assignment 3/geckodriver.exe"
def break_lists(list, n):
k, m = divmod(len(list), n)
return (list[i * k + min(i, m):(i + 1) * k + min(i + 1, m)] for i in range(n))
def join_lists(lists):
return [item for sublist in lists for item in sublist]
def get_url(position, location):
url_template = "https://www.indeed.com/jobs?q={}&l={}"
return url_template.format(position, location)
def scrape_posts(postings_list, url):
dataframe = pd.DataFrame(columns=["Title", "Company", "Location", "Rating", "Date", "Salary", "Description", "Links"])
driver = webdriver.Firefox(executable_path=executable_path, options=options)
for i in postings_list:
driver.get(url + "&start=" + str(i))
driver.implicitly_wait(3)
jobs = driver.find_elements(By.CLASS_NAME, 'job_seen_beacon')
for job in jobs:
result_html = job.get_attribute('innerHTML')
soup = BeautifulSoup(result_html, 'html.parser')
liens = job.find_elements(By.TAG_NAME, "a")
links = liens[0].get_attribute("href")
title = soup.select('.jobTitle')[0].get_text().strip()
company = soup.select('.companyName')[0].get_text().strip()
location = soup.select('.companyLocation')[0].get_text().strip()
try:
salary = soup.select('.salary-snippet-container')[0].get_text().strip()
except:
salary = 'NaN'
try:
rating = soup.select('.ratingNumber')[0].get_text().strip()
except:
rating = 'NaN'
try:
date = soup.select('.date')[0].get_text().strip()
except:
date = 'NaN'
try:
description = soup.select('.job-snippet')[0].get_text().strip()
except:
description = ''
dataframe = pd.concat([dataframe, pd.DataFrame([[title, company, location, rating, date, salary, description, links]], columns=["Title", "Company", "Location", "Rating", "Date", "Salary", "Description", "Links"])], ignore_index=True)
driver.quit()
return dataframe
def scrape_postings(postings, url, n_threads):
## Number of postings to scrape
postings_list = break_lists(list(range(0, postings, 10)), n_threads)
data = []
with concurrent.futures.ThreadPoolExecutor(max_workers=n_threads) as executor:
for postings in postings_list:
data.append(executor.submit(scrape_posts, postings, url))
return pd.concat([d.result() for d in data], ignore_index=True)
def get_job_description(links):
options = Options()
options.headless = True
options.binary_location = 'C:\\Program Files\\Mozilla Firefox\\firefox.exe'
driver = webdriver.Firefox(executable_path=executable_path, options=options)
descriptions = []
for link in links:
driver.get(link)
driver.implicitly_wait(random.randint(3, 8))
descriptions.append(driver.find_element(By.XPATH, '//div[@id="jobDescriptionText"]').text)
time.sleep(random.randint(5,10))
driver.quit()
return descriptions
def main():
random.seed(0)
variable_init()
position = "data scientist"
locations = "remote"
url = get_url(position, locations)
n_threads = 12
postings = 800
dataframe = scrape_postings(postings, url, n_threads)
Links_list = dataframe['Links'].tolist()
descriptions = []
with concurrent.futures.ThreadPoolExecutor(max_workers=n_threads) as executor:
for sublist in break_lists(Links_list, n_threads):
descriptions.append(executor.submit(get_job_description, sublist))
descriptions = join_lists([description.result() for description in descriptions])
dataframe['Descriptions'] = descriptions
dataframe.to_csv('indeed_jobs_mt.csv', index=False)
if __name__ == '__main__':
start = time.time() # start time
now = datetime.now()
current_time = now.strftime("%H:%M:%S")
print("Start Time =", current_time)
main()
end = time.time() # end time
print("Time taken: ", end - start)