Skip to content

Commit

Permalink
Bugs fixes, adjustments, and cleaning. (#84)
Browse files Browse the repository at this point in the history
* Fixes search issues due to bugs in search query encodings
- For Indeed and Monster, the query string was not properly encoded when a quoted phrase with spaces in-between words were provided. The fix was to encode all spaces with the proper character(+/-). This issue and fix also applied to city names.
- For GlassDoorStatic, the query string was encoded for a URL and returned improper results. Since this class searches using a JSON payload, the solution was to combine the keywords with a space instead.
- The old query construction function was moved from GlassDoorBase to GlassDoorDynamic to prevent the dynamic scraper class from breaking.
- Fixes issues #80.

* Radius function cleanup

* Cleaning and networking code adjustments
- Removed unused requests imports
- Changed URL strings that had http in them to https
- Set provider header dictionary as the default headers on the provider's session object. Setting headers on the actual post/get method call is only necessary for temporarily overriding the session headers on an individual request.
- Adjusted search_page_for_job_soups method for GlassDoorStatic class so that it uses GET instead of POST. Sending payload data when we already have the search page URL is unnecessary and can lead to bot detection measures activating more frequently.

* Updated indeed test URL
- Updated test URL to test for https instead of http

* Fixes to asynchronous parsing code 
- Previously futures would be deleted whether they finished parsing or not.
- Added code to delete the HTML page after it's  parsed.
- Added code to log any errors during blurb retrieval and parsing.

* Version bump
  • Loading branch information
bunsenmurder authored Jul 12, 2020
1 parent 0a246cb commit aac4008
Show file tree
Hide file tree
Showing 8 changed files with 52 additions and 37 deletions.
2 changes: 1 addition & 1 deletion jobfunnel/__init__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = '2.1.8'
__version__ = '2.1.9'
14 changes: 9 additions & 5 deletions jobfunnel/glassdoor_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,6 @@ def __init__(self, args):
'Cache-Control': 'no-cache',
'Connection': 'keep-alive',
}
self.query = '-'.join(self.search_terms['keywords'])

def convert_radius(self, radius):
"""function that quantizes the user input radius to a valid radius
Expand All @@ -50,7 +49,7 @@ def convert_radius(self, radius):
radius = 25
elif 50 <= radius < 100:
radius = 50
elif 100 <= radius:
elif radius >= 100:
radius = 100
return radius

Expand All @@ -70,10 +69,15 @@ def convert_radius(self, radius):
elif radius >= 200:
radius = 200

glassdoor_radius = {0: 0, 10: 6, 20: 12,
30: 19, 50: 31, 100: 62, 200: 124}
glassdoor_radius = {0: 0,
10: 6,
20: 12,
30: 19,
50: 31,
100: 62,
200: 124}

return glassdoor_radius[radius]
return glassdoor_radius[radius]

def parse_blurb(self, job, html):
"""parses and stores job description into dict entry"""
Expand Down
2 changes: 2 additions & 0 deletions jobfunnel/glassdoor_dynamic.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,8 @@ def __init__(self, args):
super().__init__(args)
self.provider = 'glassdoordynamic'

# Keeping old query function so this class does not break.
self.query = '-'.join(self.search_terms['keywords'])
# initialize the webdriver
self.driver = get_webdriver()

Expand Down
23 changes: 12 additions & 11 deletions jobfunnel/glassdoor_static.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@
from concurrent.futures import ThreadPoolExecutor, wait
from logging import info as log_info
from math import ceil
from requests import post
from time import sleep, time

from .jobfunnel import JobFunnel, MASTERLIST_HEADER
Expand All @@ -30,6 +29,10 @@ def __init__(self, args):
'Cache-Control': 'no-cache',
'Connection': 'keep-alive',
}
# Sets headers as default on Session object
self.s.headers.update(self.headers)
# Concatenates keywords with '-'
self.query = ' '.join(self.search_terms['keywords'])

def get_search_url(self, method='get'):
"""gets the glassdoor search url"""
Expand All @@ -40,8 +43,9 @@ def get_search_url(self, method='get'):
# form the location lookup url
location_url = 'https://www.glassdoor.co.in/findPopularLocationAjax.htm?'

# get the location id for search location
# get location id for search location
location_response = self.s.post(
# set location headers to override default session headers
location_url, headers=self.location_headers, data=data
).json()

Expand Down Expand Up @@ -69,23 +73,22 @@ def get_search_url(self, method='get'):
else:
raise ValueError(f'No html method {method} exists')

def search_page_for_job_soups(self, data, page, url, job_soup_list):
def search_page_for_job_soups(self, page, url, job_soup_list):
"""function that scrapes the glassdoor page for a list of job soups"""
log_info(f'getting glassdoor page {page} : {url}')

job = BeautifulSoup(
self.s.post(url, headers=self.headers,
data=data).text, self.bs4_parser
self.s.get(url).text, self.bs4_parser
).find_all('li', attrs={'class', 'jl'})
job_soup_list.extend(job)

def search_joblink_for_blurb(self, job):
"""function that scrapes the glassdoor job link for the blurb"""
search = job['link']
log_info(f'getting glassdoor search: {search}')

job_link_soup = BeautifulSoup(
self.s.post(
search, headers=self.location_headers).text, self.bs4_parser
self.s.get(search).text, self.bs4_parser
)

try:
Expand All @@ -105,7 +108,7 @@ def get_blurb_with_delay(self, job, delay):
search = job['link']
log_info(f'delay of {delay:.2f}s, getting glassdoor search: {search}')

res = self.s.post(search, headers=self.location_headers).text
res = self.s.get(search).text
return job, res

def scrape(self):
Expand All @@ -116,7 +119,7 @@ def scrape(self):
search, data = self.get_search_url(method='post')

# get the html data, initialize bs4 with lxml
request_html = self.s.post(search, headers=self.headers, data=data)
request_html = self.s.post(search, data=data)

# create the soup base
soup_base = BeautifulSoup(request_html.text, self.bs4_parser)
Expand All @@ -143,7 +146,6 @@ def scrape(self):
fts.append( # append thread job future to futures list
threads.submit(
self.search_page_for_job_soups,
data,
page,
request_html.url,
job_soup_list,
Expand All @@ -167,7 +169,6 @@ def scrape(self):
fts.append( # append thread job future to futures list
threads.submit(
self.search_page_for_job_soups,
data,
page,
page_url,
job_soup_list,
Expand Down
20 changes: 11 additions & 9 deletions jobfunnel/indeed.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@
from concurrent.futures import ThreadPoolExecutor, wait
from logging import info as log_info
from math import ceil
from requests import get
from time import sleep, time

from .jobfunnel import JobFunnel, MASTERLIST_HEADER
Expand All @@ -30,7 +29,10 @@ def __init__(self, args):
'Cache-Control': 'no-cache',
'Connection': 'keep-alive'
}
self.query = '+'.join(self.search_terms['keywords'])
# Sets headers as default on Session object
self.s.headers.update(self.headers)
# Concatenates keywords with '+' and encodes spaces as '+'
self.query = '+'.join(self.search_terms['keywords']).replace(' ', '+')

def convert_radius(self, radius):
"""function that quantizes the user input radius to a valid radius
Expand All @@ -47,19 +49,19 @@ def convert_radius(self, radius):
radius = 25
elif 50 <= radius < 100:
radius = 50
elif 100 <= radius:
elif radius >= 100:
radius = 100
return radius

def get_search_url(self, method='get'):
"""gets the indeed search url"""
if method == 'get':
# form job search url
search = ('http://www.indeed.{0}/jobs?'
search = ('https://www.indeed.{0}/jobs?'
'q={1}&l={2}%2C+{3}&radius={4}&limit={5}&filter={6}'.format(
self.search_terms['region']['domain'],
self.query,
self.search_terms['region']['city'],
self.search_terms['region']['city'].replace(' ', '+'),
self.search_terms['region']['province'],
self.convert_radius(
self.search_terms['region']['radius']),
Expand All @@ -79,7 +81,7 @@ def search_page_for_job_soups(self, search, page, job_soup_list):
log_info(f'getting indeed page {page} : {url}')

jobs = BeautifulSoup(
self.s.get(url, headers=self.headers).text, self.bs4_parser). \
self.s.get(url).text, self.bs4_parser). \
find_all('div', attrs={'data-tn-component': 'organicJob'})

job_soup_list.extend(jobs)
Expand All @@ -90,7 +92,7 @@ def search_joblink_for_blurb(self, job):
log_info(f'getting indeed page: {search}')

job_link_soup = BeautifulSoup(
self.s.get(search, headers=self.headers).text, self.bs4_parser)
self.s.get(search).text, self.bs4_parser)

try:
job['blurb'] = job_link_soup.find(
Expand All @@ -107,7 +109,7 @@ def get_blurb_with_delay(self, job, delay):
search = job['link']
log_info(f'delay of {delay:.2f}s, getting indeed search: {search}')

res = self.s.get(search, headers=self.headers).text
res = self.s.get(search).text
return job, res

def parse_blurb(self, job, html):
Expand Down Expand Up @@ -247,7 +249,7 @@ def scrape(self):
search = self.get_search_url()

# get the html data, initialize bs4 with lxml
request_html = self.s.get(search, headers=self.headers)
request_html = self.s.get(search)

# create the soup base
soup_base = BeautifulSoup(request_html.text, self.bs4_parser)
Expand Down
7 changes: 5 additions & 2 deletions jobfunnel/jobfunnel.py
Original file line number Diff line number Diff line change
Expand Up @@ -308,9 +308,12 @@ def delay_threader(self,
try:
job, html = future.result()
parse_fn(job, html)
except Exception:
del results[future]
del html
except Exception as e:
self.logger.error(f'Blurb Future Error: {e}')
pass
del results[future]


threads.shutdown() # clean up threads when done
# end and print recorded time
Expand Down
19 changes: 11 additions & 8 deletions jobfunnel/monster.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,10 @@ def __init__(self, args):
'Cache-Control': 'no-cache',
'Connection': 'keep-alive'
}
self.query = '-'.join(self.search_terms['keywords'])
# Sets headers as default on Session object
self.s.headers.update(self.headers)
# Concatenates keywords with '-' and encodes spaces as '-'
self.query = '-'.join(self.search_terms['keywords']).replace(' ', '-')

def convert_radius(self, radius):
"""function that quantizes the user input radius to a valid radius
Expand Down Expand Up @@ -57,7 +60,7 @@ def convert_radius(self, radius):
radius = 100
elif 150 <= radius < 200:
radius = 150
elif 200 <= radius:
elif radius >= 200:
radius = 200
else:
if radius < 5:
Expand All @@ -70,7 +73,7 @@ def convert_radius(self, radius):
radius = 20
elif 50 <= radius < 100:
radius = 50
elif 100 <= radius:
elif radius >= 100:
radius = 100

return radius
Expand All @@ -83,7 +86,7 @@ def get_search_url(self, method='get'):
'q={1}&where={2}__2C-{3}&intcid={4}&rad={5}&where={2}__2c-{3}'.format(
self.search_terms['region']['domain'],
self.query,
self.search_terms['region']['city'],
self.search_terms['region']['city'].replace(' ', "-"),
self.search_terms['region']['province'],
'skr_navigation_nhpso_searchMain',
self.convert_radius(self.search_terms['region']['radius'])))
Expand All @@ -101,7 +104,7 @@ def search_joblink_for_blurb(self, job):
log_info(f'getting monster search: {search}')

job_link_soup = BeautifulSoup(
self.s.get(search, headers=self.headers).text, self.bs4_parser)
self.s.get(search).text, self.bs4_parser)

try:
job['blurb'] = job_link_soup.find(
Expand All @@ -120,7 +123,7 @@ def get_blurb_with_delay(self, job, delay):
search = job['link']
log_info(f'delay of {delay:.2f}s, getting monster search: {search}')

res = self.s.get(search, headers=self.headers).text
res = self.s.get(search).text
return job, res

def parse_blurb(self, job, html):
Expand All @@ -143,7 +146,7 @@ def scrape(self):
search = self.get_search_url()

# get the html data, initialize bs4 with lxml
request_html = self.s.get(search, headers=self.headers)
request_html = self.s.get(search)

# create the soup base
soup_base = BeautifulSoup(request_html.text, self.bs4_parser)
Expand All @@ -160,7 +163,7 @@ def scrape(self):
log_info(f'getting monster pages 1 to {pages} : {page_url}')

jobs = BeautifulSoup(
self.s.get(page_url, headers=self.headers).text, self.bs4_parser). \
self.s.get(page_url).text, self.bs4_parser). \
find_all('div', attrs={'class': 'flex-row'})

job_soup_list = []
Expand Down
2 changes: 1 addition & 1 deletion tests/test_indeed.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ def test_get_search_url(self, init_scraper, search_terms_config):
provider = init_scraper('indeed')
provider.search_terms = search_terms_config
if(provider.search_terms['region']['domain'] == 'ca'):
assert'http://www.indeed.ca/jobs?q=Python&l=waterloo%2C+ON&radius=25&limit=50&filter=0' == provider.get_search_url()
assert'https://www.indeed.ca/jobs?q=Python&l=waterloo%2C+ON&radius=25&limit=50&filter=0' == provider.get_search_url()
with pytest.raises(ValueError) as e:
provider.get_search_url('panda')
assert str(e.value) == 'No html method panda exists'
Expand Down

0 comments on commit aac4008

Please sign in to comment.