diff --git a/jobfunnel/__init__.py b/jobfunnel/__init__.py index 7042a396..d3a156bb 100644 --- a/jobfunnel/__init__.py +++ b/jobfunnel/__init__.py @@ -1 +1 @@ -__version__ = '2.1.8' +__version__ = '2.1.9' diff --git a/jobfunnel/glassdoor_base.py b/jobfunnel/glassdoor_base.py index 8ba5ac54..6eb0bda4 100644 --- a/jobfunnel/glassdoor_base.py +++ b/jobfunnel/glassdoor_base.py @@ -32,7 +32,6 @@ def __init__(self, args): 'Cache-Control': 'no-cache', 'Connection': 'keep-alive', } - self.query = '-'.join(self.search_terms['keywords']) def convert_radius(self, radius): """function that quantizes the user input radius to a valid radius @@ -50,7 +49,7 @@ def convert_radius(self, radius): radius = 25 elif 50 <= radius < 100: radius = 50 - elif 100 <= radius: + elif radius >= 100: radius = 100 return radius @@ -70,10 +69,15 @@ def convert_radius(self, radius): elif radius >= 200: radius = 200 - glassdoor_radius = {0: 0, 10: 6, 20: 12, - 30: 19, 50: 31, 100: 62, 200: 124} + glassdoor_radius = {0: 0, + 10: 6, + 20: 12, + 30: 19, + 50: 31, + 100: 62, + 200: 124} - return glassdoor_radius[radius] + return glassdoor_radius[radius] def parse_blurb(self, job, html): """parses and stores job description into dict entry""" diff --git a/jobfunnel/glassdoor_dynamic.py b/jobfunnel/glassdoor_dynamic.py index 81d1318c..ffb753c1 100644 --- a/jobfunnel/glassdoor_dynamic.py +++ b/jobfunnel/glassdoor_dynamic.py @@ -22,6 +22,8 @@ def __init__(self, args): super().__init__(args) self.provider = 'glassdoordynamic' + # Keeping old query function so this class does not break. + self.query = '-'.join(self.search_terms['keywords']) # initialize the webdriver self.driver = get_webdriver() diff --git a/jobfunnel/glassdoor_static.py b/jobfunnel/glassdoor_static.py index 9eaebfd7..8c2a5d8f 100644 --- a/jobfunnel/glassdoor_static.py +++ b/jobfunnel/glassdoor_static.py @@ -4,7 +4,6 @@ from concurrent.futures import ThreadPoolExecutor, wait from logging import info as log_info from math import ceil -from requests import post from time import sleep, time from .jobfunnel import JobFunnel, MASTERLIST_HEADER @@ -30,6 +29,10 @@ def __init__(self, args): 'Cache-Control': 'no-cache', 'Connection': 'keep-alive', } + # Sets headers as default on Session object + self.s.headers.update(self.headers) + # Concatenates keywords with '-' + self.query = ' '.join(self.search_terms['keywords']) def get_search_url(self, method='get'): """gets the glassdoor search url""" @@ -40,8 +43,9 @@ def get_search_url(self, method='get'): # form the location lookup url location_url = 'https://www.glassdoor.co.in/findPopularLocationAjax.htm?' - # get the location id for search location + # get location id for search location location_response = self.s.post( + # set location headers to override default session headers location_url, headers=self.location_headers, data=data ).json() @@ -69,13 +73,12 @@ def get_search_url(self, method='get'): else: raise ValueError(f'No html method {method} exists') - def search_page_for_job_soups(self, data, page, url, job_soup_list): + def search_page_for_job_soups(self, page, url, job_soup_list): """function that scrapes the glassdoor page for a list of job soups""" log_info(f'getting glassdoor page {page} : {url}') job = BeautifulSoup( - self.s.post(url, headers=self.headers, - data=data).text, self.bs4_parser + self.s.get(url).text, self.bs4_parser ).find_all('li', attrs={'class', 'jl'}) job_soup_list.extend(job) @@ -83,9 +86,9 @@ def search_joblink_for_blurb(self, job): """function that scrapes the glassdoor job link for the blurb""" search = job['link'] log_info(f'getting glassdoor search: {search}') + job_link_soup = BeautifulSoup( - self.s.post( - search, headers=self.location_headers).text, self.bs4_parser + self.s.get(search).text, self.bs4_parser ) try: @@ -105,7 +108,7 @@ def get_blurb_with_delay(self, job, delay): search = job['link'] log_info(f'delay of {delay:.2f}s, getting glassdoor search: {search}') - res = self.s.post(search, headers=self.location_headers).text + res = self.s.get(search).text return job, res def scrape(self): @@ -116,7 +119,7 @@ def scrape(self): search, data = self.get_search_url(method='post') # get the html data, initialize bs4 with lxml - request_html = self.s.post(search, headers=self.headers, data=data) + request_html = self.s.post(search, data=data) # create the soup base soup_base = BeautifulSoup(request_html.text, self.bs4_parser) @@ -143,7 +146,6 @@ def scrape(self): fts.append( # append thread job future to futures list threads.submit( self.search_page_for_job_soups, - data, page, request_html.url, job_soup_list, @@ -167,7 +169,6 @@ def scrape(self): fts.append( # append thread job future to futures list threads.submit( self.search_page_for_job_soups, - data, page, page_url, job_soup_list, diff --git a/jobfunnel/indeed.py b/jobfunnel/indeed.py index 25cc9d9d..53fab9a1 100644 --- a/jobfunnel/indeed.py +++ b/jobfunnel/indeed.py @@ -4,7 +4,6 @@ from concurrent.futures import ThreadPoolExecutor, wait from logging import info as log_info from math import ceil -from requests import get from time import sleep, time from .jobfunnel import JobFunnel, MASTERLIST_HEADER @@ -30,7 +29,10 @@ def __init__(self, args): 'Cache-Control': 'no-cache', 'Connection': 'keep-alive' } - self.query = '+'.join(self.search_terms['keywords']) + # Sets headers as default on Session object + self.s.headers.update(self.headers) + # Concatenates keywords with '+' and encodes spaces as '+' + self.query = '+'.join(self.search_terms['keywords']).replace(' ', '+') def convert_radius(self, radius): """function that quantizes the user input radius to a valid radius @@ -47,7 +49,7 @@ def convert_radius(self, radius): radius = 25 elif 50 <= radius < 100: radius = 50 - elif 100 <= radius: + elif radius >= 100: radius = 100 return radius @@ -55,11 +57,11 @@ def get_search_url(self, method='get'): """gets the indeed search url""" if method == 'get': # form job search url - search = ('http://www.indeed.{0}/jobs?' + search = ('https://www.indeed.{0}/jobs?' 'q={1}&l={2}%2C+{3}&radius={4}&limit={5}&filter={6}'.format( self.search_terms['region']['domain'], self.query, - self.search_terms['region']['city'], + self.search_terms['region']['city'].replace(' ', '+'), self.search_terms['region']['province'], self.convert_radius( self.search_terms['region']['radius']), @@ -79,7 +81,7 @@ def search_page_for_job_soups(self, search, page, job_soup_list): log_info(f'getting indeed page {page} : {url}') jobs = BeautifulSoup( - self.s.get(url, headers=self.headers).text, self.bs4_parser). \ + self.s.get(url).text, self.bs4_parser). \ find_all('div', attrs={'data-tn-component': 'organicJob'}) job_soup_list.extend(jobs) @@ -90,7 +92,7 @@ def search_joblink_for_blurb(self, job): log_info(f'getting indeed page: {search}') job_link_soup = BeautifulSoup( - self.s.get(search, headers=self.headers).text, self.bs4_parser) + self.s.get(search).text, self.bs4_parser) try: job['blurb'] = job_link_soup.find( @@ -107,7 +109,7 @@ def get_blurb_with_delay(self, job, delay): search = job['link'] log_info(f'delay of {delay:.2f}s, getting indeed search: {search}') - res = self.s.get(search, headers=self.headers).text + res = self.s.get(search).text return job, res def parse_blurb(self, job, html): @@ -247,7 +249,7 @@ def scrape(self): search = self.get_search_url() # get the html data, initialize bs4 with lxml - request_html = self.s.get(search, headers=self.headers) + request_html = self.s.get(search) # create the soup base soup_base = BeautifulSoup(request_html.text, self.bs4_parser) diff --git a/jobfunnel/jobfunnel.py b/jobfunnel/jobfunnel.py index bb4adc64..e6d90f9c 100755 --- a/jobfunnel/jobfunnel.py +++ b/jobfunnel/jobfunnel.py @@ -308,9 +308,12 @@ def delay_threader(self, try: job, html = future.result() parse_fn(job, html) - except Exception: + del results[future] + del html + except Exception as e: + self.logger.error(f'Blurb Future Error: {e}') pass - del results[future] + threads.shutdown() # clean up threads when done # end and print recorded time diff --git a/jobfunnel/monster.py b/jobfunnel/monster.py index 603c8635..76dd984e 100644 --- a/jobfunnel/monster.py +++ b/jobfunnel/monster.py @@ -29,7 +29,10 @@ def __init__(self, args): 'Cache-Control': 'no-cache', 'Connection': 'keep-alive' } - self.query = '-'.join(self.search_terms['keywords']) + # Sets headers as default on Session object + self.s.headers.update(self.headers) + # Concatenates keywords with '-' and encodes spaces as '-' + self.query = '-'.join(self.search_terms['keywords']).replace(' ', '-') def convert_radius(self, radius): """function that quantizes the user input radius to a valid radius @@ -57,7 +60,7 @@ def convert_radius(self, radius): radius = 100 elif 150 <= radius < 200: radius = 150 - elif 200 <= radius: + elif radius >= 200: radius = 200 else: if radius < 5: @@ -70,7 +73,7 @@ def convert_radius(self, radius): radius = 20 elif 50 <= radius < 100: radius = 50 - elif 100 <= radius: + elif radius >= 100: radius = 100 return radius @@ -83,7 +86,7 @@ def get_search_url(self, method='get'): 'q={1}&where={2}__2C-{3}&intcid={4}&rad={5}&where={2}__2c-{3}'.format( self.search_terms['region']['domain'], self.query, - self.search_terms['region']['city'], + self.search_terms['region']['city'].replace(' ', "-"), self.search_terms['region']['province'], 'skr_navigation_nhpso_searchMain', self.convert_radius(self.search_terms['region']['radius']))) @@ -101,7 +104,7 @@ def search_joblink_for_blurb(self, job): log_info(f'getting monster search: {search}') job_link_soup = BeautifulSoup( - self.s.get(search, headers=self.headers).text, self.bs4_parser) + self.s.get(search).text, self.bs4_parser) try: job['blurb'] = job_link_soup.find( @@ -120,7 +123,7 @@ def get_blurb_with_delay(self, job, delay): search = job['link'] log_info(f'delay of {delay:.2f}s, getting monster search: {search}') - res = self.s.get(search, headers=self.headers).text + res = self.s.get(search).text return job, res def parse_blurb(self, job, html): @@ -143,7 +146,7 @@ def scrape(self): search = self.get_search_url() # get the html data, initialize bs4 with lxml - request_html = self.s.get(search, headers=self.headers) + request_html = self.s.get(search) # create the soup base soup_base = BeautifulSoup(request_html.text, self.bs4_parser) @@ -160,7 +163,7 @@ def scrape(self): log_info(f'getting monster pages 1 to {pages} : {page_url}') jobs = BeautifulSoup( - self.s.get(page_url, headers=self.headers).text, self.bs4_parser). \ + self.s.get(page_url).text, self.bs4_parser). \ find_all('div', attrs={'class': 'flex-row'}) job_soup_list = [] diff --git a/tests/test_indeed.py b/tests/test_indeed.py index 773a3146..b42dac8b 100644 --- a/tests/test_indeed.py +++ b/tests/test_indeed.py @@ -33,7 +33,7 @@ def test_get_search_url(self, init_scraper, search_terms_config): provider = init_scraper('indeed') provider.search_terms = search_terms_config if(provider.search_terms['region']['domain'] == 'ca'): - assert'http://www.indeed.ca/jobs?q=Python&l=waterloo%2C+ON&radius=25&limit=50&filter=0' == provider.get_search_url() + assert'https://www.indeed.ca/jobs?q=Python&l=waterloo%2C+ON&radius=25&limit=50&filter=0' == provider.get_search_url() with pytest.raises(ValueError) as e: provider.get_search_url('panda') assert str(e.value) == 'No html method panda exists'