Bugs fixes, adjustments, and cleaning. (#84)

* Fixes search issues due to bugs in search query encodings - For Indeed and Monster, the query string was not properly encoded when a quoted phrase with spaces in-between words were provided. The fix was to encode all spaces with the proper character(+/-). This issue and fix also applied to city names. - For GlassDoorStatic, the query string was encoded for a URL and returned improper results. Since this class searches using a JSON payload, the solution was to combine the keywords with a space instead. - The old query construction function was moved from GlassDoorBase to GlassDoorDynamic to prevent the dynamic scraper class from breaking. - Fixes issues #80. * Radius function cleanup * Cleaning and networking code adjustments - Removed unused requests imports - Changed URL strings that had http in them to https - Set provider header dictionary as the default headers on the provider's session object. Setting headers on the actual post/get method call is only necessary for temporarily overriding the session headers on an individual request. - Adjusted search_page_for_job_soups method for GlassDoorStatic class so that it uses GET instead of POST. Sending payload data when we already have the search page URL is unnecessary and can lead to bot detection measures activating more frequently. * Updated indeed test URL - Updated test URL to test for https instead of http * Fixes to asynchronous parsing code - Previously futures would be deleted whether they finished parsing or not. - Added code to delete the HTML page after it's parsed. - Added code to log any errors during blurb retrieval and parsing. * Version bump
PaulMcInnis · Jul 12, 2020 · aac4008 · aac4008
1 parent 0a246cb
commit aac4008
Show file tree

Hide file tree

Showing 8 changed files with 52 additions and 37 deletions.
diff --git a/jobfunnel/__init__.py b/jobfunnel/__init__.py
@@ -1 +1 @@
-__version__ = '2.1.8'
+__version__ = '2.1.9'
diff --git a/jobfunnel/glassdoor_base.py b/jobfunnel/glassdoor_base.py
@@ -32,7 +32,6 @@ def __init__(self, args):
             'Cache-Control': 'no-cache',
             'Connection': 'keep-alive',
         }
-        self.query = '-'.join(self.search_terms['keywords'])
 
     def convert_radius(self, radius):
         """function that quantizes the user input radius to a valid radius
@@ -50,7 +49,7 @@ def convert_radius(self, radius):
                 radius = 25
             elif 50 <= radius < 100:
                 radius = 50
-            elif 100 <= radius:
+            elif radius >= 100:
                 radius = 100
             return radius
 
@@ -70,10 +69,15 @@ def convert_radius(self, radius):
             elif radius >= 200:
                 radius = 200
 
-        glassdoor_radius = {0: 0, 10: 6, 20: 12,
-                            30: 19, 50: 31, 100: 62, 200: 124}
+            glassdoor_radius = {0: 0,
+                                10: 6,
+                                20: 12,
+                                30: 19,
+                                50: 31,
+                                100: 62,
+                                200: 124}
 
-        return glassdoor_radius[radius]
+            return glassdoor_radius[radius]
 
     def parse_blurb(self, job, html):
         """parses and stores job description into dict entry"""

diff --git a/jobfunnel/glassdoor_dynamic.py b/jobfunnel/glassdoor_dynamic.py
@@ -22,6 +22,8 @@ def __init__(self, args):
         super().__init__(args)
         self.provider = 'glassdoordynamic'
 
+        # Keeping old query function so this class does not break.
+        self.query = '-'.join(self.search_terms['keywords'])
         # initialize the webdriver
         self.driver = get_webdriver()
 

diff --git a/jobfunnel/glassdoor_static.py b/jobfunnel/glassdoor_static.py
@@ -4,7 +4,6 @@
 from concurrent.futures import ThreadPoolExecutor, wait
 from logging import info as log_info
 from math import ceil
-from requests import post
 from time import sleep, time
 
 from .jobfunnel import JobFunnel, MASTERLIST_HEADER
@@ -30,6 +29,10 @@ def __init__(self, args):
             'Cache-Control': 'no-cache',
             'Connection': 'keep-alive',
         }
+        # Sets headers as default on Session object
+        self.s.headers.update(self.headers)
+        # Concatenates keywords with '-'
+        self.query = ' '.join(self.search_terms['keywords'])
 
     def get_search_url(self, method='get'):
         """gets the glassdoor search url"""
@@ -40,8 +43,9 @@ def get_search_url(self, method='get'):
         # form the location lookup url
         location_url = 'https://www.glassdoor.co.in/findPopularLocationAjax.htm?'
 
-        # get the location id for search location
+        # get location id for search location
         location_response = self.s.post(
+            # set location headers to override default session headers
             location_url, headers=self.location_headers, data=data
         ).json()
 
@@ -69,23 +73,22 @@ def get_search_url(self, method='get'):
         else:
             raise ValueError(f'No html method {method} exists')
 
-    def search_page_for_job_soups(self, data, page, url, job_soup_list):
+    def search_page_for_job_soups(self, page, url, job_soup_list):
         """function that scrapes the glassdoor page for a list of job soups"""
         log_info(f'getting glassdoor page {page} : {url}')
 
         job = BeautifulSoup(
-            self.s.post(url, headers=self.headers,
-                        data=data).text, self.bs4_parser
+            self.s.get(url).text, self.bs4_parser
         ).find_all('li', attrs={'class', 'jl'})
         job_soup_list.extend(job)
 
     def search_joblink_for_blurb(self, job):
         """function that scrapes the glassdoor job link for the blurb"""
         search = job['link']
         log_info(f'getting glassdoor search: {search}')
+
         job_link_soup = BeautifulSoup(
-            self.s.post(
-                search, headers=self.location_headers).text, self.bs4_parser
+            self.s.get(search).text, self.bs4_parser
         )
 
         try:
@@ -105,7 +108,7 @@ def get_blurb_with_delay(self, job, delay):
         search = job['link']
         log_info(f'delay of {delay:.2f}s, getting glassdoor search: {search}')
 
-        res = self.s.post(search, headers=self.location_headers).text
+        res = self.s.get(search).text
         return job, res
 
     def scrape(self):
@@ -116,7 +119,7 @@ def scrape(self):
         search, data = self.get_search_url(method='post')
 
         # get the html data, initialize bs4 with lxml
-        request_html = self.s.post(search, headers=self.headers, data=data)
+        request_html = self.s.post(search, data=data)
 
         # create the soup base
         soup_base = BeautifulSoup(request_html.text, self.bs4_parser)
@@ -143,7 +146,6 @@ def scrape(self):
                 fts.append(  # append thread job future to futures list
                     threads.submit(
                         self.search_page_for_job_soups,
-                        data,
                         page,
                         request_html.url,
                         job_soup_list,
@@ -167,7 +169,6 @@ def scrape(self):
                 fts.append(  # append thread job future to futures list
                     threads.submit(
                         self.search_page_for_job_soups,
-                        data,
                         page,
                         page_url,
                         job_soup_list,

diff --git a/jobfunnel/indeed.py b/jobfunnel/indeed.py
@@ -4,7 +4,6 @@
 from concurrent.futures import ThreadPoolExecutor, wait
 from logging import info as log_info
 from math import ceil
-from requests import get
 from time import sleep, time
 
 from .jobfunnel import JobFunnel, MASTERLIST_HEADER
@@ -30,7 +29,10 @@ def __init__(self, args):
             'Cache-Control': 'no-cache',
             'Connection': 'keep-alive'
         }
-        self.query = '+'.join(self.search_terms['keywords'])
+        # Sets headers as default on Session object
+        self.s.headers.update(self.headers)
+        # Concatenates keywords with '+' and encodes spaces as '+'
+        self.query = '+'.join(self.search_terms['keywords']).replace(' ', '+')
 
     def convert_radius(self, radius):
         """function that quantizes the user input radius to a valid radius
@@ -47,19 +49,19 @@ def convert_radius(self, radius):
             radius = 25
         elif 50 <= radius < 100:
             radius = 50
-        elif 100 <= radius:
+        elif radius >= 100:
             radius = 100
         return radius
 
     def get_search_url(self, method='get'):
         """gets the indeed search url"""
         if method == 'get':
             # form job search url
-            search = ('http://www.indeed.{0}/jobs?'
+            search = ('https://www.indeed.{0}/jobs?'
                       'q={1}&l={2}%2C+{3}&radius={4}&limit={5}&filter={6}'.format(
                           self.search_terms['region']['domain'],
                           self.query,
-                          self.search_terms['region']['city'],
+                          self.search_terms['region']['city'].replace(' ', '+'),
                           self.search_terms['region']['province'],
                           self.convert_radius(
                               self.search_terms['region']['radius']),
@@ -79,7 +81,7 @@ def search_page_for_job_soups(self, search, page, job_soup_list):
         log_info(f'getting indeed page {page} : {url}')
 
         jobs = BeautifulSoup(
-            self.s.get(url, headers=self.headers).text, self.bs4_parser). \
+            self.s.get(url).text, self.bs4_parser). \
             find_all('div', attrs={'data-tn-component': 'organicJob'})
 
         job_soup_list.extend(jobs)
@@ -90,7 +92,7 @@ def search_joblink_for_blurb(self, job):
         log_info(f'getting indeed page: {search}')
 
         job_link_soup = BeautifulSoup(
-            self.s.get(search, headers=self.headers).text, self.bs4_parser)
+            self.s.get(search).text, self.bs4_parser)
 
         try:
             job['blurb'] = job_link_soup.find(
@@ -107,7 +109,7 @@ def get_blurb_with_delay(self, job, delay):
         search = job['link']
         log_info(f'delay of {delay:.2f}s, getting indeed search: {search}')
 
-        res = self.s.get(search, headers=self.headers).text
+        res = self.s.get(search).text
         return job, res
 
     def parse_blurb(self, job, html):
@@ -247,7 +249,7 @@ def scrape(self):
         search = self.get_search_url()
 
         # get the html data, initialize bs4 with lxml
-        request_html = self.s.get(search, headers=self.headers)
+        request_html = self.s.get(search)
 
         # create the soup base
         soup_base = BeautifulSoup(request_html.text, self.bs4_parser)

diff --git a/jobfunnel/jobfunnel.py b/jobfunnel/jobfunnel.py
@@ -308,9 +308,12 @@ def delay_threader(self,
                 try:
                     job, html = future.result()
                     parse_fn(job, html)
-                except Exception:
+                    del results[future]
+                    del html
+                except Exception as e:
+                    self.logger.error(f'Blurb Future Error: {e}')
                     pass
-                del results[future]
+
 
         threads.shutdown()  # clean up threads when done
         # end and print recorded time

diff --git a/jobfunnel/monster.py b/jobfunnel/monster.py
@@ -29,7 +29,10 @@ def __init__(self, args):
             'Cache-Control': 'no-cache',
             'Connection': 'keep-alive'
         }
-        self.query = '-'.join(self.search_terms['keywords'])
+        # Sets headers as default on Session object
+        self.s.headers.update(self.headers)
+        # Concatenates keywords with '-' and encodes spaces as '-'
+        self.query = '-'.join(self.search_terms['keywords']).replace(' ', '-')
 
     def convert_radius(self, radius):
         """function that quantizes the user input radius to a valid radius
@@ -57,7 +60,7 @@ def convert_radius(self, radius):
                 radius = 100
             elif 150 <= radius < 200:
                 radius = 150
-            elif 200 <= radius:
+            elif radius >= 200:
                 radius = 200
         else:
             if radius < 5:
@@ -70,7 +73,7 @@ def convert_radius(self, radius):
                 radius = 20
             elif 50 <= radius < 100:
                 radius = 50
-            elif 100 <= radius:
+            elif radius >= 100:
                 radius = 100
 
         return radius
@@ -83,7 +86,7 @@ def get_search_url(self, method='get'):
                       'q={1}&where={2}__2C-{3}&intcid={4}&rad={5}&where={2}__2c-{3}'.format(
                 self.search_terms['region']['domain'],
                 self.query,
-                self.search_terms['region']['city'],
+                self.search_terms['region']['city'].replace(' ', "-"),
                 self.search_terms['region']['province'],
                 'skr_navigation_nhpso_searchMain',
                 self.convert_radius(self.search_terms['region']['radius'])))
@@ -101,7 +104,7 @@ def search_joblink_for_blurb(self, job):
         log_info(f'getting monster search: {search}')
 
         job_link_soup = BeautifulSoup(
-            self.s.get(search, headers=self.headers).text, self.bs4_parser)
+            self.s.get(search).text, self.bs4_parser)
 
         try:
             job['blurb'] = job_link_soup.find(
@@ -120,7 +123,7 @@ def get_blurb_with_delay(self, job, delay):
         search = job['link']
         log_info(f'delay of {delay:.2f}s, getting monster search: {search}')
 
-        res = self.s.get(search, headers=self.headers).text
+        res = self.s.get(search).text
         return job, res
 
     def parse_blurb(self, job, html):
@@ -143,7 +146,7 @@ def scrape(self):
         search = self.get_search_url()
 
         # get the html data, initialize bs4 with lxml
-        request_html = self.s.get(search, headers=self.headers)
+        request_html = self.s.get(search)
 
         # create the soup base
         soup_base = BeautifulSoup(request_html.text, self.bs4_parser)
@@ -160,7 +163,7 @@ def scrape(self):
         log_info(f'getting monster pages 1 to {pages} : {page_url}')
 
         jobs = BeautifulSoup(
-            self.s.get(page_url, headers=self.headers).text, self.bs4_parser). \
+            self.s.get(page_url).text, self.bs4_parser). \
             find_all('div', attrs={'class': 'flex-row'})
 
         job_soup_list = []

diff --git a/tests/test_indeed.py b/tests/test_indeed.py
@@ -33,7 +33,7 @@ def test_get_search_url(self, init_scraper, search_terms_config):
         provider = init_scraper('indeed')
         provider.search_terms = search_terms_config
         if(provider.search_terms['region']['domain'] == 'ca'):
-            assert'http://www.indeed.ca/jobs?q=Python&l=waterloo%2C+ON&radius=25&limit=50&filter=0' == provider.get_search_url()
+            assert'https://www.indeed.ca/jobs?q=Python&l=waterloo%2C+ON&radius=25&limit=50&filter=0' == provider.get_search_url()
         with pytest.raises(ValueError) as e:
             provider.get_search_url('panda')
         assert str(e.value) == 'No html method panda exists'