web module reeorganized into main package

thePortus · Mar 10, 2018 · 97e8969 · 97e8969
1 parent b9150f1
commit 97e8969
Show file tree

Hide file tree

Showing 6 changed files with 32 additions and 13 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -4,6 +4,11 @@ All notable changes to this project will be documented in this file.
 The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/)
 and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.html).
 
+## [0.0.4]
+
+* Changes
+    * Web module reorganized into main package
+
 ## [0.0.3]
 
 * Changes

diff --git a/dhelp/__init__.py b/dhelp/__init__.py
@@ -14,7 +14,7 @@
 from .files.csv_file import CSVFile
 from .files.text_file import TextFile
 from .files.text_folder import TextFolder
-from .web.web_page import WebPage
+from .web_page import WebPage
 from .text.english import EnglishText
 from .text.latin import LatinText
 from .text.ancient_greek import AncientGreekText
diff --git a/dhelp/web/tests/test_web_page.py → dhelp/tests/test_web_page.py b/dhelp/web/tests/test_web_page.py → dhelp/tests/test_web_page.py
@@ -17,3 +17,10 @@ def test_fetch(self):
     def test_soup(self):
         # ensure object is a BeautifulSoup type object
         return self.assertTrue(type(self.page.soup()) == BeautifulSoup)
+
+    def test_context_manager(self):
+        # ensure soup works when invoked using with.. as.. context manager
+        results = None
+        with self.page as page_soup:
+            results = page_soup
+        return self.assertTrue((type(results)) == BeautifulSoup)
diff --git a/dhelp/web/__init__.py b/dhelp/web/__init__.py
diff --git a/dhelp/web/tests/__init__.py b/dhelp/web/tests/__init__.py
diff --git a/dhelp/web/web_page.py → dhelp/web_page.py b/dhelp/web/web_page.py → dhelp/web_page.py
@@ -29,6 +29,7 @@ class WebPage(UserString):
         ...     'delay': 4,
                 'max_retries': 3,
                 'silent': True
+                'parser': 'html.parser'
         ... }
         >>> web_page = WebPage('https://stackoverflow.com', options=options)
         https://stackoverflow.com
@@ -45,10 +46,17 @@ def __init__(self, url, options={}):
             options['max_retries'] = 0
         if 'silent' not in options:
             options['silent'] = False
+        if 'parser' not in options:
+            options['parser'] = 'html.parser'
         self.data = url
-        self.delay = options['delay']
-        self.max_retries = options['max_retries']
-        self.silent = options['silent']
+        self.options = options
+
+    def __enter__(self):
+        return self.soup()
+
+    def __exit__(self, ctx_type, ctx_value, ctx_traceback):
+        if not self.options['silent']:
+            print('Successfully scraped', self.data)
 
     def fetch(self, retry_counter=0):
         """Returns http request from URL as a string.
@@ -74,29 +82,29 @@ def fetch(self, retry_counter=0):
             <!DOCTYPE html>\\r\\n<html>\\r\\n\r\\n    <head>\\r\\n\r\\n        <title>Stack Overflow...
         """ # noqa
         # print message unless silent option
-        if not self.silent:
+        if not self.options['silent']:
             print('Fetching', self.data)
         # enforce delay to reduce server load
-        time.sleep(self.delay)
+        time.sleep(self.options['delay'])
         # attempt to fetch web page
         try:
             request = requests.get(self.data)
         # if error in getting page, call self recursively to try again
         except Exception:
             print('Problem fetching', self.data)
             # if infinite retries is set, always try again
-            if not self.max_retries:
-                if not self.silent:
+            if not self.options['max_retries']:
+                if not self.options['silent']:
                     print('Retrying...')
                 return self.fetch()
             # if below retry limit, return recursively and increment counter
-            elif retry_counter <= self.max_retries:
-                if not self.silent:
+            elif retry_counter <= self.options['max_retries']:
+                if not self.options['silent']:
                     print('Retrying')
                 return self.fetch(retry_counter=retry_counter+1)
             # otherwise retry limit has been hit, stop fetching
             else:
-                if not self.silent:
+                if not self.options['silent']:
                     print('Retry limit reached, skipping', self.data)
                 return None
         # if everything ok, returning page html instead of the entire request
@@ -123,4 +131,4 @@ def soup(self):
             >>> print(header_logo_text.get_text())
             Stack Overflow
         """ # noqa
-        return BeautifulSoup(self.fetch(), 'html.parser')
+        return BeautifulSoup(self.fetch(), self.options['parser'])