From 97e8969e86786fd667fb545199cac52ecb671e68 Mon Sep 17 00:00:00 2001 From: David Thomas Date: Fri, 9 Mar 2018 21:12:50 -0500 Subject: [PATCH] web module reeorganized into main package --- CHANGELOG.md | 5 +++++ dhelp/__init__.py | 2 +- dhelp/{web => }/tests/test_web_page.py | 7 ++++++ dhelp/web/__init__.py | 1 - dhelp/web/tests/__init__.py | 0 dhelp/{web => }/web_page.py | 30 ++++++++++++++++---------- 6 files changed, 32 insertions(+), 13 deletions(-) rename dhelp/{web => }/tests/test_web_page.py (64%) delete mode 100644 dhelp/web/__init__.py delete mode 100644 dhelp/web/tests/__init__.py rename dhelp/{web => }/web_page.py (85%) diff --git a/CHANGELOG.md b/CHANGELOG.md index 8ee031d..550bc4f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,11 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/) and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.html). +## [0.0.4] + +* Changes + * Web module reorganized into main package + ## [0.0.3] * Changes diff --git a/dhelp/__init__.py b/dhelp/__init__.py index f7d2e35..aae0ea4 100644 --- a/dhelp/__init__.py +++ b/dhelp/__init__.py @@ -14,7 +14,7 @@ from .files.csv_file import CSVFile from .files.text_file import TextFile from .files.text_folder import TextFolder -from .web.web_page import WebPage +from .web_page import WebPage from .text.english import EnglishText from .text.latin import LatinText from .text.ancient_greek import AncientGreekText diff --git a/dhelp/web/tests/test_web_page.py b/dhelp/tests/test_web_page.py similarity index 64% rename from dhelp/web/tests/test_web_page.py rename to dhelp/tests/test_web_page.py index 973ca31..a0d4b62 100644 --- a/dhelp/web/tests/test_web_page.py +++ b/dhelp/tests/test_web_page.py @@ -17,3 +17,10 @@ def test_fetch(self): def test_soup(self): # ensure object is a BeautifulSoup type object return self.assertTrue(type(self.page.soup()) == BeautifulSoup) + + def test_context_manager(self): + # ensure soup works when invoked using with.. as.. context manager + results = None + with self.page as page_soup: + results = page_soup + return self.assertTrue((type(results)) == BeautifulSoup) diff --git a/dhelp/web/__init__.py b/dhelp/web/__init__.py deleted file mode 100644 index 013e4b7..0000000 --- a/dhelp/web/__init__.py +++ /dev/null @@ -1 +0,0 @@ -#!/usr/bin/python diff --git a/dhelp/web/tests/__init__.py b/dhelp/web/tests/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/dhelp/web/web_page.py b/dhelp/web_page.py similarity index 85% rename from dhelp/web/web_page.py rename to dhelp/web_page.py index 3ed8d90..366bf9b 100644 --- a/dhelp/web/web_page.py +++ b/dhelp/web_page.py @@ -29,6 +29,7 @@ class WebPage(UserString): ... 'delay': 4, 'max_retries': 3, 'silent': True + 'parser': 'html.parser' ... } >>> web_page = WebPage('https://stackoverflow.com', options=options) https://stackoverflow.com @@ -45,10 +46,17 @@ def __init__(self, url, options={}): options['max_retries'] = 0 if 'silent' not in options: options['silent'] = False + if 'parser' not in options: + options['parser'] = 'html.parser' self.data = url - self.delay = options['delay'] - self.max_retries = options['max_retries'] - self.silent = options['silent'] + self.options = options + + def __enter__(self): + return self.soup() + + def __exit__(self, ctx_type, ctx_value, ctx_traceback): + if not self.options['silent']: + print('Successfully scraped', self.data) def fetch(self, retry_counter=0): """Returns http request from URL as a string. @@ -74,10 +82,10 @@ def fetch(self, retry_counter=0): \\r\\n\\r\\n\r\\n \\r\\n\r\\n Stack Overflow... """ # noqa # print message unless silent option - if not self.silent: + if not self.options['silent']: print('Fetching', self.data) # enforce delay to reduce server load - time.sleep(self.delay) + time.sleep(self.options['delay']) # attempt to fetch web page try: request = requests.get(self.data) @@ -85,18 +93,18 @@ def fetch(self, retry_counter=0): except Exception: print('Problem fetching', self.data) # if infinite retries is set, always try again - if not self.max_retries: - if not self.silent: + if not self.options['max_retries']: + if not self.options['silent']: print('Retrying...') return self.fetch() # if below retry limit, return recursively and increment counter - elif retry_counter <= self.max_retries: - if not self.silent: + elif retry_counter <= self.options['max_retries']: + if not self.options['silent']: print('Retrying') return self.fetch(retry_counter=retry_counter+1) # otherwise retry limit has been hit, stop fetching else: - if not self.silent: + if not self.options['silent']: print('Retry limit reached, skipping', self.data) return None # if everything ok, returning page html instead of the entire request @@ -123,4 +131,4 @@ def soup(self): >>> print(header_logo_text.get_text()) Stack Overflow """ # noqa - return BeautifulSoup(self.fetch(), 'html.parser') + return BeautifulSoup(self.fetch(), self.options['parser'])