Skip to content

Commit

Permalink
web module reeorganized into main package
Browse files Browse the repository at this point in the history
  • Loading branch information
thePortus committed Mar 10, 2018
1 parent b9150f1 commit 97e8969
Show file tree
Hide file tree
Showing 6 changed files with 32 additions and 13 deletions.
5 changes: 5 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,11 @@ All notable changes to this project will be documented in this file.
The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/)
and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.html).

## [0.0.4]

* Changes
* Web module reorganized into main package

## [0.0.3]

* Changes
Expand Down
2 changes: 1 addition & 1 deletion dhelp/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
from .files.csv_file import CSVFile
from .files.text_file import TextFile
from .files.text_folder import TextFolder
from .web.web_page import WebPage
from .web_page import WebPage
from .text.english import EnglishText
from .text.latin import LatinText
from .text.ancient_greek import AncientGreekText
Original file line number Diff line number Diff line change
Expand Up @@ -17,3 +17,10 @@ def test_fetch(self):
def test_soup(self):
# ensure object is a BeautifulSoup type object
return self.assertTrue(type(self.page.soup()) == BeautifulSoup)

def test_context_manager(self):
# ensure soup works when invoked using with.. as.. context manager
results = None
with self.page as page_soup:
results = page_soup
return self.assertTrue((type(results)) == BeautifulSoup)
1 change: 0 additions & 1 deletion dhelp/web/__init__.py

This file was deleted.

Empty file removed dhelp/web/tests/__init__.py
Empty file.
30 changes: 19 additions & 11 deletions dhelp/web/web_page.py → dhelp/web_page.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ class WebPage(UserString):
... 'delay': 4,
'max_retries': 3,
'silent': True
'parser': 'html.parser'
... }
>>> web_page = WebPage('https://stackoverflow.com', options=options)
https://stackoverflow.com
Expand All @@ -45,10 +46,17 @@ def __init__(self, url, options={}):
options['max_retries'] = 0
if 'silent' not in options:
options['silent'] = False
if 'parser' not in options:
options['parser'] = 'html.parser'
self.data = url
self.delay = options['delay']
self.max_retries = options['max_retries']
self.silent = options['silent']
self.options = options

def __enter__(self):
return self.soup()

def __exit__(self, ctx_type, ctx_value, ctx_traceback):
if not self.options['silent']:
print('Successfully scraped', self.data)

def fetch(self, retry_counter=0):
"""Returns http request from URL as a string.
Expand All @@ -74,29 +82,29 @@ def fetch(self, retry_counter=0):
<!DOCTYPE html>\\r\\n<html>\\r\\n\r\\n <head>\\r\\n\r\\n <title>Stack Overflow...
""" # noqa
# print message unless silent option
if not self.silent:
if not self.options['silent']:
print('Fetching', self.data)
# enforce delay to reduce server load
time.sleep(self.delay)
time.sleep(self.options['delay'])
# attempt to fetch web page
try:
request = requests.get(self.data)
# if error in getting page, call self recursively to try again
except Exception:
print('Problem fetching', self.data)
# if infinite retries is set, always try again
if not self.max_retries:
if not self.silent:
if not self.options['max_retries']:
if not self.options['silent']:
print('Retrying...')
return self.fetch()
# if below retry limit, return recursively and increment counter
elif retry_counter <= self.max_retries:
if not self.silent:
elif retry_counter <= self.options['max_retries']:
if not self.options['silent']:
print('Retrying')
return self.fetch(retry_counter=retry_counter+1)
# otherwise retry limit has been hit, stop fetching
else:
if not self.silent:
if not self.options['silent']:
print('Retry limit reached, skipping', self.data)
return None
# if everything ok, returning page html instead of the entire request
Expand All @@ -123,4 +131,4 @@ def soup(self):
>>> print(header_logo_text.get_text())
Stack Overflow
""" # noqa
return BeautifulSoup(self.fetch(), 'html.parser')
return BeautifulSoup(self.fetch(), self.options['parser'])

0 comments on commit 97e8969

Please sign in to comment.