Skip to content

Commit

Permalink
Merge pull request #216 from jodal/crawler-typing
Browse files Browse the repository at this point in the history
Add typing to crawler superclasses
  • Loading branch information
jodal authored Mar 28, 2021
2 parents 63c4d0a + 9c5f2ee commit c52c02a
Show file tree
Hide file tree
Showing 5 changed files with 124 additions and 101 deletions.
104 changes: 60 additions & 44 deletions comics/aggregator/crawler.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,11 @@
from __future__ import annotations

import datetime
import re
import time
import xml
import xml.sax
from dataclasses import dataclass, field
from typing import Dict, Optional, Tuple
from typing import Dict, List, Optional, Tuple, Union

import httpx
import pytz
Expand All @@ -17,52 +19,62 @@
)
from comics.aggregator.feedparser import FeedParser
from comics.aggregator.lxmlparser import LxmlParser
from comics.core.models import Comic

# For testability
now = timezone.now
today = datetime.date.today


RequestHeaders = Dict[str, str]


@dataclass
class CrawlerRelease:
def __init__(self, comic, pub_date, has_rerun_releases=False):
self.comic = comic
self.pub_date = pub_date
self.has_rerun_releases = has_rerun_releases
self._images = []
comic: Comic
pub_date: datetime.date
has_rerun_releases: bool = False
_images: List[CrawlerImage] = field(default_factory=list)

@property
def identifier(self):
def identifier(self) -> str:
return f"{self.comic.slug}/{self.pub_date}"

@property
def images(self):
def images(self) -> List[CrawlerImage]:
return self._images

def add_image(self, image):
def add_image(self, image: CrawlerImage) -> None:
image.validate(self.identifier)
self._images.append(image)


@dataclass
class CrawlerImage:
def __init__(self, url, title=None, text=None, headers=None):
self.url = url
self.title = title
self.text = text
self.request_headers = headers or {}
url: str
title: Optional[str] = None
text: Optional[str] = None
request_headers: RequestHeaders = field(default_factory=dict)

def __post_init__(self) -> None:
# Convert from e.g. lxml.etree._ElementUnicodeResult to unicode
if self.title is not None and not isinstance(self.title, str):
self.title = str(self.title)
if self.text is not None and not isinstance(self.text, str):
self.text = str(self.text)

def validate(self, identifier):
def validate(self, identifier: str) -> None:
if not self.url:
raise ImageURLNotFound(identifier)


CrawlerResult = Union[None, CrawlerImage, List[CrawlerImage]]


@dataclass
class CrawlerBase:
comic: Comic

# ### Crawler settings
# Date of oldest release available for crawling
history_capable_date: Optional[str] = None
Expand All @@ -82,18 +94,17 @@ class CrawlerBase:

# ### Settings used for both crawling and downloading
# Dictionary of HTTP headers to send when retrieving items from the site
headers: Dict[str, str] = field(default_factory=dict)
headers: RequestHeaders = field(default_factory=dict)

# Feed object which is reused when crawling multiple dates
feed: Optional[FeedParser] = None

# Page objects mapped against URL for use when crawling multiple dates
pages: Dict[str, LxmlParser] = field(default_factory=dict)

def __init__(self, comic):
self.comic = comic

def get_crawler_release(self, pub_date=None):
def get_crawler_release(
self, pub_date: Optional[datetime.date] = None
) -> Optional[CrawlerRelease]:
"""Get meta data for release at pub_date, or the latest release"""

pub_date = self._get_date_to_crawl(pub_date)
Expand All @@ -109,9 +120,9 @@ def get_crawler_release(self, pub_date=None):
raise CrawlerHTTPError(release.identifier, str(error))

if not results:
return
return None

if not hasattr(results, "__iter__"):
if isinstance(results, CrawlerImage):
results = [results]

for result in results:
Expand All @@ -121,7 +132,7 @@ def get_crawler_release(self, pub_date=None):

return release

def _get_date_to_crawl(self, pub_date):
def _get_date_to_crawl(self, pub_date: Optional[datetime.date]) -> datetime.date:
identifier = f"{self.comic.slug}/{pub_date}"

if pub_date is None:
Expand All @@ -137,13 +148,13 @@ def _get_date_to_crawl(self, pub_date):
return pub_date

@property
def current_date(self):
def current_date(self) -> datetime.date:
tz = pytz.timezone(self.time_zone)
now_in_tz = tz.normalize(now().astimezone(tz))
return now_in_tz.date()

@property
def history_capable(self):
def history_capable(self) -> datetime.date:
if self.history_capable_date is not None:
return datetime.datetime.strptime(
self.history_capable_date, "%Y-%m-%d"
Expand All @@ -153,7 +164,7 @@ def history_capable(self):
else:
return today()

def crawl(self, pub_date):
def crawl(self, pub_date: datetime.date) -> CrawlerResult:
"""
Must be overridden by all crawlers
Expand All @@ -171,20 +182,20 @@ def crawl(self, pub_date):

# ### Helpers for the crawl() implementations

def parse_feed(self, feed_url):
def parse_feed(self, feed_url: str) -> FeedParser:
if self.feed is None:
self.feed = FeedParser(feed_url)
return self.feed

def parse_page(self, page_url):
def parse_page(self, page_url: str) -> LxmlParser:
if page_url not in self.pages:
self.pages[page_url] = LxmlParser(page_url, headers=self.headers)
return self.pages[page_url]

def string_to_date(self, *args, **kwargs):
return datetime.datetime.strptime(*args, **kwargs).date()
def string_to_date(self, string: str, format: str) -> datetime.date:
return datetime.datetime.strptime(string, format).date()

def date_to_epoch(self, date):
def date_to_epoch(self, date: datetime.date) -> int:
"""The UNIX time of midnight at ``date`` in the comic's time zone"""
naive_midnight = datetime.datetime(date.year, date.month, date.day)
local_midnight = pytz.timezone(self.time_zone).localize(naive_midnight)
Expand All @@ -194,7 +205,7 @@ def date_to_epoch(self, date):
class ComicsKingdomCrawlerBase(CrawlerBase):
"""Base comic crawler for Comics Kingdom comics"""

def crawl_helper(self, short_name, pub_date):
def crawl_helper(self, short_name: str, pub_date: datetime.date) -> CrawlerResult:
date = pub_date.strftime("%Y-%m-%d")
page_url = f"https://www.comicskingdom.com/{short_name}/{date}"
page = self.parse_page(page_url)
Expand All @@ -215,7 +226,7 @@ class GoComicsComCrawlerBase(CrawlerBase):
),
}

def crawl_helper(self, url_name, pub_date):
def crawl_helper(self, url_name: str, pub_date: datetime.date) -> CrawlerResult:
page_url = "http://www.gocomics.com/{}/{}".format(
url_name,
pub_date.strftime("%Y/%m/%d/"),
Expand All @@ -227,7 +238,7 @@ def crawl_helper(self, url_name, pub_date):
# we get redirected to todays comic
date = page.content('meta[property="article:published_time"]')
if date != pub_date.strftime("%Y-%m-%d"):
return
return None

return CrawlerImage(url)

Expand All @@ -237,7 +248,7 @@ class PondusNoCrawlerBase(CrawlerBase):

time_zone = "Europe/Oslo"

def crawl_helper(self, url_id):
def crawl_helper(self, url_id: str, pub_date: datetime.date) -> CrawlerResult:
page_url = "http://www.pondus.no/?section=artikkel&id=%s" % url_id
page = self.parse_page(page_url)
url = page.src(".imagegallery img")
Expand All @@ -250,15 +261,15 @@ class DagbladetCrawlerBase(CrawlerBase):
headers = {"User-Agent": "Mozilla/5.0"}
time_zone = "Europe/Oslo"

def crawl_helper(self, short_name, pub_date):
def crawl_helper(self, short_name: str, pub_date: datetime.date) -> CrawlerResult:
page_url = "http://www.dagbladet.no/tegneserie/%s" % short_name
page = self.parse_page(page_url)

date_string = pub_date.strftime("%Y-%m-%dT00:00:00")
time = page.root.xpath('//time[contains(@datetime,"%s")]' % date_string)

if not time:
return
return None

article = time[0].getparent().getparent()
url = article.find(".//img").get("src")
Expand All @@ -272,20 +283,23 @@ class CreatorsCrawlerBase(CrawlerBase):

headers = {"User-Agent": "Mozilla/5.0"}

def crawl_helper(self, feature_id, pub_date):
def crawl_helper(self, feature_id: str, pub_date: datetime.date) -> CrawlerResult:
url = (
"https://www.creators.com/api/features/get_release_dates?"
"feature_id=%s&year=%s"
) % (feature_id, pub_date.year)

response = httpx.get(url, headers=self.headers)
releases = response.json()

for release in releases:
if release["release"] == pub_date.strftime("%Y-%m-%d"):
page = self.parse_page(release["url"])
url = page.src('img[itemprop="image"]')
return CrawlerImage(url)

return None


class NettserierCrawlerBase(CrawlerBase):
"""Base comics crawler for all comics posted at nettserier.no"""
Expand All @@ -296,27 +310,27 @@ class NettserierCrawlerBase(CrawlerBase):
time_zone = "Europe/Oslo"
page_cache: Dict[str, Tuple[LxmlParser, datetime.date]] = {}

def get_page(self, url) -> Tuple[LxmlParser, datetime.date]:
def get_page(self, url: str) -> Tuple[LxmlParser, datetime.date]:
if url not in self.page_cache:
page = self.parse_page(url)
page_date = page.text('p[class="comic-pubtime"]')
date = self.string_to_date(page_date, "Published %Y-%m-%d %H:%M:%S")
self.page_cache[url] = (page, date)
return self.page_cache[url]

def crawl_helper(self, short_name, pub_date):
def crawl_helper(self, short_name: str, pub_date: datetime.date) -> CrawlerResult:
url = "https://nettserier.no/%s/" % short_name
page, comic_date = self.get_page(url)

while pub_date < comic_date:
# Wanted date is earlier than the current, get previous page
previous_link = page.root.xpath('//li[@class="prev"]/a/@href')
if not previous_link:
return # No previous comic
return None # No previous comic
page, comic_date = self.get_page(previous_link[0])

if pub_date != comic_date:
return # Correct date not found
return None # Correct date not found

# comic-text div which contains title and text for the comic
title = page.text("div.comic-text h4")
Expand All @@ -335,7 +349,7 @@ def crawl_helper(self, short_name, pub_date):
class ComicControlCrawlerBase(CrawlerBase):
"""Base comics crawler for all comics using ComicControl CMS"""

def crawl_helper(self, site_url, pub_date):
def crawl_helper(self, site_url: str, pub_date: datetime.date) -> CrawlerResult:
if site_url[-1] == "/":
site_url = site_url[0:-1]
if "pixietrixcomix.com" in site_url:
Expand All @@ -350,3 +364,5 @@ def crawl_helper(self, site_url, pub_date):
title = re.sub(r".+? - (.+)", r"\1", entry.title)

return CrawlerImage(url, title, text)

return None
1 change: 1 addition & 0 deletions comics/aggregator/tests/test_crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ class CurrentDateWhenLocalTZIsUTCTest(TestCase):
time_zone_local = "UTC"
time_zone_ahead = "Australia/Sydney"
time_zone_behind = "America/New_York"
now: datetime.datetime

def setUp(self):
self.tz = pytz.timezone(self.time_zone_local)
Expand Down
9 changes: 6 additions & 3 deletions comics/comics/__init__.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
import os
from types import ModuleType
from typing import List


def get_comic_module_names():
def get_comic_module_names() -> List[str]:
module_files = os.listdir(os.path.dirname(__file__))
module_names = []
for file in module_files:
Expand All @@ -10,14 +12,15 @@ def get_comic_module_names():
return sorted(module_names)


def get_comic_module(comic_slug):
def get_comic_module(comic_slug: str) -> ModuleType:
module_name = f"{__package__}.{comic_slug}"
return _import_by_name(module_name)


def _import_by_name(module_name):
def _import_by_name(module_name: str) -> ModuleType:
module = __import__(module_name)
components = module_name.split(".")
for component in components[1:]:
module = getattr(module, component)
assert isinstance(module, ModuleType)
return module
Loading

0 comments on commit c52c02a

Please sign in to comment.