From cd0f7c3d3ed87a4ca463590ebf2cc563558228a2 Mon Sep 17 00:00:00 2001 From: Adam Fontenot Date: Tue, 6 Aug 2024 14:45:44 -0400 Subject: [PATCH 01/20] Fix import of urllib.parse module Many files import urllib, but this is a package that contains the module `urllib.parse`. This works, when it does (I haven't checked every case) because `requests` is also imported, and it imports the module correctly, which makes it available to xword-dl under the urllib namespace. --- xword_dl/downloader/amuniversaldownloader.py | 4 +--- xword_dl/downloader/amuselabsdownloader.py | 3 +-- xword_dl/downloader/basedownloader.py | 2 +- xword_dl/downloader/crosswordclubdownloader.py | 4 ++-- xword_dl/downloader/dailypopdownloader.py | 2 +- xword_dl/downloader/derstandarddownloader.py | 6 +++--- xword_dl/downloader/globeandmaildownloader.py | 4 ++-- xword_dl/downloader/mckinseydownloader.py | 2 +- xword_dl/downloader/newyorktimesdownloader.py | 4 ++-- xword_dl/downloader/puzzlesocietydownloader.py | 4 ++-- xword_dl/downloader/simplydailydownloader.py | 14 +++++++------- xword_dl/xword_dl.py | 4 ++-- 12 files changed, 25 insertions(+), 28 deletions(-) diff --git a/xword_dl/downloader/amuniversaldownloader.py b/xword_dl/downloader/amuniversaldownloader.py index ba94402..4060dfd 100644 --- a/xword_dl/downloader/amuniversaldownloader.py +++ b/xword_dl/downloader/amuniversaldownloader.py @@ -2,7 +2,6 @@ import json import sys import time -import urllib import xml import puz @@ -56,8 +55,7 @@ def process_clues(self, clue_list): def parse_xword(self, xword_data): fetched = {} for field in ['Title', 'Author', 'Editor', 'Copryight']: - fetched[field] = urllib.parse.unquote( - xword_data.get(field, '')).strip() + fetched[field] = unquote(xword_data.get(field, '')).strip() puzzle = puz.Puzzle() puzzle.title = fetched.get('Title', '') diff --git a/xword_dl/downloader/amuselabsdownloader.py b/xword_dl/downloader/amuselabsdownloader.py index ceffc67..c1e2022 100644 --- a/xword_dl/downloader/amuselabsdownloader.py +++ b/xword_dl/downloader/amuselabsdownloader.py @@ -1,7 +1,7 @@ import base64 import datetime import json -import urllib +import urllib.parse import puz import requests @@ -263,4 +263,3 @@ def pick_filename(self, puzzle, **kwargs): if not self.date and self.id: self.guess_date_from_id(self.id) return super().pick_filename(puzzle, **kwargs) - diff --git a/xword_dl/downloader/basedownloader.py b/xword_dl/downloader/basedownloader.py index 00bf9e8..5e6801f 100644 --- a/xword_dl/downloader/basedownloader.py +++ b/xword_dl/downloader/basedownloader.py @@ -1,4 +1,4 @@ -import urllib +import urllib.parse import requests diff --git a/xword_dl/downloader/crosswordclubdownloader.py b/xword_dl/downloader/crosswordclubdownloader.py index 09183ec..1e597f7 100644 --- a/xword_dl/downloader/crosswordclubdownloader.py +++ b/xword_dl/downloader/crosswordclubdownloader.py @@ -1,4 +1,4 @@ -import urllib +import urllib.parse import dateparser import requests @@ -40,7 +40,7 @@ def find_latest(self): index_soup = BeautifulSoup(index_res.text, "html.parser") latest_url = next(a for a in index_soup.select('.all-puzzle-list a[href^="https://crosswordclub.com/puzzles/"]'))['href'] - + return latest_url def find_solver(self, url): diff --git a/xword_dl/downloader/dailypopdownloader.py b/xword_dl/downloader/dailypopdownloader.py index 01457b8..5c85e98 100644 --- a/xword_dl/downloader/dailypopdownloader.py +++ b/xword_dl/downloader/dailypopdownloader.py @@ -1,6 +1,6 @@ import datetime import requests -import urllib +import urllib.parse from .compilerdownloader import CrosswordCompilerDownloader from ..util import XWordDLException diff --git a/xword_dl/downloader/derstandarddownloader.py b/xword_dl/downloader/derstandarddownloader.py index 78edc23..976b281 100644 --- a/xword_dl/downloader/derstandarddownloader.py +++ b/xword_dl/downloader/derstandarddownloader.py @@ -1,7 +1,7 @@ import datetime import json import re -import urllib +import urllib.parse import dateparser import requests @@ -53,10 +53,10 @@ def find_solver(self, url): try: # html embed content is encoded -> beautifulsoup parsing would not work query_id = list(re.findall(r'(http)(s)*(:\/\/cdn\-eu1\.amuselabs\.com\/pmm\/crossword)(\?id\=)([0-9a-z]{8})', str(res.text))) - + if len(query_id) == 0: raise XWordDLException('Cannot find puzzle at {} -> failed to retrieve amuselabs url from encoded html.'.format(url)) - + self.id = str(query_id[0][-1]) except KeyError: raise XWordDLException('Cannot find puzzle at {}.'.format(url)) diff --git a/xword_dl/downloader/globeandmaildownloader.py b/xword_dl/downloader/globeandmaildownloader.py index c7603f9..6b044a2 100644 --- a/xword_dl/downloader/globeandmaildownloader.py +++ b/xword_dl/downloader/globeandmaildownloader.py @@ -1,5 +1,5 @@ import datetime -import urllib +import urllib.parse from .compilerdownloader import CrosswordCompilerDownloader from ..util import XWordDLException @@ -11,7 +11,7 @@ class GlobeAndMailDownloader(CrosswordCompilerDownloader): def __init__(self, **kwargs): super().__init__(**kwargs) - + self.fetch_data = self.fetch_jsencoded_data self.date = None diff --git a/xword_dl/downloader/mckinseydownloader.py b/xword_dl/downloader/mckinseydownloader.py index 88c8b7b..7047616 100644 --- a/xword_dl/downloader/mckinseydownloader.py +++ b/xword_dl/downloader/mckinseydownloader.py @@ -1,6 +1,6 @@ import datetime import json -import urllib +import urllib.parse import dateparser import requests diff --git a/xword_dl/downloader/newyorktimesdownloader.py b/xword_dl/downloader/newyorktimesdownloader.py index dad5a66..75080d6 100644 --- a/xword_dl/downloader/newyorktimesdownloader.py +++ b/xword_dl/downloader/newyorktimesdownloader.py @@ -1,5 +1,5 @@ import datetime -import urllib +import urllib.parse import puz import requests @@ -145,7 +145,7 @@ def parse_xword(self, xword_data): rebus_board.append(0) else: try: - suitable_answer = unidecode(square.get('answer') or + suitable_answer = unidecode(square.get('answer') or square['moreAnswers']['valid'][0]) except (IndexError, KeyError): raise XWordDLException('Unable to parse puzzle JSON. Possibly something .puz incompatible') diff --git a/xword_dl/downloader/puzzlesocietydownloader.py b/xword_dl/downloader/puzzlesocietydownloader.py index 8e6b867..f0d5479 100644 --- a/xword_dl/downloader/puzzlesocietydownloader.py +++ b/xword_dl/downloader/puzzlesocietydownloader.py @@ -1,6 +1,6 @@ import datetime import json -import urllib +import urllib.parse import requests @@ -21,7 +21,7 @@ def __init__(self, **kwargs): @staticmethod def matches_url(url_components): return 'puzzlesociety.com' in url_components.netloc and 'modern-crossword' in url_components.path - + def find_by_date(self, dt): url_format = dt.strftime('%Y/%m/%d') guessed_url = urllib.parse.urljoin( diff --git a/xword_dl/downloader/simplydailydownloader.py b/xword_dl/downloader/simplydailydownloader.py index 06d469d..05bf1a2 100644 --- a/xword_dl/downloader/simplydailydownloader.py +++ b/xword_dl/downloader/simplydailydownloader.py @@ -1,5 +1,5 @@ from datetime import datetime -import urllib +import urllib.parse from .compilerdownloader import CrosswordCompilerDownloader @@ -10,10 +10,10 @@ class SimplyDailyDownloader(CrosswordCompilerDownloader): outlet_prefix = 'Simply Daily' url_subdir = 'daily-crossword' qs_prefix = 'dc1' - + def __init__(self, **kwargs): super().__init__(**kwargs) - + self.fetch_data = self.fetch_jsencoded_data self.date = None @@ -28,26 +28,26 @@ def matches_url(cls, url_components): def parse_date_from_url(self, url): query_str = urllib.parse.urlparse(url).query query_dict = urllib.parse.parse_qs(query_str) - + try: puzz = query_dict['puzz'][0] except KeyError: date = datetime.today() else: date = datetime.strptime(puzz, f'{self.qs_prefix}-%Y-%m-%d') - + return date def find_solver(self, url): date = self.parse_date_from_url(url) - + pd = f'{date.strftime("%Y-%m")}' js = f'{self.qs_prefix}-{date.strftime("%Y-%m-%d")}.js' return f'https://{self.website}/{self.url_subdir}/puzzles/{pd}/{js}' def find_by_date(self, dt): self.date = dt # self.date used by BaseDownloader.pick_filename() - + qs = f'puzz={self.qs_prefix}-{dt.strftime("%Y-%m-%d")}' return f'https://{self.website}/{self.url_subdir}/index.html?{qs}' diff --git a/xword_dl/xword_dl.py b/xword_dl/xword_dl.py index 063ad65..db33177 100644 --- a/xword_dl/xword_dl.py +++ b/xword_dl/xword_dl.py @@ -6,7 +6,7 @@ import sys import textwrap import time -import urllib +import urllib.parse import requests @@ -121,7 +121,7 @@ def parse_for_embedded_puzzle(url, **kwargs): def get_supported_outlets(command_only=True): all_classes = inspect.getmembers(sys.modules['xword_dl.downloader'], inspect.isclass) - dls = [d for d in all_classes if issubclass(d[1], + dls = [d for d in all_classes if issubclass(d[1], downloader.BaseDownloader)] if command_only: From 5f4cb51fdc1cbb6d33e8c0fa4aa8b82d8c104ec9 Mon Sep 17 00:00:00 2001 From: Adam Fontenot Date: Tue, 6 Aug 2024 15:22:24 -0400 Subject: [PATCH 02/20] Fix reference to undefined setting Because of the way the settings dict is populated, if the preserve_html option is not provided, the key will not exist. `settings.get()` usually works because `None` is false-y, but has the potential to fail unexpectedly --- xword_dl/downloader/basedownloader.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/xword_dl/downloader/basedownloader.py b/xword_dl/downloader/basedownloader.py index 5e6801f..289700a 100644 --- a/xword_dl/downloader/basedownloader.py +++ b/xword_dl/downloader/basedownloader.py @@ -105,6 +105,7 @@ def download(self, url): puzzle = sanitize_for_puzfile(puzzle, preserve_html=self.settings.get( - 'preserve_html')) + 'preserve_html', + False)) return puzzle From a3e1f2782b47fc4b1da9627a8b68be637447b0a9 Mon Sep 17 00:00:00 2001 From: Adam Fontenot Date: Tue, 6 Aug 2024 15:51:23 -0400 Subject: [PATCH 03/20] fix reference to undefined method --- xword_dl/downloader/compilerdownloader.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/xword_dl/downloader/compilerdownloader.py b/xword_dl/downloader/compilerdownloader.py index 1c44a85..748e56b 100644 --- a/xword_dl/downloader/compilerdownloader.py +++ b/xword_dl/downloader/compilerdownloader.py @@ -13,7 +13,8 @@ def __init__(self, **kwargs): def find_solver(self, url): return url - def fetch_jsencoded_data(self, url): + @staticmethod + def fetch_jsencoded_data(url): res = requests.get(url, headers={'User-Agent': 'xword-dl'}) xw_data = res.text[len('var CrosswordPuzzleData = "'):-len('";')] xw_data = xw_data.replace('\\','') @@ -22,7 +23,7 @@ def fetch_jsencoded_data(self, url): def fetch_data(self, url, js_encoded=False): if js_encoded: - return fetch_jsencoded_data(url) + return self.fetch_jsencoded_data(url) res = requests.get(url, headers={'User-Agent': 'xword-dl'}) xw_data = res.text From e7d60b405102f9b7ae99ac5379958af470b7a611 Mon Sep 17 00:00:00 2001 From: Adam Fontenot Date: Tue, 6 Aug 2024 16:30:38 -0400 Subject: [PATCH 04/20] Avoid use of .get() when existence is immediately assumed Clean up some uses of get where the code either immediately assumes that the result is not None or else provides a replacement value, e.g. through a ternary expression. Keeping this clean makes error messages more debuggable when unexpected errors aren't caught. --- xword_dl/downloader/amuniversaldownloader.py | 4 ++-- xword_dl/downloader/basedownloader.py | 3 +-- xword_dl/downloader/compilerdownloader.py | 18 +++++++++--------- 3 files changed, 12 insertions(+), 13 deletions(-) diff --git a/xword_dl/downloader/amuniversaldownloader.py b/xword_dl/downloader/amuniversaldownloader.py index 4060dfd..1649bc9 100644 --- a/xword_dl/downloader/amuniversaldownloader.py +++ b/xword_dl/downloader/amuniversaldownloader.py @@ -160,8 +160,8 @@ def fetch_data(self, solver_url): def parse_xword(self, xword_data): try: - xw = xmltodict.parse(xword_data).get('crossword') - except xml.parsers.expat.ExpatError: + xw = xmltodict.parse(xword_data)['crossword'] + except (xml.parsers.expat.ExpatError, KeyError): raise XWordDLException('Puzzle data malformed, cannot parse.') puzzle = puz.Puzzle() diff --git a/xword_dl/downloader/basedownloader.py b/xword_dl/downloader/basedownloader.py index 289700a..982b9b2 100644 --- a/xword_dl/downloader/basedownloader.py +++ b/xword_dl/downloader/basedownloader.py @@ -56,8 +56,7 @@ def pick_filename(self, puzzle, **kwargs): template += ' - %title' if tokens.get('title') else '' for token in tokens.keys(): - replacement = (kwargs.get(token) if token in kwargs - else tokens[token]) + replacement = kwargs.get(token, tokens[token]) replacement = remove_invalid_chars_from_filename(replacement) template = template.replace('%' + token, replacement) diff --git a/xword_dl/downloader/compilerdownloader.py b/xword_dl/downloader/compilerdownloader.py index 748e56b..2d5c956 100644 --- a/xword_dl/downloader/compilerdownloader.py +++ b/xword_dl/downloader/compilerdownloader.py @@ -30,9 +30,9 @@ def fetch_data(self, url, js_encoded=False): return xw_data - def parse_xword(self, xword_data, enumeration=True): - xw = xmltodict.parse(xword_data) - xw_root = xw.get('crossword-compiler') or xw.get('crossword-compiler-applet') + def parse_xword(self, xw_data, enumeration=True): + xw = xmltodict.parse(xw_data) + xw_root = xw.get('crossword-compiler') or xw['crossword-compiler-applet'] xw_puzzle = xw_root['rectangular-puzzle'] xw_metadata = xw_puzzle['metadata'] xw_grid = xw_puzzle['crossword']['grid'] @@ -43,18 +43,18 @@ def parse_xword(self, xword_data, enumeration=True): puzzle.author = xw_metadata.get('creator') or '' puzzle.copyright = xw_metadata.get('copyright') or '' - puzzle.width = int(xw_grid.get('@width')) - puzzle.height = int(xw_grid.get('@height')) + puzzle.width = int(xw_grid['@width']) + puzzle.height = int(xw_grid['@height']) solution = '' fill = '' markup = b'' - cells = {(int(cell.get('@x')), int(cell.get('@y'))): cell for cell in xw_grid.get('cell')} + cells = {(int(cell['@x']), int(cell['@y'])): cell for cell in xw_grid['cell']} for y in range(1, puzzle.height + 1): for x in range(1, puzzle.width + 1): - cell = cells.get((x, y)) + cell = cells[(x, y)] solution += cell.get('@solution', '.') fill += '.' if cell.get('@type') == 'block' else '-' markup += (b'\x80' if (cell.get('@background-shape') == 'circle') else b'\x00') @@ -66,9 +66,9 @@ def parse_xword(self, xword_data, enumeration=True): all_clues = xw_clues[0]['clue'] + xw_clues[1]['clue'] - clues = [c.get('#text') + (f' ({c.get("@format", "")})' + clues = [c.get('#text', '') + (f' ({c.get("@format", "")})' if c.get("@format") and enumeration else '') for c in - sorted(all_clues, key=lambda x: int(x.get('@number')))] + sorted(all_clues, key=lambda x: int(x['@number']))] puzzle.clues = clues From 40bc53159b0b6b226ae9faa96e14a4b41e959e01 Mon Sep 17 00:00:00 2001 From: Adam Fontenot Date: Tue, 6 Aug 2024 16:08:22 -0400 Subject: [PATCH 05/20] Fix API consistency issues Many sub-classes modify their parents' methods and alter the allowed arguments. This is a problem because if a function call is written on the API of the parent class, it will result in an error if it uses keyword arguments unavailable in the child class, and may fail or yield unexpected results if it uses positional arguments in a different order. Even when this can be guaranteed not to be the case through analysis of the code, it's still a bad practice that can break newly added code unexpectedly. I've fixed all of these issues that I could find. --- xword_dl/downloader/amuniversaldownloader.py | 18 ++++++------- xword_dl/downloader/amuselabsdownloader.py | 20 +++++++------- xword_dl/downloader/basedownloader.py | 2 +- xword_dl/downloader/compilerdownloader.py | 20 +++++++------- xword_dl/downloader/dailybeastdownloader.py | 4 +-- xword_dl/downloader/dailypopdownloader.py | 11 ++++---- xword_dl/downloader/globeandmaildownloader.py | 3 ++- xword_dl/downloader/guardiandownloader.py | 18 ++++++------- xword_dl/downloader/newyorkerdownloader.py | 8 +++--- xword_dl/downloader/newyorktimesdownloader.py | 26 +++++++++---------- .../downloader/puzzlesocietydownloader.py | 8 +++--- xword_dl/downloader/simplydailydownloader.py | 4 ++- xword_dl/downloader/wsjdownloader.py | 8 +++--- 13 files changed, 76 insertions(+), 74 deletions(-) diff --git a/xword_dl/downloader/amuniversaldownloader.py b/xword_dl/downloader/amuniversaldownloader.py index 1649bc9..efaec6e 100644 --- a/xword_dl/downloader/amuniversaldownloader.py +++ b/xword_dl/downloader/amuniversaldownloader.py @@ -52,10 +52,10 @@ def process_clues(self, clue_list): return clue_list - def parse_xword(self, xword_data): + def parse_xword(self, xw_data): fetched = {} for field in ['Title', 'Author', 'Editor', 'Copryight']: - fetched[field] = unquote(xword_data.get(field, '')).strip() + fetched[field] = unquote(xw_data.get(field, '')).strip() puzzle = puz.Puzzle() puzzle.title = fetched.get('Title', '') @@ -63,10 +63,10 @@ def parse_xword(self, xword_data): ' / Ed. ', fetched.get('Editor', '')]) puzzle.copyright = fetched.get('Copyright', '') - puzzle.width = int(xword_data.get('Width')) - puzzle.height = int(xword_data.get('Height')) + puzzle.width = int(xw_data.get('Width')) + puzzle.height = int(xw_data.get('Height')) - solution = xword_data.get('AllAnswer').replace('-', '.') + solution = xw_data.get('AllAnswer').replace('-', '.') puzzle.solution = solution @@ -78,8 +78,8 @@ def parse_xword(self, xword_data): fill += '-' puzzle.fill = fill - across_clues = xword_data['AcrossClue'].splitlines() - down_clues = self.process_clues(xword_data['DownClue'].splitlines()) + across_clues = xw_data['AcrossClue'].splitlines() + down_clues = self.process_clues(xw_data['DownClue'].splitlines()) clues_list = across_clues + down_clues @@ -158,9 +158,9 @@ def fetch_data(self, solver_url): return xw_data - def parse_xword(self, xword_data): + def parse_xword(self, xw_data): try: - xw = xmltodict.parse(xword_data)['crossword'] + xw = xmltodict.parse(xw_data)['crossword'] except (xml.parsers.expat.ExpatError, KeyError): raise XWordDLException('Puzzle data malformed, cannot parse.') diff --git a/xword_dl/downloader/amuselabsdownloader.py b/xword_dl/downloader/amuselabsdownloader.py index c1e2022..a9722c2 100644 --- a/xword_dl/downloader/amuselabsdownloader.py +++ b/xword_dl/downloader/amuselabsdownloader.py @@ -186,15 +186,15 @@ def amuse_b64(e, amuseKey=None): return xword_data - def parse_xword(self, xword_data): + def parse_xword(self, xw_data): puzzle = puz.Puzzle() - puzzle.title = xword_data.get('title', '').strip() - puzzle.author = xword_data.get('author', '').strip() - puzzle.copyright = xword_data.get('copyright', '').strip() - puzzle.width = xword_data.get('w') - puzzle.height = xword_data.get('h') + puzzle.title = xw_data.get('title', '').strip() + puzzle.author = xw_data.get('author', '').strip() + puzzle.copyright = xw_data.get('copyright', '').strip() + puzzle.width = xw_data.get('w') + puzzle.height = xw_data.get('h') - markup_data = xword_data.get('cellInfos', '') + markup_data = xw_data.get('cellInfos', '') circled = [(square['x'], square['y']) for square in markup_data if square['isCircled']] @@ -206,8 +206,8 @@ def parse_xword(self, xword_data): rebus_index = 0 rebus_table = '' - box = xword_data['box'] - for row_num in range(xword_data.get('h')): + box = xw_data['box'] + for row_num in range(xw_data.get('h')): for col_num, column in enumerate(box): cell = column[row_num] if cell == '\x00': @@ -231,7 +231,7 @@ def parse_xword(self, xword_data): puzzle.solution = solution puzzle.fill = fill - placed_words = xword_data['placedWords'] + placed_words = xw_data['placedWords'] across_words = [word for word in placed_words if word['acrossNotDown']] down_words = [ word for word in placed_words if not word['acrossNotDown']] diff --git a/xword_dl/downloader/basedownloader.py b/xword_dl/downloader/basedownloader.py index 982b9b2..3524291 100644 --- a/xword_dl/downloader/basedownloader.py +++ b/xword_dl/downloader/basedownloader.py @@ -87,7 +87,7 @@ def fetch_data(self, solver_url): """ raise NotImplementedError - def parse_xword(self, xword_data): + def parse_xword(self, xw_data): """Given a blob of crossword data, parse and stuff into puz format. This method is implemented in subclasses based on the differences in diff --git a/xword_dl/downloader/compilerdownloader.py b/xword_dl/downloader/compilerdownloader.py index 2d5c956..3069b8b 100644 --- a/xword_dl/downloader/compilerdownloader.py +++ b/xword_dl/downloader/compilerdownloader.py @@ -14,21 +14,19 @@ def find_solver(self, url): return url @staticmethod - def fetch_jsencoded_data(url): - res = requests.get(url, headers={'User-Agent': 'xword-dl'}) - xw_data = res.text[len('var CrosswordPuzzleData = "'):-len('";')] - xw_data = xw_data.replace('\\','') + def _fetch_data(solver_url, js_encoded=False, headers=None): + headers = headers or {'User-Agent': 'xword-dl'} + res = requests.get(solver_url, headers=headers) - return xw_data - - def fetch_data(self, url, js_encoded=False): if js_encoded: - return self.fetch_jsencoded_data(url) + xw_data = res.text[len('var CrosswordPuzzleData = "'):-len('";')] + return xw_data.replace('\\','') - res = requests.get(url, headers={'User-Agent': 'xword-dl'}) - xw_data = res.text + return res.text - return xw_data + # subclasses of CCD may want to override this method with different defaults + def fetch_data(self, solver_url): + return self._fetch_data(solver_url) def parse_xword(self, xw_data, enumeration=True): xw = xmltodict.parse(xw_data) diff --git a/xword_dl/downloader/dailybeastdownloader.py b/xword_dl/downloader/dailybeastdownloader.py index 01b731e..b83b6f8 100644 --- a/xword_dl/downloader/dailybeastdownloader.py +++ b/xword_dl/downloader/dailybeastdownloader.py @@ -15,8 +15,8 @@ def __init__(self, **kwargs): self.picker_url = 'https://cdn3.amuselabs.com/tdb/date-picker?set=tdb' self.url_from_id = 'https://cdn3.amuselabs.com/tdb/crossword?id={puzzle_id}&set=tdb' - def parse_xword(self, xword_data): - puzzle = super().parse_xword(xword_data) + def parse_xword(self, xw_data): + puzzle = super().parse_xword(xw_data) # Daily Beast puzzle IDs don't include the date. # This pulls it out of the puzzle title (with periods removed because diff --git a/xword_dl/downloader/dailypopdownloader.py b/xword_dl/downloader/dailypopdownloader.py index 5c85e98..5f3194e 100644 --- a/xword_dl/downloader/dailypopdownloader.py +++ b/xword_dl/downloader/dailypopdownloader.py @@ -1,6 +1,7 @@ import datetime import requests import urllib.parse +from functools import partial from .compilerdownloader import CrosswordCompilerDownloader from ..util import XWordDLException @@ -19,6 +20,11 @@ def __init__(self, **kwargs): if 'x-api-key' not in self.settings['headers']: self.settings['headers']['x-api-key'] = self.get_api_key() + self.fetch_data = partial( + self._fetch_data, + headers=self.settings['headers'] + ) + def get_api_key(self): res = requests.get('http://dailypopcrosswordsweb.puzzlenation.com/crosswordSetup.js') @@ -41,8 +47,3 @@ def find_by_date(self, dt): def find_latest(self): dt = datetime.datetime.today() return self.find_by_date(dt) - - def fetch_data(self, url): - res = requests.get(url, headers=self.settings['headers']) - - return res.text diff --git a/xword_dl/downloader/globeandmaildownloader.py b/xword_dl/downloader/globeandmaildownloader.py index 6b044a2..3d4d274 100644 --- a/xword_dl/downloader/globeandmaildownloader.py +++ b/xword_dl/downloader/globeandmaildownloader.py @@ -1,5 +1,6 @@ import datetime import urllib.parse +from functools import partial from .compilerdownloader import CrosswordCompilerDownloader from ..util import XWordDLException @@ -12,7 +13,7 @@ class GlobeAndMailDownloader(CrosswordCompilerDownloader): def __init__(self, **kwargs): super().__init__(**kwargs) - self.fetch_data = self.fetch_jsencoded_data + self.fetch_data = partial(self._fetch_data, js_encoded=True) self.date = None if 'url' in kwargs and not self.date: diff --git a/xword_dl/downloader/guardiandownloader.py b/xword_dl/downloader/guardiandownloader.py index fe9498a..7be4a69 100644 --- a/xword_dl/downloader/guardiandownloader.py +++ b/xword_dl/downloader/guardiandownloader.py @@ -41,24 +41,24 @@ def fetch_data(self, solver_url): return xw_data - def parse_xword(self, xword_data): + def parse_xword(self, xw_data): puzzle = puz.Puzzle() - puzzle.author = xword_data.get('creator', {}).get('name') or '' - puzzle.height = xword_data.get('dimensions').get('rows') - puzzle.width = xword_data.get('dimensions').get('cols') + puzzle.author = xw_data.get('creator', {}).get('name', '') + puzzle.height = xw_data.get('dimensions').get('rows') + puzzle.width = xw_data.get('dimensions').get('cols') - puzzle.title = xword_data.get('name') or '' + puzzle.title = xw_data.get('name') or '' - if not all(e.get('solution') for e in xword_data['entries']): + if not all(e.get('solution') for e in xw_data['entries']): puzzle.title += ' - no solution provided' self.date = datetime.datetime.fromtimestamp( - xword_data.get('date') // 1000) + xw_data['date'] // 1000) grid_dict = {} - for e in xword_data.get('entries'): + for e in xw_data.get('entries'): pos = (e.get('position').get('x'), e.get('position').get('y')) for index in range(e.get('length')): grid_dict[pos] = e.get('solution', 'X' * e.get('length'))[index] @@ -77,7 +77,7 @@ def parse_xword(self, xword_data): puzzle.solution = solution puzzle.fill = fill - clues = [e.get('clue') for e in sorted(xword_data.get('entries'), + clues = [e.get('clue') for e in sorted(xw_data.get('entries'), key=lambda x: (x.get('number'), x.get('direction')))] puzzle.clues = clues diff --git a/xword_dl/downloader/newyorkerdownloader.py b/xword_dl/downloader/newyorkerdownloader.py index 2aba097..96c00bd 100644 --- a/xword_dl/downloader/newyorkerdownloader.py +++ b/xword_dl/downloader/newyorkerdownloader.py @@ -1,6 +1,6 @@ import datetime import json -import urllib +import urllib.parse import dateparser import requests @@ -88,9 +88,9 @@ def find_solver(self, url): self.theme_title = desc[len(theme_supra):].rstrip('.') return self.find_puzzle_url_from_id(self.id) - - def parse_xword(self, xword_data): - puzzle = super().parse_xword(xword_data) + + def parse_xword(self, xw_data): + puzzle = super().parse_xword(xw_data) if '<' in puzzle.title: puzzle.title = puzzle.title.split('<')[0] diff --git a/xword_dl/downloader/newyorktimesdownloader.py b/xword_dl/downloader/newyorktimesdownloader.py index 75080d6..b14a9d1 100644 --- a/xword_dl/downloader/newyorktimesdownloader.py +++ b/xword_dl/downloader/newyorktimesdownloader.py @@ -109,23 +109,23 @@ def fetch_data(self, solver_url): xword_data = res.json() return xword_data - def parse_xword(self, xword_data): + def parse_xword(self, xw_data): puzzle = puz.Puzzle() - puzzle.author = join_bylines(xword_data['constructors'], "and").strip() - puzzle.copyright = xword_data['copyright'] - puzzle.height = int(xword_data['body'][0]['dimensions']['height']) - puzzle.width = int(xword_data['body'][0]['dimensions']['width']) + puzzle.author = join_bylines(xw_data['constructors'], "and").strip() + puzzle.copyright = xw_data['copyright'] + puzzle.height = int(xw_data['body'][0]['dimensions']['height']) + puzzle.width = int(xw_data['body'][0]['dimensions']['width']) if not self.date: - self.date = datetime.datetime.strptime(xword_data['publicationDate'], + self.date = datetime.datetime.strptime(xw_data['publicationDate'], '%Y-%m-%d') - puzzle.title = xword_data.get('title') or self.date.strftime( + puzzle.title = xw_data.get('title') or self.date.strftime( '%A, %B %d, %Y') - if xword_data.get('notes'): - puzzle.notes = xword_data.get('notes')[0]['text'] + if xw_data.get('notes'): + puzzle.notes = xw_data.get('notes')[0]['text'] solution = '' fill = '' @@ -134,7 +134,7 @@ def parse_xword(self, xword_data): rebus_index = 0 rebus_table = '' - for idx, square in enumerate(xword_data['body'][0]['cells']): + for idx, square in enumerate(xw_data['body'][0]['cells']): if not square: solution += '.' fill += '.' @@ -172,7 +172,7 @@ def parse_xword(self, xword_data): puzzle._extensions_order.extend([b'GRBS', b'RTBL']) puzzle.rebus() - clue_list = xword_data['body'][0]['clues'] + clue_list = xw_data['body'][0]['clues'] clue_list.sort(key=lambda c: (int(c['label']), c['direction'])) puzzle.clues = [c['text'][0].get('plain') or '' for c in clue_list] @@ -198,9 +198,9 @@ def __init__(self, **kwargs): self.url_from_date = 'https://www.nytimes.com/svc/crosswords/v6/puzzle/variety/{}.json' - def parse_xword(self, xword_data): + def parse_xword(self, xw_data): try: - return super().parse_xword(xword_data) + return super().parse_xword(xw_data) except ValueError: raise XWordDLException('Encountered error while parsing data. Maybe the selected puzzle is not a crossword?') diff --git a/xword_dl/downloader/puzzlesocietydownloader.py b/xword_dl/downloader/puzzlesocietydownloader.py index f0d5479..a3e52bc 100644 --- a/xword_dl/downloader/puzzlesocietydownloader.py +++ b/xword_dl/downloader/puzzlesocietydownloader.py @@ -47,14 +47,14 @@ def find_solver(self, url): return url - def fetch_data(self, url): - res = requests.get(url) + def fetch_data(self, solver_url): + res = requests.get(solver_url) xw_data = res.content.decode('utf-8-sig') return xw_data - def parse_xword(self, xword_data): - puzzle = super().parse_xword(xword_data, enumeration=False) + def parse_xword(self, xw_data, enumeration=False): + puzzle = super().parse_xword(xw_data, enumeration=enumeration) if not puzzle.author or puzzle.author.casefold().startswith('edited'): puzzle.author = puzzle.title[3:] diff --git a/xword_dl/downloader/simplydailydownloader.py b/xword_dl/downloader/simplydailydownloader.py index 05bf1a2..83bb6bf 100644 --- a/xword_dl/downloader/simplydailydownloader.py +++ b/xword_dl/downloader/simplydailydownloader.py @@ -1,4 +1,5 @@ from datetime import datetime +from functools import partial import urllib.parse from .compilerdownloader import CrosswordCompilerDownloader @@ -14,12 +15,13 @@ class SimplyDailyDownloader(CrosswordCompilerDownloader): def __init__(self, **kwargs): super().__init__(**kwargs) - self.fetch_data = self.fetch_jsencoded_data self.date = None if 'url' in kwargs and not self.date: self.date = self.parse_date_from_url(kwargs.get('url')) + self.fetch_data = partial(self._fetch_data, js_encoded=True) + @classmethod def matches_url(cls, url_components): return (cls.website in url_components.netloc and diff --git a/xword_dl/downloader/wsjdownloader.py b/xword_dl/downloader/wsjdownloader.py index d1c49bb..0d62c9f 100644 --- a/xword_dl/downloader/wsjdownloader.py +++ b/xword_dl/downloader/wsjdownloader.py @@ -57,9 +57,9 @@ def fetch_data(self, solver_url): data_url = solver_url.rsplit('/', maxsplit=1)[0] + '/data.json' return self.session.get(data_url).json()['data'] - def parse_xword(self, xword_data): - xword_metadata = xword_data.get('copy', '') - xword_data = xword_data.get('grid', '') + def parse_xword(self, xw_data): + xword_metadata = xw_data.get('copy', '') + xw_data = xw_data.get('grid', '') date_string = xword_metadata.get('date-publish-analytics').split()[0] @@ -78,7 +78,7 @@ def parse_xword(self, xword_data): fill = '' markup = b'' - for row in xword_data: + for row in xw_data: for cell in row: if cell.get('Blank'): fill += '.' From dcee5712a93a2aef3e725d63b6e689c8d4181ef5 Mon Sep 17 00:00:00 2001 From: Adam Fontenot Date: Tue, 6 Aug 2024 21:31:32 -0400 Subject: [PATCH 06/20] Add considerably more error checking This is mainly just catching and dealing with type mismatch errors reported by Pyright. This catches a lot of cases where HTML parsing can fall through, but not all of them, e.g. IndexError and KeyError are not type errors. (But failing to prove an optional type has a value is.) How you deal with these issues is ultimately a matter of some preference. In two or three places, I removed a perfectly adequate check that looked for an AttributeError, since exception catching doesn't satisfy the type checker. But this commit could be revised to leave these checks in or even do more of them. A follow up to this should probably add checks on the result of every `request.get` and `request.post`, as I only did that here where it was convenient. --- xword_dl/downloader/amuselabsdownloader.py | 64 +++++++++++++++---- .../downloader/crosswordclubdownloader.py | 3 + xword_dl/downloader/derstandarddownloader.py | 4 ++ xword_dl/downloader/globeandmaildownloader.py | 3 + xword_dl/downloader/guardiandownloader.py | 24 +++++-- xword_dl/downloader/mckinseydownloader.py | 6 +- xword_dl/downloader/newyorkerdownloader.py | 49 ++++++++------ .../downloader/puzzlesocietydownloader.py | 13 +++- xword_dl/downloader/wsjdownloader.py | 13 ++-- 9 files changed, 129 insertions(+), 50 deletions(-) diff --git a/xword_dl/downloader/amuselabsdownloader.py b/xword_dl/downloader/amuselabsdownloader.py index a9722c2..2a57b51 100644 --- a/xword_dl/downloader/amuselabsdownloader.py +++ b/xword_dl/downloader/amuselabsdownloader.py @@ -1,4 +1,5 @@ import base64 +import binascii import datetime import json import urllib.parse @@ -8,7 +9,7 @@ import re -from bs4 import BeautifulSoup +from bs4 import BeautifulSoup, Tag from .basedownloader import BaseDownloader from ..util import * @@ -19,24 +20,45 @@ def __init__(self, **kwargs): self.id = None + # these values must be overridden by subclasses, if used + self.picker_url = None + self.url_from_id = None + @staticmethod def matches_url(url_components): return 'amuselabs.com' in url_components.netloc - def find_latest(self): + def find_latest(self) -> str: + if self.picker_url is None or self.url_from_id is None: + raise XWordDLException("This outlet does not support finding the latest crossword.") res = requests.get(self.picker_url) soup = BeautifulSoup(res.text, 'html.parser') + puzzles = soup.find('div', attrs={'class': 'puzzles'}) - puzzle_ids = puzzles.findAll('div', attrs={'class': 'tile'}) - if not puzzle_ids: - puzzle_ids = puzzles.findAll('li', attrs={'class': 'tile'}) - self.id = puzzle_ids[0].get('data-id', '') + if not isinstance(puzzles, Tag): + raise XWordDLException("Unable to find class 'puzzles' in picker HTML soruce.") + + puzzle_id = puzzles.find( + 'div', + attrs={'class': 'tile'} + ) or puzzles.find( + 'li', + attrs={'class': 'tile'} + ) + + if not isinstance(puzzle_id, Tag): + raise XWordDLException("Unable to find puzzle ID in picker HTML source.") + self.id = puzzle_id.get('data-id', '') self.get_and_add_picker_token(res.text) return self.find_puzzle_url_from_id(self.id) def get_and_add_picker_token(self, picker_source=None): + if self.picker_url is None: + raise XWordDLException( + "No picker URL was available. Please report this as a bug." + ) if not picker_source: res = requests.get(self.picker_url) picker_source = res.text @@ -48,9 +70,10 @@ def get_and_add_picker_token(self, picker_source=None): else: soup = BeautifulSoup(picker_source, 'html.parser') param_tag = soup.find('script', id='params') - param_obj = json.loads(param_tag.string) if param_tag else {} + param_obj = json.loads(param_tag.string or "") if isinstance(param_tag, Tag) else {} rawsps = param_obj.get('rawsps', None) + # FIXME: should this raise an exception when not defined? if rawsps: picker_params = json.loads(base64.b64decode(rawsps).decode("utf-8")) token = picker_params.get('pickerToken', None) @@ -58,6 +81,10 @@ def get_and_add_picker_token(self, picker_source=None): self.url_from_id += '&pickerToken=' + token def find_puzzle_url_from_id(self, puzzle_id): + if self.url_from_id is None: + raise XWordDLException( + "No URL for puzzle IDs was available. Please report this as a bug." + ) return self.url_from_id.format(puzzle_id=puzzle_id) def guess_date_from_id(self, puzzle_id): @@ -76,6 +103,9 @@ def find_solver(self, url): def fetch_data(self, solver_url): res = requests.get(solver_url) + if not res.ok: + raise XWordDLException("Could not fetch solver.") + if 'window.rawc' in res.text or 'window.puzzleEnv.rawc' in res.text: rawc = next((line.strip().split("'")[1] for line in res.text.splitlines() if ('window.rawc' in line @@ -84,16 +114,22 @@ def fetch_data(self, solver_url): # As of 2023-12-01, it looks like the rawc value is sometimes # given as a parameter in an embedded json blob, which means # parsing the page - try: - soup = BeautifulSoup(res.text, 'html.parser') - rawc = json.loads(soup.find('script', id='params').string)['rawc'] - except AttributeError: - raise XWordDLException("Crossword puzzle not found.") + soup = BeautifulSoup(res.text, 'html.parser') + if not isinstance(soup, Tag): + raise XWordDLException("Crossword puzzle not found. Could not parse HTML.") + + script_tag = soup.find('script', id='params') + if not isinstance(script_tag, Tag): + raise XWordDLException("Crossword puzzle not found. Could not find script tag.") + + rawc = json.loads(script_tag.string or "").get('rawc') ## In some cases we need to pull the underlying JavaScript ## # Find the JavaScript URL amuseKey = None m1 = re.search(r'"([^"]+c-min.js[^"]+)"', res.text) + if m1 is None: + raise XWordDLException("Failed to find JS url for amuseKey.") js_url_fragment = m1.groups()[0] js_url = urllib.parse.urljoin(solver_url, js_url_fragment) @@ -148,7 +184,7 @@ def load_rawc(rawc, amuseKey=None): return json.loads(base64.b64decode(newRawc).decode("utf-8")) except: # case 3 is the most recent obfuscation - def amuse_b64(e, amuseKey=None): + def amuse_b64(e, amuseKey): e = list(e) H=amuseKey E=[] @@ -181,7 +217,7 @@ def amuse_b64(e, amuseKey=None): try: xword_data = load_rawc(rawc, amuseKey=amuseKey) - except (UnicodeDecodeError, base64.binascii.Error): + except (UnicodeDecodeError, binascii.Error): xword_data = load_rawc(rawc, amuseKey=amuseKey2) return xword_data diff --git a/xword_dl/downloader/crosswordclubdownloader.py b/xword_dl/downloader/crosswordclubdownloader.py index 1e597f7..834b887 100644 --- a/xword_dl/downloader/crosswordclubdownloader.py +++ b/xword_dl/downloader/crosswordclubdownloader.py @@ -41,6 +41,9 @@ def find_latest(self): latest_url = next(a for a in index_soup.select('.all-puzzle-list a[href^="https://crosswordclub.com/puzzles/"]'))['href'] + if isinstance(latest_url, list): + latest_url = latest_url[0] + return latest_url def find_solver(self, url): diff --git a/xword_dl/downloader/derstandarddownloader.py b/xword_dl/downloader/derstandarddownloader.py index 976b281..a4415da 100644 --- a/xword_dl/downloader/derstandarddownloader.py +++ b/xword_dl/downloader/derstandarddownloader.py @@ -35,6 +35,10 @@ def find_latest(self): index_soup = BeautifulSoup(index_res.text, "html.parser") latest_fragment = next(a for a in index_soup.select('.teaser-inner > a'))['href'] + + if not isinstance(latest_fragment, str): + raise XWordDLException("Could not load latest crossword. Fragment not found.") + latest_absolute = urllib.parse.urljoin('https://www.derstandard.at', latest_fragment) diff --git a/xword_dl/downloader/globeandmaildownloader.py b/xword_dl/downloader/globeandmaildownloader.py index 3d4d274..95e35bc 100644 --- a/xword_dl/downloader/globeandmaildownloader.py +++ b/xword_dl/downloader/globeandmaildownloader.py @@ -51,6 +51,9 @@ def find_by_date(self, dt): def find_solver(self, url): start_date = datetime.datetime(2011, 1, 2) + if not isinstance(self.date, datetime.datetime): + raise XWordDLException("No solver date provided. This is a bug.") + date_diff = self.date - start_date weeks_diff, extra_days = divmod(date_diff.days, 7) diff --git a/xword_dl/downloader/guardiandownloader.py b/xword_dl/downloader/guardiandownloader.py index 7be4a69..6be0cb7 100644 --- a/xword_dl/downloader/guardiandownloader.py +++ b/xword_dl/downloader/guardiandownloader.py @@ -5,7 +5,7 @@ import puz import requests -from bs4 import BeautifulSoup +from bs4 import BeautifulSoup, Tag from .basedownloader import BaseDownloader from ..util import XWordDLException @@ -23,11 +23,16 @@ def find_latest(self): res = requests.get(self.landing_page) soup = BeautifulSoup(res.text, 'html.parser') - url = 'https://www.theguardian.com' xword_link_re = re.compile(r'/crosswords/\w+/\d+') - url += soup.find('a', href=xword_link_re).get('href') + link_tag = soup.find('a', href=xword_link_re) + if not isinstance(link_tag, Tag): + raise XWordDLException("Could not find latest crossword.") - return url + link = link_tag['href'] + if isinstance(link, list): + link = link[0] + + return 'https://www.theguardian.com' + link def find_solver(self, url): return url @@ -36,8 +41,15 @@ def fetch_data(self, solver_url): res = requests.get(solver_url) soup = BeautifulSoup(res.text, 'html.parser') - xw_data = json.loads(soup.find('div', - attrs={'class':'js-crossword'}).get('data-crossword-data')) + xw_json = soup.find('div', attrs={'class': 'js-crossword'}) + if not isinstance(xw_json, Tag): + raise XWordDLException("Could not find crossword in solver data.") + + xw_json_str = xw_json.get('data-crossword-data') + if not isinstance(xw_json_str, str): + raise XWordDLException("Could not get JSON from solver data.") + + xw_data = json.loads(xw_json_str) return xw_data diff --git a/xword_dl/downloader/mckinseydownloader.py b/xword_dl/downloader/mckinseydownloader.py index 7047616..55b1228 100644 --- a/xword_dl/downloader/mckinseydownloader.py +++ b/xword_dl/downloader/mckinseydownloader.py @@ -43,7 +43,11 @@ def find_latest(self): index_res = requests.get(index_url) index_soup = BeautifulSoup(index_res.text, "html.parser") - latest_fragment = next(a for a in index_soup.select('a[href^="/featured-insights/the-mckinsey-crossword/"]') if a.find('div'))['href'] + latest_fragment = next(a for a in index_soup.select('a[href^="/featured-insights/the-mckinsey-crossword/"]') if a.find('div')).get('href') + + if not isinstance(latest_fragment, str): + raise XWordDLException("Could not get latest crossword. No crossword fragment.") + latest_absolute = urllib.parse.urljoin('https://www.mckinsey.com', latest_fragment) diff --git a/xword_dl/downloader/newyorkerdownloader.py b/xword_dl/downloader/newyorkerdownloader.py index 96c00bd..23b4f6c 100644 --- a/xword_dl/downloader/newyorkerdownloader.py +++ b/xword_dl/downloader/newyorkerdownloader.py @@ -5,7 +5,7 @@ import dateparser import requests -from bs4 import BeautifulSoup +from bs4 import BeautifulSoup, Tag from .amuselabsdownloader import AmuseLabsDownloader from ..util import XWordDLException @@ -39,11 +39,17 @@ def find_by_date(self, dt): def find_latest(self, search_string='/crossword/'): url = "https://www.newyorker.com/puzzles-and-games-dept/crossword" res = self.session.get(url) + if not res.ok: + raise XWordDLException("Could not fetch latest crossword URL.") + soup = BeautifulSoup(res.text, "html.parser") - puzzle_list = json.loads(soup.find('script', - attrs={'type':'application/ld+json'}) - .get_text()).get('itemListElement',{}) + json_tag = soup.find('script', attrs={'type':'application/ld+json'}) + if not isinstance(json_tag, Tag): + raise XWordDLException("Could not find metadata tag for latest crossword.") + + json_str = json_tag.get_text() + puzzle_list = json.loads(json_str).get('itemListElement',{}) latest_url = next((item for item in puzzle_list if search_string in item.get('url', '')), {}).get('url') @@ -64,28 +70,29 @@ def find_solver(self, url): soup = BeautifulSoup(res.text, "html.parser") iframe_tag = soup.find('iframe', id='crossword') + if not isinstance(iframe_tag, Tag): + raise XWordDLException("Could not find crossword iframe.") - try: - iframe_url = iframe_tag['data-src'] - query = urllib.parse.urlparse(iframe_url).query - query_id = urllib.parse.parse_qs(query)['id'] - self.id = query_id[0] - - # Will hit this KeyError if there's no matching iframe - # or if there's no 'id' query string - except KeyError: - raise XWordDLException('Cannot find puzzle at {}.'.format(url)) + iframe_url = iframe_tag.get('data-src') + if not isinstance(iframe_url, str): + raise XWordDLException("Could not get URL src for iframe.") - pubdate = soup.find('time').get_text() - pubdate_dt = dateparser.parse(pubdate) + query = urllib.parse.urlparse(iframe_url).query + self.id = urllib.parse.parse_qs(query)['id'][0] - self.date = pubdate_dt + pubdate = soup.find('time') + if pubdate: + pubdate = pubdate.get_text() + pubdate_dt = dateparser.parse(pubdate) + self.date = pubdate_dt theme_supra = "Today’s theme: " - desc = soup.find('meta',attrs={'property': - 'og:description'}).get('content', '') - if desc.startswith(theme_supra): - self.theme_title = desc[len(theme_supra):].rstrip('.') + desc = soup.find('meta',attrs={'property': 'og:description'}) + if isinstance(desc, Tag): + desc = desc.get('content', '') + if isinstance(desc, str): + if desc.startswith(theme_supra): + self.theme_title = desc[len(theme_supra):].rstrip('.') return self.find_puzzle_url_from_id(self.id) diff --git a/xword_dl/downloader/puzzlesocietydownloader.py b/xword_dl/downloader/puzzlesocietydownloader.py index a3e52bc..d7d8a37 100644 --- a/xword_dl/downloader/puzzlesocietydownloader.py +++ b/xword_dl/downloader/puzzlesocietydownloader.py @@ -4,7 +4,9 @@ import requests -from bs4 import BeautifulSoup +from bs4 import BeautifulSoup, Tag + +from xword_dl.util.utils import XWordDLException from .compilerdownloader import CrosswordCompilerDownloader @@ -36,8 +38,13 @@ def find_solver(self, url): res = requests.get(url) soup = BeautifulSoup(res.text, 'lxml') - page_props = json.loads(soup.find('script', - {'type':'application/json'}).get_text()) + + json_tag = soup.find('script', {'type':'application/json'}) + if not isinstance(json_tag, Tag): + raise XWordDLException("Could not find JSON metadata for solver.") + + json_str = json_tag.get_text() + page_props = json.loads(json_str) sets = page_props['props']['pageProps']\ ['gameContent']['gameLevelDataSets'] diff --git a/xword_dl/downloader/wsjdownloader.py b/xword_dl/downloader/wsjdownloader.py index 0d62c9f..a96714a 100644 --- a/xword_dl/downloader/wsjdownloader.py +++ b/xword_dl/downloader/wsjdownloader.py @@ -2,7 +2,7 @@ import puz -from bs4 import BeautifulSoup +from bs4 import BeautifulSoup, Tag from .basedownloader import BaseDownloader from ..util import XWordDLException @@ -47,10 +47,13 @@ def find_solver(self, url): else: res = self.session.get(url) soup = BeautifulSoup(res.text, 'html.parser') - try: - puzzle_link = soup.find('iframe').get('src') - except AttributeError: - raise XWordDLException('Cannot find puzzle at {}.'.format(url)) + + iframe = soup.find('iframe') + if not isinstance(iframe, Tag): + raise XWordDLException('Cannot find puzzle at {}. No iframe tag.'.format(url)) + + puzzle_link = iframe['src'] + return self.find_solver(puzzle_link) def fetch_data(self, solver_url): From feb1b0c919203c0b8392318821f2984f87698a5a Mon Sep 17 00:00:00 2001 From: Adam Fontenot Date: Tue, 6 Aug 2024 21:40:48 -0400 Subject: [PATCH 07/20] fix erroneous equality check used in place of an assignment --- xword_dl/downloader/puzzmodownloader.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/xword_dl/downloader/puzzmodownloader.py b/xword_dl/downloader/puzzmodownloader.py index 0d5a921..de30c18 100644 --- a/xword_dl/downloader/puzzmodownloader.py +++ b/xword_dl/downloader/puzzmodownloader.py @@ -16,7 +16,7 @@ def __init__(self, **kwargs): super().__init__(**kwargs) self.temporary_user_id = secrets.token_urlsafe(21) - self.session.headers.update({'Puzzmo-Gameplay-Id': + self.session.headers.update({'Puzzmo-Gameplay-Id': self.temporary_user_id}) def find_latest(self): @@ -53,7 +53,7 @@ def find_solver(self, url): return url def fetch_data(self, solver_url): - slug = solver_url.rsplit('/')[-1] + slug = solver_url.rsplit('/')[-1] query = """query PlayGameScreenQuery( $slug: ID! ) { @@ -126,7 +126,7 @@ def parse_xword(self, xw_data): continue elif not named_sections and blank_count >= 2: - section == default_sections.pop(0) + section = default_sections.pop(0) blank_count = 0 if section == 'metadata': @@ -161,7 +161,7 @@ def parse_xword(self, xw_data): elif section == 'clues': if clue_parts := re.match(r'([AD])(\d{1,2})\.(.*)', line): - clue_list.append((clue_parts[1], + clue_list.append((clue_parts[1], int(clue_parts[2]), clue_parts[3])) else: From 8d913e562222b907927c04848b67f44550176dec Mon Sep 17 00:00:00 2001 From: Adam Fontenot Date: Tue, 6 Aug 2024 21:41:58 -0400 Subject: [PATCH 08/20] Remove some incorrect (and probably dead) code These lines are an implementation of `guess_date_from_id`, which is supposed to turn a puzzle ID into a `datetime`, but they can't possibly have worked because you can't call `datetime.strftime` that way, and in any case the IDs now appear to be random hashes without date information. --- xword_dl/downloader/newyorkerdownloader.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/xword_dl/downloader/newyorkerdownloader.py b/xword_dl/downloader/newyorkerdownloader.py index 23b4f6c..884ce5d 100644 --- a/xword_dl/downloader/newyorkerdownloader.py +++ b/xword_dl/downloader/newyorkerdownloader.py @@ -26,9 +26,6 @@ def __init__(self, **kwargs): def matches_url(url_components): return ('newyorker.com' in url_components.netloc and '/puzzles-and-games-dept/crossword' in url_components.path) - def guess_date_from_id(self, puzzle_id): - self.date = datetime.datetime.strftime(puzzle_id.split('_')[-1]) - def find_by_date(self, dt): url_format = dt.strftime('%Y/%m/%d') guessed_url = urllib.parse.urljoin( From 3a09b2cb1c5b74ae6ed6cdafe907c579ff2c6546 Mon Sep 17 00:00:00 2001 From: Adam Fontenot Date: Tue, 6 Aug 2024 21:55:12 -0400 Subject: [PATCH 09/20] handle error a bit more robustly --- xword_dl/downloader/mckinseydownloader.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/xword_dl/downloader/mckinseydownloader.py b/xword_dl/downloader/mckinseydownloader.py index 55b1228..945a97d 100644 --- a/xword_dl/downloader/mckinseydownloader.py +++ b/xword_dl/downloader/mckinseydownloader.py @@ -43,7 +43,14 @@ def find_latest(self): index_res = requests.get(index_url) index_soup = BeautifulSoup(index_res.text, "html.parser") - latest_fragment = next(a for a in index_soup.select('a[href^="/featured-insights/the-mckinsey-crossword/"]') if a.find('div')).get('href') + xword_selector = 'a[href^="/featured-insights/the-mckinsey-crossword/"]' + latest_fragment = next(( + a + for a in index_soup.select(xword_selector) + if a.find('div') + ), + {} + ).get('href') if not isinstance(latest_fragment, str): raise XWordDLException("Could not get latest crossword. No crossword fragment.") From 5a66d8975de2d989e77ce3810c28c898510039be Mon Sep 17 00:00:00 2001 From: Adam Fontenot Date: Fri, 9 Aug 2024 20:27:22 -0400 Subject: [PATCH 10/20] remove unused imports --- xword_dl/downloader/__init__.py | 3 +-- xword_dl/downloader/amuselabsdownloader.py | 1 - xword_dl/downloader/derstandarddownloader.py | 3 --- xword_dl/downloader/mckinseydownloader.py | 2 -- xword_dl/downloader/newyorkerdownloader.py | 1 - xword_dl/downloader/puzzmodownloader.py | 2 +- xword_dl/xword_dl.py | 2 +- 7 files changed, 3 insertions(+), 11 deletions(-) diff --git a/xword_dl/downloader/__init__.py b/xword_dl/downloader/__init__.py index ec15663..50a21c3 100644 --- a/xword_dl/downloader/__init__.py +++ b/xword_dl/downloader/__init__.py @@ -2,7 +2,6 @@ import importlib import inspect import os -import sys # This code runs through the modules in its directory and imports the ones # that end in .py (but that aren't __init__.py). It then adds any classes @@ -18,6 +17,6 @@ else: names = [x for x in mdl.__dict__ if not x.startswith('_') and inspect.isclass(mdl.__dict__[x])] - + globals().update({k: getattr(mdl, k) for k in names}) globals().pop(mod) diff --git a/xword_dl/downloader/amuselabsdownloader.py b/xword_dl/downloader/amuselabsdownloader.py index 2a57b51..de2dd47 100644 --- a/xword_dl/downloader/amuselabsdownloader.py +++ b/xword_dl/downloader/amuselabsdownloader.py @@ -1,6 +1,5 @@ import base64 import binascii -import datetime import json import urllib.parse diff --git a/xword_dl/downloader/derstandarddownloader.py b/xword_dl/downloader/derstandarddownloader.py index a4415da..f53cb2a 100644 --- a/xword_dl/downloader/derstandarddownloader.py +++ b/xword_dl/downloader/derstandarddownloader.py @@ -1,9 +1,6 @@ -import datetime -import json import re import urllib.parse -import dateparser import requests from bs4 import BeautifulSoup diff --git a/xword_dl/downloader/mckinseydownloader.py b/xword_dl/downloader/mckinseydownloader.py index 945a97d..6e7116e 100644 --- a/xword_dl/downloader/mckinseydownloader.py +++ b/xword_dl/downloader/mckinseydownloader.py @@ -1,5 +1,3 @@ -import datetime -import json import urllib.parse import dateparser diff --git a/xword_dl/downloader/newyorkerdownloader.py b/xword_dl/downloader/newyorkerdownloader.py index 884ce5d..35eebe3 100644 --- a/xword_dl/downloader/newyorkerdownloader.py +++ b/xword_dl/downloader/newyorkerdownloader.py @@ -1,4 +1,3 @@ -import datetime import json import urllib.parse diff --git a/xword_dl/downloader/puzzmodownloader.py b/xword_dl/downloader/puzzmodownloader.py index de30c18..f77196d 100644 --- a/xword_dl/downloader/puzzmodownloader.py +++ b/xword_dl/downloader/puzzmodownloader.py @@ -5,7 +5,7 @@ import puz from .basedownloader import BaseDownloader -from ..util import join_bylines, XWordDLException +from ..util import join_bylines class PuzzmoDownloader(BaseDownloader): command = 'pzm' diff --git a/xword_dl/xword_dl.py b/xword_dl/xword_dl.py index db33177..0c25ab9 100644 --- a/xword_dl/xword_dl.py +++ b/xword_dl/xword_dl.py @@ -3,9 +3,9 @@ import argparse import inspect import json +import os import sys import textwrap -import time import urllib.parse import requests From eb1fa607192fc8069cbb05980a7e751ea22bae26 Mon Sep 17 00:00:00 2001 From: Adam Fontenot Date: Fri, 9 Aug 2024 20:55:49 -0400 Subject: [PATCH 11/20] avoid unnecessary 'import *' behavior --- xword_dl/downloader/amuselabsdownloader.py | 2 +- xword_dl/downloader/basedownloader.py | 6 +++++- xword_dl/util/utils.py | 16 +++++++++------- xword_dl/xword_dl.py | 2 +- 4 files changed, 16 insertions(+), 10 deletions(-) diff --git a/xword_dl/downloader/amuselabsdownloader.py b/xword_dl/downloader/amuselabsdownloader.py index de2dd47..aea994c 100644 --- a/xword_dl/downloader/amuselabsdownloader.py +++ b/xword_dl/downloader/amuselabsdownloader.py @@ -11,7 +11,7 @@ from bs4 import BeautifulSoup, Tag from .basedownloader import BaseDownloader -from ..util import * +from ..util import XWordDLException, unidecode class AmuseLabsDownloader(BaseDownloader): def __init__(self, **kwargs): diff --git a/xword_dl/downloader/basedownloader.py b/xword_dl/downloader/basedownloader.py index 3524291..231ab12 100644 --- a/xword_dl/downloader/basedownloader.py +++ b/xword_dl/downloader/basedownloader.py @@ -2,7 +2,11 @@ import requests -from ..util import * +from ..util import ( + read_config_values, + remove_invalid_chars_from_filename, + sanitize_for_puzfile, +) class BaseDownloader: outlet = None diff --git a/xword_dl/util/utils.py b/xword_dl/util/utils.py index fbf4e81..9393785 100644 --- a/xword_dl/util/utils.py +++ b/xword_dl/util/utils.py @@ -3,16 +3,18 @@ import dateparser import emoji +import unidecode as _unidecode import yaml +from unidecode import unidecode from html2text import html2text -# This imports the _module_ unidecode, which converts Unicode strings to -# plain ASCII. The puz format, however, can accept Latin1, which is a larger -# subset. So the second line tells the module to leave codepoints 128-256 -# untouched, then we import the _function_ unidecode. -import unidecode -unidecode.Cache[0] = [chr(c) if c > 127 else '' for c in range(256)] -from unidecode import unidecode + + +# The unidecode module converts Unicode strings to plain ASCII. The puz format, +# however, can accept latin-1, which is a larger subset. By adding cached +# replacement values for these characters to unidecode, we prevent it from +# changing them. +_unidecode.Cache[0] = [chr(c) if c > 127 else '' for c in range(256)] CONFIG_PATH = os.environ.get('XDG_CONFIG_HOME') or os.path.expanduser('~/.config') CONFIG_PATH = os.path.join(CONFIG_PATH, 'xword-dl/xword-dl.yaml') diff --git a/xword_dl/xword_dl.py b/xword_dl/xword_dl.py index 0c25ab9..f736122 100644 --- a/xword_dl/xword_dl.py +++ b/xword_dl/xword_dl.py @@ -16,7 +16,7 @@ from . import downloader -from .util import * +from .util import XWordDLException, parse_date_or_exit, save_puzzle with open(os.path.join(os.path.dirname(__file__), 'version')) as f: __version__ = f.read().strip() From 23cfa9fb892bec461d9d2fe1fa45b100efa17d5a Mon Sep 17 00:00:00 2001 From: Adam Fontenot Date: Sat, 10 Aug 2024 00:38:05 -0400 Subject: [PATCH 12/20] remove unused variables and dead code --- xword_dl/downloader/amuselabsdownloader.py | 3 --- xword_dl/xword_dl.py | 5 +---- 2 files changed, 1 insertion(+), 7 deletions(-) diff --git a/xword_dl/downloader/amuselabsdownloader.py b/xword_dl/downloader/amuselabsdownloader.py index aea994c..e3f9cc6 100644 --- a/xword_dl/downloader/amuselabsdownloader.py +++ b/xword_dl/downloader/amuselabsdownloader.py @@ -267,9 +267,6 @@ def parse_xword(self, xw_data): puzzle.fill = fill placed_words = xw_data['placedWords'] - across_words = [word for word in placed_words if word['acrossNotDown']] - down_words = [ - word for word in placed_words if not word['acrossNotDown']] weirdass_puz_clue_sorting = sorted(placed_words, key=lambda word: (word['y'], word['x'], not word['acrossNotDown'])) diff --git a/xword_dl/xword_dl.py b/xword_dl/xword_dl.py index f736122..aaf4975 100644 --- a/xword_dl/xword_dl.py +++ b/xword_dl/xword_dl.py @@ -77,8 +77,6 @@ def by_url(url, **kwargs): def parse_for_embedded_puzzle(url, **kwargs): - netloc = urllib.parse.urlparse(url).netloc - res = requests.get(url, headers={'User-Agent':'xword-dl'}) soup = BeautifulSoup(res.text, 'lxml') @@ -224,8 +222,7 @@ def main(): password = args.password or getpass("Password: ") try: - dl = downloader.NewYorkTimesDownloader( - username=username, password=password) + downloader.NewYorkTimesDownloader(username=username, password=password) sys.exit('Authentication successful.') except Exception as e: sys.exit(' '.join(['Authentication failed:', str(e)])) From 2c387383346a3332a08161a8fbfc583aaa41f795 Mon Sep 17 00:00:00 2001 From: Adam Fontenot Date: Sat, 10 Aug 2024 01:12:23 -0400 Subject: [PATCH 13/20] make all exception catching explicit and refactor --- xword_dl/downloader/amuniversaldownloader.py | 2 +- xword_dl/downloader/amuselabsdownloader.py | 122 ++++++++++--------- 2 files changed, 64 insertions(+), 60 deletions(-) diff --git a/xword_dl/downloader/amuniversaldownloader.py b/xword_dl/downloader/amuniversaldownloader.py index efaec6e..083ba61 100644 --- a/xword_dl/downloader/amuniversaldownloader.py +++ b/xword_dl/downloader/amuniversaldownloader.py @@ -128,7 +128,7 @@ def find_by_date(self, dt): try: res = requests.head(url) res.raise_for_status() - except: + except requests.HTTPError: raise XWordDLException('Unable to find puzzle for date provided.') return url diff --git a/xword_dl/downloader/amuselabsdownloader.py b/xword_dl/downloader/amuselabsdownloader.py index e3f9cc6..3cd0214 100644 --- a/xword_dl/downloader/amuselabsdownloader.py +++ b/xword_dl/downloader/amuselabsdownloader.py @@ -158,66 +158,11 @@ def fetch_data(self, solver_url): amuseKey2 = [x for x, _ in sorted(zip(key_digits, key_orders), key=lambda pair: pair[1])] + # try to decode potentially obsfucated rawc with the two detected keys + xword_data = load_rawc(rawc, amuseKey=amuseKey) or load_rawc(rawc, amuseKey=amuseKey2) - # helper function to decode rawc - # as occasionally it can be obfuscated - def load_rawc(rawc, amuseKey=None): - try: - # the original case is just base64'd JSON - return json.loads(base64.b64decode(rawc).decode("utf-8")) - except: - try: - # case 2 is the first obfuscation - E = rawc.split('.') - A = list(E[0]) - H = E[1][::-1] - F = [int(A,16)+2 for A in H] - B, G = 0, 0 - while B < len(A) - 1: - C = min(F[G % len(F)], len(A) - B) - for D in range(C//2): - A[B+D], A[B+C-D-1] = A[B+C-D-1], A[B+D] - B+=C - G+=1 - newRawc=''.join(A) - return json.loads(base64.b64decode(newRawc).decode("utf-8")) - except: - # case 3 is the most recent obfuscation - def amuse_b64(e, amuseKey): - e = list(e) - H=amuseKey - E=[] - F=0 - - while F Date: Sat, 10 Aug 2024 19:56:02 -0400 Subject: [PATCH 14/20] Refactor imports of downloader plugins This commit changes how importing the downloaders happens slightly. The point is to avoid making any assumptions in xword_dl.py about what plugins are available and what methods they have. This has a number of advantages, for example, that it's trivial for someone to add a downloader plugin that requires authentication without adding another special case to the main control flow. This also happens to make code checkers happy as a side effect, because they can't determine whether the plugin classes are in the downloader namespace or not without executing the downloader __init__ code. The work in this commit is prefatory to an additional refactor which would create a generic Downloader class that automatically finds plugins and abstracts the control flow away from the application logic in xword_dl.py. --- xword_dl/downloader/amuselabsdownloader.py | 7 ++ xword_dl/downloader/compilerdownloader.py | 15 ++++ xword_dl/downloader/newyorktimesdownloader.py | 4 + xword_dl/xword_dl.py | 76 ++++++++----------- 4 files changed, 57 insertions(+), 45 deletions(-) diff --git a/xword_dl/downloader/amuselabsdownloader.py b/xword_dl/downloader/amuselabsdownloader.py index 3cd0214..dab1b2e 100644 --- a/xword_dl/downloader/amuselabsdownloader.py +++ b/xword_dl/downloader/amuselabsdownloader.py @@ -27,6 +27,13 @@ def __init__(self, **kwargs): def matches_url(url_components): return 'amuselabs.com' in url_components.netloc + @staticmethod + def matches_embed_url(src): + url = urllib.parse.urlparse(src) + if 'amuselabs.com' in url.netloc: + return src + return None + def find_latest(self) -> str: if self.picker_url is None or self.url_from_id is None: raise XWordDLException("This outlet does not support finding the latest crossword.") diff --git a/xword_dl/downloader/compilerdownloader.py b/xword_dl/downloader/compilerdownloader.py index 3069b8b..f7622e0 100644 --- a/xword_dl/downloader/compilerdownloader.py +++ b/xword_dl/downloader/compilerdownloader.py @@ -1,6 +1,8 @@ import puz import requests +import urllib.parse import xmltodict +from bs4 import BeautifulSoup from .basedownloader import BaseDownloader @@ -24,6 +26,19 @@ def _fetch_data(solver_url, js_encoded=False, headers=None): return res.text + @staticmethod + def matches_embed_url(src): + res = requests.get(src) + if not res.ok: + return None + soup = BeautifulSoup(res.text, 'lxml') + + for script in [s for s in soup.find_all('script') if s.get('src')]: + js_url = urllib.parse.urljoin(src, script.get('src')) + res = requests.get(js_url, headers={'User-Agent':'xword-dl'}) + if res.text.startswith('var CrosswordPuzzleData'): + return js_url + # subclasses of CCD may want to override this method with different defaults def fetch_data(self, solver_url): return self._fetch_data(solver_url) diff --git a/xword_dl/downloader/newyorktimesdownloader.py b/xword_dl/downloader/newyorktimesdownloader.py index b14a9d1..ad7d8d8 100644 --- a/xword_dl/downloader/newyorktimesdownloader.py +++ b/xword_dl/downloader/newyorktimesdownloader.py @@ -3,6 +3,7 @@ import puz import requests +from getpass import getpass from .basedownloader import BaseDownloader from ..util import XWordDLException, join_bylines, update_config_file, unidecode @@ -46,6 +47,9 @@ def matches_url(url_components): def authenticate(self, username, password): """Given a NYT username and password, returns the NYT-S cookie value""" + username = username or input("New York Times username: ") + password = password or getpass("Password: ") + try: res = requests.post('https://myaccount.nytimes.com/svc/ios/v2/login', data={'login': username, 'password': password}, diff --git a/xword_dl/xword_dl.py b/xword_dl/xword_dl.py index aaf4975..03cc0cb 100644 --- a/xword_dl/xword_dl.py +++ b/xword_dl/xword_dl.py @@ -10,11 +10,10 @@ import requests -from getpass import getpass - from bs4 import BeautifulSoup -from . import downloader +# FIXME: abstract download handling in separate file +from .downloader.basedownloader import BaseDownloader from .util import XWordDLException, parse_date_or_exit, save_puzzle @@ -77,6 +76,12 @@ def by_url(url, **kwargs): def parse_for_embedded_puzzle(url, **kwargs): + supported_downloaders = [ + d[1] + for d in get_supported_outlets(command_only=False) + if hasattr(d[1], 'matches_embed_url') + ] + res = requests.get(url, headers={'User-Agent':'xword-dl'}) soup = BeautifulSoup(res.text, 'lxml') @@ -91,27 +96,10 @@ def parse_for_embedded_puzzle(url, **kwargs): sources.insert(0, url) for src in sources: - if 'amuselabs.com' in src: - dl = downloader.AmuseLabsDownloader(url=url, **kwargs) - puzzle_url = src - - return dl, puzzle_url - - if not soup: - res = requests.get(src) - soup = BeautifulSoup(res.text, 'lxml') - - for script in [s for s in soup.find_all('script') if s.get('src')]: - js_url = urllib.parse.urljoin(url, script.get('src')) - res = requests.get(js_url, headers={'User-Agent':'xword-dl'}) - if res.text.startswith('var CrosswordPuzzleData'): - dl = downloader.CrosswordCompilerDownloader(url=url, **kwargs) - puzzle_url = js_url - dl.fetch_data = dl.fetch_jsencoded_data - - return dl, puzzle_url - - soup = None + for dlr in supported_downloaders: + puzzle_url = dlr.matches_embed_url(src) + if puzzle_url is not None: + return (dlr(), puzzle_url) return None, None @@ -119,8 +107,7 @@ def parse_for_embedded_puzzle(url, **kwargs): def get_supported_outlets(command_only=True): all_classes = inspect.getmembers(sys.modules['xword_dl.downloader'], inspect.isclass) - dls = [d for d in all_classes if issubclass(d[1], - downloader.BaseDownloader)] + dls = [d for d in all_classes if issubclass(d[1], BaseDownloader)] if command_only: dls = [d for d in dls if hasattr(d[1], 'command')] @@ -173,25 +160,22 @@ def main(): parser.add_argument('-a', '--authenticate', help=textwrap.dedent("""\ - when used with the nyt puzzle keyword, - stores an authenticated New York Times cookie - without downloading a puzzle. If username - or password are not provided as flags, - xword-dl will prompt for those values - at runtime"""), + when used with a subscription-only puzzle source, + stores an authentication token without downloading + a puzzle. If username or password are not provided + as flags xword-dl will prompt for those values at + runtime"""), action='store_true', default=False) parser.add_argument('-u', '--username', help=textwrap.dedent("""\ - username for a site that requires credentials - (currently only the New York Times)"""), + username for a site that requires credentials"""), default=None) parser.add_argument('-p', '--password', help=textwrap.dedent("""\ - password for a site that requires credentials - (currently only the New York Times)"""), + password for a site that requires credentials"""), default=None) parser.add_argument('--preserve-html', @@ -217,15 +201,17 @@ def main(): args = parser.parse_args() - if args.authenticate and args.source == 'nyt': - username = args.username or input("New York Times username: ") - password = args.password or getpass("Password: ") + if args.authenticate and args.source: + keyword_dict = {d[1].command: d[1] for d in get_supported_outlets()} + selected_downloader = keyword_dict.get(args.source, None) + if selected_downloader is None: + raise XWordDLException('Keyword {} not recognized.'.format(args.source)) + + if not hasattr(selected_downloader, "authenticate"): + sys.exit('This outlet does not support authentication.') + + selected_downloader.authenticate(args.username, args.password) - try: - downloader.NewYorkTimesDownloader(username=username, password=password) - sys.exit('Authentication successful.') - except Exception as e: - sys.exit(' '.join(['Authentication failed:', str(e)])) elif args.authenticate: sys.exit('Authentication flag must use a puzzle outlet keyword.') @@ -257,7 +243,7 @@ def main(): else: puzzle, filename = by_keyword(args.source, **options) except XWordDLException as e: - sys.exit(e) + sys.exit(str(e)) # specialcase the output file '-' if args.output == '-': From 9498790d7ef41dee2535a27e483d32ab38f1d3ba Mon Sep 17 00:00:00 2001 From: Adam Fontenot Date: Sat, 10 Aug 2024 22:10:19 -0400 Subject: [PATCH 15/20] Make import of downloader modules much cleaner Gets rid of a lot of the complicated introspection required to pick up the available downloader classes, and also moves all of this logic out of xword_dl.py. This change could potentially be amended to add the plugins to the downloader namespace as before, but after this change along with the last several, this has no real benefit and leaves the namespace cluttered. --- xword_dl/downloader/__init__.py | 31 +++++++++++---------------- xword_dl/xword_dl.py | 38 +++++++++++++++------------------ 2 files changed, 30 insertions(+), 39 deletions(-) diff --git a/xword_dl/downloader/__init__.py b/xword_dl/downloader/__init__.py index 50a21c3..82547e1 100644 --- a/xword_dl/downloader/__init__.py +++ b/xword_dl/downloader/__init__.py @@ -1,22 +1,17 @@ -import glob import importlib -import inspect -import os +import pkgutil -# This code runs through the modules in its directory and imports the ones -# that end in .py (but that aren't __init__.py). It then adds any classes -# from those modules to its own namespace. +from .basedownloader import BaseDownloader as __bd -modules = glob.glob(os.path.join(os.path.dirname(__file__), "*.py")) -modules = [os.path.basename(f)[:-3] for f in modules if os.path.isfile(f) and not f.endswith('__init__.py')] +def __get_subclasses(cls, lst=[]): + """Recursively returns a list of subclasses of `cls` in imported namespaces.""" + for s_cls in cls.__subclasses__(): + lst.append(s_cls) + __get_subclasses(s_cls, lst) + return lst -for mod in modules: - mdl = importlib.import_module('.' + mod, package=__name__) - if '__all__' in mdl.__dict__: - names = mdl.__dict__['__all__'] - else: - names = [x for x in mdl.__dict__ if not x.startswith('_') - and inspect.isclass(mdl.__dict__[x])] - - globals().update({k: getattr(mdl, k) for k in names}) - globals().pop(mod) +def get_plugins(): + """Returns all plugins available in the downloader package.""" + for _, mod, _ in pkgutil.walk_packages(__path__): + importlib.import_module(f".{mod}", package=__name__) + return __get_subclasses(__bd) diff --git a/xword_dl/xword_dl.py b/xword_dl/xword_dl.py index 03cc0cb..19a10d1 100644 --- a/xword_dl/xword_dl.py +++ b/xword_dl/xword_dl.py @@ -1,7 +1,6 @@ #!/usr/bin/env python3 import argparse -import inspect import json import os import sys @@ -12,18 +11,19 @@ from bs4 import BeautifulSoup -# FIXME: abstract download handling in separate file -from .downloader.basedownloader import BaseDownloader - +from .downloader import get_plugins from .util import XWordDLException, parse_date_or_exit, save_puzzle with open(os.path.join(os.path.dirname(__file__), 'version')) as f: __version__ = f.read().strip() +plugins = get_plugins() + def by_keyword(keyword, **kwargs): - keyword_dict = {d[1].command: d[1] for d in get_supported_outlets()} - selected_downloader = keyword_dict.get(keyword, None) + selected_downloader = next( + (d for d in get_supported_outlets() if d.command == keyword), None + ) if selected_downloader: dl = selected_downloader(**kwargs) @@ -49,9 +49,9 @@ def by_keyword(keyword, **kwargs): def by_url(url, **kwargs): - supported_downloaders = [d[1] for d in + supported_downloaders = [d for d in get_supported_outlets(command_only=False) - if hasattr(d[1], 'matches_url')] + if hasattr(d, 'matches_url')] dl = None @@ -77,9 +77,9 @@ def by_url(url, **kwargs): def parse_for_embedded_puzzle(url, **kwargs): supported_downloaders = [ - d[1] + d for d in get_supported_outlets(command_only=False) - if hasattr(d[1], 'matches_embed_url') + if hasattr(d, 'matches_embed_url') ] res = requests.get(url, headers={'User-Agent':'xword-dl'}) @@ -105,20 +105,15 @@ def parse_for_embedded_puzzle(url, **kwargs): def get_supported_outlets(command_only=True): - all_classes = inspect.getmembers(sys.modules['xword_dl.downloader'], - inspect.isclass) - dls = [d for d in all_classes if issubclass(d[1], BaseDownloader)] - if command_only: - dls = [d for d in dls if hasattr(d[1], 'command')] - - return dls + return [d for d in plugins if hasattr(d, 'command')] + return plugins def get_help_text_formatted_list(): text = '' - for d in sorted(get_supported_outlets(), key=lambda x: x[1].outlet.lower()): - text += '{:<5} {}\n'.format(d[1].command, d[1].outlet) + for d in sorted(get_supported_outlets(), key=lambda x: x.outlet.lower()): + text += '{:<5} {}\n'.format(d.command, d.outlet) return text @@ -202,8 +197,9 @@ def main(): args = parser.parse_args() if args.authenticate and args.source: - keyword_dict = {d[1].command: d[1] for d in get_supported_outlets()} - selected_downloader = keyword_dict.get(args.source, None) + selected_downloader = next( + (d for d in get_supported_outlets() if d.command == args.source), None + ) if selected_downloader is None: raise XWordDLException('Keyword {} not recognized.'.format(args.source)) From 1eb0842e343d0efc5dc1cd8f30aad88d7b100c79 Mon Sep 17 00:00:00 2001 From: Adam Fontenot Date: Sat, 10 Aug 2024 22:53:32 -0400 Subject: [PATCH 16/20] remove unnecessary import in __init__.py --- xword_dl/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/xword_dl/__init__.py b/xword_dl/__init__.py index 22ecaab..8b13789 100644 --- a/xword_dl/__init__.py +++ b/xword_dl/__init__.py @@ -1 +1 @@ -from .xword_dl import * + From 2a7952fe258ef5b4e9b61f02529990ab193e1570 Mon Sep 17 00:00:00 2001 From: Adam Fontenot Date: Sat, 10 Aug 2024 22:54:43 -0400 Subject: [PATCH 17/20] Simplify overcomplicated use of hasattr With this commit the entire repository is Pyright clean!! --- xword_dl/downloader/basedownloader.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/xword_dl/downloader/basedownloader.py b/xword_dl/downloader/basedownloader.py index 231ab12..797795c 100644 --- a/xword_dl/downloader/basedownloader.py +++ b/xword_dl/downloader/basedownloader.py @@ -43,8 +43,7 @@ def pick_filename(self, puzzle, **kwargs): 'prefix': self.outlet_prefix or '', 'title': puzzle.title or '', 'author': puzzle.author or '', - 'cmd': (self.command if hasattr(self, 'command') - else self.netloc or ''), + 'cmd': getattr(self, "command", self.netloc or ""), 'netloc': self.netloc or '', } From 630e0f1cd420d80d13b85ea5d79e5ca3bda061c5 Mon Sep 17 00:00:00 2001 From: Adam Fontenot Date: Sat, 10 Aug 2024 23:20:04 -0400 Subject: [PATCH 18/20] fix issue where refactor broke authentication --- xword_dl/downloader/newyorktimesdownloader.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/xword_dl/downloader/newyorktimesdownloader.py b/xword_dl/downloader/newyorktimesdownloader.py index ad7d8d8..ffd4488 100644 --- a/xword_dl/downloader/newyorktimesdownloader.py +++ b/xword_dl/downloader/newyorktimesdownloader.py @@ -29,10 +29,8 @@ def __init__(self, **kwargs): password = self.settings.get('password') if username and password: - nyts_token = self.authenticate(username, password) - update_config_file('nyt', {'NYT-S': nyts_token}) - else: - nyts_token = self.settings.get('NYT_S') + self.authenticate(username, password) + nyts_token = self.settings.get('NYT_S') if not nyts_token: raise XWordDLException('No credentials provided or stored. Try running xword-dl nyt --authenticate') @@ -68,7 +66,7 @@ def authenticate(self, username, password): nyts_token = cookie['cipheredValue'] if nyts_token: - return nyts_token + update_config_file('nyt', {'NYT-S': nyts_token}) else: raise XWordDLException('NYT-S cookie not found.') From 49317635fcb21acd9e140632dba94e7787db137e Mon Sep 17 00:00:00 2001 From: Adam Fontenot Date: Sun, 11 Aug 2024 17:22:20 -0400 Subject: [PATCH 19/20] Additional typing improvements Simplifies __get_subclasses(), and adds type annotations to important methods in __init__.py, xword_dl.py, and utils.py. The result of this work was to reveal a number of issues with API inconsistency, which this commit fixes as well. --- xword_dl/downloader/__init__.py | 12 +-- xword_dl/downloader/amuniversaldownloader.py | 3 + xword_dl/downloader/amuselabsdownloader.py | 8 +- xword_dl/downloader/basedownloader.py | 77 ++++++++++++++----- xword_dl/downloader/compilerdownloader.py | 4 +- .../downloader/crosswordclubdownloader.py | 4 +- xword_dl/downloader/derstandarddownloader.py | 4 +- xword_dl/downloader/globeandmaildownloader.py | 4 +- xword_dl/downloader/guardiandownloader.py | 28 +++---- xword_dl/downloader/mckinseydownloader.py | 4 +- xword_dl/downloader/newyorkerdownloader.py | 4 +- xword_dl/downloader/newyorktimesdownloader.py | 11 +-- .../downloader/puzzlesocietydownloader.py | 4 +- xword_dl/downloader/wsjdownloader.py | 7 +- xword_dl/util/utils.py | 19 ++--- xword_dl/xword_dl.py | 11 ++- 16 files changed, 127 insertions(+), 77 deletions(-) diff --git a/xword_dl/downloader/__init__.py b/xword_dl/downloader/__init__.py index 82547e1..8d5451a 100644 --- a/xword_dl/downloader/__init__.py +++ b/xword_dl/downloader/__init__.py @@ -1,14 +1,16 @@ import importlib import pkgutil +from typing import Type from .basedownloader import BaseDownloader as __bd -def __get_subclasses(cls, lst=[]): +def __get_subclasses(cls: Type[__bd]): """Recursively returns a list of subclasses of `cls` in imported namespaces.""" - for s_cls in cls.__subclasses__(): - lst.append(s_cls) - __get_subclasses(s_cls, lst) - return lst + return [cls] + [ + r_cls + for s_cls in cls.__subclasses__() + for r_cls in __get_subclasses(s_cls) + ] def get_plugins(): """Returns all plugins available in the downloader package.""" diff --git a/xword_dl/downloader/amuniversaldownloader.py b/xword_dl/downloader/amuniversaldownloader.py index 083ba61..da61753 100644 --- a/xword_dl/downloader/amuniversaldownloader.py +++ b/xword_dl/downloader/amuniversaldownloader.py @@ -19,6 +19,9 @@ def __init__(self, **kwargs): self.url_blob = None def find_by_date(self, dt): + if self.url_blob is None: + raise XWordDLException("Blob URL was not set by AMUniversal subclass.") + self.date = dt url_format = dt.strftime('%Y-%m-%d') diff --git a/xword_dl/downloader/amuselabsdownloader.py b/xword_dl/downloader/amuselabsdownloader.py index dab1b2e..f0767de 100644 --- a/xword_dl/downloader/amuselabsdownloader.py +++ b/xword_dl/downloader/amuselabsdownloader.py @@ -23,12 +23,12 @@ def __init__(self, **kwargs): self.picker_url = None self.url_from_id = None - @staticmethod - def matches_url(url_components): + @classmethod + def matches_url(cls, url_components): return 'amuselabs.com' in url_components.netloc - @staticmethod - def matches_embed_url(src): + @classmethod + def matches_embed_url(cls, src): url = urllib.parse.urlparse(src) if 'amuselabs.com' in url.netloc: return src diff --git a/xword_dl/downloader/basedownloader.py b/xword_dl/downloader/basedownloader.py index 797795c..9702a7f 100644 --- a/xword_dl/downloader/basedownloader.py +++ b/xword_dl/downloader/basedownloader.py @@ -1,6 +1,8 @@ import urllib.parse +from datetime import datetime import requests +from puz import Puzzle from ..util import ( read_config_values, @@ -9,7 +11,8 @@ ) class BaseDownloader: - outlet = None + command = "" + outlet = "" outlet_prefix = None def __init__(self, **kwargs): @@ -20,11 +23,12 @@ def __init__(self, **kwargs): self.settings.update(read_config_values('general')) - if hasattr(self, 'command') or 'inherit_settings' in kwargs: - self.settings.update(read_config_values( - kwargs.get('inherit_settings'))) - self.settings.update(read_config_values( - getattr(self, 'command', ''))) + if 'inherit_settings' in kwargs: + self.settings.update(read_config_values(kwargs['inherit_settings'])) + + if self.command: + self.settings.update(read_config_values(self.command)) + elif 'url' in kwargs: self.settings.update(read_config_values('url')) self.settings.update(read_config_values(self.netloc)) @@ -38,7 +42,7 @@ def __init__(self, **kwargs): self.session.headers.update(self.settings.get('headers', {})) self.session.cookies.update(self.settings.get('cookies', {})) - def pick_filename(self, puzzle, **kwargs): + def pick_filename(self, puzzle: Puzzle, **kwargs) -> str: tokens = {'outlet': self.outlet or '', 'prefix': self.outlet_prefix or '', 'title': puzzle.title or '', @@ -73,7 +77,21 @@ def pick_filename(self, puzzle, **kwargs): return template - def find_solver(self, url): + def download(self, url: str) -> Puzzle: + """Download, parse, and return a puzzle at a given URL.""" + + solver_url = self.find_solver(url) + xword_data = self.fetch_data(solver_url) + puzzle = self.parse_xword(xword_data) + + puzzle = sanitize_for_puzfile( + puzzle, + preserve_html=self.settings.get("preserve_html", False) + ) + + return puzzle + + def find_solver(self, url: str) -> str: """Given a URL for a puzzle, returns the essential 'solver' URL. This is implemented in subclasses, and in instances where there is no @@ -82,7 +100,7 @@ def find_solver(self, url): """ raise NotImplementedError - def fetch_data(self, solver_url): + def fetch_data(self, solver_url: str): """Given a URL from the find_solver function, return JSON crossword data This is implemented in subclasses and the returned data will not be @@ -90,7 +108,7 @@ def fetch_data(self, solver_url): """ raise NotImplementedError - def parse_xword(self, xw_data): + def parse_xword(self, xw_data) -> Puzzle: """Given a blob of crossword data, parse and stuff into puz format. This method is implemented in subclasses based on the differences in @@ -98,16 +116,35 @@ def parse_xword(self, xw_data): """ raise NotImplementedError - def download(self, url): - """Download, parse, and return a puzzle at a given URL.""" + def find_latest(self) -> str: + """Get the latest available crossword for this outlet. - solver_url = self.find_solver(url) - xword_data = self.fetch_data(solver_url) - puzzle = self.parse_xword(xword_data) + This method is implemented in subclasses and should return a string + representing the URL to the latest puzzle.""" + raise NotImplementedError - puzzle = sanitize_for_puzfile(puzzle, - preserve_html=self.settings.get( - 'preserve_html', - False)) + def find_by_date(self, dt: datetime) -> str: + """Get the outlet's crossword for the specified date, if any. - return puzzle + This method is implemented in subclasses and should return a string + representing the URL to the puzzle.""" + raise NotImplementedError + + @classmethod + def matches_url(cls, url_components: urllib.parse.ParseResult) -> bool: + """Returns whether this plugin can download the provided URL.""" + raise NotImplementedError + + @classmethod + def matches_embed_url(cls, src: str) -> str | None: + """Returns a URL to a puzzle, given an embedded link this plugin can parse.""" + raise NotImplementedError + + @classmethod + def authenticate(cls, username: str | None, password: str | None) -> None: + """Authenticate the puzzle source with a username and password. + + This method is implemented in subclasses, and should save a login token to the + program's settings when it is possible to do so. Authenticate is a class method + and is usually not called on an instance of the class.""" + raise NotImplementedError diff --git a/xword_dl/downloader/compilerdownloader.py b/xword_dl/downloader/compilerdownloader.py index f7622e0..e39580d 100644 --- a/xword_dl/downloader/compilerdownloader.py +++ b/xword_dl/downloader/compilerdownloader.py @@ -26,8 +26,8 @@ def _fetch_data(solver_url, js_encoded=False, headers=None): return res.text - @staticmethod - def matches_embed_url(src): + @classmethod + def matches_embed_url(cls, src): res = requests.get(src) if not res.ok: return None diff --git a/xword_dl/downloader/crosswordclubdownloader.py b/xword_dl/downloader/crosswordclubdownloader.py index 834b887..47629c2 100644 --- a/xword_dl/downloader/crosswordclubdownloader.py +++ b/xword_dl/downloader/crosswordclubdownloader.py @@ -18,8 +18,8 @@ def __init__(self, **kwargs): self.url_from_id = 'https://cdn2.amuselabs.com/pmm/crossword?id={puzzle_id}&set=pardon-crossword' - @staticmethod - def matches_url(url_components): + @classmethod + def matches_url(cls, url_components): return ('crosswordclub.com' in url_components.netloc and '/puzzles' in url_components.path) diff --git a/xword_dl/downloader/derstandarddownloader.py b/xword_dl/downloader/derstandarddownloader.py index f53cb2a..70d052c 100644 --- a/xword_dl/downloader/derstandarddownloader.py +++ b/xword_dl/downloader/derstandarddownloader.py @@ -22,8 +22,8 @@ def __init__(self, **kwargs): 'DNT':'1' } - @staticmethod - def matches_url(url_components): + @classmethod + def matches_url(cls, url_components): return ('derstandard.at' in url_components.netloc and '/kreuzwortraetsel' in url_components.path) def find_latest(self): diff --git a/xword_dl/downloader/globeandmaildownloader.py b/xword_dl/downloader/globeandmaildownloader.py index 95e35bc..648e866 100644 --- a/xword_dl/downloader/globeandmaildownloader.py +++ b/xword_dl/downloader/globeandmaildownloader.py @@ -21,8 +21,8 @@ def __init__(self, **kwargs): self.url_format = 'https://www.theglobeandmail.com/puzzles-and-crosswords/cryptic-crossword/?date={url_encoded_date}' - @staticmethod - def matches_url(url_components): + @classmethod + def matches_url(cls, url_components): return 'theglobeandmail.com' in url_components.netloc def parse_date_from_url(self, url): diff --git a/xword_dl/downloader/guardiandownloader.py b/xword_dl/downloader/guardiandownloader.py index 6be0cb7..403ac64 100644 --- a/xword_dl/downloader/guardiandownloader.py +++ b/xword_dl/downloader/guardiandownloader.py @@ -107,8 +107,8 @@ def __init__(self, **kwargs): self.landing_page += '/series/cryptic' - @staticmethod - def matches_url(url_components): + @classmethod + def matches_url(cls, url_components): return ('theguardian.com' in url_components.netloc and '/crosswords/cryptic' in url_components.path) @@ -123,8 +123,8 @@ def __init__(self, **kwargs): self.landing_page += '/series/everyman' - @staticmethod - def matches_url(url_components): + @classmethod + def matches_url(cls, url_components): return ('theguardian.com' in url_components.netloc and '/crosswords/everyman' in url_components.path) @@ -139,8 +139,8 @@ def __init__(self, **kwargs): self.landing_page += '/series/speedy' - @staticmethod - def matches_url(url_components): + @classmethod + def matches_url(cls, url_components): return ('theguardian.com' in url_components.netloc and '/crosswords/speedy' in url_components.path) @@ -154,8 +154,8 @@ def __init__(self, **kwargs): self.landing_page += '/series/quick' - @staticmethod - def matches_url(url_components): + @classmethod + def matches_url(cls, url_components): return ('theguardian.com' in url_components.netloc and '/crosswords/quick' in url_components.path) @@ -169,8 +169,8 @@ def __init__(self, **kwargs): self.landing_page += '/series/prize' - @staticmethod - def matches_url(url_components): + @classmethod + def matches_url(cls, url_components): return ('theguardian.com' in url_components.netloc and '/crosswords/prize' in url_components.path) @@ -184,8 +184,8 @@ def __init__(self, **kwargs): self.landing_page += '/series/weekend-crossword' - @staticmethod - def matches_url(url_components): + @classmethod + def matches_url(cls, url_components): return ('theguardian.com' in url_components.netloc and '/crosswords/weekend' in url_components.path) @@ -199,7 +199,7 @@ def __init__(self, **kwargs): self.landing_page += '/series/quiptic' - @staticmethod - def matches_url(url_components): + @classmethod + def matches_url(cls, url_components): return ('theguardian.com' in url_components.netloc and '/crosswords/quiptic' in url_components.path) diff --git a/xword_dl/downloader/mckinseydownloader.py b/xword_dl/downloader/mckinseydownloader.py index 6e7116e..dfc222f 100644 --- a/xword_dl/downloader/mckinseydownloader.py +++ b/xword_dl/downloader/mckinseydownloader.py @@ -18,8 +18,8 @@ def __init__(self, **kwargs): self.url_from_id = 'https://cdn2.amuselabs.com/pmm/crossword?id={puzzle_id}&set=mckinsey' - @staticmethod - def matches_url(url_components): + @classmethod + def matches_url(cls, url_components): return ('mckinsey.com' in url_components.netloc and '/featured-insights/the-mckinsey-crossword' in url_components.path) def find_by_date(self, dt): diff --git a/xword_dl/downloader/newyorkerdownloader.py b/xword_dl/downloader/newyorkerdownloader.py index 35eebe3..3587c6a 100644 --- a/xword_dl/downloader/newyorkerdownloader.py +++ b/xword_dl/downloader/newyorkerdownloader.py @@ -21,8 +21,8 @@ def __init__(self, **kwargs): self.theme_title = '' - @staticmethod - def matches_url(url_components): + @classmethod + def matches_url(cls, url_components): return ('newyorker.com' in url_components.netloc and '/puzzles-and-games-dept/crossword' in url_components.path) def find_by_date(self, dt): diff --git a/xword_dl/downloader/newyorktimesdownloader.py b/xword_dl/downloader/newyorktimesdownloader.py index ffd4488..385c21f 100644 --- a/xword_dl/downloader/newyorktimesdownloader.py +++ b/xword_dl/downloader/newyorktimesdownloader.py @@ -37,12 +37,13 @@ def __init__(self, **kwargs): else: self.cookies.update({'NYT-S': nyts_token}) - @staticmethod - def matches_url(url_components): + @classmethod + def matches_url(cls, url_components): return ('nytimes.com' in url_components.netloc and 'crosswords/game/daily' in url_components.path) - def authenticate(self, username, password): + @classmethod + def authenticate(cls, username, password): """Given a NYT username and password, returns the NYT-S cookie value""" username = username or input("New York Times username: ") @@ -217,8 +218,8 @@ def __init__(self, **kwargs): self.url_from_date = 'https://www.nytimes.com/svc/crosswords/v6/puzzle/mini/{}.json' - @staticmethod - def matches_url(url_components): + @classmethod + def matches_url(cls, url_components): return ('nytimes.com' in url_components.netloc and 'mini' in url_components.path) diff --git a/xword_dl/downloader/puzzlesocietydownloader.py b/xword_dl/downloader/puzzlesocietydownloader.py index d7d8a37..5e9ea5c 100644 --- a/xword_dl/downloader/puzzlesocietydownloader.py +++ b/xword_dl/downloader/puzzlesocietydownloader.py @@ -20,8 +20,8 @@ class TheModernDownloader(CrosswordCompilerDownloader): def __init__(self, **kwargs): super().__init__(**kwargs) - @staticmethod - def matches_url(url_components): + @classmethod + def matches_url(cls, url_components): return 'puzzlesociety.com' in url_components.netloc and 'modern-crossword' in url_components.path def find_by_date(self, dt): diff --git a/xword_dl/downloader/wsjdownloader.py b/xword_dl/downloader/wsjdownloader.py index a96714a..91b2da3 100644 --- a/xword_dl/downloader/wsjdownloader.py +++ b/xword_dl/downloader/wsjdownloader.py @@ -19,8 +19,8 @@ class WSJDownloader(BaseDownloader): def __init__(self, **kwargs): super().__init__(headers={'User-Agent': 'xword-dl'}, **kwargs) - @staticmethod - def matches_url(url_components): + @classmethod + def matches_url(cls, url_components): return False # disabling, see above # 'wsj.com' in url_components.netloc def find_latest(self): @@ -54,6 +54,9 @@ def find_solver(self, url): puzzle_link = iframe['src'] + if isinstance(puzzle_link, list): + puzzle_link = puzzle_link[0] + return self.find_solver(puzzle_link) def fetch_data(self, solver_url): diff --git a/xword_dl/util/utils.py b/xword_dl/util/utils.py index 9393785..9c2fbb5 100644 --- a/xword_dl/util/utils.py +++ b/xword_dl/util/utils.py @@ -5,6 +5,7 @@ import emoji import unidecode as _unidecode import yaml +from puz import Puzzle from unidecode import unidecode from html2text import html2text @@ -27,7 +28,7 @@ class XWordDLException(Exception): pass -def save_puzzle(puzzle, filename): +def save_puzzle(puzzle: Puzzle, filename: str): if not os.path.exists(filename): puzzle.save(filename) msg = ("Puzzle downloaded and saved as {}.".format(filename) @@ -38,10 +39,10 @@ def save_puzzle(puzzle, filename): print("Not saving: a file named {} already exists.".format(filename), file=sys.stderr) -def join_bylines(l, and_word="&"): +def join_bylines(l: list[str], and_word="&"): return ', '.join(l[:-1]) + f', {and_word} ' + l[-1] if len(l) > 2 else f' {and_word} '.join(l) -def remove_invalid_chars_from_filename(filename): +def remove_invalid_chars_from_filename(filename: str): invalid_chars = r'<>:"/\|?*' for char in invalid_chars: @@ -50,7 +51,7 @@ def remove_invalid_chars_from_filename(filename): return filename -def cleanup(field, preserve_html=False): +def cleanup(field: str, preserve_html=False): if preserve_html: field = unidecode(emoji.demojize(field)).strip() else: @@ -59,7 +60,7 @@ def cleanup(field, preserve_html=False): return field -def sanitize_for_puzfile(puzzle, preserve_html=False): +def sanitize_for_puzfile(puzzle: Puzzle, preserve_html=False) -> Puzzle: puzzle.title = cleanup(puzzle.title, preserve_html) puzzle.author = cleanup(puzzle.author, preserve_html) puzzle.copyright = cleanup(puzzle.copyright, preserve_html) @@ -71,11 +72,11 @@ def sanitize_for_puzfile(puzzle, preserve_html=False): return puzzle -def parse_date(entered_date): +def parse_date(entered_date: str): return dateparser.parse(entered_date, settings={'PREFER_DATES_FROM':'past'}) -def parse_date_or_exit(entered_date): +def parse_date_or_exit(entered_date: str): guessed_dt = parse_date(entered_date) if not guessed_dt: @@ -85,7 +86,7 @@ def parse_date_or_exit(entered_date): return guessed_dt -def update_config_file(heading, new_values_dict): +def update_config_file(heading: str, new_values_dict: dict): with open(CONFIG_PATH, 'r') as f: config = yaml.safe_load(f) or {} @@ -98,7 +99,7 @@ def update_config_file(heading, new_values_dict): yaml.dump(config, f) -def read_config_values(heading): +def read_config_values(heading: str): with open(CONFIG_PATH, 'r') as f: config = yaml.safe_load(f) or {} diff --git a/xword_dl/xword_dl.py b/xword_dl/xword_dl.py index 19a10d1..00dba90 100644 --- a/xword_dl/xword_dl.py +++ b/xword_dl/xword_dl.py @@ -10,6 +10,7 @@ import requests from bs4 import BeautifulSoup +from puz import Puzzle from .downloader import get_plugins from .util import XWordDLException, parse_date_or_exit, save_puzzle @@ -20,7 +21,7 @@ plugins = get_plugins() -def by_keyword(keyword, **kwargs): +def by_keyword(keyword: str, **kwargs) -> tuple[Puzzle, str]: selected_downloader = next( (d for d in get_supported_outlets() if d.command == keyword), None ) @@ -48,7 +49,7 @@ def by_keyword(keyword, **kwargs): return puzzle, filename -def by_url(url, **kwargs): +def by_url(url: str, **kwargs) -> tuple[Puzzle, str]: supported_downloaders = [d for d in get_supported_outlets(command_only=False) if hasattr(d, 'matches_url')] @@ -65,7 +66,7 @@ def by_url(url, **kwargs): else: dl, puzzle_url = parse_for_embedded_puzzle(url, **kwargs) - if dl: + if dl and puzzle_url: puzzle = dl.download(puzzle_url) else: raise XWordDLException('Unable to find a puzzle at {}.'.format(url)) @@ -75,7 +76,7 @@ def by_url(url, **kwargs): return puzzle, filename -def parse_for_embedded_puzzle(url, **kwargs): +def parse_for_embedded_puzzle(url: str, **kwargs): supported_downloaders = [ d for d in get_supported_outlets(command_only=False) @@ -98,6 +99,8 @@ def parse_for_embedded_puzzle(url, **kwargs): for src in sources: for dlr in supported_downloaders: puzzle_url = dlr.matches_embed_url(src) + # TODO: would it be better to just return a URL and have controller + # request this from the plugin via normal methods? if puzzle_url is not None: return (dlr(), puzzle_url) From 458bcb6b0e3eaf7b11473107b9d0d7f7121c3c0b Mon Sep 17 00:00:00 2001 From: Adam Fontenot Date: Sun, 11 Aug 2024 22:40:11 -0400 Subject: [PATCH 20/20] Make __get_subclasses generic More appropriate as this function isn't specific to the downloader types. --- xword_dl/downloader/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/xword_dl/downloader/__init__.py b/xword_dl/downloader/__init__.py index 8d5451a..260b800 100644 --- a/xword_dl/downloader/__init__.py +++ b/xword_dl/downloader/__init__.py @@ -4,7 +4,7 @@ from .basedownloader import BaseDownloader as __bd -def __get_subclasses(cls: Type[__bd]): +def __get_subclasses[T](cls: Type[T]) -> list[Type[T]]: """Recursively returns a list of subclasses of `cls` in imported namespaces.""" return [cls] + [ r_cls