From 24047db6f6cb27790a15d1a8b44637a0b3383e7f Mon Sep 17 00:00:00 2001 From: Laure Thompson <602628+laurejt@users.noreply.github.com> Date: Tue, 5 Nov 2024 16:16:22 -0500 Subject: [PATCH 01/13] Updated developer notes and fixed unit test --- DEVELOPERNOTES.rst | 42 +++++++++++++++++++++++++++++++++- ppa/archive/tests/test_gale.py | 2 +- 2 files changed, 42 insertions(+), 2 deletions(-) diff --git a/DEVELOPERNOTES.rst b/DEVELOPERNOTES.rst index 9f39ab40..3236017f 100644 --- a/DEVELOPERNOTES.rst +++ b/DEVELOPERNOTES.rst @@ -1,6 +1,46 @@ Troubleshooting =============== +Local Solr setup +---------------- +Install Solr via `brew `:: + + brew install solr + +Copy the Solr config files in as a configset named `ppa`:: + + cp -r solr_conf /opt/homebrew/opt/solr/server/solr/configsets/ppa + +Create symbolic link to configsets in the Solr home directory:: + + ln -s /opt/homebrew/opt/solr/server/solr/configsets /opt/homebrew/var/lib/solr/ + +Create a new core with the `ppa` configset (Solr must be running):: + + curl "http://localhost:8983/solr/admin/cores?action=CREATE&name=ppa&configSet=ppa" + +When the configset has changed, copy in the updated Solr config files:: + + cp solr_conf/* /opt/homewbrew/var/lib/solr/configsets/ppa/ + +Start Solr by running the following command:: + + /opt/homebrew/opt/solr/bin/solr start -f + + +Local PostgreSQL +---------------- +Install PostgreSQL via `brew `:: + + brew install postgresql@15 + +Start PostgreSQL (or restart after an ugrade):: + + brew services start postgresql@15 + +Add PostgreSQL to your PATH:: + + echo 'export PATH="/opt/homebrew/opt/postgresql@15/bin:$PATH"' >> ~/.zshrc Solr setup with Docker @@ -92,7 +132,7 @@ To replace a local development database with a dump of production data:: psql -d postgres -c "DROP DATABASE cdh_ppa;" psql -d postgres -c "CREATE DATABASE cdh_ppa;" - psql -d postgres -U cdh_ppa < data/13_daily_cdh_ppa_cdh_ppa_2023-01-11.Wednesday.sql + psql cdh_ppa < data/13_daily_cdh_ppa_cdh_ppa_2023-01-11.Wednesday.sql Updating Wagtail test fixture diff --git a/ppa/archive/tests/test_gale.py b/ppa/archive/tests/test_gale.py index c15f730f..b6dd9a0d 100644 --- a/ppa/archive/tests/test_gale.py +++ b/ppa/archive/tests/test_gale.py @@ -272,7 +272,7 @@ def test_get_item_pages(self, mock_get_item, mock_get_local_ocr, mockrequests): # Set up get_local_ocr so that only the 3rd page's text is found mock_get_local_ocr.side_effect = [FileNotFoundError, FileNotFoundError, "local ocr text"] page_data = list(gale_api.get_item_pages(item_id)) - mock_get_item.called_once() + mock_get_item.assert_called_once() assert mock_get_local_ocr.call_count == 3 assert len(page_data) == 3 assert [ p["page_id"] for p in page_data ] == ["0001", "0002", "0003"] From 0ff502eaec0c4077cd01a2dcdb8e0ecca7315a75 Mon Sep 17 00:00:00 2001 From: Laure Thompson <602628+laurejt@users.noreply.github.com> Date: Tue, 5 Nov 2024 16:17:25 -0500 Subject: [PATCH 02/13] Add ruff to dev requirements --- dev-requirements.txt | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/dev-requirements.txt b/dev-requirements.txt index dffb9f5e..61d2e22b 100644 --- a/dev-requirements.txt +++ b/dev-requirements.txt @@ -4,4 +4,5 @@ pytest-django>=4.5.2 pytest-cov django-debug-toolbar sphinx>=7.2 -pre-commit \ No newline at end of file +pre-commit +ruff From 1a64204d9665d29ecbe1a53cf8e2dfbfc11ac5fb Mon Sep 17 00:00:00 2001 From: Laure Thompson <602628+laurejt@users.noreply.github.com> Date: Tue, 5 Nov 2024 16:18:00 -0500 Subject: [PATCH 03/13] Initial command for fetching hathi images. Note this currently relies on a feature branch of ppa-nlp --- .../management/commands/hathi_images.py | 165 ++++++++++++++++++ requirements.txt | 4 +- 2 files changed, 168 insertions(+), 1 deletion(-) create mode 100644 ppa/archive/management/commands/hathi_images.py diff --git a/ppa/archive/management/commands/hathi_images.py b/ppa/archive/management/commands/hathi_images.py new file mode 100644 index 00000000..ba18881c --- /dev/null +++ b/ppa/archive/management/commands/hathi_images.py @@ -0,0 +1,165 @@ +import requests +from pathlib import Path +from time import sleep + +import progressbar +from django.core.management.base import BaseCommand, CommandError +from django.template.defaultfilters import pluralize +from corppa.utils.path_utils import encode_htid, get_vol_dir + +from ppa.archive.models import DigitizedWork +from ppa.archive.templatetags.ppa_tags import page_image_url + + +class Command(BaseCommand): + """ + Download HathiTrust page image data via image server + + Note: Excerpts cannot be specified individually, only by source (collectively) + """ + help = __doc__ + #: normal verbosity level + v_normal = 1 + verbosity = v_normal + #: crawl delay (in seconds) + crawl_delay=1 + def add_arguments(self, parser): + parser.add_argument( + "out", + type=Path, + help="Top-level output directory") + parser.add_argument( + "--htids", + nargs="*", + help="Optional list HathiTrust ids to download", + ) + parser.add_argument( + "--progress", + action="store_true", + help="Display progress bars to track download progress", + default=True, + ) + + def download_image(self, page_url: str, out_file: Path) -> None: + response = requests.get(page_url) + if response.status_code == requests.codes.ok: + with out_file.open(mode="wb") as writer: + writer.write(response.content) + # Apply crawl delay after request + sleep(self.crawl_delay) + + + def handle(self, *args, **kwargs): + self.verbosity = kwargs.get("verbosity", self.v_normal) + self.options = kwargs + + # validate output directory + if not kwargs["out"]: + raise CommandError("An output directory must be specified") + output_dir = kwargs["out"] + if not output_dir.is_dir(): + raise CommandError( + f"Output directory '{output_dir}' does not exist or is not a directory" + ) + + # use ids specified via command line when present + htids = kwargs.get("htids", []) + + # by default, sync data for all non-suppressed hathi source ids + digworks = DigitizedWork.objects.filter( + status=DigitizedWork.PUBLIC, source=DigitizedWork.HATHI + ) + + # if htids are specified via parameter, use them to filter + # the queryset, to ensure we only sync records that are + # in the database and not suppressed + if htids: + digworks = digworks.filter(source_id__in=htids) + + # bail out if there's nothing to do + # (e.g., explicit htids only and none valid) + if not digworks.exists(): + self.stdout.write("No records to download; stopping") + return + + # setup main progress bar + overall_progress = None + if self.options["progress"]: + overall_progress = progressbar.ProgressBar( + redirect_stdout=True, max_value=digworks.count(), max_error=False + ) + overall_progress.start() + + self.stdout.write( + f"Downloading images for {digworks.count()} record{pluralize(digworks)}" + ) + + for digwork in digworks: + vol_id = digwork.source_id + + # Determine output volume & thumbnail directories (create as needed) + vol_dir = output_dir / get_vol_dir(vol_id) + vol_dir.mkdir(parents=True, exist_ok=True) + thumbnail_dir = vol_dir / "thumbnails" + thumbnail_dir.mkdir(exist_ok=True) + + # Get filename-friendly version of htid + clean_htid = encode_htid(vol_id) + + # Determine page range + if digwork.item_type == DigitizedWork.FULL: + page_range = range(1, digwork.page_count+1) + else: + page_range = digwork.page_span + + # Setup volume-level progress bar + volume_progress = None + if self.options["progress"]: + volume_progress = progressbar.ProgressBar( + redirect_stdout=True, max_value=len(page_range), max_error=False + ) + volume_progress.start() + + # Fetch images + stats = { + "image": {"fetch": 0, "skip": 0}, + "thumbnail": {"fetch": 0, "skip": 0} + } + for page_num in page_range: + image_name = f"{clean_htid}.{page_num:08d}.jpg" + + # Fetch thumbnail if file does not exist + page_thumbnail = thumbnail_dir / image_name + if not page_thumbnail.is_file(): + thumbnail_url = page_image_url(vol_id, page_num, 250) + self.download_image(thumbnail_url, page_thumbnail) + stats["thumbnail"]["fetch"] += 1 + else: + stats["thumbnail"]["skip"] += 1 + + # Fetch "full" image if file does not exist + page_image = vol_dir / image_name + if not page_image.is_file(): + image_url = page_image_url(vol_id, page_num, 800) + #self.download_image(image_url, page_image) + stats["image"]["fetch"] += 1 + else: + stats["image"]["skip"] += 1 + + # Update volume-specific progress bar + if volume_progress: + volume_progress.increment() + # Finish volume-specific progress bar + if volume_progress: + volume_progress.finish() + self.stdout.write( + f"{vol_id}: Fetched {stats['image']['fetch']} images & " + f"{stats['thumbnail']['fetch']} thumbnails; " + f"Skipped {stats['image']['skip']} images & " + f"{stats['thumbnail']['skip']} thumbnails" + ) + # Update overall progress bar + if overall_progress: + overall_progress.increment() + if overall_progress: + overall_progress.finish() diff --git a/requirements.txt b/requirements.txt index f612502f..ef2a33b3 100644 --- a/requirements.txt +++ b/requirements.txt @@ -29,4 +29,6 @@ psycopg2-binary multiprocess django-split-settings # only needed for the 'generate_textcorpus' manage command -orjsonl \ No newline at end of file +orjsonl +# TODO: Switch to develop once feature branch is merged +git+https://github.com/Princeton-CDH/ppa-nlp@feature/hathi-paths#egg=corppa From 9c185e2cc445402786fce4fdb244f1fb200e8897 Mon Sep 17 00:00:00 2001 From: Laure Thompson <602628+laurejt@users.noreply.github.com> Date: Wed, 6 Nov 2024 13:43:58 -0500 Subject: [PATCH 04/13] Update ppa/archive/management/commands/hathi_images.py Co-authored-by: Rebecca Sutton Koeser --- ppa/archive/management/commands/hathi_images.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ppa/archive/management/commands/hathi_images.py b/ppa/archive/management/commands/hathi_images.py index ba18881c..117f6413 100644 --- a/ppa/archive/management/commands/hathi_images.py +++ b/ppa/archive/management/commands/hathi_images.py @@ -31,7 +31,7 @@ def add_arguments(self, parser): parser.add_argument( "--htids", nargs="*", - help="Optional list HathiTrust ids to download", + help="Optional list of HathiTrust ids (by default, downloads images for all public HathiTrust volumes)", ) parser.add_argument( "--progress", From a617d9439a708461905e07cef9a63c549f989ae9 Mon Sep 17 00:00:00 2001 From: Laure Thompson <602628+laurejt@users.noreply.github.com> Date: Wed, 6 Nov 2024 13:44:11 -0500 Subject: [PATCH 05/13] Update ppa/archive/management/commands/hathi_images.py Co-authored-by: Rebecca Sutton Koeser --- ppa/archive/management/commands/hathi_images.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ppa/archive/management/commands/hathi_images.py b/ppa/archive/management/commands/hathi_images.py index 117f6413..06588aab 100644 --- a/ppa/archive/management/commands/hathi_images.py +++ b/ppa/archive/management/commands/hathi_images.py @@ -65,7 +65,7 @@ def handle(self, *args, **kwargs): # use ids specified via command line when present htids = kwargs.get("htids", []) - # by default, sync data for all non-suppressed hathi source ids + # by default, download images for all non-suppressed hathi source ids digworks = DigitizedWork.objects.filter( status=DigitizedWork.PUBLIC, source=DigitizedWork.HATHI ) From 9106cd57384944293ef8895a8877f1327beed7bb Mon Sep 17 00:00:00 2001 From: Laure Thompson <602628+laurejt@users.noreply.github.com> Date: Wed, 6 Nov 2024 13:55:28 -0500 Subject: [PATCH 06/13] Update corppa dependency to develop branch --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index ef2a33b3..63f24f85 100644 --- a/requirements.txt +++ b/requirements.txt @@ -31,4 +31,4 @@ django-split-settings # only needed for the 'generate_textcorpus' manage command orjsonl # TODO: Switch to develop once feature branch is merged -git+https://github.com/Princeton-CDH/ppa-nlp@feature/hathi-paths#egg=corppa +git+https://github.com/Princeton-CDH/ppa-nlp@develop#egg=corppa From 3776d4444eeb4426555cdc306f4f722548a9a81f Mon Sep 17 00:00:00 2001 From: Laure Thompson <602628+laurejt@users.noreply.github.com> Date: Tue, 12 Nov 2024 10:30:31 -0500 Subject: [PATCH 07/13] Updated models.py to pass unit tests locally Appears to be some issue where None is being return instead of 0 --- ppa/archive/models.py | 50 ++++++++++++++----------------------------- 1 file changed, 16 insertions(+), 34 deletions(-) diff --git a/ppa/archive/models.py b/ppa/archive/models.py index 12cb091b..e2484b92 100644 --- a/ppa/archive/models.py +++ b/ppa/archive/models.py @@ -180,8 +180,7 @@ class ProtectedWorkField(models.Field): :class:`ProtectedWorkFieldFlags` object and stores as integer.""" description = ( - "A field that stores an instance of :class:`ProtectedWorkFieldFlags` " - "as an integer." + "A field that stores an instance of :class:`ProtectedWorkFieldFlags` as an integer." ) def __init__(self, verbose_name=None, name=None, **kwargs): @@ -221,9 +220,7 @@ def collection_save(sender, instance, **kwargs): # if the collection has any works associated works = instance.digitizedwork_set.all() if works.exists(): - logger.debug( - "collection save, reindexing %d related works", works.count() - ) + logger.debug(f"collection save, reindexing {works.count()} related works") DigitizedWork.index_items(works) @staticmethod @@ -234,7 +231,7 @@ def collection_delete(sender, instance, **kwargs): digwork_ids = instance.digitizedwork_set.values_list("id", flat=True) # find the items based on the list of ids to reindex digworks = DigitizedWork.objects.filter(id__in=list(digwork_ids)) - logger.debug("collection delete, reindexing %d works" % len(digworks)) + logger.debug(f"collection delete, reindexing {len(digworks)} works") # NOTE: this sends pre/post clear signal, but it's not obvious # how to take advantage of that @@ -252,11 +249,11 @@ def cluster_save(sender, instance, **kwargs): works = instance.digitizedwork_set.all() if works.exists(): # get a total of page count for affected works - page_count = works.aggregate(page_count=models.Sum("page_count")) + page_count = works.aggregate(page_count=models.Sum("page_count", default=0)) logger.debug( "cluster id has changed, reindexing %d works and %d pages", works.count(), - page_count.get("page_count", 0), + page_count["page_count"], ) DigitizedWork.index_items(works) # reindex pages (this may be slow...) @@ -272,11 +269,11 @@ def cluster_delete(sender, instance, **kwargs): # find the items based on the list of ids to reindex digworks = DigitizedWork.objects.filter(id__in=list(digwork_ids)) # get a total of page count for affected works - page_count = digworks.aggregate(page_count=models.Sum("page_count")) + page_count = digworks.aggregate(page_count=models.Sum("page_count", default=0)) logger.debug( "cluster delete, reindexing %d works and %d pages", digworks.count(), - page_count.get("page_count", 0), + page_count["page_count"], ) # NOTE: this sends pre/post clear signal, but it's not obvious @@ -295,7 +292,7 @@ def handle_digwork_cluster_change(sender, instance, **kwargs): logger.debug( "Cluster changed for %s; indexing %d pages", instance, - instance.page_count, + instance.page_count or 0, ) instance.index_items(Page.page_index_data(instance)) @@ -425,9 +422,7 @@ class DigitizedWork(ModelIndexable, TrackChangesModel): collections = models.ManyToManyField(Collection, blank=True) #: optional cluster for aggregating works - cluster = models.ForeignKey( - Cluster, blank=True, null=True, on_delete=models.SET_NULL - ) + cluster = models.ForeignKey(Cluster, blank=True, null=True, on_delete=models.SET_NULL) #: date added to the archive added = models.DateTimeField(auto_now_add=True) @@ -629,9 +624,7 @@ def save(self, *args, **kwargs): # if excerpt page range has changed # OR this is a new record with a page range - if self.has_changed("pages_digital") or ( - self.pk is None and self.pages_digital - ): + if self.has_changed("pages_digital") or (self.pk is None and self.pages_digital): # update the page count if possible (i.e., not a Gale record) self.page_count = self.count_pages() # if page range changed on existing record, clear out old index @@ -765,9 +758,7 @@ def metadata_from_marc(self, marc_record, populate=True): field_data["sort_title"] = marc_record.title()[non_sort:].strip(' "[') field_data["author"] = marc_record.author() or "" # remove a note present on some records and strip whitespace - field_data["author"] = ( - field_data["author"].replace("[from old catalog]", "").strip() - ) + field_data["author"] = field_data["author"].replace("[from old catalog]", "").strip() # removing trailing period, except when it is part of an # initial or known abbreviation (i.e, Esq.) # Look for single initial, but support initials with no spaces @@ -815,9 +806,7 @@ def metadata_from_marc(self, marc_record, populate=True): # *only* if they wrap the whole text for field in ["publisher", "pub_place"]: if field in field_data: - field_data[field] = re.sub( - r"^\[(.*)\]$", r"\1", field_data[field] - ).strip() + field_data[field] = re.sub(r"^\[(.*)\]$", r"\1", field_data[field]).strip() if populate: # conditionally update fields that are protected (or not) @@ -874,9 +863,7 @@ def populate_from_bibdata(self, bibdata): "post_save": SignalHandlers.cluster_save, "pre_delete": SignalHandlers.cluster_delete, }, - "archive.DigitizedWork": { - "post_save": SignalHandlers.handle_digwork_cluster_change - }, + "archive.DigitizedWork": {"post_save": SignalHandlers.handle_digwork_cluster_change}, } def first_page(self): @@ -1027,11 +1014,7 @@ def count_pages(self, ptree_client=None): # some aggregate packages retrieved from Data API # include jp2 and xml files as well as txt; only count text page_count = len( - [ - filename - for filename in ht_zip.namelist() - if filename.endswith(".txt") - ] + [filename for filename in ht_zip.namelist() if filename.endswith(".txt")] ) logger.debug( "Counted %d pages in zipfile in %f sec", page_count, time.time() - start @@ -1072,8 +1055,7 @@ def get_metadata(self, metadata_format): return record.as_marc() except MARCRecordNotFound: logger.warning( - "MARC record for %s/%s not found" - % (self.source_id, self.record_id) + f"MARC record for {self.source_id}/{self.record_id} not found" ) return "" @@ -1088,7 +1070,7 @@ def get_source_link_label(self): return "View on Gale Primary Sources" if self.source == DigitizedWork.OTHER: return "View external record" - return "View on %s" % self.get_source_display() + return f"View on {self.get_source_display()}" @staticmethod def add_from_hathi(htid, bib_api=None, update=False, log_msg_src=None, user=None): From 85bc6b14ca3a06ebc20272919dbd864a8629ce30 Mon Sep 17 00:00:00 2001 From: Laure Thompson <602628+laurejt@users.noreply.github.com> Date: Wed, 13 Nov 2024 11:55:51 -0500 Subject: [PATCH 08/13] Refactored hathi_images command & added unit tests --- .../management/commands/hathi_images.py | 234 ++++++++++++------ ppa/archive/tests/test_hathi_images.py | 98 ++++++++ 2 files changed, 253 insertions(+), 79 deletions(-) create mode 100644 ppa/archive/tests/test_hathi_images.py diff --git a/ppa/archive/management/commands/hathi_images.py b/ppa/archive/management/commands/hathi_images.py index 06588aab..785b858c 100644 --- a/ppa/archive/management/commands/hathi_images.py +++ b/ppa/archive/management/commands/hathi_images.py @@ -1,6 +1,14 @@ +""" +**hathi_images** is a custom manage command for downloading both full-size +and thumbnail page images for a list of HathiTrust volumes. +""" +import argparse +from collections import Counter +from collections.abc import Iterable import requests from pathlib import Path from time import sleep +from typing import Self import progressbar from django.core.management.base import BaseCommand, CommandError @@ -11,6 +19,43 @@ from ppa.archive.templatetags.ppa_tags import page_image_url +class DownloadStats: + ACTION_TYPES = {"fetch", "skip"} + def __init__(self): + # Stats for full size images + self.full = Counter() + # Stats for thumbnail images + self.thumbnail = Counter() + + def _log_action(self, image_type: str, action: str) -> None: + if action not in self.ACTION_TYPES: + raise ValueError(f"Unknown action type '{action}'") + if image_type == "full": + self.full[action] += 1 + elif image_type == "thumbnail": + self.thumbnail[action] += 1 + else: + raise ValueError(f"Unknown image type '{image_type}'") + + def log_download(self, image_type: str) -> None: + self._log_action(image_type, "fetch") + + def log_skip(self, image_type: str) -> None: + self._log_action(image_type, "skip") + + def update(self, other: Self) -> None: + self.full.update(other.full) + self.thumbnail.update(other.thumbnail) + + def get_report(self) -> str: + return ( + f"Fetched {self.full['fetch']} images & " + f"{self.thumbnail['fetch']} thumbnails; " + f"Skipped {self.full['skip']} images & " + f"{self.thumbnail['skip']} thumbnails" + ) + + class Command(BaseCommand): """ Download HathiTrust page image data via image server @@ -21,46 +66,128 @@ class Command(BaseCommand): #: normal verbosity level v_normal = 1 verbosity = v_normal - #: crawl delay (in seconds) - crawl_delay=1 + + # Argument parsing def add_arguments(self, parser): + """ + Configure additional CLI arguments + """ parser.add_argument( - "out", + "output_dir", type=Path, - help="Top-level output directory") + help="Top-level output directory" + ) parser.add_argument( "--htids", - nargs="*", + nargs="+", help="Optional list of HathiTrust ids (by default, downloads images for all public HathiTrust volumes)", ) + parser.add_argument( + "--crawl-delay", + type=int, + help="Delay to be applied between each download in seconds. Default: 1", + default=1, + ) + parser.add_argument( + "--image-width", + type=int, + help="Width for full-size images in pixels. Default: 800", + default=800, + ) + parser.add_argument( + "--thumbnail-width", + type=int, + help="Width for thumbnail images in pixels. Must be at most 250 pixels. Default: 250", + default=250, + ) parser.add_argument( "--progress", - action="store_true", + action=argparse.BooleanOptionalAction, help="Display progress bars to track download progress", default=True, ) - - def download_image(self, page_url: str, out_file: Path) -> None: + + def download_image(self, page_url: str, out_file: Path) -> bool: response = requests.get(page_url) + success = False if response.status_code == requests.codes.ok: with out_file.open(mode="wb") as writer: writer.write(response.content) + success = True + else: + if self.verbosity > self.v_normal: + self.stdout(f"Warning: Failed to fetch image {out_file.name}") # Apply crawl delay after request sleep(self.crawl_delay) + return success + + + def download_volume_images(self, vol_id:str, page_range: Iterable) -> DownloadStats: + # Determine output volume & thumbnail directories (create as needed) + vol_dir = self.output_dir / get_vol_dir(vol_id) + vol_dir.mkdir(parents=True, exist_ok=True) + thumbnail_dir = vol_dir / "thumbnails" + thumbnail_dir.mkdir(exist_ok=True) + + # Get filename-friendly version of htid + clean_htid = encode_htid(vol_id) + + # Setup volume-level progress bar + volume_progress = None + if self.show_progress: + volume_progress = progressbar.ProgressBar( + line_offset=1, redirect_stdout=True, max_value=len(page_range), max_error=False + ) + volume_progress.start() + + # Fetch images + stats = DownloadStats() + for page_num in page_range: + image_name = f"{clean_htid}.{page_num:08d}.jpg" + + # Fetch thumbnail if file does not exist + page_thumbnail = thumbnail_dir / image_name + if not page_thumbnail.is_file(): + thumbnail_url = page_image_url(vol_id, page_num, self.thumbnail_width) + success = self.download_image(thumbnail_url, page_thumbnail) + # TODO: Should we log something different if the download fails? + stats.log_download("thumbnail") + else: + stats.log_skip("thumbnail") + + # Fetch "full" image if file does not exist + page_image = vol_dir / image_name + if not page_image.is_file(): + image_url = page_image_url(vol_id, page_num, self.full_width) + success = self.download_image(image_url, page_image) + stats.log_download("full") + else: + stats.log_skip("full") + + # Update volume-specific progress bar + if volume_progress: + volume_progress.increment() + # Finish volume-specific progress bar + if volume_progress: + volume_progress.finish() + return stats def handle(self, *args, **kwargs): - self.verbosity = kwargs.get("verbosity", self.v_normal) - self.options = kwargs - - # validate output directory - if not kwargs["out"]: - raise CommandError("An output directory must be specified") - output_dir = kwargs["out"] - if not output_dir.is_dir(): + self.output_dir = kwargs["output_dir"] + self.crawl_delay = kwargs["crawl_delay"] + self.full_width = kwargs["image_width"] + self.thumbnail_width = kwargs["thumbnail_width"] + self.verbosity = kwargs.get("verbosity", self.verbosity) + self.show_progress = kwargs["progress"] + + # Validate input arguments + if not self.output_dir.is_dir(): raise CommandError( - f"Output directory '{output_dir}' does not exist or is not a directory" + f"Output directory '{self.output_dir}' does not exist or is not a directory" ) + if self.thumbnail_width > 250: + raise CommandError(f"Thumbnail width cannot be more than 250 pixels") # use ids specified via command line when present htids = kwargs.get("htids", []) @@ -82,84 +209,33 @@ def handle(self, *args, **kwargs): self.stdout.write("No records to download; stopping") return + self.stdout.write( + f"Downloading images for {digworks.count()} record{pluralize(digworks)}" + ) + # setup main progress bar overall_progress = None - if self.options["progress"]: + if self.show_progress: overall_progress = progressbar.ProgressBar( - redirect_stdout=True, max_value=digworks.count(), max_error=False + line_offset=0, redirect_stdout=True, max_value=digworks.count(), max_error=False ) overall_progress.start() - self.stdout.write( - f"Downloading images for {digworks.count()} record{pluralize(digworks)}" - ) - + overall_stats = DownloadStats() for digwork in digworks: vol_id = digwork.source_id - - # Determine output volume & thumbnail directories (create as needed) - vol_dir = output_dir / get_vol_dir(vol_id) - vol_dir.mkdir(parents=True, exist_ok=True) - thumbnail_dir = vol_dir / "thumbnails" - thumbnail_dir.mkdir(exist_ok=True) - - # Get filename-friendly version of htid - clean_htid = encode_htid(vol_id) - # Determine page range if digwork.item_type == DigitizedWork.FULL: page_range = range(1, digwork.page_count+1) else: page_range = digwork.page_span - # Setup volume-level progress bar - volume_progress = None - if self.options["progress"]: - volume_progress = progressbar.ProgressBar( - redirect_stdout=True, max_value=len(page_range), max_error=False - ) - volume_progress.start() - - # Fetch images - stats = { - "image": {"fetch": 0, "skip": 0}, - "thumbnail": {"fetch": 0, "skip": 0} - } - for page_num in page_range: - image_name = f"{clean_htid}.{page_num:08d}.jpg" - - # Fetch thumbnail if file does not exist - page_thumbnail = thumbnail_dir / image_name - if not page_thumbnail.is_file(): - thumbnail_url = page_image_url(vol_id, page_num, 250) - self.download_image(thumbnail_url, page_thumbnail) - stats["thumbnail"]["fetch"] += 1 - else: - stats["thumbnail"]["skip"] += 1 - - # Fetch "full" image if file does not exist - page_image = vol_dir / image_name - if not page_image.is_file(): - image_url = page_image_url(vol_id, page_num, 800) - #self.download_image(image_url, page_image) - stats["image"]["fetch"] += 1 - else: - stats["image"]["skip"] += 1 - - # Update volume-specific progress bar - if volume_progress: - volume_progress.increment() - # Finish volume-specific progress bar - if volume_progress: - volume_progress.finish() - self.stdout.write( - f"{vol_id}: Fetched {stats['image']['fetch']} images & " - f"{stats['thumbnail']['fetch']} thumbnails; " - f"Skipped {stats['image']['skip']} images & " - f"{stats['thumbnail']['skip']} thumbnails" - ) + vol_stats = self.download_volume_images(vol_id, page_range) + overall_stats.update(vol_stats) # Update overall progress bar if overall_progress: overall_progress.increment() if overall_progress: overall_progress.finish() + self.stdout.write("\n\n") # To avoid overwriting progress bars + self.stdout.write(self.style.SUCCESS(overall_stats.get_report())) diff --git a/ppa/archive/tests/test_hathi_images.py b/ppa/archive/tests/test_hathi_images.py new file mode 100644 index 00000000..1755f9c5 --- /dev/null +++ b/ppa/archive/tests/test_hathi_images.py @@ -0,0 +1,98 @@ +from unittest.mock import call, patch + +import pytest +import requests + + +from ppa.archive.templatetags.ppa_tags import page_image_url +from ppa.archive.management.commands.hathi_images import ( + DownloadStats, +) + + +class TestDownloadStats: + def check_stats( + self, + stats: DownloadStats, + full_fetch: int, + full_skip: int, + thumbnail_fetch: int, + thumbnail_skip: int, + ) -> None: + """Helper function to check stats""" + assert stats.full["fetch"] == full_fetch + assert stats.full["skip"] == full_skip + assert stats.thumbnail["fetch"] == thumbnail_fetch + assert stats.thumbnail["skip"] == thumbnail_skip + + def test_init(self): + stats = DownloadStats() + self.check_stats(stats, 0, 0, 0, 0) + + def test_log_action(self): + stats = DownloadStats() + # unknown action type + with pytest.raises(ValueError, match="Unknown action type 'bad_action'"): + stats._log_action("image_type", "bad_action") + + # unknown image type + with pytest.raises(ValueError, match="Unknown image type 'image_type'"): + stats._log_action("image_type", "fetch") + + # Add one to each image type & action + stats._log_action("full", "fetch") + self.check_stats(stats, 1, 0, 0, 0) + stats._log_action("full", "skip") + self.check_stats(stats, 1, 1, 0, 0) + stats._log_action("thumbnail", "fetch") + self.check_stats(stats, 1, 1, 1, 0) + stats._log_action("thumbnail", "skip") + self.check_stats(stats, 1, 1, 1, 1) + + # Add another one to each image type & action + stats._log_action("thumbnail", "skip") + self.check_stats(stats, 1, 1, 1, 2) + stats._log_action("full", "skip") + self.check_stats(stats, 1, 2, 1, 2) + stats._log_action("full", "fetch") + self.check_stats(stats, 2, 2, 1, 2) + stats._log_action("thumbnail", "fetch") + self.check_stats(stats, 2, 2, 2, 2) + + @patch.object(DownloadStats, "_log_action") + def test_log_download(self, mock_log_action): + stats = DownloadStats() + stats.log_download("image_type") + mock_log_action.called_once_with("image_type", "fetch") + + @patch.object(DownloadStats, "_log_action") + def test_log_skip(self, mock_log_action): + stats = DownloadStats() + stats.log_download("image_type") + mock_log_action.called_once_with("image_type", "skip") + + def test_update(self): + stats_a = DownloadStats() + stats_b = DownloadStats() + stats_b.full.update({"fetch": 5, "skip": 1}) + stats_b.thumbnail.update({"fetch": 3, "skip": 2}) + self.check_stats(stats_b, 5, 1, 3, 2 ) + + stats_a.update(stats_b) + self.check_stats(stats_a, 5, 1, 3, 2) + self.check_stats(stats_b, 5, 1, 3, 2 ) + + stats_a.update(stats_b) + self.check_stats(stats_a, 10, 2, 6, 4) + self.check_stats(stats_b, 5, 1, 3, 2 ) + + def test_report(self): + stats_a = DownloadStats() + report_a = "Fetched 0 images & 0 thumbnails; Skipped 0 images & 0 thumbnails" + assert stats_a.get_report() == report_a + + stats_b = DownloadStats() + stats_b.full.update({"fetch": 5, "skip": 1}) + stats_b.thumbnail.update({"fetch": 3, "skip": 2}) + report_b = "Fetched 5 images & 3 thumbnails; Skipped 1 images & 2 thumbnails" + assert stats_b.get_report() == report_b From 0f3880a730403d9045492c977bba7f18530dd273 Mon Sep 17 00:00:00 2001 From: Laure Thompson <602628+laurejt@users.noreply.github.com> Date: Wed, 13 Nov 2024 15:50:16 -0500 Subject: [PATCH 09/13] Update local settings for parasolr --- ppa/settings/local_settings.py.sample | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ppa/settings/local_settings.py.sample b/ppa/settings/local_settings.py.sample index 0ebf225c..28d86dc0 100644 --- a/ppa/settings/local_settings.py.sample +++ b/ppa/settings/local_settings.py.sample @@ -105,7 +105,7 @@ LOGGING = { 'handlers': ['console'], 'level': 'DEBUG' }, - 'SolrClient': { + 'parasolr': { 'handlers': ['console'], 'level': 'WARN' }, From 24c02f95d95c24b4aa74bd34aa6ed00b955012e0 Mon Sep 17 00:00:00 2001 From: Laure Thompson <602628+laurejt@users.noreply.github.com> Date: Thu, 14 Nov 2024 11:19:23 -0500 Subject: [PATCH 10/13] Updated hathi images command * Added keyboard interrupt handling * Modified progress bar * Added logging --- .../management/commands/hathi_images.py | 174 +++++++++++------- ppa/archive/tests/test_hathi_images.py | 4 +- 2 files changed, 111 insertions(+), 67 deletions(-) diff --git a/ppa/archive/management/commands/hathi_images.py b/ppa/archive/management/commands/hathi_images.py index 785b858c..93e97989 100644 --- a/ppa/archive/management/commands/hathi_images.py +++ b/ppa/archive/management/commands/hathi_images.py @@ -5,12 +5,14 @@ import argparse from collections import Counter from collections.abc import Iterable +import logging import requests from pathlib import Path +import signal from time import sleep from typing import Self -import progressbar +from tqdm import tqdm from django.core.management.base import BaseCommand, CommandError from django.template.defaultfilters import pluralize from corppa.utils.path_utils import encode_htid, get_vol_dir @@ -18,9 +20,19 @@ from ppa.archive.models import DigitizedWork from ppa.archive.templatetags.ppa_tags import page_image_url +logger = logging.getLogger(__name__) + class DownloadStats: - ACTION_TYPES = {"fetch", "skip"} + # Support actions + ACTION_TYPES = {"fetch", "skip", "error"} + # Associated strings used for reporting + ACTION_STRS = { + "fetch": "Fetched", + "skip": "Skipped", + "error": "Missed", + } + def __init__(self): # Stats for full size images self.full = Counter() @@ -43,17 +55,25 @@ def log_download(self, image_type: str) -> None: def log_skip(self, image_type: str) -> None: self._log_action(image_type, "skip") + def log_error(self, image_type: str) -> None: + self._log_action(image_type, "error") + def update(self, other: Self) -> None: self.full.update(other.full) self.thumbnail.update(other.thumbnail) def get_report(self) -> str: - return ( - f"Fetched {self.full['fetch']} images & " - f"{self.thumbnail['fetch']} thumbnails; " - f"Skipped {self.full['skip']} images & " - f"{self.thumbnail['skip']} thumbnails" - ) + report = "" + for action in ["fetch", "skip", "error"]: + if action == "error": + # Only report errors when an error is present + if not self.full[action] and not self.thumbnail[action]: + continue + action_str = self.ACTION_STRS[action] + if report: + report += "\n" + report += f"{action_str}: {self.full[action]} images & {self.thumbnail[action]} thumbnails" + return report class Command(BaseCommand): @@ -63,9 +83,9 @@ class Command(BaseCommand): Note: Excerpts cannot be specified individually, only by source (collectively) """ help = __doc__ - #: normal verbosity level - v_normal = 1 - verbosity = v_normal + + # Interrupt flag to exit gracefully (i.e. between volumes) when a signal is caught + interrupted = False # Argument parsing def add_arguments(self, parser): @@ -106,17 +126,45 @@ def add_arguments(self, parser): help="Display progress bars to track download progress", default=True, ) - + + def interrupt_handler(self, signum, frame): + """ + For handling of SIGINT, as possible. For the first SIGINT, a flag is set + so that the command will exit after the current volume's image download + is complete. Additionally, the default signal handler is restored so a + second SIGINT will cause the command to immediately exit. + """ + if signum == signal.SIGINT: + # Restore default signal handler + signal.signal(signal.SIGINT, signal.SIG_DFL) + # Set interrupt flag + self.interrupted = True + self.stdout.write(self.style.WARNING( + "Command will exit once this volume's image download is " + "complete.\n Ctrl-C / Interrupt to quit immediately" + ) + ) + def download_image(self, page_url: str, out_file: Path) -> bool: response = requests.get(page_url) + # log response time + logger.debug(f"Response time: {response.elapsed.total_seconds()}") + self.stdout.write(str(response.headers)) success = False if response.status_code == requests.codes.ok: with out_file.open(mode="wb") as writer: writer.write(response.content) success = True - else: - if self.verbosity > self.v_normal: - self.stdout(f"Warning: Failed to fetch image {out_file.name}") + # For checking throttling rates + # TODO: Consider removing once crawl delays are determined + choke_str = "x-choke info:" + for choke_sfx in ['allowed', 'credit', 'delta', 'max', 'rate']: + header = f"x-choke-{choke_sfx}" + if header in response.headers: + choke_str += f"\n {header}: {response.headers[header]}" + logger.debug(choke_str) + elif response.status_code == 503: + logger.debug("WARNING: Received 503 status code. Throttling may have occurred") # Apply crawl delay after request sleep(self.crawl_delay) return success @@ -132,44 +180,31 @@ def download_volume_images(self, vol_id:str, page_range: Iterable) -> DownloadSt # Get filename-friendly version of htid clean_htid = encode_htid(vol_id) - # Setup volume-level progress bar - volume_progress = None - if self.show_progress: - volume_progress = progressbar.ProgressBar( - line_offset=1, redirect_stdout=True, max_value=len(page_range), max_error=False - ) - volume_progress.start() - # Fetch images stats = DownloadStats() for page_num in page_range: image_name = f"{clean_htid}.{page_num:08d}.jpg" - # Fetch thumbnail if file does not exist - page_thumbnail = thumbnail_dir / image_name - if not page_thumbnail.is_file(): - thumbnail_url = page_image_url(vol_id, page_num, self.thumbnail_width) - success = self.download_image(thumbnail_url, page_thumbnail) - # TODO: Should we log something different if the download fails? - stats.log_download("thumbnail") - else: - stats.log_skip("thumbnail") - - # Fetch "full" image if file does not exist - page_image = vol_dir / image_name - if not page_image.is_file(): - image_url = page_image_url(vol_id, page_num, self.full_width) - success = self.download_image(image_url, page_image) - stats.log_download("full") - else: - stats.log_skip("full") - - # Update volume-specific progress bar - if volume_progress: - volume_progress.increment() - # Finish volume-specific progress bar - if volume_progress: - volume_progress.finish() + for image_type in ["full", "thumbnail"]: + image_dir = vol_dir if image_type == "full" else thumbnail_dir + image = image_dir / image_name + image_width = getattr(self, f"{image_type}_width") + + # Fetch image does not exist + if not image.is_file(): + image_url = page_image_url(vol_id, page_num, image_width) + success = self.download_image(image_url, image) + if success: + stats.log_download(image_type) + else: + stats.log_error(image_type) + logger.debug(f"Failed to download {image_type} image {image_name}") + else: + stats.log_skip(image_type) + + # Update progress bar + if self.show_progress: + self.progress_bar.update() return stats @@ -178,7 +213,6 @@ def handle(self, *args, **kwargs): self.crawl_delay = kwargs["crawl_delay"] self.full_width = kwargs["image_width"] self.thumbnail_width = kwargs["thumbnail_width"] - self.verbosity = kwargs.get("verbosity", self.verbosity) self.show_progress = kwargs["progress"] # Validate input arguments @@ -187,7 +221,7 @@ def handle(self, *args, **kwargs): f"Output directory '{self.output_dir}' does not exist or is not a directory" ) if self.thumbnail_width > 250: - raise CommandError(f"Thumbnail width cannot be more than 250 pixels") + raise CommandError("Thumbnail width cannot be more than 250 pixels") # use ids specified via command line when present htids = kwargs.get("htids", []) @@ -208,34 +242,44 @@ def handle(self, *args, **kwargs): if not digworks.exists(): self.stdout.write("No records to download; stopping") return + + # Bind handler for interrupt signal + signal.signal(signal.SIGINT, self.interrupt_handler) + n_vols = digworks.count() self.stdout.write( - f"Downloading images for {digworks.count()} record{pluralize(digworks)}" + f"Downloading images for {n_vols} record{pluralize(digworks)}", ) - # setup main progress bar - overall_progress = None + # Initialize progress bar if self.show_progress: - overall_progress = progressbar.ProgressBar( - line_offset=0, redirect_stdout=True, max_value=digworks.count(), max_error=False - ) - overall_progress.start() - + self.progress_bar = tqdm() + overall_stats = DownloadStats() - for digwork in digworks: + for i, digwork in enumerate(digworks): vol_id = digwork.source_id # Determine page range if digwork.item_type == DigitizedWork.FULL: page_range = range(1, digwork.page_count+1) else: page_range = digwork.page_span - + + # Update progress bar + if self.show_progress: + self.progress_bar.reset(total=len(page_range)) + self.progress_bar.set_description( + f"{vol_id} ({i+1}/{n_vols})" + ) + vol_stats = self.download_volume_images(vol_id, page_range) overall_stats.update(vol_stats) # Update overall progress bar - if overall_progress: - overall_progress.increment() - if overall_progress: - overall_progress.finish() - self.stdout.write("\n\n") # To avoid overwriting progress bars + # Check if we need to exit early + if self.interrupted: + break + # Close progres bar + if self.show_progress: + self.progress_bar.close() + if self.interrupted: + self.stdout.write(self.style.WARNING(f"Exited early with {i} volumes completed.")) self.stdout.write(self.style.SUCCESS(overall_stats.get_report())) diff --git a/ppa/archive/tests/test_hathi_images.py b/ppa/archive/tests/test_hathi_images.py index 1755f9c5..cea33e21 100644 --- a/ppa/archive/tests/test_hathi_images.py +++ b/ppa/archive/tests/test_hathi_images.py @@ -88,11 +88,11 @@ def test_update(self): def test_report(self): stats_a = DownloadStats() - report_a = "Fetched 0 images & 0 thumbnails; Skipped 0 images & 0 thumbnails" + report_a = "Fetched: 0 images & 0 thumbnails\nSkipped: 0 images & 0 thumbnails" assert stats_a.get_report() == report_a stats_b = DownloadStats() stats_b.full.update({"fetch": 5, "skip": 1}) stats_b.thumbnail.update({"fetch": 3, "skip": 2}) - report_b = "Fetched 5 images & 3 thumbnails; Skipped 1 images & 2 thumbnails" + report_b = "Fetched: 5 images & 3 thumbnails\nSkipped: 1 images & 2 thumbnails" assert stats_b.get_report() == report_b From e18f2ce0e2f0693c25ea75bc8a96dbd2ce0ccdec Mon Sep 17 00:00:00 2001 From: Laure Thompson <602628+laurejt@users.noreply.github.com> Date: Thu, 14 Nov 2024 12:31:07 -0500 Subject: [PATCH 11/13] Update local settings to prevent log duplication --- ppa/settings/local_settings.py.sample | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/ppa/settings/local_settings.py.sample b/ppa/settings/local_settings.py.sample index 28d86dc0..3405b8db 100644 --- a/ppa/settings/local_settings.py.sample +++ b/ppa/settings/local_settings.py.sample @@ -103,11 +103,13 @@ LOGGING = { }, 'ppa': { 'handlers': ['console'], - 'level': 'DEBUG' + 'level': 'DEBUG', + 'propagate': False, }, 'parasolr': { 'handlers': ['console'], - 'level': 'WARN' + 'level': 'WARN', + 'propagate': False, }, } } From 51d1591549c74d825dfab3a89ecefa95860c446e Mon Sep 17 00:00:00 2001 From: Laure Thompson <602628+laurejt@users.noreply.github.com> Date: Thu, 14 Nov 2024 13:54:54 -0500 Subject: [PATCH 12/13] Removed crawl delay and added a few unit tests. --- .../management/commands/hathi_images.py | 69 +++++++------- ppa/archive/tests/test_hathi_images.py | 92 ++++++++++++++----- 2 files changed, 102 insertions(+), 59 deletions(-) diff --git a/ppa/archive/management/commands/hathi_images.py b/ppa/archive/management/commands/hathi_images.py index 93e97989..0553efa7 100644 --- a/ppa/archive/management/commands/hathi_images.py +++ b/ppa/archive/management/commands/hathi_images.py @@ -9,7 +9,7 @@ import requests from pathlib import Path import signal -from time import sleep +import time from typing import Self from tqdm import tqdm @@ -63,16 +63,22 @@ def update(self, other: Self) -> None: self.thumbnail.update(other.thumbnail) def get_report(self) -> str: + # No actions logged + if not self.full and not self.thumbnail: + return "No actions taken" + + # Report actions taken report = "" for action in ["fetch", "skip", "error"]: - if action == "error": - # Only report errors when an error is present - if not self.full[action] and not self.thumbnail[action]: - continue + # Only report action when it has been taken + if not self.full[action] and not self.thumbnail[action]: + continue action_str = self.ACTION_STRS[action] + n_full = self.full[action] + n_thumbnail = self.thumbnail[action] if report: report += "\n" - report += f"{action_str}: {self.full[action]} images & {self.thumbnail[action]} thumbnails" + report += f"{action_str}: {n_full} images & {n_thumbnail} thumbnails" return report @@ -102,12 +108,6 @@ def add_arguments(self, parser): nargs="+", help="Optional list of HathiTrust ids (by default, downloads images for all public HathiTrust volumes)", ) - parser.add_argument( - "--crawl-delay", - type=int, - help="Delay to be applied between each download in seconds. Default: 1", - default=1, - ) parser.add_argument( "--image-width", type=int, @@ -141,39 +141,34 @@ def interrupt_handler(self, signum, frame): self.interrupted = True self.stdout.write(self.style.WARNING( "Command will exit once this volume's image download is " - "complete.\n Ctrl-C / Interrupt to quit immediately" + "complete.\nCtrl-C / Interrupt to quit immediately" ) ) def download_image(self, page_url: str, out_file: Path) -> bool: + """ + Attempts to download and save an image from the specified URL. + Returns a boolean corresponding to whether the download was successful + """ response = requests.get(page_url) - # log response time - logger.debug(f"Response time: {response.elapsed.total_seconds()}") - self.stdout.write(str(response.headers)) success = False if response.status_code == requests.codes.ok: with out_file.open(mode="wb") as writer: writer.write(response.content) success = True - # For checking throttling rates - # TODO: Consider removing once crawl delays are determined - choke_str = "x-choke info:" - for choke_sfx in ['allowed', 'credit', 'delta', 'max', 'rate']: - header = f"x-choke-{choke_sfx}" - if header in response.headers: - choke_str += f"\n {header}: {response.headers[header]}" - logger.debug(choke_str) elif response.status_code == 503: - logger.debug("WARNING: Received 503 status code. Throttling may have occurred") - # Apply crawl delay after request - sleep(self.crawl_delay) + logger.debug("Received 503 status code. Throttling may have occurred") return success def download_volume_images(self, vol_id:str, page_range: Iterable) -> DownloadStats: - # Determine output volume & thumbnail directories (create as needed) + """ + For a given volume, download the pages corresponding to the provided page range. + """ + # Get volume directory vol_dir = self.output_dir / get_vol_dir(vol_id) vol_dir.mkdir(parents=True, exist_ok=True) + # Get volume's thumbnail directory thumbnail_dir = vol_dir / "thumbnails" thumbnail_dir.mkdir(exist_ok=True) @@ -182,6 +177,7 @@ def download_volume_images(self, vol_id:str, page_range: Iterable) -> DownloadSt # Fetch images stats = DownloadStats() + start_time = time.time() for page_num in page_range: image_name = f"{clean_htid}.{page_num:08d}.jpg" @@ -205,12 +201,15 @@ def download_volume_images(self, vol_id:str, page_range: Iterable) -> DownloadSt # Update progress bar if self.show_progress: self.progress_bar.update() + # Log volume page completion rates + duration = time.time() - start_time + page_rate = duration / len(page_range) + logger.debug(f"{vol_id}: Compelted in {duration:.2f}s ({page_rate:.2f} sec/page)") return stats def handle(self, *args, **kwargs): self.output_dir = kwargs["output_dir"] - self.crawl_delay = kwargs["crawl_delay"] self.full_width = kwargs["image_width"] self.thumbnail_width = kwargs["thumbnail_width"] self.show_progress = kwargs["progress"] @@ -257,6 +256,10 @@ def handle(self, *args, **kwargs): overall_stats = DownloadStats() for i, digwork in enumerate(digworks): + # Check if we need to exit early + if self.interrupted: + break + vol_id = digwork.source_id # Determine page range if digwork.item_type == DigitizedWork.FULL: @@ -271,15 +274,13 @@ def handle(self, *args, **kwargs): f"{vol_id} ({i+1}/{n_vols})" ) + # Download volume images & update overall stats vol_stats = self.download_volume_images(vol_id, page_range) overall_stats.update(vol_stats) - # Update overall progress bar - # Check if we need to exit early - if self.interrupted: - break - # Close progres bar + # Close progress bar if self.show_progress: self.progress_bar.close() + # If interrupted, report the number of volumes completed. if self.interrupted: self.stdout.write(self.style.WARNING(f"Exited early with {i} volumes completed.")) self.stdout.write(self.style.SUCCESS(overall_stats.get_report())) diff --git a/ppa/archive/tests/test_hathi_images.py b/ppa/archive/tests/test_hathi_images.py index cea33e21..3e79cfe8 100644 --- a/ppa/archive/tests/test_hathi_images.py +++ b/ppa/archive/tests/test_hathi_images.py @@ -1,19 +1,18 @@ -from unittest.mock import call, patch +from io import StringIO +from unittest.mock import Mock, call, patch import pytest import requests - +import signal from ppa.archive.templatetags.ppa_tags import page_image_url -from ppa.archive.management.commands.hathi_images import ( - DownloadStats, -) +from ppa.archive.management.commands import hathi_images class TestDownloadStats: def check_stats( self, - stats: DownloadStats, + stats: hathi_images.DownloadStats, full_fetch: int, full_skip: int, thumbnail_fetch: int, @@ -26,11 +25,11 @@ def check_stats( assert stats.thumbnail["skip"] == thumbnail_skip def test_init(self): - stats = DownloadStats() + stats = hathi_images.DownloadStats() self.check_stats(stats, 0, 0, 0, 0) def test_log_action(self): - stats = DownloadStats() + stats = hathi_images.DownloadStats() # unknown action type with pytest.raises(ValueError, match="Unknown action type 'bad_action'"): stats._log_action("image_type", "bad_action") @@ -59,23 +58,25 @@ def test_log_action(self): stats._log_action("thumbnail", "fetch") self.check_stats(stats, 2, 2, 2, 2) - @patch.object(DownloadStats, "_log_action") + @patch.object(hathi_images.DownloadStats, "_log_action") def test_log_download(self, mock_log_action): - stats = DownloadStats() + stats = hathi_images.DownloadStats() stats.log_download("image_type") mock_log_action.called_once_with("image_type", "fetch") - @patch.object(DownloadStats, "_log_action") + @patch.object(hathi_images.DownloadStats, "_log_action") def test_log_skip(self, mock_log_action): - stats = DownloadStats() + stats = hathi_images.DownloadStats() stats.log_download("image_type") mock_log_action.called_once_with("image_type", "skip") def test_update(self): - stats_a = DownloadStats() - stats_b = DownloadStats() - stats_b.full.update({"fetch": 5, "skip": 1}) - stats_b.thumbnail.update({"fetch": 3, "skip": 2}) + stats_a = hathi_images.DownloadStats() + stats_b = hathi_images.DownloadStats() + stats_b.full["fetch"] = 5 + stats_b.full["skip"] = 1 + stats_b.thumbnail["fetch"] = 3 + stats_b.thumbnail["skip"] = 2 self.check_stats(stats_b, 5, 1, 3, 2 ) stats_a.update(stats_b) @@ -87,12 +88,53 @@ def test_update(self): self.check_stats(stats_b, 5, 1, 3, 2 ) def test_report(self): - stats_a = DownloadStats() - report_a = "Fetched: 0 images & 0 thumbnails\nSkipped: 0 images & 0 thumbnails" - assert stats_a.get_report() == report_a - - stats_b = DownloadStats() - stats_b.full.update({"fetch": 5, "skip": 1}) - stats_b.thumbnail.update({"fetch": 3, "skip": 2}) - report_b = "Fetched: 5 images & 3 thumbnails\nSkipped: 1 images & 2 thumbnails" - assert stats_b.get_report() == report_b + stats = hathi_images.DownloadStats() + assert stats.get_report() == "No actions taken" + + # Only actions that have occurred are reported + stats.full["fetch"] = 5 + report = "Fetched: 5 images & 0 thumbnails" + assert stats.get_report() == report + + stats.thumbnail["skip"] = 3 + report += "\nSkipped: 0 images & 3 thumbnails" + assert stats.get_report() == report + + stats.full["error"] = 1 + stats.thumbnail["error"] = 2 + report += "\nMissed: 1 images & 2 thumbnails" + assert stats.get_report() == report + + +class TestHathiImagesCommand: + @patch("signal.signal") + def test_interrupt_handler(self, mock_signal): + stdout = StringIO() + cmd = hathi_images.Command(stdout=stdout) + + cmd.interrupt_handler(signal.SIGINT, "frame") + mock_signal.assert_called_once_with(signal.SIGINT, signal.SIG_DFL) + assert cmd.interrupted + assert stdout.getvalue() == ( + "Command will exit once this volume's image download is complete.\n" + "Ctrl-C / Interrupt to quit immediately\n" + ) + + @patch("requests.get") + def test_download_image(self, mock_get, tmp_path): + cmd = hathi_images.Command() + + # Not ok status + mock_get.return_value = Mock(status_code=503) + result = cmd.download_image("page_url", "out_file") + assert mock_get.called_once_with("page_url") + assert result is False + + # Ok status + out_file = tmp_path / "test.jpg" + mock_get.reset_mock() + mock_get.return_value = Mock(status_code=200, content=b"image content") + result = cmd.download_image("page_url", out_file) + assert mock_get.called_once_with("page_url") + assert result is True + assert out_file.read_text() == "image content" From 4df4513ad65f2f89233550248c85ddb304a99fb1 Mon Sep 17 00:00:00 2001 From: Laure Thompson <602628+laurejt@users.noreply.github.com> Date: Fri, 15 Nov 2024 09:33:05 -0500 Subject: [PATCH 13/13] Corrected typo --- ppa/archive/management/commands/hathi_images.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ppa/archive/management/commands/hathi_images.py b/ppa/archive/management/commands/hathi_images.py index 0553efa7..990dbea0 100644 --- a/ppa/archive/management/commands/hathi_images.py +++ b/ppa/archive/management/commands/hathi_images.py @@ -204,7 +204,7 @@ def download_volume_images(self, vol_id:str, page_range: Iterable) -> DownloadSt # Log volume page completion rates duration = time.time() - start_time page_rate = duration / len(page_range) - logger.debug(f"{vol_id}: Compelted in {duration:.2f}s ({page_rate:.2f} sec/page)") + logger.debug(f"{vol_id}: Completed in {duration:.2f}s ({page_rate:.2f} sec/page)") return stats