diff --git a/DEVELOPERNOTES.rst b/DEVELOPERNOTES.rst index 9f39ab40..3236017f 100644 --- a/DEVELOPERNOTES.rst +++ b/DEVELOPERNOTES.rst @@ -1,6 +1,46 @@ Troubleshooting =============== +Local Solr setup +---------------- +Install Solr via `brew `:: + + brew install solr + +Copy the Solr config files in as a configset named `ppa`:: + + cp -r solr_conf /opt/homebrew/opt/solr/server/solr/configsets/ppa + +Create symbolic link to configsets in the Solr home directory:: + + ln -s /opt/homebrew/opt/solr/server/solr/configsets /opt/homebrew/var/lib/solr/ + +Create a new core with the `ppa` configset (Solr must be running):: + + curl "http://localhost:8983/solr/admin/cores?action=CREATE&name=ppa&configSet=ppa" + +When the configset has changed, copy in the updated Solr config files:: + + cp solr_conf/* /opt/homewbrew/var/lib/solr/configsets/ppa/ + +Start Solr by running the following command:: + + /opt/homebrew/opt/solr/bin/solr start -f + + +Local PostgreSQL +---------------- +Install PostgreSQL via `brew `:: + + brew install postgresql@15 + +Start PostgreSQL (or restart after an ugrade):: + + brew services start postgresql@15 + +Add PostgreSQL to your PATH:: + + echo 'export PATH="/opt/homebrew/opt/postgresql@15/bin:$PATH"' >> ~/.zshrc Solr setup with Docker @@ -92,7 +132,7 @@ To replace a local development database with a dump of production data:: psql -d postgres -c "DROP DATABASE cdh_ppa;" psql -d postgres -c "CREATE DATABASE cdh_ppa;" - psql -d postgres -U cdh_ppa < data/13_daily_cdh_ppa_cdh_ppa_2023-01-11.Wednesday.sql + psql cdh_ppa < data/13_daily_cdh_ppa_cdh_ppa_2023-01-11.Wednesday.sql Updating Wagtail test fixture diff --git a/dev-requirements.txt b/dev-requirements.txt index dffb9f5e..61d2e22b 100644 --- a/dev-requirements.txt +++ b/dev-requirements.txt @@ -4,4 +4,5 @@ pytest-django>=4.5.2 pytest-cov django-debug-toolbar sphinx>=7.2 -pre-commit \ No newline at end of file +pre-commit +ruff diff --git a/ppa/archive/management/commands/hathi_images.py b/ppa/archive/management/commands/hathi_images.py new file mode 100644 index 00000000..990dbea0 --- /dev/null +++ b/ppa/archive/management/commands/hathi_images.py @@ -0,0 +1,286 @@ +""" +**hathi_images** is a custom manage command for downloading both full-size +and thumbnail page images for a list of HathiTrust volumes. +""" +import argparse +from collections import Counter +from collections.abc import Iterable +import logging +import requests +from pathlib import Path +import signal +import time +from typing import Self + +from tqdm import tqdm +from django.core.management.base import BaseCommand, CommandError +from django.template.defaultfilters import pluralize +from corppa.utils.path_utils import encode_htid, get_vol_dir + +from ppa.archive.models import DigitizedWork +from ppa.archive.templatetags.ppa_tags import page_image_url + +logger = logging.getLogger(__name__) + + +class DownloadStats: + # Support actions + ACTION_TYPES = {"fetch", "skip", "error"} + # Associated strings used for reporting + ACTION_STRS = { + "fetch": "Fetched", + "skip": "Skipped", + "error": "Missed", + } + + def __init__(self): + # Stats for full size images + self.full = Counter() + # Stats for thumbnail images + self.thumbnail = Counter() + + def _log_action(self, image_type: str, action: str) -> None: + if action not in self.ACTION_TYPES: + raise ValueError(f"Unknown action type '{action}'") + if image_type == "full": + self.full[action] += 1 + elif image_type == "thumbnail": + self.thumbnail[action] += 1 + else: + raise ValueError(f"Unknown image type '{image_type}'") + + def log_download(self, image_type: str) -> None: + self._log_action(image_type, "fetch") + + def log_skip(self, image_type: str) -> None: + self._log_action(image_type, "skip") + + def log_error(self, image_type: str) -> None: + self._log_action(image_type, "error") + + def update(self, other: Self) -> None: + self.full.update(other.full) + self.thumbnail.update(other.thumbnail) + + def get_report(self) -> str: + # No actions logged + if not self.full and not self.thumbnail: + return "No actions taken" + + # Report actions taken + report = "" + for action in ["fetch", "skip", "error"]: + # Only report action when it has been taken + if not self.full[action] and not self.thumbnail[action]: + continue + action_str = self.ACTION_STRS[action] + n_full = self.full[action] + n_thumbnail = self.thumbnail[action] + if report: + report += "\n" + report += f"{action_str}: {n_full} images & {n_thumbnail} thumbnails" + return report + + +class Command(BaseCommand): + """ + Download HathiTrust page image data via image server + + Note: Excerpts cannot be specified individually, only by source (collectively) + """ + help = __doc__ + + # Interrupt flag to exit gracefully (i.e. between volumes) when a signal is caught + interrupted = False + + # Argument parsing + def add_arguments(self, parser): + """ + Configure additional CLI arguments + """ + parser.add_argument( + "output_dir", + type=Path, + help="Top-level output directory" + ) + parser.add_argument( + "--htids", + nargs="+", + help="Optional list of HathiTrust ids (by default, downloads images for all public HathiTrust volumes)", + ) + parser.add_argument( + "--image-width", + type=int, + help="Width for full-size images in pixels. Default: 800", + default=800, + ) + parser.add_argument( + "--thumbnail-width", + type=int, + help="Width for thumbnail images in pixels. Must be at most 250 pixels. Default: 250", + default=250, + ) + parser.add_argument( + "--progress", + action=argparse.BooleanOptionalAction, + help="Display progress bars to track download progress", + default=True, + ) + + def interrupt_handler(self, signum, frame): + """ + For handling of SIGINT, as possible. For the first SIGINT, a flag is set + so that the command will exit after the current volume's image download + is complete. Additionally, the default signal handler is restored so a + second SIGINT will cause the command to immediately exit. + """ + if signum == signal.SIGINT: + # Restore default signal handler + signal.signal(signal.SIGINT, signal.SIG_DFL) + # Set interrupt flag + self.interrupted = True + self.stdout.write(self.style.WARNING( + "Command will exit once this volume's image download is " + "complete.\nCtrl-C / Interrupt to quit immediately" + ) + ) + + def download_image(self, page_url: str, out_file: Path) -> bool: + """ + Attempts to download and save an image from the specified URL. + Returns a boolean corresponding to whether the download was successful + """ + response = requests.get(page_url) + success = False + if response.status_code == requests.codes.ok: + with out_file.open(mode="wb") as writer: + writer.write(response.content) + success = True + elif response.status_code == 503: + logger.debug("Received 503 status code. Throttling may have occurred") + return success + + + def download_volume_images(self, vol_id:str, page_range: Iterable) -> DownloadStats: + """ + For a given volume, download the pages corresponding to the provided page range. + """ + # Get volume directory + vol_dir = self.output_dir / get_vol_dir(vol_id) + vol_dir.mkdir(parents=True, exist_ok=True) + # Get volume's thumbnail directory + thumbnail_dir = vol_dir / "thumbnails" + thumbnail_dir.mkdir(exist_ok=True) + + # Get filename-friendly version of htid + clean_htid = encode_htid(vol_id) + + # Fetch images + stats = DownloadStats() + start_time = time.time() + for page_num in page_range: + image_name = f"{clean_htid}.{page_num:08d}.jpg" + + for image_type in ["full", "thumbnail"]: + image_dir = vol_dir if image_type == "full" else thumbnail_dir + image = image_dir / image_name + image_width = getattr(self, f"{image_type}_width") + + # Fetch image does not exist + if not image.is_file(): + image_url = page_image_url(vol_id, page_num, image_width) + success = self.download_image(image_url, image) + if success: + stats.log_download(image_type) + else: + stats.log_error(image_type) + logger.debug(f"Failed to download {image_type} image {image_name}") + else: + stats.log_skip(image_type) + + # Update progress bar + if self.show_progress: + self.progress_bar.update() + # Log volume page completion rates + duration = time.time() - start_time + page_rate = duration / len(page_range) + logger.debug(f"{vol_id}: Completed in {duration:.2f}s ({page_rate:.2f} sec/page)") + return stats + + + def handle(self, *args, **kwargs): + self.output_dir = kwargs["output_dir"] + self.full_width = kwargs["image_width"] + self.thumbnail_width = kwargs["thumbnail_width"] + self.show_progress = kwargs["progress"] + + # Validate input arguments + if not self.output_dir.is_dir(): + raise CommandError( + f"Output directory '{self.output_dir}' does not exist or is not a directory" + ) + if self.thumbnail_width > 250: + raise CommandError("Thumbnail width cannot be more than 250 pixels") + + # use ids specified via command line when present + htids = kwargs.get("htids", []) + + # by default, download images for all non-suppressed hathi source ids + digworks = DigitizedWork.objects.filter( + status=DigitizedWork.PUBLIC, source=DigitizedWork.HATHI + ) + + # if htids are specified via parameter, use them to filter + # the queryset, to ensure we only sync records that are + # in the database and not suppressed + if htids: + digworks = digworks.filter(source_id__in=htids) + + # bail out if there's nothing to do + # (e.g., explicit htids only and none valid) + if not digworks.exists(): + self.stdout.write("No records to download; stopping") + return + + # Bind handler for interrupt signal + signal.signal(signal.SIGINT, self.interrupt_handler) + + n_vols = digworks.count() + self.stdout.write( + f"Downloading images for {n_vols} record{pluralize(digworks)}", + ) + + # Initialize progress bar + if self.show_progress: + self.progress_bar = tqdm() + + overall_stats = DownloadStats() + for i, digwork in enumerate(digworks): + # Check if we need to exit early + if self.interrupted: + break + + vol_id = digwork.source_id + # Determine page range + if digwork.item_type == DigitizedWork.FULL: + page_range = range(1, digwork.page_count+1) + else: + page_range = digwork.page_span + + # Update progress bar + if self.show_progress: + self.progress_bar.reset(total=len(page_range)) + self.progress_bar.set_description( + f"{vol_id} ({i+1}/{n_vols})" + ) + + # Download volume images & update overall stats + vol_stats = self.download_volume_images(vol_id, page_range) + overall_stats.update(vol_stats) + # Close progress bar + if self.show_progress: + self.progress_bar.close() + # If interrupted, report the number of volumes completed. + if self.interrupted: + self.stdout.write(self.style.WARNING(f"Exited early with {i} volumes completed.")) + self.stdout.write(self.style.SUCCESS(overall_stats.get_report())) diff --git a/ppa/archive/models.py b/ppa/archive/models.py index 12cb091b..e2484b92 100644 --- a/ppa/archive/models.py +++ b/ppa/archive/models.py @@ -180,8 +180,7 @@ class ProtectedWorkField(models.Field): :class:`ProtectedWorkFieldFlags` object and stores as integer.""" description = ( - "A field that stores an instance of :class:`ProtectedWorkFieldFlags` " - "as an integer." + "A field that stores an instance of :class:`ProtectedWorkFieldFlags` as an integer." ) def __init__(self, verbose_name=None, name=None, **kwargs): @@ -221,9 +220,7 @@ def collection_save(sender, instance, **kwargs): # if the collection has any works associated works = instance.digitizedwork_set.all() if works.exists(): - logger.debug( - "collection save, reindexing %d related works", works.count() - ) + logger.debug(f"collection save, reindexing {works.count()} related works") DigitizedWork.index_items(works) @staticmethod @@ -234,7 +231,7 @@ def collection_delete(sender, instance, **kwargs): digwork_ids = instance.digitizedwork_set.values_list("id", flat=True) # find the items based on the list of ids to reindex digworks = DigitizedWork.objects.filter(id__in=list(digwork_ids)) - logger.debug("collection delete, reindexing %d works" % len(digworks)) + logger.debug(f"collection delete, reindexing {len(digworks)} works") # NOTE: this sends pre/post clear signal, but it's not obvious # how to take advantage of that @@ -252,11 +249,11 @@ def cluster_save(sender, instance, **kwargs): works = instance.digitizedwork_set.all() if works.exists(): # get a total of page count for affected works - page_count = works.aggregate(page_count=models.Sum("page_count")) + page_count = works.aggregate(page_count=models.Sum("page_count", default=0)) logger.debug( "cluster id has changed, reindexing %d works and %d pages", works.count(), - page_count.get("page_count", 0), + page_count["page_count"], ) DigitizedWork.index_items(works) # reindex pages (this may be slow...) @@ -272,11 +269,11 @@ def cluster_delete(sender, instance, **kwargs): # find the items based on the list of ids to reindex digworks = DigitizedWork.objects.filter(id__in=list(digwork_ids)) # get a total of page count for affected works - page_count = digworks.aggregate(page_count=models.Sum("page_count")) + page_count = digworks.aggregate(page_count=models.Sum("page_count", default=0)) logger.debug( "cluster delete, reindexing %d works and %d pages", digworks.count(), - page_count.get("page_count", 0), + page_count["page_count"], ) # NOTE: this sends pre/post clear signal, but it's not obvious @@ -295,7 +292,7 @@ def handle_digwork_cluster_change(sender, instance, **kwargs): logger.debug( "Cluster changed for %s; indexing %d pages", instance, - instance.page_count, + instance.page_count or 0, ) instance.index_items(Page.page_index_data(instance)) @@ -425,9 +422,7 @@ class DigitizedWork(ModelIndexable, TrackChangesModel): collections = models.ManyToManyField(Collection, blank=True) #: optional cluster for aggregating works - cluster = models.ForeignKey( - Cluster, blank=True, null=True, on_delete=models.SET_NULL - ) + cluster = models.ForeignKey(Cluster, blank=True, null=True, on_delete=models.SET_NULL) #: date added to the archive added = models.DateTimeField(auto_now_add=True) @@ -629,9 +624,7 @@ def save(self, *args, **kwargs): # if excerpt page range has changed # OR this is a new record with a page range - if self.has_changed("pages_digital") or ( - self.pk is None and self.pages_digital - ): + if self.has_changed("pages_digital") or (self.pk is None and self.pages_digital): # update the page count if possible (i.e., not a Gale record) self.page_count = self.count_pages() # if page range changed on existing record, clear out old index @@ -765,9 +758,7 @@ def metadata_from_marc(self, marc_record, populate=True): field_data["sort_title"] = marc_record.title()[non_sort:].strip(' "[') field_data["author"] = marc_record.author() or "" # remove a note present on some records and strip whitespace - field_data["author"] = ( - field_data["author"].replace("[from old catalog]", "").strip() - ) + field_data["author"] = field_data["author"].replace("[from old catalog]", "").strip() # removing trailing period, except when it is part of an # initial or known abbreviation (i.e, Esq.) # Look for single initial, but support initials with no spaces @@ -815,9 +806,7 @@ def metadata_from_marc(self, marc_record, populate=True): # *only* if they wrap the whole text for field in ["publisher", "pub_place"]: if field in field_data: - field_data[field] = re.sub( - r"^\[(.*)\]$", r"\1", field_data[field] - ).strip() + field_data[field] = re.sub(r"^\[(.*)\]$", r"\1", field_data[field]).strip() if populate: # conditionally update fields that are protected (or not) @@ -874,9 +863,7 @@ def populate_from_bibdata(self, bibdata): "post_save": SignalHandlers.cluster_save, "pre_delete": SignalHandlers.cluster_delete, }, - "archive.DigitizedWork": { - "post_save": SignalHandlers.handle_digwork_cluster_change - }, + "archive.DigitizedWork": {"post_save": SignalHandlers.handle_digwork_cluster_change}, } def first_page(self): @@ -1027,11 +1014,7 @@ def count_pages(self, ptree_client=None): # some aggregate packages retrieved from Data API # include jp2 and xml files as well as txt; only count text page_count = len( - [ - filename - for filename in ht_zip.namelist() - if filename.endswith(".txt") - ] + [filename for filename in ht_zip.namelist() if filename.endswith(".txt")] ) logger.debug( "Counted %d pages in zipfile in %f sec", page_count, time.time() - start @@ -1072,8 +1055,7 @@ def get_metadata(self, metadata_format): return record.as_marc() except MARCRecordNotFound: logger.warning( - "MARC record for %s/%s not found" - % (self.source_id, self.record_id) + f"MARC record for {self.source_id}/{self.record_id} not found" ) return "" @@ -1088,7 +1070,7 @@ def get_source_link_label(self): return "View on Gale Primary Sources" if self.source == DigitizedWork.OTHER: return "View external record" - return "View on %s" % self.get_source_display() + return f"View on {self.get_source_display()}" @staticmethod def add_from_hathi(htid, bib_api=None, update=False, log_msg_src=None, user=None): diff --git a/ppa/archive/tests/test_gale.py b/ppa/archive/tests/test_gale.py index 388a36ca..9acc2b5f 100644 --- a/ppa/archive/tests/test_gale.py +++ b/ppa/archive/tests/test_gale.py @@ -283,7 +283,7 @@ def test_get_item_pages(self, mock_get_item, mock_get_local_ocr, mockrequests): # Set up get_local_ocr so that only the 3rd page's text is found mock_get_local_ocr.return_value = {"0003": "local ocr text"} page_data = list(gale_api.get_item_pages(item_id)) - mock_get_item.called_once() + mock_get_item.assert_called_once() # called once per volume assert mock_get_local_ocr.call_count == 1 assert len(page_data) == 3 diff --git a/ppa/archive/tests/test_hathi_images.py b/ppa/archive/tests/test_hathi_images.py new file mode 100644 index 00000000..3e79cfe8 --- /dev/null +++ b/ppa/archive/tests/test_hathi_images.py @@ -0,0 +1,140 @@ +from io import StringIO +from unittest.mock import Mock, call, patch + +import pytest +import requests +import signal + +from ppa.archive.templatetags.ppa_tags import page_image_url +from ppa.archive.management.commands import hathi_images + + +class TestDownloadStats: + def check_stats( + self, + stats: hathi_images.DownloadStats, + full_fetch: int, + full_skip: int, + thumbnail_fetch: int, + thumbnail_skip: int, + ) -> None: + """Helper function to check stats""" + assert stats.full["fetch"] == full_fetch + assert stats.full["skip"] == full_skip + assert stats.thumbnail["fetch"] == thumbnail_fetch + assert stats.thumbnail["skip"] == thumbnail_skip + + def test_init(self): + stats = hathi_images.DownloadStats() + self.check_stats(stats, 0, 0, 0, 0) + + def test_log_action(self): + stats = hathi_images.DownloadStats() + # unknown action type + with pytest.raises(ValueError, match="Unknown action type 'bad_action'"): + stats._log_action("image_type", "bad_action") + + # unknown image type + with pytest.raises(ValueError, match="Unknown image type 'image_type'"): + stats._log_action("image_type", "fetch") + + # Add one to each image type & action + stats._log_action("full", "fetch") + self.check_stats(stats, 1, 0, 0, 0) + stats._log_action("full", "skip") + self.check_stats(stats, 1, 1, 0, 0) + stats._log_action("thumbnail", "fetch") + self.check_stats(stats, 1, 1, 1, 0) + stats._log_action("thumbnail", "skip") + self.check_stats(stats, 1, 1, 1, 1) + + # Add another one to each image type & action + stats._log_action("thumbnail", "skip") + self.check_stats(stats, 1, 1, 1, 2) + stats._log_action("full", "skip") + self.check_stats(stats, 1, 2, 1, 2) + stats._log_action("full", "fetch") + self.check_stats(stats, 2, 2, 1, 2) + stats._log_action("thumbnail", "fetch") + self.check_stats(stats, 2, 2, 2, 2) + + @patch.object(hathi_images.DownloadStats, "_log_action") + def test_log_download(self, mock_log_action): + stats = hathi_images.DownloadStats() + stats.log_download("image_type") + mock_log_action.called_once_with("image_type", "fetch") + + @patch.object(hathi_images.DownloadStats, "_log_action") + def test_log_skip(self, mock_log_action): + stats = hathi_images.DownloadStats() + stats.log_download("image_type") + mock_log_action.called_once_with("image_type", "skip") + + def test_update(self): + stats_a = hathi_images.DownloadStats() + stats_b = hathi_images.DownloadStats() + stats_b.full["fetch"] = 5 + stats_b.full["skip"] = 1 + stats_b.thumbnail["fetch"] = 3 + stats_b.thumbnail["skip"] = 2 + self.check_stats(stats_b, 5, 1, 3, 2 ) + + stats_a.update(stats_b) + self.check_stats(stats_a, 5, 1, 3, 2) + self.check_stats(stats_b, 5, 1, 3, 2 ) + + stats_a.update(stats_b) + self.check_stats(stats_a, 10, 2, 6, 4) + self.check_stats(stats_b, 5, 1, 3, 2 ) + + def test_report(self): + stats = hathi_images.DownloadStats() + assert stats.get_report() == "No actions taken" + + # Only actions that have occurred are reported + stats.full["fetch"] = 5 + report = "Fetched: 5 images & 0 thumbnails" + assert stats.get_report() == report + + stats.thumbnail["skip"] = 3 + report += "\nSkipped: 0 images & 3 thumbnails" + assert stats.get_report() == report + + stats.full["error"] = 1 + stats.thumbnail["error"] = 2 + report += "\nMissed: 1 images & 2 thumbnails" + assert stats.get_report() == report + + +class TestHathiImagesCommand: + @patch("signal.signal") + def test_interrupt_handler(self, mock_signal): + stdout = StringIO() + cmd = hathi_images.Command(stdout=stdout) + + cmd.interrupt_handler(signal.SIGINT, "frame") + mock_signal.assert_called_once_with(signal.SIGINT, signal.SIG_DFL) + assert cmd.interrupted + assert stdout.getvalue() == ( + "Command will exit once this volume's image download is complete.\n" + "Ctrl-C / Interrupt to quit immediately\n" + ) + + @patch("requests.get") + def test_download_image(self, mock_get, tmp_path): + cmd = hathi_images.Command() + + # Not ok status + mock_get.return_value = Mock(status_code=503) + result = cmd.download_image("page_url", "out_file") + assert mock_get.called_once_with("page_url") + assert result is False + + # Ok status + out_file = tmp_path / "test.jpg" + mock_get.reset_mock() + mock_get.return_value = Mock(status_code=200, content=b"image content") + result = cmd.download_image("page_url", out_file) + assert mock_get.called_once_with("page_url") + assert result is True + assert out_file.read_text() == "image content" diff --git a/ppa/settings/local_settings.py.sample b/ppa/settings/local_settings.py.sample index 0ebf225c..3405b8db 100644 --- a/ppa/settings/local_settings.py.sample +++ b/ppa/settings/local_settings.py.sample @@ -103,11 +103,13 @@ LOGGING = { }, 'ppa': { 'handlers': ['console'], - 'level': 'DEBUG' + 'level': 'DEBUG', + 'propagate': False, }, - 'SolrClient': { + 'parasolr': { 'handlers': ['console'], - 'level': 'WARN' + 'level': 'WARN', + 'propagate': False, }, } } diff --git a/requirements.txt b/requirements.txt index f612502f..63f24f85 100644 --- a/requirements.txt +++ b/requirements.txt @@ -29,4 +29,6 @@ psycopg2-binary multiprocess django-split-settings # only needed for the 'generate_textcorpus' manage command -orjsonl \ No newline at end of file +orjsonl +# TODO: Switch to develop once feature branch is merged +git+https://github.com/Princeton-CDH/ppa-nlp@develop#egg=corppa