From 24047db6f6cb27790a15d1a8b44637a0b3383e7f Mon Sep 17 00:00:00 2001
From: Laure Thompson <602628+laurejt@users.noreply.github.com>
Date: Tue, 5 Nov 2024 16:16:22 -0500
Subject: [PATCH 01/13] Updated developer notes and fixed unit test

---
 DEVELOPERNOTES.rst             | 42 +++++++++++++++++++++++++++++++++-
 ppa/archive/tests/test_gale.py |  2 +-
 2 files changed, 42 insertions(+), 2 deletions(-)

diff --git a/DEVELOPERNOTES.rst b/DEVELOPERNOTES.rst
index 9f39ab40..3236017f 100644
--- a/DEVELOPERNOTES.rst
+++ b/DEVELOPERNOTES.rst
@@ -1,6 +1,46 @@
 Troubleshooting
 ===============
 
+Local Solr setup
+----------------
+Install Solr via `brew <https://formulae.brew.sh/formula/solr>`::
+
+    brew install solr
+
+Copy the Solr config files in as a configset named `ppa`::
+
+    cp -r solr_conf /opt/homebrew/opt/solr/server/solr/configsets/ppa
+
+Create symbolic link to configsets in the Solr home directory::
+
+    ln -s /opt/homebrew/opt/solr/server/solr/configsets /opt/homebrew/var/lib/solr/
+
+Create a new core with the `ppa` configset (Solr must be running)::
+
+    curl "http://localhost:8983/solr/admin/cores?action=CREATE&name=ppa&configSet=ppa"
+
+When the configset has changed, copy in the updated Solr config files::
+
+    cp solr_conf/* /opt/homewbrew/var/lib/solr/configsets/ppa/
+
+Start Solr by running the following command::
+
+    /opt/homebrew/opt/solr/bin/solr start -f
+
+
+Local PostgreSQL
+----------------
+Install PostgreSQL via `brew <https://formulae.brew.sh/formula/postgresql@15>`::
+
+    brew install postgresql@15
+
+Start PostgreSQL (or restart after an ugrade)::
+
+    brew services start postgresql@15
+
+Add PostgreSQL to your PATH::
+
+    echo 'export PATH="/opt/homebrew/opt/postgresql@15/bin:$PATH"' >> ~/.zshrc
 
 
 Solr setup with Docker
@@ -92,7 +132,7 @@ To replace a local development database with a dump of production data::
 
     psql -d postgres -c "DROP DATABASE cdh_ppa;"
     psql -d postgres -c "CREATE DATABASE cdh_ppa;"
-    psql -d postgres -U cdh_ppa < data/13_daily_cdh_ppa_cdh_ppa_2023-01-11.Wednesday.sql
+    psql cdh_ppa < data/13_daily_cdh_ppa_cdh_ppa_2023-01-11.Wednesday.sql
 
 
 Updating Wagtail test fixture
diff --git a/ppa/archive/tests/test_gale.py b/ppa/archive/tests/test_gale.py
index c15f730f..b6dd9a0d 100644
--- a/ppa/archive/tests/test_gale.py
+++ b/ppa/archive/tests/test_gale.py
@@ -272,7 +272,7 @@ def test_get_item_pages(self, mock_get_item, mock_get_local_ocr, mockrequests):
         # Set up get_local_ocr so that only the 3rd page's text is found
         mock_get_local_ocr.side_effect = [FileNotFoundError, FileNotFoundError, "local ocr text"]
         page_data = list(gale_api.get_item_pages(item_id))
-        mock_get_item.called_once()
+        mock_get_item.assert_called_once()
         assert mock_get_local_ocr.call_count == 3
         assert len(page_data) == 3
         assert [ p["page_id"] for p in page_data ] == ["0001", "0002", "0003"]

From 0ff502eaec0c4077cd01a2dcdb8e0ecca7315a75 Mon Sep 17 00:00:00 2001
From: Laure Thompson <602628+laurejt@users.noreply.github.com>
Date: Tue, 5 Nov 2024 16:17:25 -0500
Subject: [PATCH 02/13] Add ruff to dev requirements

---
 dev-requirements.txt | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/dev-requirements.txt b/dev-requirements.txt
index dffb9f5e..61d2e22b 100644
--- a/dev-requirements.txt
+++ b/dev-requirements.txt
@@ -4,4 +4,5 @@ pytest-django>=4.5.2
 pytest-cov
 django-debug-toolbar
 sphinx>=7.2
-pre-commit
\ No newline at end of file
+pre-commit
+ruff

From 1a64204d9665d29ecbe1a53cf8e2dfbfc11ac5fb Mon Sep 17 00:00:00 2001
From: Laure Thompson <602628+laurejt@users.noreply.github.com>
Date: Tue, 5 Nov 2024 16:18:00 -0500
Subject: [PATCH 03/13] Initial command for fetching hathi images.

Note this currently relies on a feature branch of ppa-nlp
---
 .../management/commands/hathi_images.py       | 165 ++++++++++++++++++
 requirements.txt                              |   4 +-
 2 files changed, 168 insertions(+), 1 deletion(-)
 create mode 100644 ppa/archive/management/commands/hathi_images.py

diff --git a/ppa/archive/management/commands/hathi_images.py b/ppa/archive/management/commands/hathi_images.py
new file mode 100644
index 00000000..ba18881c
--- /dev/null
+++ b/ppa/archive/management/commands/hathi_images.py
@@ -0,0 +1,165 @@
+import requests
+from pathlib import Path
+from time import sleep
+
+import progressbar
+from django.core.management.base import BaseCommand, CommandError
+from django.template.defaultfilters import pluralize
+from corppa.utils.path_utils import encode_htid, get_vol_dir
+
+from ppa.archive.models import DigitizedWork
+from ppa.archive.templatetags.ppa_tags import page_image_url
+
+
+class Command(BaseCommand):
+    """
+    Download HathiTrust page image data via image server
+
+    Note: Excerpts cannot be specified individually, only by source (collectively)
+    """
+    help = __doc__
+    #: normal verbosity level
+    v_normal = 1
+    verbosity = v_normal
+    #: crawl delay (in seconds)
+    crawl_delay=1
+    def add_arguments(self, parser):
+        parser.add_argument(
+            "out",
+            type=Path,
+            help="Top-level output directory")
+        parser.add_argument(
+            "--htids",
+            nargs="*",
+            help="Optional list HathiTrust ids to download",
+        )
+        parser.add_argument(
+            "--progress",
+            action="store_true",
+            help="Display progress bars to track download progress",
+            default=True,
+        )
+    
+    def download_image(self, page_url: str, out_file: Path) -> None:
+        response = requests.get(page_url)
+        if response.status_code == requests.codes.ok:
+            with out_file.open(mode="wb") as writer:
+                writer.write(response.content)
+        # Apply crawl delay after request
+        sleep(self.crawl_delay)
+
+
+    def handle(self, *args, **kwargs):
+        self.verbosity = kwargs.get("verbosity", self.v_normal)
+        self.options = kwargs
+
+        # validate output directory
+        if not kwargs["out"]:
+            raise CommandError("An output directory must be specified")
+        output_dir = kwargs["out"]
+        if not output_dir.is_dir():
+            raise CommandError(
+                f"Output directory '{output_dir}' does not exist or is not a directory"
+            )
+
+        # use ids specified via command line when present
+        htids = kwargs.get("htids", [])
+
+        # by default, sync data for all non-suppressed hathi source ids
+        digworks = DigitizedWork.objects.filter(
+            status=DigitizedWork.PUBLIC, source=DigitizedWork.HATHI
+        )
+
+        # if htids are specified via parameter, use them to filter
+        # the queryset, to ensure we only sync records that are
+        # in the database and not suppressed
+        if htids:
+            digworks = digworks.filter(source_id__in=htids)
+
+        # bail out if there's nothing to do
+        # (e.g., explicit htids only and none valid)
+        if not digworks.exists():
+            self.stdout.write("No records to download; stopping")
+            return
+
+        # setup main progress bar
+        overall_progress = None
+        if self.options["progress"]:
+            overall_progress = progressbar.ProgressBar(
+                redirect_stdout=True, max_value=digworks.count(), max_error=False
+            )
+            overall_progress.start()
+
+        self.stdout.write(
+            f"Downloading images for {digworks.count()} record{pluralize(digworks)}"
+        )
+
+        for digwork in digworks:
+            vol_id = digwork.source_id
+            
+            # Determine output volume & thumbnail directories (create as needed)
+            vol_dir = output_dir / get_vol_dir(vol_id)
+            vol_dir.mkdir(parents=True, exist_ok=True)
+            thumbnail_dir = vol_dir / "thumbnails"
+            thumbnail_dir.mkdir(exist_ok=True)
+            
+            # Get filename-friendly version of htid
+            clean_htid = encode_htid(vol_id)
+            
+            # Determine page range
+            if digwork.item_type == DigitizedWork.FULL:
+                page_range = range(1, digwork.page_count+1)
+            else:
+                page_range = digwork.page_span
+            
+            # Setup volume-level progress bar
+            volume_progress = None
+            if self.options["progress"]:
+                volume_progress = progressbar.ProgressBar(
+                    redirect_stdout=True, max_value=len(page_range), max_error=False
+                )
+                volume_progress.start()
+
+            # Fetch images
+            stats = {
+                "image": {"fetch": 0, "skip": 0},
+                "thumbnail": {"fetch": 0, "skip": 0}
+            }
+            for page_num in page_range:
+                image_name = f"{clean_htid}.{page_num:08d}.jpg"
+
+                # Fetch thumbnail if file does not exist
+                page_thumbnail = thumbnail_dir / image_name
+                if not page_thumbnail.is_file():
+                    thumbnail_url = page_image_url(vol_id, page_num, 250)
+                    self.download_image(thumbnail_url, page_thumbnail)
+                    stats["thumbnail"]["fetch"] += 1
+                else:
+                    stats["thumbnail"]["skip"] += 1
+
+                # Fetch "full" image if file does not exist
+                page_image = vol_dir / image_name
+                if not page_image.is_file():
+                    image_url = page_image_url(vol_id, page_num, 800)
+                    #self.download_image(image_url, page_image)
+                    stats["image"]["fetch"] += 1
+                else:
+                    stats["image"]["skip"] += 1
+                
+                # Update volume-specific progress bar
+                if volume_progress:
+                    volume_progress.increment()
+            # Finish volume-specific progress bar
+            if volume_progress:
+                volume_progress.finish()
+            self.stdout.write(
+                f"{vol_id}: Fetched {stats['image']['fetch']} images & "
+                f"{stats['thumbnail']['fetch']} thumbnails; "
+                f"Skipped {stats['image']['skip']} images & "
+                f"{stats['thumbnail']['skip']} thumbnails"
+            )
+            # Update overall progress bar
+            if overall_progress:
+                overall_progress.increment()
+        if overall_progress:
+            overall_progress.finish()
diff --git a/requirements.txt b/requirements.txt
index f612502f..ef2a33b3 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -29,4 +29,6 @@ psycopg2-binary
 multiprocess
 django-split-settings
 # only needed for the 'generate_textcorpus' manage command
-orjsonl
\ No newline at end of file
+orjsonl
+# TODO: Switch to develop once feature branch is merged
+git+https://github.com/Princeton-CDH/ppa-nlp@feature/hathi-paths#egg=corppa

From 9c185e2cc445402786fce4fdb244f1fb200e8897 Mon Sep 17 00:00:00 2001
From: Laure Thompson <602628+laurejt@users.noreply.github.com>
Date: Wed, 6 Nov 2024 13:43:58 -0500
Subject: [PATCH 04/13] Update ppa/archive/management/commands/hathi_images.py

Co-authored-by: Rebecca Sutton Koeser <rlskoeser@users.noreply.github.com>
---
 ppa/archive/management/commands/hathi_images.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ppa/archive/management/commands/hathi_images.py b/ppa/archive/management/commands/hathi_images.py
index ba18881c..117f6413 100644
--- a/ppa/archive/management/commands/hathi_images.py
+++ b/ppa/archive/management/commands/hathi_images.py
@@ -31,7 +31,7 @@ def add_arguments(self, parser):
         parser.add_argument(
             "--htids",
             nargs="*",
-            help="Optional list HathiTrust ids to download",
+            help="Optional list of HathiTrust ids (by default, downloads images for all public HathiTrust volumes)",
         )
         parser.add_argument(
             "--progress",

From a617d9439a708461905e07cef9a63c549f989ae9 Mon Sep 17 00:00:00 2001
From: Laure Thompson <602628+laurejt@users.noreply.github.com>
Date: Wed, 6 Nov 2024 13:44:11 -0500
Subject: [PATCH 05/13] Update ppa/archive/management/commands/hathi_images.py

Co-authored-by: Rebecca Sutton Koeser <rlskoeser@users.noreply.github.com>
---
 ppa/archive/management/commands/hathi_images.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ppa/archive/management/commands/hathi_images.py b/ppa/archive/management/commands/hathi_images.py
index 117f6413..06588aab 100644
--- a/ppa/archive/management/commands/hathi_images.py
+++ b/ppa/archive/management/commands/hathi_images.py
@@ -65,7 +65,7 @@ def handle(self, *args, **kwargs):
         # use ids specified via command line when present
         htids = kwargs.get("htids", [])
 
-        # by default, sync data for all non-suppressed hathi source ids
+        # by default, download images for all non-suppressed hathi source ids
         digworks = DigitizedWork.objects.filter(
             status=DigitizedWork.PUBLIC, source=DigitizedWork.HATHI
         )

From 9106cd57384944293ef8895a8877f1327beed7bb Mon Sep 17 00:00:00 2001
From: Laure Thompson <602628+laurejt@users.noreply.github.com>
Date: Wed, 6 Nov 2024 13:55:28 -0500
Subject: [PATCH 06/13] Update corppa dependency to develop branch

---
 requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/requirements.txt b/requirements.txt
index ef2a33b3..63f24f85 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -31,4 +31,4 @@ django-split-settings
 # only needed for the 'generate_textcorpus' manage command
 orjsonl
 # TODO: Switch to develop once feature branch is merged
-git+https://github.com/Princeton-CDH/ppa-nlp@feature/hathi-paths#egg=corppa
+git+https://github.com/Princeton-CDH/ppa-nlp@develop#egg=corppa

From 3776d4444eeb4426555cdc306f4f722548a9a81f Mon Sep 17 00:00:00 2001
From: Laure Thompson <602628+laurejt@users.noreply.github.com>
Date: Tue, 12 Nov 2024 10:30:31 -0500
Subject: [PATCH 07/13] Updated models.py to pass unit tests locally

Appears to be some issue where None is being return instead of 0
---
 ppa/archive/models.py | 50 ++++++++++++++-----------------------------
 1 file changed, 16 insertions(+), 34 deletions(-)

diff --git a/ppa/archive/models.py b/ppa/archive/models.py
index 12cb091b..e2484b92 100644
--- a/ppa/archive/models.py
+++ b/ppa/archive/models.py
@@ -180,8 +180,7 @@ class ProtectedWorkField(models.Field):
     :class:`ProtectedWorkFieldFlags` object and stores as integer."""
 
     description = (
-        "A field that stores an instance of :class:`ProtectedWorkFieldFlags` "
-        "as an integer."
+        "A field that stores an instance of :class:`ProtectedWorkFieldFlags` as an integer."
     )
 
     def __init__(self, verbose_name=None, name=None, **kwargs):
@@ -221,9 +220,7 @@ def collection_save(sender, instance, **kwargs):
             # if the collection has any works associated
             works = instance.digitizedwork_set.all()
             if works.exists():
-                logger.debug(
-                    "collection save, reindexing %d related works", works.count()
-                )
+                logger.debug(f"collection save, reindexing {works.count()} related works")
                 DigitizedWork.index_items(works)
 
     @staticmethod
@@ -234,7 +231,7 @@ def collection_delete(sender, instance, **kwargs):
         digwork_ids = instance.digitizedwork_set.values_list("id", flat=True)
         # find the items based on the list of ids to reindex
         digworks = DigitizedWork.objects.filter(id__in=list(digwork_ids))
-        logger.debug("collection delete, reindexing %d works" % len(digworks))
+        logger.debug(f"collection delete, reindexing {len(digworks)} works")
 
         # NOTE: this sends pre/post clear signal, but it's not obvious
         # how to take advantage of that
@@ -252,11 +249,11 @@ def cluster_save(sender, instance, **kwargs):
             works = instance.digitizedwork_set.all()
             if works.exists():
                 # get a total of page count for affected works
-                page_count = works.aggregate(page_count=models.Sum("page_count"))
+                page_count = works.aggregate(page_count=models.Sum("page_count", default=0))
                 logger.debug(
                     "cluster id has changed, reindexing %d works and %d pages",
                     works.count(),
-                    page_count.get("page_count", 0),
+                    page_count["page_count"],
                 )
                 DigitizedWork.index_items(works)
                 # reindex pages (this may be slow...)
@@ -272,11 +269,11 @@ def cluster_delete(sender, instance, **kwargs):
         # find the items based on the list of ids to reindex
         digworks = DigitizedWork.objects.filter(id__in=list(digwork_ids))
         # get a total of page count for affected works
-        page_count = digworks.aggregate(page_count=models.Sum("page_count"))
+        page_count = digworks.aggregate(page_count=models.Sum("page_count", default=0))
         logger.debug(
             "cluster delete, reindexing %d works and %d pages",
             digworks.count(),
-            page_count.get("page_count", 0),
+            page_count["page_count"],
         )
 
         # NOTE: this sends pre/post clear signal, but it's not obvious
@@ -295,7 +292,7 @@ def handle_digwork_cluster_change(sender, instance, **kwargs):
             logger.debug(
                 "Cluster changed for %s; indexing %d pages",
                 instance,
-                instance.page_count,
+                instance.page_count or 0,
             )
             instance.index_items(Page.page_index_data(instance))
 
@@ -425,9 +422,7 @@ class DigitizedWork(ModelIndexable, TrackChangesModel):
     collections = models.ManyToManyField(Collection, blank=True)
 
     #: optional cluster for aggregating works
-    cluster = models.ForeignKey(
-        Cluster, blank=True, null=True, on_delete=models.SET_NULL
-    )
+    cluster = models.ForeignKey(Cluster, blank=True, null=True, on_delete=models.SET_NULL)
 
     #: date added to the archive
     added = models.DateTimeField(auto_now_add=True)
@@ -629,9 +624,7 @@ def save(self, *args, **kwargs):
 
         # if excerpt page range has changed
         # OR this is a new record with a page range
-        if self.has_changed("pages_digital") or (
-            self.pk is None and self.pages_digital
-        ):
+        if self.has_changed("pages_digital") or (self.pk is None and self.pages_digital):
             # update the page count if possible (i.e., not a Gale record)
             self.page_count = self.count_pages()
             # if page range changed on existing record, clear out old index
@@ -765,9 +758,7 @@ def metadata_from_marc(self, marc_record, populate=True):
         field_data["sort_title"] = marc_record.title()[non_sort:].strip(' "[')
         field_data["author"] = marc_record.author() or ""
         # remove a note present on some records and strip whitespace
-        field_data["author"] = (
-            field_data["author"].replace("[from old catalog]", "").strip()
-        )
+        field_data["author"] = field_data["author"].replace("[from old catalog]", "").strip()
         # removing trailing period, except when it is part of an
         # initial or known abbreviation (i.e, Esq.)
         # Look for single initial, but support initials with no spaces
@@ -815,9 +806,7 @@ def metadata_from_marc(self, marc_record, populate=True):
         # *only* if they wrap the whole text
         for field in ["publisher", "pub_place"]:
             if field in field_data:
-                field_data[field] = re.sub(
-                    r"^\[(.*)\]$", r"\1", field_data[field]
-                ).strip()
+                field_data[field] = re.sub(r"^\[(.*)\]$", r"\1", field_data[field]).strip()
 
         if populate:
             # conditionally update fields that are protected (or not)
@@ -874,9 +863,7 @@ def populate_from_bibdata(self, bibdata):
             "post_save": SignalHandlers.cluster_save,
             "pre_delete": SignalHandlers.cluster_delete,
         },
-        "archive.DigitizedWork": {
-            "post_save": SignalHandlers.handle_digwork_cluster_change
-        },
+        "archive.DigitizedWork": {"post_save": SignalHandlers.handle_digwork_cluster_change},
     }
 
     def first_page(self):
@@ -1027,11 +1014,7 @@ def count_pages(self, ptree_client=None):
             # some aggregate packages retrieved from Data API
             # include jp2 and xml files as well as txt; only count text
             page_count = len(
-                [
-                    filename
-                    for filename in ht_zip.namelist()
-                    if filename.endswith(".txt")
-                ]
+                [filename for filename in ht_zip.namelist() if filename.endswith(".txt")]
             )
             logger.debug(
                 "Counted %d pages in zipfile in %f sec", page_count, time.time() - start
@@ -1072,8 +1055,7 @@ def get_metadata(self, metadata_format):
                     return record.as_marc()
                 except MARCRecordNotFound:
                     logger.warning(
-                        "MARC record for %s/%s not found"
-                        % (self.source_id, self.record_id)
+                        f"MARC record for {self.source_id}/{self.record_id} not found"
                     )
                     return ""
 
@@ -1088,7 +1070,7 @@ def get_source_link_label(self):
             return "View on Gale Primary Sources"
         if self.source == DigitizedWork.OTHER:
             return "View external record"
-        return "View on %s" % self.get_source_display()
+        return f"View on {self.get_source_display()}"
 
     @staticmethod
     def add_from_hathi(htid, bib_api=None, update=False, log_msg_src=None, user=None):

From 85bc6b14ca3a06ebc20272919dbd864a8629ce30 Mon Sep 17 00:00:00 2001
From: Laure Thompson <602628+laurejt@users.noreply.github.com>
Date: Wed, 13 Nov 2024 11:55:51 -0500
Subject: [PATCH 08/13] Refactored hathi_images command & added unit tests

---
 .../management/commands/hathi_images.py       | 234 ++++++++++++------
 ppa/archive/tests/test_hathi_images.py        |  98 ++++++++
 2 files changed, 253 insertions(+), 79 deletions(-)
 create mode 100644 ppa/archive/tests/test_hathi_images.py

diff --git a/ppa/archive/management/commands/hathi_images.py b/ppa/archive/management/commands/hathi_images.py
index 06588aab..785b858c 100644
--- a/ppa/archive/management/commands/hathi_images.py
+++ b/ppa/archive/management/commands/hathi_images.py
@@ -1,6 +1,14 @@
+"""
+**hathi_images** is a custom manage command for downloading both full-size
+and thumbnail page images for a list of HathiTrust volumes.
+"""
+import argparse
+from collections import Counter
+from collections.abc import Iterable
 import requests
 from pathlib import Path
 from time import sleep
+from typing import Self
 
 import progressbar
 from django.core.management.base import BaseCommand, CommandError
@@ -11,6 +19,43 @@
 from ppa.archive.templatetags.ppa_tags import page_image_url
 
 
+class DownloadStats:
+    ACTION_TYPES = {"fetch", "skip"}
+    def __init__(self):
+        # Stats for full size images
+        self.full = Counter()
+        # Stats for thumbnail images
+        self.thumbnail = Counter()
+
+    def _log_action(self, image_type: str, action: str) -> None:
+        if action not in self.ACTION_TYPES:
+            raise ValueError(f"Unknown action type '{action}'")
+        if image_type == "full":
+            self.full[action] += 1
+        elif image_type == "thumbnail":
+            self.thumbnail[action] += 1
+        else:
+            raise ValueError(f"Unknown image type '{image_type}'")
+    
+    def log_download(self, image_type: str) -> None:
+        self._log_action(image_type, "fetch")
+
+    def log_skip(self, image_type: str) -> None:
+        self._log_action(image_type, "skip")
+
+    def update(self, other: Self) -> None:
+        self.full.update(other.full)
+        self.thumbnail.update(other.thumbnail)
+
+    def get_report(self) -> str:
+        return (
+            f"Fetched {self.full['fetch']} images & "
+            f"{self.thumbnail['fetch']} thumbnails; "
+            f"Skipped {self.full['skip']} images & "
+            f"{self.thumbnail['skip']} thumbnails"
+        )
+
+
 class Command(BaseCommand):
     """
     Download HathiTrust page image data via image server
@@ -21,46 +66,128 @@ class Command(BaseCommand):
     #: normal verbosity level
     v_normal = 1
     verbosity = v_normal
-    #: crawl delay (in seconds)
-    crawl_delay=1
+
+    # Argument parsing
     def add_arguments(self, parser):
+        """
+        Configure additional CLI arguments
+        """
         parser.add_argument(
-            "out",
+            "output_dir",
             type=Path,
-            help="Top-level output directory")
+            help="Top-level output directory"
+        )
         parser.add_argument(
             "--htids",
-            nargs="*",
+            nargs="+",
             help="Optional list of HathiTrust ids (by default, downloads images for all public HathiTrust volumes)",
         )
+        parser.add_argument(
+            "--crawl-delay",
+            type=int,
+            help="Delay to be applied between each download in seconds. Default: 1",
+            default=1,
+        )
+        parser.add_argument(
+            "--image-width",
+            type=int,
+            help="Width for full-size images in pixels. Default: 800",
+            default=800,
+        )
+        parser.add_argument(
+            "--thumbnail-width",
+            type=int,
+            help="Width for thumbnail images in pixels. Must be at most 250 pixels. Default: 250",
+            default=250,
+        )
         parser.add_argument(
             "--progress",
-            action="store_true",
+            action=argparse.BooleanOptionalAction,
             help="Display progress bars to track download progress",
             default=True,
         )
-    
-    def download_image(self, page_url: str, out_file: Path) -> None:
+   
+    def download_image(self, page_url: str, out_file: Path) -> bool:
         response = requests.get(page_url)
+        success = False
         if response.status_code == requests.codes.ok:
             with out_file.open(mode="wb") as writer:
                 writer.write(response.content)
+            success = True
+        else:
+            if self.verbosity > self.v_normal:
+                self.stdout(f"Warning: Failed to fetch image {out_file.name}")
         # Apply crawl delay after request
         sleep(self.crawl_delay)
+        return success
+
+
+    def download_volume_images(self, vol_id:str, page_range: Iterable) -> DownloadStats:
+        # Determine output volume & thumbnail directories (create as needed)
+        vol_dir = self.output_dir / get_vol_dir(vol_id)
+        vol_dir.mkdir(parents=True, exist_ok=True)
+        thumbnail_dir = vol_dir / "thumbnails"
+        thumbnail_dir.mkdir(exist_ok=True)
+            
+        # Get filename-friendly version of htid
+        clean_htid = encode_htid(vol_id)
+            
+        # Setup volume-level progress bar
+        volume_progress = None
+        if self.show_progress:
+            volume_progress = progressbar.ProgressBar(
+                line_offset=1, redirect_stdout=True, max_value=len(page_range), max_error=False
+            )
+            volume_progress.start()
+
+        # Fetch images
+        stats = DownloadStats()
+        for page_num in page_range:
+            image_name = f"{clean_htid}.{page_num:08d}.jpg"
+
+            # Fetch thumbnail if file does not exist
+            page_thumbnail = thumbnail_dir / image_name
+            if not page_thumbnail.is_file():
+                thumbnail_url = page_image_url(vol_id, page_num, self.thumbnail_width)
+                success = self.download_image(thumbnail_url, page_thumbnail)
+                # TODO: Should we log something different if the download fails?
+                stats.log_download("thumbnail")
+            else:
+                stats.log_skip("thumbnail")
+
+            # Fetch "full" image if file does not exist
+            page_image = vol_dir / image_name
+            if not page_image.is_file():
+                image_url = page_image_url(vol_id, page_num, self.full_width)
+                success = self.download_image(image_url, page_image)
+                stats.log_download("full")
+            else:
+                stats.log_skip("full")
+                
+            # Update volume-specific progress bar
+            if volume_progress:
+                volume_progress.increment()
+        # Finish volume-specific progress bar
+        if volume_progress:
+            volume_progress.finish()
+        return stats
 
 
     def handle(self, *args, **kwargs):
-        self.verbosity = kwargs.get("verbosity", self.v_normal)
-        self.options = kwargs
-
-        # validate output directory
-        if not kwargs["out"]:
-            raise CommandError("An output directory must be specified")
-        output_dir = kwargs["out"]
-        if not output_dir.is_dir():
+        self.output_dir = kwargs["output_dir"]
+        self.crawl_delay = kwargs["crawl_delay"]
+        self.full_width = kwargs["image_width"]
+        self.thumbnail_width = kwargs["thumbnail_width"]
+        self.verbosity = kwargs.get("verbosity", self.verbosity)
+        self.show_progress = kwargs["progress"]
+
+        # Validate input arguments
+        if not self.output_dir.is_dir():
             raise CommandError(
-                f"Output directory '{output_dir}' does not exist or is not a directory"
+                f"Output directory '{self.output_dir}' does not exist or is not a directory"
             )
+        if self.thumbnail_width > 250:
+            raise CommandError(f"Thumbnail width cannot be more than 250 pixels")
 
         # use ids specified via command line when present
         htids = kwargs.get("htids", [])
@@ -82,84 +209,33 @@ def handle(self, *args, **kwargs):
             self.stdout.write("No records to download; stopping")
             return
 
+        self.stdout.write(
+            f"Downloading images for {digworks.count()} record{pluralize(digworks)}"
+        )
+
         # setup main progress bar
         overall_progress = None
-        if self.options["progress"]:
+        if self.show_progress:
             overall_progress = progressbar.ProgressBar(
-                redirect_stdout=True, max_value=digworks.count(), max_error=False
+                line_offset=0, redirect_stdout=True, max_value=digworks.count(), max_error=False
             )
             overall_progress.start()
 
-        self.stdout.write(
-            f"Downloading images for {digworks.count()} record{pluralize(digworks)}"
-        )
-
+        overall_stats = DownloadStats()
         for digwork in digworks:
             vol_id = digwork.source_id
-            
-            # Determine output volume & thumbnail directories (create as needed)
-            vol_dir = output_dir / get_vol_dir(vol_id)
-            vol_dir.mkdir(parents=True, exist_ok=True)
-            thumbnail_dir = vol_dir / "thumbnails"
-            thumbnail_dir.mkdir(exist_ok=True)
-            
-            # Get filename-friendly version of htid
-            clean_htid = encode_htid(vol_id)
-            
             # Determine page range
             if digwork.item_type == DigitizedWork.FULL:
                 page_range = range(1, digwork.page_count+1)
             else:
                 page_range = digwork.page_span
             
-            # Setup volume-level progress bar
-            volume_progress = None
-            if self.options["progress"]:
-                volume_progress = progressbar.ProgressBar(
-                    redirect_stdout=True, max_value=len(page_range), max_error=False
-                )
-                volume_progress.start()
-
-            # Fetch images
-            stats = {
-                "image": {"fetch": 0, "skip": 0},
-                "thumbnail": {"fetch": 0, "skip": 0}
-            }
-            for page_num in page_range:
-                image_name = f"{clean_htid}.{page_num:08d}.jpg"
-
-                # Fetch thumbnail if file does not exist
-                page_thumbnail = thumbnail_dir / image_name
-                if not page_thumbnail.is_file():
-                    thumbnail_url = page_image_url(vol_id, page_num, 250)
-                    self.download_image(thumbnail_url, page_thumbnail)
-                    stats["thumbnail"]["fetch"] += 1
-                else:
-                    stats["thumbnail"]["skip"] += 1
-
-                # Fetch "full" image if file does not exist
-                page_image = vol_dir / image_name
-                if not page_image.is_file():
-                    image_url = page_image_url(vol_id, page_num, 800)
-                    #self.download_image(image_url, page_image)
-                    stats["image"]["fetch"] += 1
-                else:
-                    stats["image"]["skip"] += 1
-                
-                # Update volume-specific progress bar
-                if volume_progress:
-                    volume_progress.increment()
-            # Finish volume-specific progress bar
-            if volume_progress:
-                volume_progress.finish()
-            self.stdout.write(
-                f"{vol_id}: Fetched {stats['image']['fetch']} images & "
-                f"{stats['thumbnail']['fetch']} thumbnails; "
-                f"Skipped {stats['image']['skip']} images & "
-                f"{stats['thumbnail']['skip']} thumbnails"
-            )
+            vol_stats = self.download_volume_images(vol_id, page_range)
+            overall_stats.update(vol_stats)
             # Update overall progress bar
             if overall_progress:
                 overall_progress.increment()
         if overall_progress:
             overall_progress.finish()
+        self.stdout.write("\n\n")  # To avoid overwriting progress bars
+        self.stdout.write(self.style.SUCCESS(overall_stats.get_report()))
diff --git a/ppa/archive/tests/test_hathi_images.py b/ppa/archive/tests/test_hathi_images.py
new file mode 100644
index 00000000..1755f9c5
--- /dev/null
+++ b/ppa/archive/tests/test_hathi_images.py
@@ -0,0 +1,98 @@
+from unittest.mock import call, patch
+
+import pytest
+import requests
+
+
+from ppa.archive.templatetags.ppa_tags import page_image_url
+from ppa.archive.management.commands.hathi_images import (
+    DownloadStats,
+)
+
+
+class TestDownloadStats:
+    def check_stats(
+        self,
+        stats: DownloadStats,
+        full_fetch: int,
+        full_skip: int,
+        thumbnail_fetch: int,
+        thumbnail_skip: int,
+    ) -> None:
+        """Helper function to check stats"""
+        assert stats.full["fetch"] == full_fetch
+        assert stats.full["skip"] == full_skip
+        assert stats.thumbnail["fetch"] == thumbnail_fetch
+        assert stats.thumbnail["skip"] == thumbnail_skip
+
+    def test_init(self):
+        stats = DownloadStats()
+        self.check_stats(stats, 0, 0, 0, 0)
+
+    def test_log_action(self):
+        stats = DownloadStats()
+        # unknown action type
+        with pytest.raises(ValueError, match="Unknown action type 'bad_action'"):
+            stats._log_action("image_type", "bad_action")
+
+        # unknown image type
+        with pytest.raises(ValueError, match="Unknown image type 'image_type'"):
+            stats._log_action("image_type", "fetch")
+
+        # Add one to each image type & action
+        stats._log_action("full", "fetch")
+        self.check_stats(stats, 1, 0, 0, 0)
+        stats._log_action("full", "skip")
+        self.check_stats(stats, 1, 1, 0, 0)
+        stats._log_action("thumbnail", "fetch")
+        self.check_stats(stats, 1, 1, 1, 0)
+        stats._log_action("thumbnail", "skip")
+        self.check_stats(stats, 1, 1, 1, 1)
+
+        # Add another one to each image type & action
+        stats._log_action("thumbnail", "skip")
+        self.check_stats(stats, 1, 1, 1, 2)
+        stats._log_action("full", "skip")
+        self.check_stats(stats, 1, 2, 1, 2)
+        stats._log_action("full", "fetch")
+        self.check_stats(stats, 2, 2, 1, 2)
+        stats._log_action("thumbnail", "fetch")
+        self.check_stats(stats, 2, 2, 2, 2)
+
+    @patch.object(DownloadStats, "_log_action")
+    def test_log_download(self, mock_log_action):
+        stats = DownloadStats()
+        stats.log_download("image_type")
+        mock_log_action.called_once_with("image_type", "fetch")
+
+    @patch.object(DownloadStats, "_log_action")
+    def test_log_skip(self, mock_log_action):
+        stats = DownloadStats()
+        stats.log_download("image_type")
+        mock_log_action.called_once_with("image_type", "skip")
+
+    def test_update(self):
+        stats_a = DownloadStats()
+        stats_b = DownloadStats()
+        stats_b.full.update({"fetch": 5, "skip": 1})
+        stats_b.thumbnail.update({"fetch": 3, "skip": 2})
+        self.check_stats(stats_b, 5, 1, 3, 2 )
+
+        stats_a.update(stats_b)
+        self.check_stats(stats_a, 5, 1, 3, 2)
+        self.check_stats(stats_b, 5, 1, 3, 2 )
+
+        stats_a.update(stats_b)
+        self.check_stats(stats_a, 10, 2, 6, 4)
+        self.check_stats(stats_b, 5, 1, 3, 2 )
+
+    def test_report(self):
+        stats_a = DownloadStats()
+        report_a = "Fetched 0 images & 0 thumbnails; Skipped 0 images & 0 thumbnails"
+        assert stats_a.get_report() == report_a 
+
+        stats_b = DownloadStats()
+        stats_b.full.update({"fetch": 5, "skip": 1})
+        stats_b.thumbnail.update({"fetch": 3, "skip": 2})
+        report_b = "Fetched 5 images & 3 thumbnails; Skipped 1 images & 2 thumbnails"
+        assert stats_b.get_report() == report_b

From 0f3880a730403d9045492c977bba7f18530dd273 Mon Sep 17 00:00:00 2001
From: Laure Thompson <602628+laurejt@users.noreply.github.com>
Date: Wed, 13 Nov 2024 15:50:16 -0500
Subject: [PATCH 09/13] Update local settings for parasolr

---
 ppa/settings/local_settings.py.sample | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ppa/settings/local_settings.py.sample b/ppa/settings/local_settings.py.sample
index 0ebf225c..28d86dc0 100644
--- a/ppa/settings/local_settings.py.sample
+++ b/ppa/settings/local_settings.py.sample
@@ -105,7 +105,7 @@ LOGGING = {
             'handlers': ['console'],
             'level': 'DEBUG'
         },
-        'SolrClient':  {
+        'parasolr':  {
             'handlers': ['console'],
             'level': 'WARN'
         },

From 24c02f95d95c24b4aa74bd34aa6ed00b955012e0 Mon Sep 17 00:00:00 2001
From: Laure Thompson <602628+laurejt@users.noreply.github.com>
Date: Thu, 14 Nov 2024 11:19:23 -0500
Subject: [PATCH 10/13] Updated hathi images command

* Added keyboard interrupt handling
* Modified progress bar
* Added logging
---
 .../management/commands/hathi_images.py       | 174 +++++++++++-------
 ppa/archive/tests/test_hathi_images.py        |   4 +-
 2 files changed, 111 insertions(+), 67 deletions(-)

diff --git a/ppa/archive/management/commands/hathi_images.py b/ppa/archive/management/commands/hathi_images.py
index 785b858c..93e97989 100644
--- a/ppa/archive/management/commands/hathi_images.py
+++ b/ppa/archive/management/commands/hathi_images.py
@@ -5,12 +5,14 @@
 import argparse
 from collections import Counter
 from collections.abc import Iterable
+import logging
 import requests
 from pathlib import Path
+import signal
 from time import sleep
 from typing import Self
 
-import progressbar
+from tqdm import tqdm
 from django.core.management.base import BaseCommand, CommandError
 from django.template.defaultfilters import pluralize
 from corppa.utils.path_utils import encode_htid, get_vol_dir
@@ -18,9 +20,19 @@
 from ppa.archive.models import DigitizedWork
 from ppa.archive.templatetags.ppa_tags import page_image_url
 
+logger = logging.getLogger(__name__)
+
 
 class DownloadStats:
-    ACTION_TYPES = {"fetch", "skip"}
+    # Support actions
+    ACTION_TYPES = {"fetch", "skip", "error"}
+    # Associated strings used for reporting
+    ACTION_STRS = {
+        "fetch": "Fetched",
+        "skip": "Skipped",
+        "error": "Missed",
+    }
+
     def __init__(self):
         # Stats for full size images
         self.full = Counter()
@@ -43,17 +55,25 @@ def log_download(self, image_type: str) -> None:
     def log_skip(self, image_type: str) -> None:
         self._log_action(image_type, "skip")
 
+    def log_error(self, image_type: str) -> None:
+        self._log_action(image_type, "error")
+
     def update(self, other: Self) -> None:
         self.full.update(other.full)
         self.thumbnail.update(other.thumbnail)
 
     def get_report(self) -> str:
-        return (
-            f"Fetched {self.full['fetch']} images & "
-            f"{self.thumbnail['fetch']} thumbnails; "
-            f"Skipped {self.full['skip']} images & "
-            f"{self.thumbnail['skip']} thumbnails"
-        )
+        report = ""
+        for action in ["fetch", "skip", "error"]:
+            if action == "error":
+                # Only report errors when an error is present
+                if not self.full[action] and not self.thumbnail[action]:
+                    continue
+            action_str = self.ACTION_STRS[action]
+            if report:
+                report += "\n"
+            report += f"{action_str}: {self.full[action]} images & {self.thumbnail[action]} thumbnails"
+        return report
 
 
 class Command(BaseCommand):
@@ -63,9 +83,9 @@ class Command(BaseCommand):
     Note: Excerpts cannot be specified individually, only by source (collectively)
     """
     help = __doc__
-    #: normal verbosity level
-    v_normal = 1
-    verbosity = v_normal
+
+    # Interrupt flag to exit gracefully (i.e. between volumes) when a signal is caught
+    interrupted = False
 
     # Argument parsing
     def add_arguments(self, parser):
@@ -106,17 +126,45 @@ def add_arguments(self, parser):
             help="Display progress bars to track download progress",
             default=True,
         )
-   
+  
+    def interrupt_handler(self, signum, frame):
+        """
+        For handling of SIGINT, as possible. For the first SIGINT, a flag is set
+        so that the command will exit after the current volume's image download
+        is complete. Additionally, the default signal handler is restored so a
+        second SIGINT will cause the command to immediately exit.
+        """
+        if signum == signal.SIGINT:
+            # Restore default signal handler
+            signal.signal(signal.SIGINT, signal.SIG_DFL)
+            # Set interrupt flag
+            self.interrupted = True
+            self.stdout.write(self.style.WARNING(
+                "Command will exit once this volume's image download is "
+                "complete.\n Ctrl-C / Interrupt to quit immediately"
+                )
+            )
+
     def download_image(self, page_url: str, out_file: Path) -> bool:
         response = requests.get(page_url)
+        # log response time
+        logger.debug(f"Response time: {response.elapsed.total_seconds()}")
+        self.stdout.write(str(response.headers))
         success = False
         if response.status_code == requests.codes.ok:
             with out_file.open(mode="wb") as writer:
                 writer.write(response.content)
             success = True
-        else:
-            if self.verbosity > self.v_normal:
-                self.stdout(f"Warning: Failed to fetch image {out_file.name}")
+            # For checking throttling rates
+            # TODO: Consider removing once crawl delays are determined
+            choke_str = "x-choke info:"
+            for choke_sfx in ['allowed', 'credit', 'delta', 'max', 'rate']:
+                header = f"x-choke-{choke_sfx}"
+                if header in response.headers:
+                    choke_str += f"\n  {header}: {response.headers[header]}"
+            logger.debug(choke_str)
+        elif response.status_code == 503:
+            logger.debug("WARNING: Received 503 status code. Throttling may have occurred")
         # Apply crawl delay after request
         sleep(self.crawl_delay)
         return success
@@ -132,44 +180,31 @@ def download_volume_images(self, vol_id:str, page_range: Iterable) -> DownloadSt
         # Get filename-friendly version of htid
         clean_htid = encode_htid(vol_id)
             
-        # Setup volume-level progress bar
-        volume_progress = None
-        if self.show_progress:
-            volume_progress = progressbar.ProgressBar(
-                line_offset=1, redirect_stdout=True, max_value=len(page_range), max_error=False
-            )
-            volume_progress.start()
-
         # Fetch images
         stats = DownloadStats()
         for page_num in page_range:
             image_name = f"{clean_htid}.{page_num:08d}.jpg"
 
-            # Fetch thumbnail if file does not exist
-            page_thumbnail = thumbnail_dir / image_name
-            if not page_thumbnail.is_file():
-                thumbnail_url = page_image_url(vol_id, page_num, self.thumbnail_width)
-                success = self.download_image(thumbnail_url, page_thumbnail)
-                # TODO: Should we log something different if the download fails?
-                stats.log_download("thumbnail")
-            else:
-                stats.log_skip("thumbnail")
-
-            # Fetch "full" image if file does not exist
-            page_image = vol_dir / image_name
-            if not page_image.is_file():
-                image_url = page_image_url(vol_id, page_num, self.full_width)
-                success = self.download_image(image_url, page_image)
-                stats.log_download("full")
-            else:
-                stats.log_skip("full")
-                
-            # Update volume-specific progress bar
-            if volume_progress:
-                volume_progress.increment()
-        # Finish volume-specific progress bar
-        if volume_progress:
-            volume_progress.finish()
+            for image_type in ["full", "thumbnail"]:
+                image_dir = vol_dir if image_type == "full" else thumbnail_dir
+                image = image_dir / image_name
+                image_width = getattr(self, f"{image_type}_width")
+
+                # Fetch image does not exist
+                if not image.is_file():
+                    image_url = page_image_url(vol_id, page_num, image_width)
+                    success = self.download_image(image_url, image)
+                    if success:
+                        stats.log_download(image_type)
+                    else:
+                        stats.log_error(image_type)
+                        logger.debug(f"Failed to download {image_type} image {image_name}")
+                else:
+                    stats.log_skip(image_type)
+
+            # Update progress bar
+            if self.show_progress:
+                self.progress_bar.update()
         return stats
 
 
@@ -178,7 +213,6 @@ def handle(self, *args, **kwargs):
         self.crawl_delay = kwargs["crawl_delay"]
         self.full_width = kwargs["image_width"]
         self.thumbnail_width = kwargs["thumbnail_width"]
-        self.verbosity = kwargs.get("verbosity", self.verbosity)
         self.show_progress = kwargs["progress"]
 
         # Validate input arguments
@@ -187,7 +221,7 @@ def handle(self, *args, **kwargs):
                 f"Output directory '{self.output_dir}' does not exist or is not a directory"
             )
         if self.thumbnail_width > 250:
-            raise CommandError(f"Thumbnail width cannot be more than 250 pixels")
+            raise CommandError("Thumbnail width cannot be more than 250 pixels")
 
         # use ids specified via command line when present
         htids = kwargs.get("htids", [])
@@ -208,34 +242,44 @@ def handle(self, *args, **kwargs):
         if not digworks.exists():
             self.stdout.write("No records to download; stopping")
             return
+        
+        # Bind handler for interrupt signal
+        signal.signal(signal.SIGINT, self.interrupt_handler)
 
+        n_vols = digworks.count()
         self.stdout.write(
-            f"Downloading images for {digworks.count()} record{pluralize(digworks)}"
+            f"Downloading images for {n_vols} record{pluralize(digworks)}",
         )
 
-        # setup main progress bar
-        overall_progress = None
+        # Initialize progress bar
         if self.show_progress:
-            overall_progress = progressbar.ProgressBar(
-                line_offset=0, redirect_stdout=True, max_value=digworks.count(), max_error=False
-            )
-            overall_progress.start()
-
+            self.progress_bar = tqdm()
+       
         overall_stats = DownloadStats()
-        for digwork in digworks:
+        for i, digwork in enumerate(digworks):
             vol_id = digwork.source_id
             # Determine page range
             if digwork.item_type == DigitizedWork.FULL:
                 page_range = range(1, digwork.page_count+1)
             else:
                 page_range = digwork.page_span
-            
+           
+            # Update progress bar
+            if self.show_progress:
+                self.progress_bar.reset(total=len(page_range))
+                self.progress_bar.set_description(
+                    f"{vol_id} ({i+1}/{n_vols})"
+                )
+
             vol_stats = self.download_volume_images(vol_id, page_range)
             overall_stats.update(vol_stats)
             # Update overall progress bar
-            if overall_progress:
-                overall_progress.increment()
-        if overall_progress:
-            overall_progress.finish()
-        self.stdout.write("\n\n")  # To avoid overwriting progress bars
+            # Check if we need to exit early
+            if self.interrupted:
+                break
+        # Close progres bar
+        if self.show_progress:
+            self.progress_bar.close()
+        if self.interrupted:
+            self.stdout.write(self.style.WARNING(f"Exited early with {i} volumes completed."))
         self.stdout.write(self.style.SUCCESS(overall_stats.get_report()))
diff --git a/ppa/archive/tests/test_hathi_images.py b/ppa/archive/tests/test_hathi_images.py
index 1755f9c5..cea33e21 100644
--- a/ppa/archive/tests/test_hathi_images.py
+++ b/ppa/archive/tests/test_hathi_images.py
@@ -88,11 +88,11 @@ def test_update(self):
 
     def test_report(self):
         stats_a = DownloadStats()
-        report_a = "Fetched 0 images & 0 thumbnails; Skipped 0 images & 0 thumbnails"
+        report_a = "Fetched: 0 images & 0 thumbnails\nSkipped: 0 images & 0 thumbnails"
         assert stats_a.get_report() == report_a 
 
         stats_b = DownloadStats()
         stats_b.full.update({"fetch": 5, "skip": 1})
         stats_b.thumbnail.update({"fetch": 3, "skip": 2})
-        report_b = "Fetched 5 images & 3 thumbnails; Skipped 1 images & 2 thumbnails"
+        report_b = "Fetched: 5 images & 3 thumbnails\nSkipped: 1 images & 2 thumbnails"
         assert stats_b.get_report() == report_b

From e18f2ce0e2f0693c25ea75bc8a96dbd2ce0ccdec Mon Sep 17 00:00:00 2001
From: Laure Thompson <602628+laurejt@users.noreply.github.com>
Date: Thu, 14 Nov 2024 12:31:07 -0500
Subject: [PATCH 11/13] Update local settings to prevent log duplication

---
 ppa/settings/local_settings.py.sample | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/ppa/settings/local_settings.py.sample b/ppa/settings/local_settings.py.sample
index 28d86dc0..3405b8db 100644
--- a/ppa/settings/local_settings.py.sample
+++ b/ppa/settings/local_settings.py.sample
@@ -103,11 +103,13 @@ LOGGING = {
         },
         'ppa':  {
             'handlers': ['console'],
-            'level': 'DEBUG'
+            'level': 'DEBUG',
+            'propagate': False,
         },
         'parasolr':  {
             'handlers': ['console'],
-            'level': 'WARN'
+            'level': 'WARN',
+            'propagate': False,
         },
     }
 }

From 51d1591549c74d825dfab3a89ecefa95860c446e Mon Sep 17 00:00:00 2001
From: Laure Thompson <602628+laurejt@users.noreply.github.com>
Date: Thu, 14 Nov 2024 13:54:54 -0500
Subject: [PATCH 12/13] Removed crawl delay and added a few unit tests.

---
 .../management/commands/hathi_images.py       | 69 +++++++-------
 ppa/archive/tests/test_hathi_images.py        | 92 ++++++++++++++-----
 2 files changed, 102 insertions(+), 59 deletions(-)

diff --git a/ppa/archive/management/commands/hathi_images.py b/ppa/archive/management/commands/hathi_images.py
index 93e97989..0553efa7 100644
--- a/ppa/archive/management/commands/hathi_images.py
+++ b/ppa/archive/management/commands/hathi_images.py
@@ -9,7 +9,7 @@
 import requests
 from pathlib import Path
 import signal
-from time import sleep
+import time
 from typing import Self
 
 from tqdm import tqdm
@@ -63,16 +63,22 @@ def update(self, other: Self) -> None:
         self.thumbnail.update(other.thumbnail)
 
     def get_report(self) -> str:
+        # No actions logged
+        if not self.full and not self.thumbnail:
+            return "No actions taken"
+
+        # Report actions taken
         report = ""
         for action in ["fetch", "skip", "error"]:
-            if action == "error":
-                # Only report errors when an error is present
-                if not self.full[action] and not self.thumbnail[action]:
-                    continue
+            # Only report action when it has been taken
+            if not self.full[action] and not self.thumbnail[action]:
+                continue
             action_str = self.ACTION_STRS[action]
+            n_full = self.full[action]
+            n_thumbnail = self.thumbnail[action]
             if report:
                 report += "\n"
-            report += f"{action_str}: {self.full[action]} images & {self.thumbnail[action]} thumbnails"
+            report += f"{action_str}: {n_full} images & {n_thumbnail} thumbnails"
         return report
 
 
@@ -102,12 +108,6 @@ def add_arguments(self, parser):
             nargs="+",
             help="Optional list of HathiTrust ids (by default, downloads images for all public HathiTrust volumes)",
         )
-        parser.add_argument(
-            "--crawl-delay",
-            type=int,
-            help="Delay to be applied between each download in seconds. Default: 1",
-            default=1,
-        )
         parser.add_argument(
             "--image-width",
             type=int,
@@ -141,39 +141,34 @@ def interrupt_handler(self, signum, frame):
             self.interrupted = True
             self.stdout.write(self.style.WARNING(
                 "Command will exit once this volume's image download is "
-                "complete.\n Ctrl-C / Interrupt to quit immediately"
+                "complete.\nCtrl-C / Interrupt to quit immediately"
                 )
             )
 
     def download_image(self, page_url: str, out_file: Path) -> bool:
+        """
+        Attempts to download and save an image from the specified URL.
+        Returns a boolean corresponding to whether the download was successful
+        """
         response = requests.get(page_url)
-        # log response time
-        logger.debug(f"Response time: {response.elapsed.total_seconds()}")
-        self.stdout.write(str(response.headers))
         success = False
         if response.status_code == requests.codes.ok:
             with out_file.open(mode="wb") as writer:
                 writer.write(response.content)
             success = True
-            # For checking throttling rates
-            # TODO: Consider removing once crawl delays are determined
-            choke_str = "x-choke info:"
-            for choke_sfx in ['allowed', 'credit', 'delta', 'max', 'rate']:
-                header = f"x-choke-{choke_sfx}"
-                if header in response.headers:
-                    choke_str += f"\n  {header}: {response.headers[header]}"
-            logger.debug(choke_str)
         elif response.status_code == 503:
-            logger.debug("WARNING: Received 503 status code. Throttling may have occurred")
-        # Apply crawl delay after request
-        sleep(self.crawl_delay)
+            logger.debug("Received 503 status code. Throttling may have occurred")
         return success
 
 
     def download_volume_images(self, vol_id:str, page_range: Iterable) -> DownloadStats:
-        # Determine output volume & thumbnail directories (create as needed)
+        """
+        For a given volume, download the pages corresponding to the provided page range.
+        """
+        # Get volume directory
         vol_dir = self.output_dir / get_vol_dir(vol_id)
         vol_dir.mkdir(parents=True, exist_ok=True)
+        # Get volume's thumbnail directory
         thumbnail_dir = vol_dir / "thumbnails"
         thumbnail_dir.mkdir(exist_ok=True)
             
@@ -182,6 +177,7 @@ def download_volume_images(self, vol_id:str, page_range: Iterable) -> DownloadSt
             
         # Fetch images
         stats = DownloadStats()
+        start_time = time.time()
         for page_num in page_range:
             image_name = f"{clean_htid}.{page_num:08d}.jpg"
 
@@ -205,12 +201,15 @@ def download_volume_images(self, vol_id:str, page_range: Iterable) -> DownloadSt
             # Update progress bar
             if self.show_progress:
                 self.progress_bar.update()
+        # Log volume page completion rates
+        duration = time.time() - start_time
+        page_rate = duration / len(page_range)
+        logger.debug(f"{vol_id}: Compelted in {duration:.2f}s ({page_rate:.2f} sec/page)")
         return stats
 
 
     def handle(self, *args, **kwargs):
         self.output_dir = kwargs["output_dir"]
-        self.crawl_delay = kwargs["crawl_delay"]
         self.full_width = kwargs["image_width"]
         self.thumbnail_width = kwargs["thumbnail_width"]
         self.show_progress = kwargs["progress"]
@@ -257,6 +256,10 @@ def handle(self, *args, **kwargs):
        
         overall_stats = DownloadStats()
         for i, digwork in enumerate(digworks):
+            # Check if we need to exit early
+            if self.interrupted:
+                break
+            
             vol_id = digwork.source_id
             # Determine page range
             if digwork.item_type == DigitizedWork.FULL:
@@ -271,15 +274,13 @@ def handle(self, *args, **kwargs):
                     f"{vol_id} ({i+1}/{n_vols})"
                 )
 
+            # Download volume images & update overall stats
             vol_stats = self.download_volume_images(vol_id, page_range)
             overall_stats.update(vol_stats)
-            # Update overall progress bar
-            # Check if we need to exit early
-            if self.interrupted:
-                break
-        # Close progres bar
+        # Close progress bar
         if self.show_progress:
             self.progress_bar.close()
+        # If interrupted, report the number of volumes completed.
         if self.interrupted:
             self.stdout.write(self.style.WARNING(f"Exited early with {i} volumes completed."))
         self.stdout.write(self.style.SUCCESS(overall_stats.get_report()))
diff --git a/ppa/archive/tests/test_hathi_images.py b/ppa/archive/tests/test_hathi_images.py
index cea33e21..3e79cfe8 100644
--- a/ppa/archive/tests/test_hathi_images.py
+++ b/ppa/archive/tests/test_hathi_images.py
@@ -1,19 +1,18 @@
-from unittest.mock import call, patch
+from io import StringIO
+from unittest.mock import Mock, call, patch
 
 import pytest
 import requests
-
+import signal
 
 from ppa.archive.templatetags.ppa_tags import page_image_url
-from ppa.archive.management.commands.hathi_images import (
-    DownloadStats,
-)
+from ppa.archive.management.commands import hathi_images
 
 
 class TestDownloadStats:
     def check_stats(
         self,
-        stats: DownloadStats,
+        stats: hathi_images.DownloadStats,
         full_fetch: int,
         full_skip: int,
         thumbnail_fetch: int,
@@ -26,11 +25,11 @@ def check_stats(
         assert stats.thumbnail["skip"] == thumbnail_skip
 
     def test_init(self):
-        stats = DownloadStats()
+        stats = hathi_images.DownloadStats()
         self.check_stats(stats, 0, 0, 0, 0)
 
     def test_log_action(self):
-        stats = DownloadStats()
+        stats = hathi_images.DownloadStats()
         # unknown action type
         with pytest.raises(ValueError, match="Unknown action type 'bad_action'"):
             stats._log_action("image_type", "bad_action")
@@ -59,23 +58,25 @@ def test_log_action(self):
         stats._log_action("thumbnail", "fetch")
         self.check_stats(stats, 2, 2, 2, 2)
 
-    @patch.object(DownloadStats, "_log_action")
+    @patch.object(hathi_images.DownloadStats, "_log_action")
     def test_log_download(self, mock_log_action):
-        stats = DownloadStats()
+        stats = hathi_images.DownloadStats()
         stats.log_download("image_type")
         mock_log_action.called_once_with("image_type", "fetch")
 
-    @patch.object(DownloadStats, "_log_action")
+    @patch.object(hathi_images.DownloadStats, "_log_action")
     def test_log_skip(self, mock_log_action):
-        stats = DownloadStats()
+        stats = hathi_images.DownloadStats()
         stats.log_download("image_type")
         mock_log_action.called_once_with("image_type", "skip")
 
     def test_update(self):
-        stats_a = DownloadStats()
-        stats_b = DownloadStats()
-        stats_b.full.update({"fetch": 5, "skip": 1})
-        stats_b.thumbnail.update({"fetch": 3, "skip": 2})
+        stats_a = hathi_images.DownloadStats()
+        stats_b = hathi_images.DownloadStats()
+        stats_b.full["fetch"] = 5
+        stats_b.full["skip"] = 1
+        stats_b.thumbnail["fetch"] = 3
+        stats_b.thumbnail["skip"] = 2
         self.check_stats(stats_b, 5, 1, 3, 2 )
 
         stats_a.update(stats_b)
@@ -87,12 +88,53 @@ def test_update(self):
         self.check_stats(stats_b, 5, 1, 3, 2 )
 
     def test_report(self):
-        stats_a = DownloadStats()
-        report_a = "Fetched: 0 images & 0 thumbnails\nSkipped: 0 images & 0 thumbnails"
-        assert stats_a.get_report() == report_a 
-
-        stats_b = DownloadStats()
-        stats_b.full.update({"fetch": 5, "skip": 1})
-        stats_b.thumbnail.update({"fetch": 3, "skip": 2})
-        report_b = "Fetched: 5 images & 3 thumbnails\nSkipped: 1 images & 2 thumbnails"
-        assert stats_b.get_report() == report_b
+        stats = hathi_images.DownloadStats()
+        assert stats.get_report() == "No actions taken"
+
+        # Only actions that have occurred are reported 
+        stats.full["fetch"] = 5
+        report = "Fetched: 5 images & 0 thumbnails"
+        assert stats.get_report() == report
+
+        stats.thumbnail["skip"] = 3
+        report += "\nSkipped: 0 images & 3 thumbnails"
+        assert stats.get_report() == report
+
+        stats.full["error"] = 1
+        stats.thumbnail["error"] = 2
+        report += "\nMissed: 1 images & 2 thumbnails"
+        assert stats.get_report() == report
+
+
+class TestHathiImagesCommand:
+    @patch("signal.signal")
+    def test_interrupt_handler(self, mock_signal):
+        stdout = StringIO()
+        cmd = hathi_images.Command(stdout=stdout)
+        
+        cmd.interrupt_handler(signal.SIGINT, "frame")
+        mock_signal.assert_called_once_with(signal.SIGINT, signal.SIG_DFL)
+        assert cmd.interrupted
+        assert stdout.getvalue() == (
+            "Command will exit once this volume's image download is complete.\n"
+            "Ctrl-C / Interrupt to quit immediately\n"
+        )
+
+    @patch("requests.get")
+    def test_download_image(self, mock_get, tmp_path):
+        cmd = hathi_images.Command()
+        
+        # Not ok status
+        mock_get.return_value = Mock(status_code=503)
+        result = cmd.download_image("page_url", "out_file")
+        assert mock_get.called_once_with("page_url")
+        assert result is False
+
+        # Ok status
+        out_file = tmp_path / "test.jpg"
+        mock_get.reset_mock()
+        mock_get.return_value = Mock(status_code=200, content=b"image content")
+        result = cmd.download_image("page_url", out_file)
+        assert mock_get.called_once_with("page_url")
+        assert result is True
+        assert out_file.read_text() == "image content"

From 4df4513ad65f2f89233550248c85ddb304a99fb1 Mon Sep 17 00:00:00 2001
From: Laure Thompson <602628+laurejt@users.noreply.github.com>
Date: Fri, 15 Nov 2024 09:33:05 -0500
Subject: [PATCH 13/13] Corrected typo

---
 ppa/archive/management/commands/hathi_images.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ppa/archive/management/commands/hathi_images.py b/ppa/archive/management/commands/hathi_images.py
index 0553efa7..990dbea0 100644
--- a/ppa/archive/management/commands/hathi_images.py
+++ b/ppa/archive/management/commands/hathi_images.py
@@ -204,7 +204,7 @@ def download_volume_images(self, vol_id:str, page_range: Iterable) -> DownloadSt
         # Log volume page completion rates
         duration = time.time() - start_time
         page_rate = duration / len(page_range)
-        logger.debug(f"{vol_id}: Compelted in {duration:.2f}s ({page_rate:.2f} sec/page)")
+        logger.debug(f"{vol_id}: Completed in {duration:.2f}s ({page_rate:.2f} sec/page)")
         return stats