Princeton-CDH · laurejt · Nov 15, 2024 · Nov 5, 2024 · Nov 5, 2024 · Nov 5, 2024
diff --git a/DEVELOPERNOTES.rst b/DEVELOPERNOTES.rst
@@ -1,6 +1,46 @@
 Troubleshooting
 ===============
 
+Local Solr setup
+----------------
+Install Solr via `brew <https://formulae.brew.sh/formula/solr>`::
+
+    brew install solr
+
+Copy the Solr config files in as a configset named `ppa`::
+
+    cp -r solr_conf /opt/homebrew/opt/solr/server/solr/configsets/ppa
+
+Create symbolic link to configsets in the Solr home directory::
+
+    ln -s /opt/homebrew/opt/solr/server/solr/configsets /opt/homebrew/var/lib/solr/
+
+Create a new core with the `ppa` configset (Solr must be running)::
+
+    curl "http://localhost:8983/solr/admin/cores?action=CREATE&name=ppa&configSet=ppa"
+
+When the configset has changed, copy in the updated Solr config files::
+
+    cp solr_conf/* /opt/homewbrew/var/lib/solr/configsets/ppa/
+
+Start Solr by running the following command::
+
+    /opt/homebrew/opt/solr/bin/solr start -f
+
+
+Local PostgreSQL
+----------------
+Install PostgreSQL via `brew <https://formulae.brew.sh/formula/postgresql@15>`::
+
+    brew install postgresql@15
+
+Start PostgreSQL (or restart after an ugrade)::
+
+    brew services start postgresql@15
+
+Add PostgreSQL to your PATH::
+
+    echo 'export PATH="/opt/homebrew/opt/postgresql@15/bin:$PATH"' >> ~/.zshrc
 
 
 Solr setup with Docker
@@ -92,7 +132,7 @@ To replace a local development database with a dump of production data::
 
     psql -d postgres -c "DROP DATABASE cdh_ppa;"
     psql -d postgres -c "CREATE DATABASE cdh_ppa;"
-    psql -d postgres -U cdh_ppa < data/13_daily_cdh_ppa_cdh_ppa_2023-01-11.Wednesday.sql
+    psql cdh_ppa < data/13_daily_cdh_ppa_cdh_ppa_2023-01-11.Wednesday.sql
 
 
 Updating Wagtail test fixture

diff --git a/dev-requirements.txt b/dev-requirements.txt
@@ -4,4 +4,5 @@ pytest-django>=4.5.2
 pytest-cov
 django-debug-toolbar
 sphinx>=7.2
-pre-commit
+pre-commit
+ruff
diff --git a/ppa/archive/management/commands/hathi_images.py b/ppa/archive/management/commands/hathi_images.py
@@ -0,0 +1,165 @@
+import requests
+from pathlib import Path
+from time import sleep
+
+import progressbar
+from django.core.management.base import BaseCommand, CommandError
+from django.template.defaultfilters import pluralize
+from corppa.utils.path_utils import encode_htid, get_vol_dir
+
+from ppa.archive.models import DigitizedWork
+from ppa.archive.templatetags.ppa_tags import page_image_url
+
+
+class Command(BaseCommand):
+    """
+    Download HathiTrust page image data via image server
+
+    Note: Excerpts cannot be specified individually, only by source (collectively)
+    """
+    help = __doc__
+    #: normal verbosity level
+    v_normal = 1
+    verbosity = v_normal
+    #: crawl delay (in seconds)
+    crawl_delay=1
+    def add_arguments(self, parser):
+        parser.add_argument(
+            "out",
+            type=Path,
+            help="Top-level output directory")
+        parser.add_argument(
+            "--htids",
+            nargs="*",
+            help="Optional list HathiTrust ids to download",
+        )
+        parser.add_argument(
+            "--progress",
+            action="store_true",
+            help="Display progress bars to track download progress",
+            default=True,
+        )
+
+    def download_image(self, page_url: str, out_file: Path) -> None:
+        response = requests.get(page_url)
+        if response.status_code == requests.codes.ok:
+            with out_file.open(mode="wb") as writer:
+                writer.write(response.content)
+        # Apply crawl delay after request
+        sleep(self.crawl_delay)
+
+
+    def handle(self, *args, **kwargs):
+        self.verbosity = kwargs.get("verbosity", self.v_normal)
+        self.options = kwargs
+
+        # validate output directory
+        if not kwargs["out"]:
+            raise CommandError("An output directory must be specified")
+        output_dir = kwargs["out"]
+        if not output_dir.is_dir():
+            raise CommandError(
+                f"Output directory '{output_dir}' does not exist or is not a directory"
+            )
+
+        # use ids specified via command line when present
+        htids = kwargs.get("htids", [])
+
+        # by default, sync data for all non-suppressed hathi source ids
+        digworks = DigitizedWork.objects.filter(
+            status=DigitizedWork.PUBLIC, source=DigitizedWork.HATHI
+        )
+
+        # if htids are specified via parameter, use them to filter
+        # the queryset, to ensure we only sync records that are
+        # in the database and not suppressed
+        if htids:
+            digworks = digworks.filter(source_id__in=htids)
+
+        # bail out if there's nothing to do
+        # (e.g., explicit htids only and none valid)
+        if not digworks.exists():
+            self.stdout.write("No records to download; stopping")
+            return
+
+        # setup main progress bar
+        overall_progress = None
+        if self.options["progress"]:
+            overall_progress = progressbar.ProgressBar(
+                redirect_stdout=True, max_value=digworks.count(), max_error=False
+            )
+            overall_progress.start()
+
+        self.stdout.write(
+            f"Downloading images for {digworks.count()} record{pluralize(digworks)}"
+        )
+
+        for digwork in digworks:
+            vol_id = digwork.source_id
+
+            # Determine output volume & thumbnail directories (create as needed)
+            vol_dir = output_dir / get_vol_dir(vol_id)
+            vol_dir.mkdir(parents=True, exist_ok=True)
+            thumbnail_dir = vol_dir / "thumbnails"
+            thumbnail_dir.mkdir(exist_ok=True)
+
+            # Get filename-friendly version of htid
+            clean_htid = encode_htid(vol_id)
+
+            # Determine page range
+            if digwork.item_type == DigitizedWork.FULL:
+                page_range = range(1, digwork.page_count+1)
+            else:
+                page_range = digwork.page_span
+
+            # Setup volume-level progress bar
+            volume_progress = None
+            if self.options["progress"]:
+                volume_progress = progressbar.ProgressBar(
+                    redirect_stdout=True, max_value=len(page_range), max_error=False
+                )
+                volume_progress.start()
+
+            # Fetch images
+            stats = {
+                "image": {"fetch": 0, "skip": 0},
+                "thumbnail": {"fetch": 0, "skip": 0}
+            }
+            for page_num in page_range:
+                image_name = f"{clean_htid}.{page_num:08d}.jpg"
+
+                # Fetch thumbnail if file does not exist
+                page_thumbnail = thumbnail_dir / image_name
+                if not page_thumbnail.is_file():
+                    thumbnail_url = page_image_url(vol_id, page_num, 250)
+                    self.download_image(thumbnail_url, page_thumbnail)
+                    stats["thumbnail"]["fetch"] += 1
+                else:
+                    stats["thumbnail"]["skip"] += 1
+
+                # Fetch "full" image if file does not exist
+                page_image = vol_dir / image_name
+                if not page_image.is_file():
+                    image_url = page_image_url(vol_id, page_num, 800)
+                    #self.download_image(image_url, page_image)
+                    stats["image"]["fetch"] += 1
+                else:
+                    stats["image"]["skip"] += 1
+
+                # Update volume-specific progress bar
+                if volume_progress:
+                    volume_progress.increment()
+            # Finish volume-specific progress bar
+            if volume_progress:
+                volume_progress.finish()
+            self.stdout.write(
+                f"{vol_id}: Fetched {stats['image']['fetch']} images & "
+                f"{stats['thumbnail']['fetch']} thumbnails; "
+                f"Skipped {stats['image']['skip']} images & "
+                f"{stats['thumbnail']['skip']} thumbnails"
+            )
+            # Update overall progress bar
+            if overall_progress:
+                overall_progress.increment()
+        if overall_progress:
+            overall_progress.finish()
diff --git a/ppa/archive/tests/test_gale.py b/ppa/archive/tests/test_gale.py
@@ -272,7 +272,7 @@ def test_get_item_pages(self, mock_get_item, mock_get_local_ocr, mockrequests):
         # Set up get_local_ocr so that only the 3rd page's text is found
         mock_get_local_ocr.side_effect = [FileNotFoundError, FileNotFoundError, "local ocr text"]
         page_data = list(gale_api.get_item_pages(item_id))
-        mock_get_item.called_once()
+        mock_get_item.assert_called_once()
         assert mock_get_local_ocr.call_count == 3
         assert len(page_data) == 3
         assert [ p["page_id"] for p in page_data ] == ["0001", "0002", "0003"]

diff --git a/requirements.txt b/requirements.txt
@@ -29,4 +29,6 @@ psycopg2-binary
 multiprocess
 django-split-settings
 # only needed for the 'generate_textcorpus' manage command
-orjsonl
+orjsonl
+# TODO: Switch to develop once feature branch is merged
+git+https://github.com/Princeton-CDH/ppa-nlp@feature/hathi-paths#egg=corppa