Skip to content

Commit

Permalink
Updated hathi images command
Browse files Browse the repository at this point in the history
* Added keyboard interrupt handling
* Modified progress bar
* Added logging
  • Loading branch information
laurejt committed Nov 14, 2024
1 parent 0f3880a commit 24c02f9
Show file tree
Hide file tree
Showing 2 changed files with 111 additions and 67 deletions.
174 changes: 109 additions & 65 deletions ppa/archive/management/commands/hathi_images.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,22 +5,34 @@
import argparse
from collections import Counter
from collections.abc import Iterable
import logging
import requests
from pathlib import Path
import signal
from time import sleep
from typing import Self

import progressbar
from tqdm import tqdm
from django.core.management.base import BaseCommand, CommandError
from django.template.defaultfilters import pluralize
from corppa.utils.path_utils import encode_htid, get_vol_dir

from ppa.archive.models import DigitizedWork
from ppa.archive.templatetags.ppa_tags import page_image_url

logger = logging.getLogger(__name__)


class DownloadStats:
ACTION_TYPES = {"fetch", "skip"}
# Support actions
ACTION_TYPES = {"fetch", "skip", "error"}
# Associated strings used for reporting
ACTION_STRS = {
"fetch": "Fetched",
"skip": "Skipped",
"error": "Missed",
}

def __init__(self):
# Stats for full size images
self.full = Counter()
Expand All @@ -43,17 +55,25 @@ def log_download(self, image_type: str) -> None:
def log_skip(self, image_type: str) -> None:
self._log_action(image_type, "skip")

Check warning on line 56 in ppa/archive/management/commands/hathi_images.py

View check run for this annotation

Codecov / codecov/patch

ppa/archive/management/commands/hathi_images.py#L56

Added line #L56 was not covered by tests

def log_error(self, image_type: str) -> None:
self._log_action(image_type, "error")

Check warning on line 59 in ppa/archive/management/commands/hathi_images.py

View check run for this annotation

Codecov / codecov/patch

ppa/archive/management/commands/hathi_images.py#L59

Added line #L59 was not covered by tests

def update(self, other: Self) -> None:
self.full.update(other.full)
self.thumbnail.update(other.thumbnail)

def get_report(self) -> str:
return (
f"Fetched {self.full['fetch']} images & "
f"{self.thumbnail['fetch']} thumbnails; "
f"Skipped {self.full['skip']} images & "
f"{self.thumbnail['skip']} thumbnails"
)
report = ""
for action in ["fetch", "skip", "error"]:
if action == "error":
# Only report errors when an error is present
if not self.full[action] and not self.thumbnail[action]:
continue
action_str = self.ACTION_STRS[action]
if report:
report += "\n"
report += f"{action_str}: {self.full[action]} images & {self.thumbnail[action]} thumbnails"
return report


class Command(BaseCommand):
Expand All @@ -63,9 +83,9 @@ class Command(BaseCommand):
Note: Excerpts cannot be specified individually, only by source (collectively)
"""
help = __doc__
#: normal verbosity level
v_normal = 1
verbosity = v_normal

# Interrupt flag to exit gracefully (i.e. between volumes) when a signal is caught
interrupted = False

# Argument parsing
def add_arguments(self, parser):
Expand Down Expand Up @@ -106,17 +126,45 @@ def add_arguments(self, parser):
help="Display progress bars to track download progress",
default=True,
)


def interrupt_handler(self, signum, frame):
"""
For handling of SIGINT, as possible. For the first SIGINT, a flag is set
so that the command will exit after the current volume's image download
is complete. Additionally, the default signal handler is restored so a
second SIGINT will cause the command to immediately exit.
"""
if signum == signal.SIGINT:

Check warning on line 137 in ppa/archive/management/commands/hathi_images.py

View check run for this annotation

Codecov / codecov/patch

ppa/archive/management/commands/hathi_images.py#L137

Added line #L137 was not covered by tests
# Restore default signal handler
signal.signal(signal.SIGINT, signal.SIG_DFL)

Check warning on line 139 in ppa/archive/management/commands/hathi_images.py

View check run for this annotation

Codecov / codecov/patch

ppa/archive/management/commands/hathi_images.py#L139

Added line #L139 was not covered by tests
# Set interrupt flag
self.interrupted = True
self.stdout.write(self.style.WARNING(

Check warning on line 142 in ppa/archive/management/commands/hathi_images.py

View check run for this annotation

Codecov / codecov/patch

ppa/archive/management/commands/hathi_images.py#L141-L142

Added lines #L141 - L142 were not covered by tests
"Command will exit once this volume's image download is "
"complete.\n Ctrl-C / Interrupt to quit immediately"
)
)

def download_image(self, page_url: str, out_file: Path) -> bool:
response = requests.get(page_url)

Check warning on line 149 in ppa/archive/management/commands/hathi_images.py

View check run for this annotation

Codecov / codecov/patch

ppa/archive/management/commands/hathi_images.py#L149

Added line #L149 was not covered by tests
# log response time
logger.debug(f"Response time: {response.elapsed.total_seconds()}")
self.stdout.write(str(response.headers))
success = False
if response.status_code == requests.codes.ok:
with out_file.open(mode="wb") as writer:
writer.write(response.content)
success = True

Check warning on line 157 in ppa/archive/management/commands/hathi_images.py

View check run for this annotation

Codecov / codecov/patch

ppa/archive/management/commands/hathi_images.py#L151-L157

Added lines #L151 - L157 were not covered by tests
else:
if self.verbosity > self.v_normal:
self.stdout(f"Warning: Failed to fetch image {out_file.name}")
# For checking throttling rates
# TODO: Consider removing once crawl delays are determined

Check notice on line 159 in ppa/archive/management/commands/hathi_images.py

View check run for this annotation

codefactor.io / CodeFactor

ppa/archive/management/commands/hathi_images.py#L159

Unresolved comment '# TODO: Consider removing once crawl delays are determined'. (C100)
choke_str = "x-choke info:"
for choke_sfx in ['allowed', 'credit', 'delta', 'max', 'rate']:
header = f"x-choke-{choke_sfx}"
if header in response.headers:
choke_str += f"\n {header}: {response.headers[header]}"
logger.debug(choke_str)
elif response.status_code == 503:
logger.debug("WARNING: Received 503 status code. Throttling may have occurred")

Check warning on line 167 in ppa/archive/management/commands/hathi_images.py

View check run for this annotation

Codecov / codecov/patch

ppa/archive/management/commands/hathi_images.py#L160-L167

Added lines #L160 - L167 were not covered by tests
# Apply crawl delay after request
sleep(self.crawl_delay)
return success

Check warning on line 170 in ppa/archive/management/commands/hathi_images.py

View check run for this annotation

Codecov / codecov/patch

ppa/archive/management/commands/hathi_images.py#L169-L170

Added lines #L169 - L170 were not covered by tests
Expand All @@ -132,44 +180,31 @@ def download_volume_images(self, vol_id:str, page_range: Iterable) -> DownloadSt
# Get filename-friendly version of htid
clean_htid = encode_htid(vol_id)

Check warning on line 181 in ppa/archive/management/commands/hathi_images.py

View check run for this annotation

Codecov / codecov/patch

ppa/archive/management/commands/hathi_images.py#L181

Added line #L181 was not covered by tests

# Setup volume-level progress bar
volume_progress = None
if self.show_progress:
volume_progress = progressbar.ProgressBar(
line_offset=1, redirect_stdout=True, max_value=len(page_range), max_error=False
)
volume_progress.start()

# Fetch images
stats = DownloadStats()
for page_num in page_range:
image_name = f"{clean_htid}.{page_num:08d}.jpg"

Check warning on line 186 in ppa/archive/management/commands/hathi_images.py

View check run for this annotation

Codecov / codecov/patch

ppa/archive/management/commands/hathi_images.py#L184-L186

Added lines #L184 - L186 were not covered by tests

# Fetch thumbnail if file does not exist
page_thumbnail = thumbnail_dir / image_name
if not page_thumbnail.is_file():
thumbnail_url = page_image_url(vol_id, page_num, self.thumbnail_width)
success = self.download_image(thumbnail_url, page_thumbnail)
# TODO: Should we log something different if the download fails?
stats.log_download("thumbnail")
else:
stats.log_skip("thumbnail")

# Fetch "full" image if file does not exist
page_image = vol_dir / image_name
if not page_image.is_file():
image_url = page_image_url(vol_id, page_num, self.full_width)
success = self.download_image(image_url, page_image)
stats.log_download("full")
else:
stats.log_skip("full")

# Update volume-specific progress bar
if volume_progress:
volume_progress.increment()
# Finish volume-specific progress bar
if volume_progress:
volume_progress.finish()
for image_type in ["full", "thumbnail"]:
image_dir = vol_dir if image_type == "full" else thumbnail_dir
image = image_dir / image_name
image_width = getattr(self, f"{image_type}_width")

Check warning on line 191 in ppa/archive/management/commands/hathi_images.py

View check run for this annotation

Codecov / codecov/patch

ppa/archive/management/commands/hathi_images.py#L188-L191

Added lines #L188 - L191 were not covered by tests

# Fetch image does not exist
if not image.is_file():
image_url = page_image_url(vol_id, page_num, image_width)
success = self.download_image(image_url, image)
if success:
stats.log_download(image_type)

Check warning on line 198 in ppa/archive/management/commands/hathi_images.py

View check run for this annotation

Codecov / codecov/patch

ppa/archive/management/commands/hathi_images.py#L194-L198

Added lines #L194 - L198 were not covered by tests
else:
stats.log_error(image_type)
logger.debug(f"Failed to download {image_type} image {image_name}")

Check warning on line 201 in ppa/archive/management/commands/hathi_images.py

View check run for this annotation

Codecov / codecov/patch

ppa/archive/management/commands/hathi_images.py#L200-L201

Added lines #L200 - L201 were not covered by tests
else:
stats.log_skip(image_type)

Check warning on line 203 in ppa/archive/management/commands/hathi_images.py

View check run for this annotation

Codecov / codecov/patch

ppa/archive/management/commands/hathi_images.py#L203

Added line #L203 was not covered by tests

# Update progress bar
if self.show_progress:
self.progress_bar.update()
return stats

Check warning on line 208 in ppa/archive/management/commands/hathi_images.py

View check run for this annotation

Codecov / codecov/patch

ppa/archive/management/commands/hathi_images.py#L206-L208

Added lines #L206 - L208 were not covered by tests


Expand All @@ -178,7 +213,6 @@ def handle(self, *args, **kwargs):
self.crawl_delay = kwargs["crawl_delay"]
self.full_width = kwargs["image_width"]
self.thumbnail_width = kwargs["thumbnail_width"]
self.verbosity = kwargs.get("verbosity", self.verbosity)
self.show_progress = kwargs["progress"]

Check warning on line 216 in ppa/archive/management/commands/hathi_images.py

View check run for this annotation

Codecov / codecov/patch

ppa/archive/management/commands/hathi_images.py#L212-L216

Added lines #L212 - L216 were not covered by tests

# Validate input arguments
Expand All @@ -187,7 +221,7 @@ def handle(self, *args, **kwargs):
f"Output directory '{self.output_dir}' does not exist or is not a directory"
)
if self.thumbnail_width > 250:
raise CommandError(f"Thumbnail width cannot be more than 250 pixels")
raise CommandError("Thumbnail width cannot be more than 250 pixels")

Check warning on line 224 in ppa/archive/management/commands/hathi_images.py

View check run for this annotation

Codecov / codecov/patch

ppa/archive/management/commands/hathi_images.py#L223-L224

Added lines #L223 - L224 were not covered by tests

# use ids specified via command line when present
htids = kwargs.get("htids", [])

Check warning on line 227 in ppa/archive/management/commands/hathi_images.py

View check run for this annotation

Codecov / codecov/patch

ppa/archive/management/commands/hathi_images.py#L227

Added line #L227 was not covered by tests
Expand All @@ -208,34 +242,44 @@ def handle(self, *args, **kwargs):
if not digworks.exists():
self.stdout.write("No records to download; stopping")
return

Check warning on line 244 in ppa/archive/management/commands/hathi_images.py

View check run for this annotation

Codecov / codecov/patch

ppa/archive/management/commands/hathi_images.py#L242-L244

Added lines #L242 - L244 were not covered by tests

# Bind handler for interrupt signal
signal.signal(signal.SIGINT, self.interrupt_handler)

Check warning on line 247 in ppa/archive/management/commands/hathi_images.py

View check run for this annotation

Codecov / codecov/patch

ppa/archive/management/commands/hathi_images.py#L247

Added line #L247 was not covered by tests

n_vols = digworks.count()
self.stdout.write(

Check warning on line 250 in ppa/archive/management/commands/hathi_images.py

View check run for this annotation

Codecov / codecov/patch

ppa/archive/management/commands/hathi_images.py#L249-L250

Added lines #L249 - L250 were not covered by tests
f"Downloading images for {digworks.count()} record{pluralize(digworks)}"
f"Downloading images for {n_vols} record{pluralize(digworks)}",
)

# setup main progress bar
overall_progress = None
# Initialize progress bar
if self.show_progress:
overall_progress = progressbar.ProgressBar(
line_offset=0, redirect_stdout=True, max_value=digworks.count(), max_error=False
)
overall_progress.start()

self.progress_bar = tqdm()

Check warning on line 256 in ppa/archive/management/commands/hathi_images.py

View check run for this annotation

Codecov / codecov/patch

ppa/archive/management/commands/hathi_images.py#L255-L256

Added lines #L255 - L256 were not covered by tests

overall_stats = DownloadStats()
for digwork in digworks:
for i, digwork in enumerate(digworks):
vol_id = digwork.source_id

Check warning on line 260 in ppa/archive/management/commands/hathi_images.py

View check run for this annotation

Codecov / codecov/patch

ppa/archive/management/commands/hathi_images.py#L258-L260

Added lines #L258 - L260 were not covered by tests
# Determine page range
if digwork.item_type == DigitizedWork.FULL:
page_range = range(1, digwork.page_count+1)

Check warning on line 263 in ppa/archive/management/commands/hathi_images.py

View check run for this annotation

Codecov / codecov/patch

ppa/archive/management/commands/hathi_images.py#L262-L263

Added lines #L262 - L263 were not covered by tests
else:
page_range = digwork.page_span

Check warning on line 265 in ppa/archive/management/commands/hathi_images.py

View check run for this annotation

Codecov / codecov/patch

ppa/archive/management/commands/hathi_images.py#L265

Added line #L265 was not covered by tests


# Update progress bar
if self.show_progress:
self.progress_bar.reset(total=len(page_range))
self.progress_bar.set_description(

Check warning on line 270 in ppa/archive/management/commands/hathi_images.py

View check run for this annotation

Codecov / codecov/patch

ppa/archive/management/commands/hathi_images.py#L268-L270

Added lines #L268 - L270 were not covered by tests
f"{vol_id} ({i+1}/{n_vols})"
)

vol_stats = self.download_volume_images(vol_id, page_range)
overall_stats.update(vol_stats)

Check warning on line 275 in ppa/archive/management/commands/hathi_images.py

View check run for this annotation

Codecov / codecov/patch

ppa/archive/management/commands/hathi_images.py#L274-L275

Added lines #L274 - L275 were not covered by tests
# Update overall progress bar
if overall_progress:
overall_progress.increment()
if overall_progress:
overall_progress.finish()
self.stdout.write("\n\n") # To avoid overwriting progress bars
# Check if we need to exit early
if self.interrupted:
break

Check warning on line 279 in ppa/archive/management/commands/hathi_images.py

View check run for this annotation

Codecov / codecov/patch

ppa/archive/management/commands/hathi_images.py#L278-L279

Added lines #L278 - L279 were not covered by tests
# Close progres bar
if self.show_progress:
self.progress_bar.close()
if self.interrupted:
self.stdout.write(self.style.WARNING(f"Exited early with {i} volumes completed."))
self.stdout.write(self.style.SUCCESS(overall_stats.get_report()))

Check warning on line 285 in ppa/archive/management/commands/hathi_images.py

View check run for this annotation

Codecov / codecov/patch

ppa/archive/management/commands/hathi_images.py#L281-L285

Added lines #L281 - L285 were not covered by tests
4 changes: 2 additions & 2 deletions ppa/archive/tests/test_hathi_images.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,11 +88,11 @@ def test_update(self):

def test_report(self):
stats_a = DownloadStats()
report_a = "Fetched 0 images & 0 thumbnails; Skipped 0 images & 0 thumbnails"
report_a = "Fetched: 0 images & 0 thumbnails\nSkipped: 0 images & 0 thumbnails"
assert stats_a.get_report() == report_a

stats_b = DownloadStats()
stats_b.full.update({"fetch": 5, "skip": 1})
stats_b.thumbnail.update({"fetch": 3, "skip": 2})
report_b = "Fetched 5 images & 3 thumbnails; Skipped 1 images & 2 thumbnails"
report_b = "Fetched: 5 images & 3 thumbnails\nSkipped: 1 images & 2 thumbnails"
assert stats_b.get_report() == report_b

0 comments on commit 24c02f9

Please sign in to comment.