Skip to content

Commit 24c02f9

Browse files
committed
Updated hathi images command
* Added keyboard interrupt handling * Modified progress bar * Added logging
1 parent 0f3880a commit 24c02f9

File tree

2 files changed

+111
-67
lines changed

2 files changed

+111
-67
lines changed

ppa/archive/management/commands/hathi_images.py

Lines changed: 109 additions & 65 deletions
Original file line numberDiff line numberDiff line change
@@ -5,22 +5,34 @@
55
import argparse
66
from collections import Counter
77
from collections.abc import Iterable
8+
import logging
89
import requests
910
from pathlib import Path
11+
import signal
1012
from time import sleep
1113
from typing import Self
1214

13-
import progressbar
15+
from tqdm import tqdm
1416
from django.core.management.base import BaseCommand, CommandError
1517
from django.template.defaultfilters import pluralize
1618
from corppa.utils.path_utils import encode_htid, get_vol_dir
1719

1820
from ppa.archive.models import DigitizedWork
1921
from ppa.archive.templatetags.ppa_tags import page_image_url
2022

23+
logger = logging.getLogger(__name__)
24+
2125

2226
class DownloadStats:
23-
ACTION_TYPES = {"fetch", "skip"}
27+
# Support actions
28+
ACTION_TYPES = {"fetch", "skip", "error"}
29+
# Associated strings used for reporting
30+
ACTION_STRS = {
31+
"fetch": "Fetched",
32+
"skip": "Skipped",
33+
"error": "Missed",
34+
}
35+
2436
def __init__(self):
2537
# Stats for full size images
2638
self.full = Counter()
@@ -43,17 +55,25 @@ def log_download(self, image_type: str) -> None:
4355
def log_skip(self, image_type: str) -> None:
4456
self._log_action(image_type, "skip")
4557

58+
def log_error(self, image_type: str) -> None:
59+
self._log_action(image_type, "error")
60+
4661
def update(self, other: Self) -> None:
4762
self.full.update(other.full)
4863
self.thumbnail.update(other.thumbnail)
4964

5065
def get_report(self) -> str:
51-
return (
52-
f"Fetched {self.full['fetch']} images & "
53-
f"{self.thumbnail['fetch']} thumbnails; "
54-
f"Skipped {self.full['skip']} images & "
55-
f"{self.thumbnail['skip']} thumbnails"
56-
)
66+
report = ""
67+
for action in ["fetch", "skip", "error"]:
68+
if action == "error":
69+
# Only report errors when an error is present
70+
if not self.full[action] and not self.thumbnail[action]:
71+
continue
72+
action_str = self.ACTION_STRS[action]
73+
if report:
74+
report += "\n"
75+
report += f"{action_str}: {self.full[action]} images & {self.thumbnail[action]} thumbnails"
76+
return report
5777

5878

5979
class Command(BaseCommand):
@@ -63,9 +83,9 @@ class Command(BaseCommand):
6383
Note: Excerpts cannot be specified individually, only by source (collectively)
6484
"""
6585
help = __doc__
66-
#: normal verbosity level
67-
v_normal = 1
68-
verbosity = v_normal
86+
87+
# Interrupt flag to exit gracefully (i.e. between volumes) when a signal is caught
88+
interrupted = False
6989

7090
# Argument parsing
7191
def add_arguments(self, parser):
@@ -106,17 +126,45 @@ def add_arguments(self, parser):
106126
help="Display progress bars to track download progress",
107127
default=True,
108128
)
109-
129+
130+
def interrupt_handler(self, signum, frame):
131+
"""
132+
For handling of SIGINT, as possible. For the first SIGINT, a flag is set
133+
so that the command will exit after the current volume's image download
134+
is complete. Additionally, the default signal handler is restored so a
135+
second SIGINT will cause the command to immediately exit.
136+
"""
137+
if signum == signal.SIGINT:
138+
# Restore default signal handler
139+
signal.signal(signal.SIGINT, signal.SIG_DFL)
140+
# Set interrupt flag
141+
self.interrupted = True
142+
self.stdout.write(self.style.WARNING(
143+
"Command will exit once this volume's image download is "
144+
"complete.\n Ctrl-C / Interrupt to quit immediately"
145+
)
146+
)
147+
110148
def download_image(self, page_url: str, out_file: Path) -> bool:
111149
response = requests.get(page_url)
150+
# log response time
151+
logger.debug(f"Response time: {response.elapsed.total_seconds()}")
152+
self.stdout.write(str(response.headers))
112153
success = False
113154
if response.status_code == requests.codes.ok:
114155
with out_file.open(mode="wb") as writer:
115156
writer.write(response.content)
116157
success = True
117-
else:
118-
if self.verbosity > self.v_normal:
119-
self.stdout(f"Warning: Failed to fetch image {out_file.name}")
158+
# For checking throttling rates
159+
# TODO: Consider removing once crawl delays are determined
160+
choke_str = "x-choke info:"
161+
for choke_sfx in ['allowed', 'credit', 'delta', 'max', 'rate']:
162+
header = f"x-choke-{choke_sfx}"
163+
if header in response.headers:
164+
choke_str += f"\n {header}: {response.headers[header]}"
165+
logger.debug(choke_str)
166+
elif response.status_code == 503:
167+
logger.debug("WARNING: Received 503 status code. Throttling may have occurred")
120168
# Apply crawl delay after request
121169
sleep(self.crawl_delay)
122170
return success
@@ -132,44 +180,31 @@ def download_volume_images(self, vol_id:str, page_range: Iterable) -> DownloadSt
132180
# Get filename-friendly version of htid
133181
clean_htid = encode_htid(vol_id)
134182

135-
# Setup volume-level progress bar
136-
volume_progress = None
137-
if self.show_progress:
138-
volume_progress = progressbar.ProgressBar(
139-
line_offset=1, redirect_stdout=True, max_value=len(page_range), max_error=False
140-
)
141-
volume_progress.start()
142-
143183
# Fetch images
144184
stats = DownloadStats()
145185
for page_num in page_range:
146186
image_name = f"{clean_htid}.{page_num:08d}.jpg"
147187

148-
# Fetch thumbnail if file does not exist
149-
page_thumbnail = thumbnail_dir / image_name
150-
if not page_thumbnail.is_file():
151-
thumbnail_url = page_image_url(vol_id, page_num, self.thumbnail_width)
152-
success = self.download_image(thumbnail_url, page_thumbnail)
153-
# TODO: Should we log something different if the download fails?
154-
stats.log_download("thumbnail")
155-
else:
156-
stats.log_skip("thumbnail")
157-
158-
# Fetch "full" image if file does not exist
159-
page_image = vol_dir / image_name
160-
if not page_image.is_file():
161-
image_url = page_image_url(vol_id, page_num, self.full_width)
162-
success = self.download_image(image_url, page_image)
163-
stats.log_download("full")
164-
else:
165-
stats.log_skip("full")
166-
167-
# Update volume-specific progress bar
168-
if volume_progress:
169-
volume_progress.increment()
170-
# Finish volume-specific progress bar
171-
if volume_progress:
172-
volume_progress.finish()
188+
for image_type in ["full", "thumbnail"]:
189+
image_dir = vol_dir if image_type == "full" else thumbnail_dir
190+
image = image_dir / image_name
191+
image_width = getattr(self, f"{image_type}_width")
192+
193+
# Fetch image does not exist
194+
if not image.is_file():
195+
image_url = page_image_url(vol_id, page_num, image_width)
196+
success = self.download_image(image_url, image)
197+
if success:
198+
stats.log_download(image_type)
199+
else:
200+
stats.log_error(image_type)
201+
logger.debug(f"Failed to download {image_type} image {image_name}")
202+
else:
203+
stats.log_skip(image_type)
204+
205+
# Update progress bar
206+
if self.show_progress:
207+
self.progress_bar.update()
173208
return stats
174209

175210

@@ -178,7 +213,6 @@ def handle(self, *args, **kwargs):
178213
self.crawl_delay = kwargs["crawl_delay"]
179214
self.full_width = kwargs["image_width"]
180215
self.thumbnail_width = kwargs["thumbnail_width"]
181-
self.verbosity = kwargs.get("verbosity", self.verbosity)
182216
self.show_progress = kwargs["progress"]
183217

184218
# Validate input arguments
@@ -187,7 +221,7 @@ def handle(self, *args, **kwargs):
187221
f"Output directory '{self.output_dir}' does not exist or is not a directory"
188222
)
189223
if self.thumbnail_width > 250:
190-
raise CommandError(f"Thumbnail width cannot be more than 250 pixels")
224+
raise CommandError("Thumbnail width cannot be more than 250 pixels")
191225

192226
# use ids specified via command line when present
193227
htids = kwargs.get("htids", [])
@@ -208,34 +242,44 @@ def handle(self, *args, **kwargs):
208242
if not digworks.exists():
209243
self.stdout.write("No records to download; stopping")
210244
return
245+
246+
# Bind handler for interrupt signal
247+
signal.signal(signal.SIGINT, self.interrupt_handler)
211248

249+
n_vols = digworks.count()
212250
self.stdout.write(
213-
f"Downloading images for {digworks.count()} record{pluralize(digworks)}"
251+
f"Downloading images for {n_vols} record{pluralize(digworks)}",
214252
)
215253

216-
# setup main progress bar
217-
overall_progress = None
254+
# Initialize progress bar
218255
if self.show_progress:
219-
overall_progress = progressbar.ProgressBar(
220-
line_offset=0, redirect_stdout=True, max_value=digworks.count(), max_error=False
221-
)
222-
overall_progress.start()
223-
256+
self.progress_bar = tqdm()
257+
224258
overall_stats = DownloadStats()
225-
for digwork in digworks:
259+
for i, digwork in enumerate(digworks):
226260
vol_id = digwork.source_id
227261
# Determine page range
228262
if digwork.item_type == DigitizedWork.FULL:
229263
page_range = range(1, digwork.page_count+1)
230264
else:
231265
page_range = digwork.page_span
232-
266+
267+
# Update progress bar
268+
if self.show_progress:
269+
self.progress_bar.reset(total=len(page_range))
270+
self.progress_bar.set_description(
271+
f"{vol_id} ({i+1}/{n_vols})"
272+
)
273+
233274
vol_stats = self.download_volume_images(vol_id, page_range)
234275
overall_stats.update(vol_stats)
235276
# Update overall progress bar
236-
if overall_progress:
237-
overall_progress.increment()
238-
if overall_progress:
239-
overall_progress.finish()
240-
self.stdout.write("\n\n") # To avoid overwriting progress bars
277+
# Check if we need to exit early
278+
if self.interrupted:
279+
break
280+
# Close progres bar
281+
if self.show_progress:
282+
self.progress_bar.close()
283+
if self.interrupted:
284+
self.stdout.write(self.style.WARNING(f"Exited early with {i} volumes completed."))
241285
self.stdout.write(self.style.SUCCESS(overall_stats.get_report()))

ppa/archive/tests/test_hathi_images.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -88,11 +88,11 @@ def test_update(self):
8888

8989
def test_report(self):
9090
stats_a = DownloadStats()
91-
report_a = "Fetched 0 images & 0 thumbnails; Skipped 0 images & 0 thumbnails"
91+
report_a = "Fetched: 0 images & 0 thumbnails\nSkipped: 0 images & 0 thumbnails"
9292
assert stats_a.get_report() == report_a
9393

9494
stats_b = DownloadStats()
9595
stats_b.full.update({"fetch": 5, "skip": 1})
9696
stats_b.thumbnail.update({"fetch": 3, "skip": 2})
97-
report_b = "Fetched 5 images & 3 thumbnails; Skipped 1 images & 2 thumbnails"
97+
report_b = "Fetched: 5 images & 3 thumbnails\nSkipped: 1 images & 2 thumbnails"
9898
assert stats_b.get_report() == report_b

0 commit comments

Comments
 (0)