5
5
import argparse
6
6
from collections import Counter
7
7
from collections .abc import Iterable
8
+ import logging
8
9
import requests
9
10
from pathlib import Path
11
+ import signal
10
12
from time import sleep
11
13
from typing import Self
12
14
13
- import progressbar
15
+ from tqdm import tqdm
14
16
from django .core .management .base import BaseCommand , CommandError
15
17
from django .template .defaultfilters import pluralize
16
18
from corppa .utils .path_utils import encode_htid , get_vol_dir
17
19
18
20
from ppa .archive .models import DigitizedWork
19
21
from ppa .archive .templatetags .ppa_tags import page_image_url
20
22
23
+ logger = logging .getLogger (__name__ )
24
+
21
25
22
26
class DownloadStats :
23
- ACTION_TYPES = {"fetch" , "skip" }
27
+ # Support actions
28
+ ACTION_TYPES = {"fetch" , "skip" , "error" }
29
+ # Associated strings used for reporting
30
+ ACTION_STRS = {
31
+ "fetch" : "Fetched" ,
32
+ "skip" : "Skipped" ,
33
+ "error" : "Missed" ,
34
+ }
35
+
24
36
def __init__ (self ):
25
37
# Stats for full size images
26
38
self .full = Counter ()
@@ -43,17 +55,25 @@ def log_download(self, image_type: str) -> None:
43
55
def log_skip (self , image_type : str ) -> None :
44
56
self ._log_action (image_type , "skip" )
45
57
58
+ def log_error (self , image_type : str ) -> None :
59
+ self ._log_action (image_type , "error" )
60
+
46
61
def update (self , other : Self ) -> None :
47
62
self .full .update (other .full )
48
63
self .thumbnail .update (other .thumbnail )
49
64
50
65
def get_report (self ) -> str :
51
- return (
52
- f"Fetched { self .full ['fetch' ]} images & "
53
- f"{ self .thumbnail ['fetch' ]} thumbnails; "
54
- f"Skipped { self .full ['skip' ]} images & "
55
- f"{ self .thumbnail ['skip' ]} thumbnails"
56
- )
66
+ report = ""
67
+ for action in ["fetch" , "skip" , "error" ]:
68
+ if action == "error" :
69
+ # Only report errors when an error is present
70
+ if not self .full [action ] and not self .thumbnail [action ]:
71
+ continue
72
+ action_str = self .ACTION_STRS [action ]
73
+ if report :
74
+ report += "\n "
75
+ report += f"{ action_str } : { self .full [action ]} images & { self .thumbnail [action ]} thumbnails"
76
+ return report
57
77
58
78
59
79
class Command (BaseCommand ):
@@ -63,9 +83,9 @@ class Command(BaseCommand):
63
83
Note: Excerpts cannot be specified individually, only by source (collectively)
64
84
"""
65
85
help = __doc__
66
- #: normal verbosity level
67
- v_normal = 1
68
- verbosity = v_normal
86
+
87
+ # Interrupt flag to exit gracefully (i.e. between volumes) when a signal is caught
88
+ interrupted = False
69
89
70
90
# Argument parsing
71
91
def add_arguments (self , parser ):
@@ -106,17 +126,45 @@ def add_arguments(self, parser):
106
126
help = "Display progress bars to track download progress" ,
107
127
default = True ,
108
128
)
109
-
129
+
130
+ def interrupt_handler (self , signum , frame ):
131
+ """
132
+ For handling of SIGINT, as possible. For the first SIGINT, a flag is set
133
+ so that the command will exit after the current volume's image download
134
+ is complete. Additionally, the default signal handler is restored so a
135
+ second SIGINT will cause the command to immediately exit.
136
+ """
137
+ if signum == signal .SIGINT :
138
+ # Restore default signal handler
139
+ signal .signal (signal .SIGINT , signal .SIG_DFL )
140
+ # Set interrupt flag
141
+ self .interrupted = True
142
+ self .stdout .write (self .style .WARNING (
143
+ "Command will exit once this volume's image download is "
144
+ "complete.\n Ctrl-C / Interrupt to quit immediately"
145
+ )
146
+ )
147
+
110
148
def download_image (self , page_url : str , out_file : Path ) -> bool :
111
149
response = requests .get (page_url )
150
+ # log response time
151
+ logger .debug (f"Response time: { response .elapsed .total_seconds ()} " )
152
+ self .stdout .write (str (response .headers ))
112
153
success = False
113
154
if response .status_code == requests .codes .ok :
114
155
with out_file .open (mode = "wb" ) as writer :
115
156
writer .write (response .content )
116
157
success = True
117
- else :
118
- if self .verbosity > self .v_normal :
119
- self .stdout (f"Warning: Failed to fetch image { out_file .name } " )
158
+ # For checking throttling rates
159
+ # TODO: Consider removing once crawl delays are determined
160
+ choke_str = "x-choke info:"
161
+ for choke_sfx in ['allowed' , 'credit' , 'delta' , 'max' , 'rate' ]:
162
+ header = f"x-choke-{ choke_sfx } "
163
+ if header in response .headers :
164
+ choke_str += f"\n { header } : { response .headers [header ]} "
165
+ logger .debug (choke_str )
166
+ elif response .status_code == 503 :
167
+ logger .debug ("WARNING: Received 503 status code. Throttling may have occurred" )
120
168
# Apply crawl delay after request
121
169
sleep (self .crawl_delay )
122
170
return success
@@ -132,44 +180,31 @@ def download_volume_images(self, vol_id:str, page_range: Iterable) -> DownloadSt
132
180
# Get filename-friendly version of htid
133
181
clean_htid = encode_htid (vol_id )
134
182
135
- # Setup volume-level progress bar
136
- volume_progress = None
137
- if self .show_progress :
138
- volume_progress = progressbar .ProgressBar (
139
- line_offset = 1 , redirect_stdout = True , max_value = len (page_range ), max_error = False
140
- )
141
- volume_progress .start ()
142
-
143
183
# Fetch images
144
184
stats = DownloadStats ()
145
185
for page_num in page_range :
146
186
image_name = f"{ clean_htid } .{ page_num :08d} .jpg"
147
187
148
- # Fetch thumbnail if file does not exist
149
- page_thumbnail = thumbnail_dir / image_name
150
- if not page_thumbnail .is_file ():
151
- thumbnail_url = page_image_url (vol_id , page_num , self .thumbnail_width )
152
- success = self .download_image (thumbnail_url , page_thumbnail )
153
- # TODO: Should we log something different if the download fails?
154
- stats .log_download ("thumbnail" )
155
- else :
156
- stats .log_skip ("thumbnail" )
157
-
158
- # Fetch "full" image if file does not exist
159
- page_image = vol_dir / image_name
160
- if not page_image .is_file ():
161
- image_url = page_image_url (vol_id , page_num , self .full_width )
162
- success = self .download_image (image_url , page_image )
163
- stats .log_download ("full" )
164
- else :
165
- stats .log_skip ("full" )
166
-
167
- # Update volume-specific progress bar
168
- if volume_progress :
169
- volume_progress .increment ()
170
- # Finish volume-specific progress bar
171
- if volume_progress :
172
- volume_progress .finish ()
188
+ for image_type in ["full" , "thumbnail" ]:
189
+ image_dir = vol_dir if image_type == "full" else thumbnail_dir
190
+ image = image_dir / image_name
191
+ image_width = getattr (self , f"{ image_type } _width" )
192
+
193
+ # Fetch image does not exist
194
+ if not image .is_file ():
195
+ image_url = page_image_url (vol_id , page_num , image_width )
196
+ success = self .download_image (image_url , image )
197
+ if success :
198
+ stats .log_download (image_type )
199
+ else :
200
+ stats .log_error (image_type )
201
+ logger .debug (f"Failed to download { image_type } image { image_name } " )
202
+ else :
203
+ stats .log_skip (image_type )
204
+
205
+ # Update progress bar
206
+ if self .show_progress :
207
+ self .progress_bar .update ()
173
208
return stats
174
209
175
210
@@ -178,7 +213,6 @@ def handle(self, *args, **kwargs):
178
213
self .crawl_delay = kwargs ["crawl_delay" ]
179
214
self .full_width = kwargs ["image_width" ]
180
215
self .thumbnail_width = kwargs ["thumbnail_width" ]
181
- self .verbosity = kwargs .get ("verbosity" , self .verbosity )
182
216
self .show_progress = kwargs ["progress" ]
183
217
184
218
# Validate input arguments
@@ -187,7 +221,7 @@ def handle(self, *args, **kwargs):
187
221
f"Output directory '{ self .output_dir } ' does not exist or is not a directory"
188
222
)
189
223
if self .thumbnail_width > 250 :
190
- raise CommandError (f "Thumbnail width cannot be more than 250 pixels" )
224
+ raise CommandError ("Thumbnail width cannot be more than 250 pixels" )
191
225
192
226
# use ids specified via command line when present
193
227
htids = kwargs .get ("htids" , [])
@@ -208,34 +242,44 @@ def handle(self, *args, **kwargs):
208
242
if not digworks .exists ():
209
243
self .stdout .write ("No records to download; stopping" )
210
244
return
245
+
246
+ # Bind handler for interrupt signal
247
+ signal .signal (signal .SIGINT , self .interrupt_handler )
211
248
249
+ n_vols = digworks .count ()
212
250
self .stdout .write (
213
- f"Downloading images for { digworks . count () } record{ pluralize (digworks )} "
251
+ f"Downloading images for { n_vols } record{ pluralize (digworks )} " ,
214
252
)
215
253
216
- # setup main progress bar
217
- overall_progress = None
254
+ # Initialize progress bar
218
255
if self .show_progress :
219
- overall_progress = progressbar .ProgressBar (
220
- line_offset = 0 , redirect_stdout = True , max_value = digworks .count (), max_error = False
221
- )
222
- overall_progress .start ()
223
-
256
+ self .progress_bar = tqdm ()
257
+
224
258
overall_stats = DownloadStats ()
225
- for digwork in digworks :
259
+ for i , digwork in enumerate ( digworks ) :
226
260
vol_id = digwork .source_id
227
261
# Determine page range
228
262
if digwork .item_type == DigitizedWork .FULL :
229
263
page_range = range (1 , digwork .page_count + 1 )
230
264
else :
231
265
page_range = digwork .page_span
232
-
266
+
267
+ # Update progress bar
268
+ if self .show_progress :
269
+ self .progress_bar .reset (total = len (page_range ))
270
+ self .progress_bar .set_description (
271
+ f"{ vol_id } ({ i + 1 } /{ n_vols } )"
272
+ )
273
+
233
274
vol_stats = self .download_volume_images (vol_id , page_range )
234
275
overall_stats .update (vol_stats )
235
276
# Update overall progress bar
236
- if overall_progress :
237
- overall_progress .increment ()
238
- if overall_progress :
239
- overall_progress .finish ()
240
- self .stdout .write ("\n \n " ) # To avoid overwriting progress bars
277
+ # Check if we need to exit early
278
+ if self .interrupted :
279
+ break
280
+ # Close progres bar
281
+ if self .show_progress :
282
+ self .progress_bar .close ()
283
+ if self .interrupted :
284
+ self .stdout .write (self .style .WARNING (f"Exited early with { i } volumes completed." ))
241
285
self .stdout .write (self .style .SUCCESS (overall_stats .get_report ()))
0 commit comments