Skip to content

Commit

Permalink
Refactor multithreaded video download logic for better error handling
Browse files Browse the repository at this point in the history
  • Loading branch information
dan-niles committed Sep 6, 2024
1 parent 962b6b3 commit 69cb71c
Showing 1 changed file with 36 additions and 25 deletions.
61 changes: 36 additions & 25 deletions scraper/src/youtube2zim/scraper.py
Original file line number Diff line number Diff line change
Expand Up @@ -196,6 +196,7 @@ def __init__(
if stats_filename:
self.stats_path = Path(stats_filename).expanduser()
self.stats_path.parent.mkdir(parents=True, exist_ok=True)
self.succeeded, self.failed = set(), set()

Check warning on line 199 in scraper/src/youtube2zim/scraper.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/youtube2zim/scraper.py#L199

Added line #L199 was not covered by tests

@property
def root_dir(self):
Expand Down Expand Up @@ -402,15 +403,16 @@ def run(self):
f" using cache: {self.s3_storage.url.netloc} "
f"with bucket: {self.s3_storage.bucket_name}"
)
succeeded, failed = self.download_video_files(
max_concurrency=self.max_concurrency
)
if failed:
logger.error(f"{len(failed)} video(s) failed to download: {failed}")
if len(failed) >= len(succeeded):
self.download_video_files(max_concurrency=self.max_concurrency)

Check warning on line 406 in scraper/src/youtube2zim/scraper.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/youtube2zim/scraper.py#L406

Added line #L406 was not covered by tests
if self.failed:
logger.error(

Check warning on line 408 in scraper/src/youtube2zim/scraper.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/youtube2zim/scraper.py#L408

Added line #L408 was not covered by tests
f"{len(self.failed)} video(s) failed to download: {self.failed}"
)
if len(self.failed) >= len(self.succeeded):
logger.critical("More than half of videos failed. exiting")
raise OSError("Too much videos failed to download")

succeeded = list(self.succeeded)

Check warning on line 415 in scraper/src/youtube2zim/scraper.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/youtube2zim/scraper.py#L415

Added line #L415 was not covered by tests
logger.info("retrieve channel-info for all videos (author details)")
get_videos_authors_info(succeeded)

Expand Down Expand Up @@ -639,22 +641,16 @@ def download_video_files(self, max_concurrency):

self.yt_downloader = YoutubeDownloader(concurrency)

Check warning on line 642 in scraper/src/youtube2zim/scraper.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/youtube2zim/scraper.py#L642

Added line #L642 was not covered by tests

succeeded = []
failed = []
for video_id in self.videos_ids:
run_pending()
if self.download_video(video_id, options) and self.download_thumbnail(
video_id, options
):
try:

Check warning on line 644 in scraper/src/youtube2zim/scraper.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/youtube2zim/scraper.py#L644

Added line #L644 was not covered by tests
for video_id in self.videos_ids:
self.download_video(video_id, options)
self.download_thumbnail(video_id, options)
self.download_subtitles(video_id, options)
succeeded.append(video_id)
else:
failed.append(video_id)
self.videos_processed += 1

self.yt_downloader.shutdown()

return succeeded, failed
except Exception as exc:
logger.error(f"Error while downloading videos: {exc}")
raise

Check warning on line 651 in scraper/src/youtube2zim/scraper.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/youtube2zim/scraper.py#L646-L651

Added lines #L646 - L651 were not covered by tests
finally:
self.yt_downloader.shutdown()

Check warning on line 653 in scraper/src/youtube2zim/scraper.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/youtube2zim/scraper.py#L653

Added line #L653 was not covered by tests

def download_from_cache(self, key, video_path, encoder_version):
"""whether it successfully downloaded from cache"""
Expand Down Expand Up @@ -692,6 +688,13 @@ def upload_to_cache(self, key, video_path, encoder_version):
logger.info(f"uploaded {video_path} to cache at {key}")
return True

def handle_download_status(self, video_id, status):

Check warning on line 691 in scraper/src/youtube2zim/scraper.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/youtube2zim/scraper.py#L691

Added line #L691 was not covered by tests
if status == "failed":
self.succeeded.discard(video_id)
self.failed.add(video_id)

Check warning on line 694 in scraper/src/youtube2zim/scraper.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/youtube2zim/scraper.py#L693-L694

Added lines #L693 - L694 were not covered by tests
elif status == "succeeded" and video_id not in self.failed:
self.succeeded.add(video_id)

Check warning on line 696 in scraper/src/youtube2zim/scraper.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/youtube2zim/scraper.py#L696

Added line #L696 was not covered by tests

def download_video(self, video_id, options):
"""download the video from cache/youtube and return True if successful"""

Expand All @@ -711,6 +714,7 @@ def download_video(self, video_id, options):
self.add_file_to_zim(
zim_path, video_path, callback=(delete_callback, video_path)
)
self.handle_download_status(video_id, "succeeded")

Check warning on line 717 in scraper/src/youtube2zim/scraper.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/youtube2zim/scraper.py#L717

Added line #L717 was not covered by tests
return True

# skip downloading the thumbnails
Expand All @@ -726,6 +730,7 @@ def download_video(self, video_id, options):

def on_complete(future):
try:
run_pending()
future.result()
post_process_video(

Check warning on line 735 in scraper/src/youtube2zim/scraper.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/youtube2zim/scraper.py#L731-L735

Added lines #L731 - L735 were not covered by tests
video_location,
Expand All @@ -744,15 +749,18 @@ def on_complete(future):
) as exc:
logger.error(f"Video file for {video_id} could not be downloaded")
logger.debug(exc)
return False
self.handle_download_status(video_id, "failed")

Check warning on line 752 in scraper/src/youtube2zim/scraper.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/youtube2zim/scraper.py#L750-L752

Added lines #L750 - L752 were not covered by tests
else: # upload to cache only if everything went well
if self.s3_storage:
logger.debug(f"Uploading video file for {video_id} to cache ...")
self.upload_to_cache(s3_key, video_path, preset.VERSION)
return True
self.handle_download_status(video_id, "succeeded")

Check warning on line 757 in scraper/src/youtube2zim/scraper.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/youtube2zim/scraper.py#L755-L757

Added lines #L755 - L757 were not covered by tests
finally:
self.videos_processed += 1

Check warning on line 759 in scraper/src/youtube2zim/scraper.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/youtube2zim/scraper.py#L759

Added line #L759 was not covered by tests

if isinstance(future, Future):
future.add_done_callback(on_complete)

Check warning on line 762 in scraper/src/youtube2zim/scraper.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/youtube2zim/scraper.py#L762

Added line #L762 was not covered by tests

return future

Check warning on line 764 in scraper/src/youtube2zim/scraper.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/youtube2zim/scraper.py#L764

Added line #L764 was not covered by tests

def download_thumbnail(self, video_id, options):
Expand All @@ -774,6 +782,7 @@ def download_thumbnail(self, video_id, options):
self.add_file_to_zim(
zim_path, thumbnail_path, callback=(delete_callback, thumbnail_path)
)
self.handle_download_status(video_id, "succeeded")

Check warning on line 785 in scraper/src/youtube2zim/scraper.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/youtube2zim/scraper.py#L785

Added line #L785 was not covered by tests
return True

# skip downloading the video
Expand Down Expand Up @@ -801,15 +810,16 @@ def on_complete(future):
) as exc:
logger.error(f"Thumbnail for {video_id} could not be downloaded")
logger.debug(exc)
return False
self.handle_download_status(video_id, "failed")

Check warning on line 813 in scraper/src/youtube2zim/scraper.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/youtube2zim/scraper.py#L811-L813

Added lines #L811 - L813 were not covered by tests
else: # upload to cache only if everything went well
if self.s3_storage:
logger.debug(f"Uploading thumbnail for {video_id} to cache ...")
self.upload_to_cache(s3_key, thumbnail_path, preset.VERSION)
return True
self.handle_download_status(video_id, "succeeded")

Check warning on line 818 in scraper/src/youtube2zim/scraper.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/youtube2zim/scraper.py#L816-L818

Added lines #L816 - L818 were not covered by tests

if isinstance(future, Future):
future.add_done_callback(on_complete)

Check warning on line 821 in scraper/src/youtube2zim/scraper.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/youtube2zim/scraper.py#L821

Added line #L821 was not covered by tests

return future

Check warning on line 823 in scraper/src/youtube2zim/scraper.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/youtube2zim/scraper.py#L823

Added line #L823 was not covered by tests

def fetch_video_subtitles_list(self, video_id: str) -> Subtitles:
Expand Down Expand Up @@ -881,6 +891,7 @@ def on_complete(future):

if isinstance(future, Future):
future.add_done_callback(on_complete)

Check warning on line 893 in scraper/src/youtube2zim/scraper.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/youtube2zim/scraper.py#L893

Added line #L893 was not covered by tests

return future

Check warning on line 895 in scraper/src/youtube2zim/scraper.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/youtube2zim/scraper.py#L895

Added line #L895 was not covered by tests

def download_authors_branding(self):
Expand Down

0 comments on commit 69cb71c

Please sign in to comment.