From 25eebff8fb052284cbc87ee284a9eb3e849bbbc9 Mon Sep 17 00:00:00 2001
From: Cebtenzzre <cebtenzzre@gmail.com>
Date: Mon, 21 Sep 2020 00:36:24 -0400
Subject: [PATCH] tumblr_backup: --continue and related new behavior

Fixes #115

Included revisions:
- Allow setting options in BACKUP_CHANGING_OPTIONS to match the backup --
redundant, but sometimes more convenient.
---
 tumblr_backup.py | 317 ++++++++++++++++++++++++++++++++++-------------
 util.py          |  17 +++
 wget.py          |  21 +---
 3 files changed, 249 insertions(+), 106 deletions(-)

diff --git a/tumblr_backup.py b/tumblr_backup.py
index 531f9b1..23e238b 100755
--- a/tumblr_backup.py
+++ b/tumblr_backup.py
@@ -4,6 +4,7 @@
 from __future__ import absolute_import, division, print_function, with_statement
 
 # standard Python library imports
+import contextlib
 import errno
 import hashlib
 import imghdr
@@ -19,14 +20,15 @@
 import time
 from collections import defaultdict
 from datetime import datetime, timedelta
+from tempfile import NamedTemporaryFile
 from glob import glob
 from os.path import join, split, splitext
 from posixpath import basename as urlbasename, join as urlpathjoin, splitext as urlsplitext
 from xml.sax.saxutils import escape
 
 from util import (AsyncCallable, ConnectionFile, LockedQueue, MultiCondition, PY3, disable_unraisable_hook,
-                  is_dns_working, make_requests_session, no_internet, nullcontext, path_is_on_vfat, to_bytes,
-                  to_unicode)
+                  is_dns_working, make_requests_session, no_internet, nullcontext, opendir, path_is_on_vfat, to_bytes,
+                  to_unicode, try_unlink)
 from wget import HTTPError, HTTP_RETRY, HTTP_TIMEOUT, WGError, WgetRetrieveWrapper, setup_wget, urlopen
 
 try:
@@ -154,6 +156,11 @@ def test_jpg(h, f):
 FILE_ENCODING = 'utf-8'
 TIME_ENCODING = locale.getlocale(locale.LC_TIME)[1] or FILE_ENCODING
 
+MUST_MATCH_OPTIONS = ('dirs', 'likes', 'blosxom', 'hostdirs', 'image_names')
+BACKUP_CHANGING_OPTIONS = (
+    'save_images', 'save_video', 'save_video_tumblr', 'save_audio', 'save_notes', 'copy_notes', 'notes_limit', 'json',
+    'count', 'skip', 'period', 'request', 'filter', 'no_reblog', 'exif', 'prev_archives')
+
 main_thread_lock = threading.RLock()
 multicond = MultiCondition(main_thread_lock)
 disable_note_scraper = set()  # type: Set[str]
@@ -161,6 +168,15 @@ def test_jpg(h, f):
 prev_resps = None  # type: Optional[Tuple[str, ...]]
 
 
+def load_bs4(reason):
+    sys.modules['soupsieve'] = ()  # type: ignore[assignment]
+    try:
+        from bs4 import BeautifulSoup
+    except ImportError:
+        raise RuntimeError("Cannot {} without module 'bs4'".format(reason))
+    return BeautifulSoup
+
+
 class Logger(object):
     def __init__(self):
         self.lock = threading.Lock()
@@ -222,10 +238,32 @@ def open_file(open_fn, parts):
     return open_fn(path_to(*parts))
 
 
+@contextlib.contextmanager
 def open_text(*parts):
-    return open_file(
-        lambda f: io.open(f, 'w', encoding=FILE_ENCODING, errors='xmlcharrefreplace'), parts
-    )
+    dest_path = open_file(lambda f: f, parts)
+    dest_dirname, dest_basename = split(dest_path)
+
+    with NamedTemporaryFile('w', prefix='.{}.'.format(dest_basename), dir=dest_dirname, delete=False) as partf:
+        # Yield the file for writing
+        with io.open(partf.fileno(), 'w', encoding=FILE_ENCODING, errors='xmlcharrefreplace', closefd=False) as f:
+            yield f
+
+        # NamedTemporaryFile is created 0600, set mode to the usual 0644
+        os.fchmod(partf.fileno(), 0o644)
+
+        # Flush buffers and sync the inode
+        partf.flush()
+        os.fsync(partf)  # type: ignore
+
+        pfname = partf.name
+
+    # Move to final destination
+    if PY3:
+        os.replace(pfname, dest_path)
+    else:
+        if os.name == 'nt':
+            try_unlink(dest_path)  # Avoid potential FileExistsError
+        os.rename(pfname, dest_path)
 
 
 def strftime(fmt, t=None):
@@ -246,24 +284,25 @@ def get_api_url(account):
     )
 
 
-def set_period():
+def set_period(period):
     """Prepare the period start and end timestamps"""
     i = 0
-    tm = [int(options.period[:4]), 1, 1, 0, 0, 0, 0, 0, -1]
-    if len(options.period) >= 6:
+    tm = [int(period[:4]), 1, 1, 0, 0, 0, 0, 0, -1]
+    if len(period) >= 6:
         i = 1
-        tm[1] = int(options.period[4:6])
-    if len(options.period) == 8:
+        tm[1] = int(period[4:6])
+    if len(period) == 8:
         i = 2
-        tm[2] = int(options.period[6:8])
+        tm[2] = int(period[6:8])
 
     def mktime(tml):
         tmt = tuple(tml)  # type: Any
         return time.mktime(tmt)
 
-    options.p_start = int(mktime(tm))
+    p_start = int(mktime(tm))
     tm[i] += 1
-    options.p_stop = int(mktime(tm))
+    p_stop = int(mktime(tm))
+    return p_start, p_stop
 
 
 class ApiParser(object):
@@ -546,6 +585,18 @@ def dup(fd): return fd
         return True  # Either we copied it or we didn't need to
 
 
+def check_optional_modules():
+    if options.exif:
+        if pyexiv2 is None:
+            raise RuntimeError("--exif: module 'pyexif2' is not installed")
+        if not hasattr(pyexiv2, 'ImageMetadata'):
+            raise RuntimeError("--exif: module 'pyexiv2' is missing features, perhaps you need 'py3exiv2'?")
+    if options.filter is not None and pyjq is None:
+        raise RuntimeError("--filter: module 'pyjq' is not installed")
+    if options.prev_archives and scandir is None:
+        raise RuntimeError("--prev-archives: Python is less than 3.5 and module 'scandir' is not installed")
+
+
 class Index(object):
     def __init__(self, blog, body_class='index'):
         self.blog = blog
@@ -631,7 +682,8 @@ def next_month(inc):
 
             archive.append(self.blog.footer(base, pp, np))
 
-            arch.write('\n'.join(archive))
+            with arch as archf:
+                archf.write('\n'.join(archive))
 
         assert first_file is not None
         return first_file
@@ -739,7 +791,8 @@ def footer(base, previous_page, next_page):
         return f
 
     @staticmethod
-    def get_post_timestamps(posts):
+    def get_post_timestamps(posts, reason):
+        BeautifulSoup = load_bs4(reason)
         for post in posts:
             with io.open(post, encoding=FILE_ENCODING) as pf:
                 soup = BeautifulSoup(pf, 'lxml')
@@ -748,6 +801,90 @@ def get_post_timestamps(posts):
             # No datetime.fromisoformat or datetime.timestamp on Python 2
             yield (datetime.strptime(postdate, '%Y-%m-%dT%H:%M:%SZ') - datetime(1970, 1, 1)) // timedelta(seconds=1)
 
+    @classmethod
+    def process_existing_backup(cls, account, prev_archive):
+        complete_backup = os.path.exists(path_to('.complete'))
+        if options.resume and complete_backup:
+            raise RuntimeError('{}: Cannot continue complete backup'.format(account))
+        try:
+            with io.open(path_to('.first_run_options'), encoding=FILE_ENCODING) as f:
+                first_run_options = json.load(f)
+        except EnvironmentError as e:
+            if getattr(e, 'errno', None) != errno.ENOENT:
+                raise
+            first_run_options = None
+
+        class Options(object):
+            def __init__(self, fro): self.fro = fro
+            def differs(self, opt): return opt not in self.fro or orig_options[opt] != self.fro[opt]
+            def first(self, opts): return {opt: self.fro.get(opt, '<not present>') for opt in opts}
+            @staticmethod
+            def this(opts): return {opt: orig_options[opt] for opt in opts}
+
+        # These options must always match
+        if first_run_options is not None:
+            opts = Options(first_run_options)
+            mustmatchdiff = tuple(filter(opts.differs, MUST_MATCH_OPTIONS))
+            if mustmatchdiff:
+                raise RuntimeError('{}: The script was given {} but the existing backup was made with {}'.format(
+                    account, opts.this(mustmatchdiff), opts.first(mustmatchdiff)))
+
+            backdiff = tuple(filter(opts.differs, BACKUP_CHANGING_OPTIONS))
+            if options.resume:
+                backdiff_nondef = tuple(opt for opt in backdiff if orig_options[opt] != parser.get_default(opt))
+                if backdiff_nondef:
+                    raise RuntimeError('{}: The script was given {} but the existing backup was made with {}'.format(
+                        account, opts.this(backdiff_nondef), opts.first(backdiff_nondef)))
+            elif complete_backup:
+                pass  # Complete archives may be added to with different options
+            elif not backdiff:
+                raise RuntimeError('{}: Found incomplete archive, try --continue'.format(account))
+            elif not options.ignore_resume:
+                raise RuntimeError('{}: Refusing to make a different backup (with {} instead of {}) over an incomplete '
+                                   'archive. Delete the old backup to start fresh, or skip this check with '
+                                   '--continue=ignore.'.format(account, opts.this(backdiff), opts.first(backdiff)))
+
+        if prev_archive is not None:
+            try:
+                with io.open(join(prev_archive, '.first_run_options'), encoding=FILE_ENCODING) as f:
+                    pa_first_run_options = json.load(f)
+            except EnvironmentError as e:
+                if getattr(e, 'errno', None) != errno.ENOENT:
+                    raise
+                pa_first_run_options = None
+
+            # These options must always match
+            if pa_first_run_options is not None:
+                pa_opts = Options(pa_first_run_options)
+                mustmatchdiff = tuple(filter(pa_opts.differs, MUST_MATCH_OPTIONS))
+                if mustmatchdiff:
+                    raise RuntimeError('{}: The script was given {} but the previous archive was made with {}'.format(
+                        account, pa_opts.this(mustmatchdiff), pa_opts.first(mustmatchdiff)))
+
+        oldest_tstamp = None
+        if not complete_backup:
+            # Read every post to find the oldest timestamp we've saved.
+            filter_ = join('*', dir_index) if options.dirs else '*' + post_ext
+            post_glob = glob(path_to(post_dir, filter_))
+            if options.resume and post_glob:
+                log('Found incomplete backup. Finding oldest post (may take a while)\n', account=True)
+                oldest_tstamp = min(cls.get_post_timestamps(post_glob, 'continue incomplete backup'))
+
+        if first_run_options is not None and options.resume:
+            # Load saved options
+            for opt in BACKUP_CHANGING_OPTIONS:
+                setattr(options, opt, first_run_options[opt])
+        else:
+            # Load original options
+            for opt in BACKUP_CHANGING_OPTIONS:
+                setattr(options, opt, orig_options[opt])
+            if first_run_options is None and not (complete_backup or post_glob):
+                # Presumably this is the initial backup of this blog
+                with open_text('.first_run_options') as f:
+                    f.write(to_unicode(json.dumps(orig_options)))
+
+        return oldest_tstamp
+
     def backup(self, account, prev_archive):
         """makes single files and an index for every post on a public Tumblr blog account"""
 
@@ -775,6 +912,9 @@ def backup(self, account, prev_archive):
         self.post_count = 0
         self.filter_skipped = 0
 
+        oldest_tstamp = self.process_existing_backup(account, prev_archive)
+        check_optional_modules()
+
         # get the highest post id already saved
         ident_max = None
         if options.incremental:
@@ -784,10 +924,8 @@ def backup(self, account, prev_archive):
                 pass  # No posts to read
             elif options.likes:
                 # Read every post to find the newest timestamp we've saved.
-                if BeautifulSoup is None:
-                    raise RuntimeError("Incremental likes backup: module 'bs4' is not installed")
                 log('Finding newest liked post (may take a while)\n', account=True)
-                ident_max = max(self.get_post_timestamps(post_glob))
+                ident_max = max(self.get_post_timestamps(post_glob, 'backup likes incrementally'))
             else:
                 ident_max = max(long(splitext(split(f)[1])[0]) for f in post_glob)
             if ident_max is not None:
@@ -825,6 +963,9 @@ def backup(self, account, prev_archive):
         # use the meta information to create a HTML header
         TumblrPost.post_header = self.header(body_class='post')
 
+        jq_filter = None if options.filter is None else pyjq.compile(options.filter)  # pytype: disable=attribute-error
+        request_sets = None if options.request is None else {typ: set(tags) for typ, tags in options.request}
+
         # start the thread pool
         backup_pool = ThreadPool()
 
@@ -842,16 +983,16 @@ def _backup(posts, post_respfiles):
                 if options.count and self.post_count >= options.count:
                     return False
                 if options.period:
-                    if post.date >= options.p_stop:
+                    if post.date >= options.period[1]:
                         raise RuntimeError('Found post with date ({}) older than before param ({})'.format(
-                            post.date, options.p_stop))
-                    if post.date < options.p_start:
+                            post.date, options.period[1]))
+                    if post.date < options.period[0]:
                         return False
-                if options.request:
-                    if post.typ not in options.request:
+                if request_sets:
+                    if post.typ not in request_sets:
                         continue
-                    tags = options.request[post.typ]
-                    if not (TAG_ANY in tags or tags & post.tags_lower):
+                    tags = request_sets[post.typ]
+                    if not (TAG_ANY in tags or tags & {t.lower() for t in post.tags}):
                         continue
                 if options.no_reblog:
                     if 'reblogged_from_name' in p or 'reblogged_root_name' in p:
@@ -863,7 +1004,7 @@ def _backup(posts, post_respfiles):
                         continue
                 if os.path.exists(open_file(lambda f: f, post.get_path())) and options.no_post_clobber:
                     continue  # Post exists and no-clobber enabled
-                if options.filter and not options.filter.first(p):
+                if jq_filter and not jq_filter.first(p):
                     self.filter_skipped += 1
                     continue
 
@@ -881,7 +1022,9 @@ def _backup(posts, post_respfiles):
             # Get the JSON entries from the API, which we can only do for MAX_POSTS posts at once.
             # Posts "arrive" in reverse chronological order. Post #0 is the most recent one.
             i = options.skip
-            before = options.p_stop if options.period else None
+            before = options.period[1] if options.period else None
+            if before is not None and oldest_tstamp is not None:
+                before = min(before, oldest_tstamp)
 
             while True:
                 # find the upper bound
@@ -941,6 +1084,17 @@ def _backup(posts, post_respfiles):
             ix.build_index()
             ix.save_index()
 
+        if not os.path.exists(path_to('.complete')):
+            # Make .complete file
+            sf = opendir(save_folder, os.O_RDONLY)
+            try:
+                os.fdatasync(sf)
+                with io.open(open_file(lambda f: f, ('.complete',)), 'wb') as f:
+                    os.fsync(f)  # type: ignore
+                os.fdatasync(sf)
+            finally:
+                os.close(sf)
+
         log.status(None)
         skipped_msg = (', {} did not match filter'.format(self.filter_skipped)) if self.filter_skipped else ''
         log(
@@ -969,7 +1123,7 @@ def __init__(self, post, backup_account, respfile, prev_archive):
         self.isodate = datetime.utcfromtimestamp(self.date).isoformat() + 'Z'
         self.tm = time.localtime(self.date)
         self.title = u''
-        self.tags = post['tags']
+        self.tags = post['tags']  # type: Text
         self.note_count = post.get('note_count')
         if self.note_count is None:
             self.note_count = post.get('notes', {}).get('count')
@@ -979,9 +1133,6 @@ def __init__(self, post, backup_account, respfile, prev_archive):
         self.reblogged_root = post.get('reblogged_root_url')
         self.source_title = post.get('source_title', '')
         self.source_url = post.get('source_url', '')
-        self.tags_lower = None  # type: Optional[Set[str]]
-        if options.request:
-            self.tags_lower = {t.lower() for t in self.tags}
         self.file_name = join(self.ident, dir_index) if options.dirs else self.ident + post_ext
         self.llink = self.ident if options.dirs else self.file_name
         self.media_dir = join(post_dir, self.ident) if options.dirs else media_dir
@@ -1132,6 +1283,11 @@ def get_youtube_url(self, youtube_url):
         }
         if options.cookiefile is not None:
             ydl_options['cookiefile'] = options.cookiefile
+        try:
+            import youtube_dl
+            from youtube_dl.utils import sanitize_filename
+        except ImportError:
+            raise RuntimeError("--save-video: module 'youtube_dl' is not installed")
         ydl = youtube_dl.YoutubeDL(ydl_options)
         ydl.add_default_info_extractors()
         try:
@@ -1291,6 +1447,9 @@ def get_post(self):
 
         notes_html = u''
 
+        if options.save_notes or options.copy_notes:
+            BeautifulSoup = load_bs4('save notes' if options.save_notes else 'copy notes')
+
         if options.copy_notes:
             # Copy notes from prev_archive
             with io.open(join(self.prev_archive, post_dir, self.ident + post_ext)) as post_file:
@@ -1300,6 +1459,7 @@ def get_post(self):
                 notes_html = u''.join([n.prettify() for n in notes.find_all('li')])
 
         if options.save_notes and self.backup_account not in disable_note_scraper and not notes_html.strip():
+            import note_scraper
             # Scrape and save notes
             while True:
                 ns_stdout_rd, ns_stdout_wr = multiprocessing.Pipe(duplex=False)
@@ -1373,9 +1533,10 @@ def get_path(self):
 
     def save_post(self):
         """saves this post locally"""
-        with open_text(*self.get_path()) as f:
+        path = path_to(*self.get_path())
+        with open_text(path) as f:
             f.write(self.get_post())
-        os.utime(f.name, (self.date, self.date))
+        os.utime(path, (self.date, self.date))
         if options.json:
             with open_text(json_dir, self.ident + '.json') as f:
                 f.write(self.get_json_content())
@@ -1548,10 +1709,6 @@ def handle_term_signal(signum, frame):
     import argparse
 
     class CSVCallback(argparse.Action):
-        def __call__(self, parser, namespace, values, option_string=None):
-            setattr(namespace, self.dest, set(values.split(',')))
-
-    class CSVListCallback(argparse.Action):
         def __call__(self, parser, namespace, values, option_string=None):
             setattr(namespace, self.dest, list(values.split(',')))
 
@@ -1565,9 +1722,9 @@ def __call__(self, parser, namespace, values, option_string=None):
                     parser.error("{}: invalid post type '{}'".format(option_string, typ))
                 for typ in POST_TYPES if typ == TYPE_ANY else (typ,):
                     if parts:
-                        request[typ] = request.get(typ, set()).union(parts)
+                        request[typ] = request.get(typ, ()) + parts
                     else:
-                        request[typ] = {TAG_ANY}
+                        request[typ] = (TAG_ANY,)
             setattr(namespace, self.dest, request)
 
     class TagsCallback(RequestCallback):
@@ -1576,6 +1733,18 @@ def __call__(self, parser, namespace, values, option_string=None):
                 parser, namespace, TYPE_ANY + ':' + values.replace(',', ':'), option_string,
             )
 
+    class PeriodCallback(argparse.Action):
+        def __call__(self, parser, namespace, values, option_string=None):
+            try:
+                pformat = {'y': '%Y', 'm': '%Y%m', 'd': '%Y%m%d'}[values]
+            except KeyError:
+                period = values.replace('-', '')
+                if not re.match(r'^\d{4}(\d\d)?(\d\d)?$', period):
+                    parser.error("Period must be 'y', 'm', 'd' or YYYY[MM[DD]]")
+            else:
+                period = time.strftime(pformat)
+            setattr(namespace, self.dest, set_period(period))
+
     parser = argparse.ArgumentParser(usage='%(prog)s [options] blog-name ...',
                                      description='Makes a local backup of Tumblr blogs.')
     parser.add_argument('-O', '--outdir', help='set the output directory (default: blog-name)')
@@ -1603,7 +1772,8 @@ def __call__(self, parser, namespace, values, option_string=None):
                              ' (useful for cron jobs)')
     parser.add_argument('-n', '--count', type=int, help='save only COUNT posts')
     parser.add_argument('-s', '--skip', type=int, default=0, help='skip the first SKIP posts')
-    parser.add_argument('-p', '--period', help="limit the backup to PERIOD ('y', 'm', 'd' or YYYY[MM[DD]])")
+    parser.add_argument('-p', '--period', action=PeriodCallback,
+                        help="limit the backup to PERIOD ('y', 'm', 'd' or YYYY[MM[DD]])")
     parser.add_argument('-N', '--posts-per-page', type=int, default=50, metavar='COUNT',
                         help='set the number of posts per monthly page, 0 for unlimited')
     parser.add_argument('-Q', '--request', action=RequestCallback,
@@ -1620,11 +1790,11 @@ def __call__(self, parser, namespace, values, option_string=None):
     parser.add_argument('--no-reblog', action='store_true', help="don't save reblogged posts")
     parser.add_argument('-I', '--image-names', choices=('o', 'i', 'bi'), default='o', metavar='FMT',
                         help="image filename format ('o'=original, 'i'=<post-id>, 'bi'=<blog-name>_<post-id>)")
-    parser.add_argument('-e', '--exif', action=CSVCallback, default=set(), metavar='KW',
+    parser.add_argument('-e', '--exif', action=CSVCallback, default=[], metavar='KW',
                         help='add EXIF keyword tags to each picture'
                              " (comma-separated values; '-' to remove all tags, '' to add no extra tags)")
     parser.add_argument('-S', '--no-ssl-verify', action='store_true', help='ignore SSL verification errors')
-    parser.add_argument('--prev-archives', action=CSVListCallback, default=[], metavar='DIRS',
+    parser.add_argument('--prev-archives', action=CSVCallback, default=[], metavar='DIRS',
                         help='comma-separated list of directories (one per blog) containing previous blog archives')
     parser.add_argument('-M', '--timestamping', action='store_true',
                         help="don't re-download files if the remote timestamp and size match the local file")
@@ -1638,27 +1808,21 @@ def __call__(self, parser, namespace, values, option_string=None):
     parser.add_argument('--user-agent', help='User agent string to use with HTTP requests')
     parser.add_argument('--no-post-clobber', action='store_true', help='Do not re-download existing posts')
     parser.add_argument('--threads', type=int, default=20, help='number of threads to use for post retrieval')
+    parser.add_argument('--continue', action='store_true', dest='resume', help='Continue an incomplete first backup')
+    parser.add_argument('--continue=ignore', action='store_true', dest='ignore_resume',
+                        help='Force backup over an incomplete archive with different options')
     parser.add_argument('blogs', nargs='*')
     options = parser.parse_args()
+    blogs = options.blogs or DEFAULT_BLOGS
+    del options.blogs
+    orig_options = vars(options).copy()
 
+    if not blogs:
+        parser.error('Missing blog-name')
+    if sum(1 for arg in ('resume', 'ignore_resume', 'incremental', 'auto') if getattr(options, arg)) > 1:
+        parser.error('Only one of --continue, --continue=ignore, --incremental, and --auto may be given')
     if options.auto is not None and options.auto != time.localtime().tm_hour:
         options.incremental = True
-    if options.period:
-        try:
-            pformat = {'y': '%Y', 'm': '%Y%m', 'd': '%Y%m%d'}[options.period]
-            options.period = time.strftime(pformat)
-        except KeyError:
-            options.period = options.period.replace('-', '')
-            if not re.match(r'^\d{4}(\d\d)?(\d\d)?$', options.period):
-                parser.error("Period must be 'y', 'm', 'd' or YYYY[MM[DD]]")
-        set_period()
-
-    wget_retrieve = WgetRetrieveWrapper(options, log)
-    setup_wget(not options.no_ssl_verify, options.user_agent)
-
-    blogs = options.blogs or DEFAULT_BLOGS
-    if not blogs:
-        parser.error("Missing blog-name")
     if options.count is not None and options.count < 0:
         parser.error("--count: count must not be negative")
     if options.skip < 0:
@@ -1669,44 +1833,16 @@ def __call__(self, parser, namespace, values, option_string=None):
         parser.error("-O can only be used for a single blog-name")
     if options.dirs and options.tag_index:
         parser.error("-D cannot be used with --tag-index")
-    if options.exif:
-        if pyexiv2 is None:
-            parser.error("--exif: module 'pyexif2' is not installed")
-        if not hasattr(pyexiv2, 'ImageMetadata'):
-            parser.error("--exif: module 'pyexiv2' is missing features, perhaps you need 'py3exiv2'?")
-    if options.save_video:
-        try:
-            import youtube_dl
-            from youtube_dl.utils import sanitize_filename
-        except ImportError:
-            parser.error("--save-video: module 'youtube_dl' is not installed")
-    if options.save_notes or options.copy_notes:
-        sys.modules['soupsieve'] = ()  # type: ignore[assignment]
-        try:
-            from bs4 import BeautifulSoup
-        except ImportError:
-            parser.error("--{}: module 'bs4' is not installed".format(
-                'save-notes' if options.save_notes else 'copy-notes'
-            ))
     if options.cookiefile is not None and not os.access(options.cookiefile, os.R_OK):
         parser.error('--cookiefile: file cannot be read')
-    if options.save_notes:
-        import note_scraper
-    if options.copy_notes:
-        if not options.prev_archives:
-            parser.error('--copy-notes requires --prev-archives')
+    if options.copy_notes and not options.prev_archives:
+        parser.error('--copy-notes requires --prev-archives')
     if options.notes_limit is not None:
         if not options.save_notes:
             parser.error('--notes-limit requires --save-notes')
         if options.notes_limit < 1:
             parser.error('--notes-limit: Value must be at least 1')
-    if options.filter is not None:
-        if pyjq is None:
-            parser.error("--filter: module 'pyjq' is not installed")
-        options.filter = pyjq.compile(options.filter)
     if options.prev_archives:
-        if scandir is None:
-            parser.error("--prev-archives: Python is less than 3.5 and module 'scandir' is not installed")
         if len(options.prev_archives) != len(blogs):
             parser.error('--prev-archives: expected {} directories, got {}'.format(
                 len(blogs), len(options.prev_archives),
@@ -1720,6 +1856,11 @@ def __call__(self, parser, namespace, values, option_string=None):
     if options.threads < 1:
         parser.error('--threads: must use at least one thread')
 
+    check_optional_modules()
+
+    wget_retrieve = WgetRetrieveWrapper(options, log)
+    setup_wget(not options.no_ssl_verify, options.user_agent)
+
     ApiParser.setup()
 
     global backup_account
diff --git a/util.py b/util.py
index f5f5dc6..4ff58d1 100644
--- a/util.py
+++ b/util.py
@@ -4,6 +4,7 @@
 
 import collections
 import contextlib
+import errno
 import io
 import os
 import socket
@@ -501,3 +502,19 @@ def quit(self):
         except queue.Full:
             pass
         self.thread.join()
+
+
+def opendir(dir_, flags):
+    try:
+        flags |= os.O_DIRECTORY
+    except AttributeError:
+        dir_ += os.path.sep  # Fallback, some systems don't support O_DIRECTORY
+    return os.open(dir_, flags)
+
+
+def try_unlink(path):
+    try:
+        os.unlink(path)
+    except EnvironmentError as e:
+        if getattr(e, 'errno', None) != errno.ENOENT:
+            raise
diff --git a/wget.py b/wget.py
index 2292c19..4a9e3b5 100644
--- a/wget.py
+++ b/wget.py
@@ -13,7 +13,8 @@
 from tempfile import NamedTemporaryFile
 from wsgiref.handlers import format_date_time
 
-from util import PY3, URLLIB3_FROM_PIP, get_supported_encodings, is_dns_working, no_internet, setup_urllib3_ssl
+from util import (PY3, URLLIB3_FROM_PIP,
+                  get_supported_encodings, is_dns_working, no_internet, opendir, setup_urllib3_ssl, try_unlink)
 
 try:
     from urllib.parse import urlsplit, urljoin
@@ -569,13 +570,7 @@ def _retrieve_loop(hstat, url, dest_file, adjust_basename, options, log):
     got_head = False  # used for time-stamping
     dest_dirname, dest_basename = os.path.split(dest_file)
 
-    flags = os.O_RDONLY
-    try:
-        flags |= os.O_DIRECTORY
-    except AttributeError:
-        dest_dirname += os.path.sep  # Fallback, some systems don't support O_DIRECTORY
-
-    hstat.dest_dir = os.open(dest_dirname, flags)
+    hstat.dest_dir = opendir(dest_dirname, os.O_RDONLY)
     hstat.set_part_file_supplier(functools.partial(
         lambda pfx, dir_: NamedTemporaryFile('wb', prefix=pfx, dir=dir_, delete=False),
         '.{}.'.format(dest_basename), dest_dirname,
@@ -728,19 +723,9 @@ def _retrieve_loop(hstat, url, dest_file, adjust_basename, options, log):
             os.replace(os.path.basename(pfname), new_dest_basename,
                        src_dir_fd=hstat.dest_dir, dst_dir_fd=hstat.dest_dir)
 
-        # Sync the directory and return
-        os.fdatasync(hstat.dest_dir)
         return
 
 
-def try_unlink(path):
-    try:
-        os.unlink(path)
-    except EnvironmentError as e:
-        if getattr(e, 'errno', None) != errno.ENOENT:
-            raise
-
-
 def setup_wget(ssl_verify, user_agent):
     if not ssl_verify:
         # Hide the InsecureRequestWarning from urllib3