diff --git a/tumblr_backup.py b/tumblr_backup.py index 5da6bf6..be2ea5e 100755 --- a/tumblr_backup.py +++ b/tumblr_backup.py @@ -83,6 +83,14 @@ except ImportError: pyjq = None +try: + from os import scandir # type: ignore[attr-defined] +except ImportError: + try: + from scandir import scandir # type: ignore[no-redef] + except ImportError: + scandir = None # type: ignore[assignment,no-redef] + # These builtins have new names in Python 3 try: long, xrange # type: ignore[has-type] @@ -164,6 +172,7 @@ def tb_urlopen(url): disable_note_scraper = set() # type: Set[str] disablens_lock = threading.Lock() +old_json_dir_entries = None # type: Optional[Tuple[str, ...]] class Logger(object): @@ -273,7 +282,37 @@ def mktime(tml): options.p_stop = mktime(tm) +def initial_apiparse(base, old_json_dir): + global old_json_dir_entries + if old_json_dir is not None: + old_json_dir_entries = tuple(e.path for e in sorted( + (e for e in scandir(old_json_dir) if (e.name.endswith('.json') and e.is_file())), + key=lambda e: e.name)) + else: + old_json_dir_entries = None + + return apiparse(base, 1) + + def apiparse(base, count, start=0): + # type: (...) -> Optional[JSONDict] + if old_json_dir_entries is not None: + # Reconstruct the API response + posts = [] + posts_respfiles = old_json_dir_entries[start:] + for prf in posts_respfiles[:count]: + with io.open(prf, encoding=FILE_ENCODING) as f: + try: + post = json.load(f) + except ValueError as e: + f.seek(0) + log('{}: {}\n{!r}\n'.format(e.__class__.__name__, e, f.read())) + return None + posts.append(post) + return {'posts': posts, + 'post_respfiles': posts_respfiles, + 'blog': dict(posts[0]['blog'] if posts else {}, posts=len(old_json_dir_entries))} + params = {'api_key': API_KEY, 'limit': count, 'reblog_info': 'true'} if start > 0: params['offset'] = start @@ -575,7 +614,7 @@ def footer(base, previous_page, next_page, suffix): f += '\n' return f - def backup(self, account): + def backup(self, account, old_json_dir): """makes single files and an index for every post on a public Tumblr blog account""" base = get_api_url(account) @@ -616,8 +655,7 @@ def backup(self, account): else: log.status('Getting basic information\r') - # start by calling the API with just a single post - resp = apiparse(base, 1) + resp = initial_apiparse(base, old_json_dir) if not resp: self.errors = True return @@ -642,9 +680,11 @@ def backup(self, account): backup_pool = ThreadPool() # returns whether any posts from this batch were saved - def _backup(posts): - for p in sorted(posts, key=lambda x: x['id'], reverse=True): - post = post_class(p, account) + def _backup(posts, post_respfiles): + sorted_posts = sorted(zip(posts, post_respfiles), + key=lambda x: x[0]['id'], reverse=True) + for p, prf in sorted_posts: + post = post_class(p, account, prf) if ident_max and long(post.ident) <= ident_max: return False if options.count and self.post_count >= options.count: @@ -695,8 +735,12 @@ def _backup(posts): continue posts = resp[posts_key] + post_respfiles = resp.get('post_respfiles') + if post_respfiles is None: + post_respfiles = [None for _ in posts] + # `_backup(posts)` can be empty even when `posts` is not if we don't backup reblogged posts - if not posts or not _backup(posts): + if not posts or not _backup(posts, post_respfiles): log.status('Backing up posts found empty set of posts, finishing\r') break @@ -732,12 +776,12 @@ def _backup(posts): class TumblrPost(object): post_header = '' # set by TumblrBackup.backup() - def __init__(self, post, backup_account): - # type: (JSONDict, str) -> None + def __init__(self, post, backup_account, respfile): + # type: (JSONDict, str, Text) -> None self.content = '' self.post = post self.backup_account = backup_account - self.json_content = to_unicode(json.dumps(post, sort_keys=True, indent=4, separators=(',', ': '))) + self.respfile = respfile self.creator = post['blog_name'] self.ident = str(post['id']) self.url = post['post_url'] @@ -880,7 +924,7 @@ def make_player(src_): else: log(u"Unknown post type '{}' in post #{}\n".format(self.typ, self.ident)) - append(escape(self.json_content), u'
%s
') + append(escape(self.get_json_content()), u'
%s
') self.content = '\n'.join(content) @@ -1144,7 +1188,13 @@ def save_post(self): os.utime(f.name, (self.date, self.date)) if options.json: with open_text(json_dir, self.ident + '.json') as f: - f.write(self.json_content) + f.write(self.get_json_content()) + + def get_json_content(self): + if self.respfile is not None: + with io.open(self.respfile, encoding=FILE_ENCODING) as f: + return f.read() + return to_unicode(json.dumps(self.post, sort_keys=True, indent=4, separators=(',', ': '))) @staticmethod def _parse_url_match(match, transform=None): @@ -1272,6 +1322,10 @@ class CSVCallback(argparse.Action): def __call__(self, parser, namespace, values, option_string=None): setattr(namespace, self.dest, set(values.split(','))) + class CSVListCallback(argparse.Action): + def __call__(self, parser, namespace, values, option_string=None): + setattr(namespace, self.dest, list(values.split(','))) + class RequestCallback(argparse.Action): def __call__(self, parser, namespace, values, option_string=None): request = getattr(namespace, self.dest) or {} @@ -1340,6 +1394,8 @@ def __call__(self, parser, namespace, values, option_string=None): help='add EXIF keyword tags to each picture' " (comma-separated values; '-' to remove all tags, '' to add no extra tags)") parser.add_argument('-S', '--no-ssl-verify', action='store_true', help='ignore SSL verification errors') + parser.add_argument('--json-dirs', action=CSVListCallback, default=[], metavar='DIRS', + help='comma-separated list of directories containing API responses (one per blog)') parser.add_argument('blogs', nargs='*') options = parser.parse_args() @@ -1394,13 +1450,21 @@ def __call__(self, parser, namespace, values, option_string=None): if pyjq is None: parser.error("--filter: module 'pyjq' is not installed") options.filter = pyjq.compile(options.filter) + if options.json_dirs: + if not scandir: + parser.error("--json-dirs: Python is less than 3.5 and module 'scandir' is not installed") + if len(options.json_dirs) != len(blogs): + parser.error('--json-dirs: expected {} directories, got {}'.format(len(blogs), len(options.json_dirs))) + for d in options.json_dirs: + if not os.access(d, os.R_OK | os.X_OK): + parser.error("--json-dirs: directory '{}' cannot be read".format(d)) global backup_account tb = TumblrBackup() try: - for account in blogs: + for i, account in enumerate(blogs): log.backup_account = account - tb.backup(account) + tb.backup(account, options.json_dirs[i] if options.json_dirs else None) except KeyboardInterrupt: sys.exit(EXIT_INTERRUPT)