From 252c9c3ce77e0997e6cf6be04cead45d849f46f4 Mon Sep 17 00:00:00 2001 From: K Sadov Date: Sun, 17 Mar 2024 19:29:48 -0700 Subject: [PATCH] add wildcard search based on https://github.com/jsvine/waybackpack/pull/17 --- waybackpack/asset.py | 10 +++++----- waybackpack/cli.py | 2 +- waybackpack/pack.py | 29 ++++++++++++++++------------- 3 files changed, 22 insertions(+), 19 deletions(-) diff --git a/waybackpack/asset.py b/waybackpack/asset.py index b014c29..95a798b 100644 --- a/waybackpack/asset.py +++ b/waybackpack/asset.py @@ -32,12 +32,12 @@ class Asset(object): - def __init__(self, original_url, timestamp): + def __init__(self, snapshot): # Ensure timestamp is only numeric - if re.match(r"^[0-9]+\Z", timestamp) is None: - raise RuntimeError("invalid timestamp {!r}".format(timestamp)) - self.timestamp = timestamp - self.original_url = original_url + self.timestamp = snapshot['timestamp'] + self.original_url = snapshot['original'] + if re.match(r"^[0-9]+\Z", self.timestamp) is None: + raise RuntimeError("invalid timestamp {!r}".format(self.timestamp)) def get_archive_url(self, raw=False): flag = "id_" if raw else "" diff --git a/waybackpack/cli.py b/waybackpack/cli.py index 5405ede..ce41f87 100644 --- a/waybackpack/cli.py +++ b/waybackpack/cli.py @@ -173,7 +173,7 @@ def main(): timestamps = [snap["timestamp"] for snap in snapshots] - pack = Pack(search_url, timestamps=timestamps, session=session) + pack = Pack(search_url, snapshots=snapshots) time.sleep(args.delay) diff --git a/waybackpack/pack.py b/waybackpack/pack.py index b7ca6a4..0d3841e 100644 --- a/waybackpack/pack.py +++ b/waybackpack/pack.py @@ -2,6 +2,7 @@ import os import platform import time +import urllib from .asset import Asset from .cdx import search @@ -42,7 +43,7 @@ def replace_invalid_chars(path, fallback_char="_"): class Pack(object): - def __init__(self, url, timestamps=None, uniques_only=False, session=None): + def __init__(self, url, snapshots=None, uniques_only=False, session=None): self.url = url prefix = "http://" if urlparse(url).scheme == "" else "" @@ -51,15 +52,12 @@ def __init__(self, url, timestamps=None, uniques_only=False, session=None): self.session = session or Session() - if timestamps is None: - self.timestamps = [ - snap["timestamp"] - for snap in search(url, uniques_only=uniques_only, session=self.session) - ] - else: - self.timestamps = timestamps - - self.assets = [Asset(self.url, ts) for ts in self.timestamps] + self.snapshots = snapshots or search( + url, + uniques_only=uniques_only, + session=self.session + ) + self.assets = [Asset(snapshot) for snapshot in self.snapshots] def download_to( self, @@ -83,9 +81,14 @@ def download_to( logger.info("Sleeping {0} seconds".format(delay)) time.sleep(delay) - path_head, path_tail = os.path.split(self.parsed_url.path) - if path_tail == "": - path_tail = "index.html" + path = urllib.parse.urlparse(asset.original_url).path[1:] + + if path: + path_head, path_tail = path.rsplit('/', 1) + if not path_tail: + path_tail = 'index.html' + else: + path_head, path_tail = '', 'index.html' filedir = os.path.join( directory,