Skip to content

Commit

Permalink
add wildcard search based on jsvine#17
Browse files Browse the repository at this point in the history
  • Loading branch information
ksadov committed Mar 18, 2024
1 parent ddfd93f commit 252c9c3
Show file tree
Hide file tree
Showing 3 changed files with 22 additions and 19 deletions.
10 changes: 5 additions & 5 deletions waybackpack/asset.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,12 +32,12 @@


class Asset(object):
def __init__(self, original_url, timestamp):
def __init__(self, snapshot):
# Ensure timestamp is only numeric
if re.match(r"^[0-9]+\Z", timestamp) is None:
raise RuntimeError("invalid timestamp {!r}".format(timestamp))
self.timestamp = timestamp
self.original_url = original_url
self.timestamp = snapshot['timestamp']
self.original_url = snapshot['original']
if re.match(r"^[0-9]+\Z", self.timestamp) is None:
raise RuntimeError("invalid timestamp {!r}".format(self.timestamp))

def get_archive_url(self, raw=False):
flag = "id_" if raw else ""
Expand Down
2 changes: 1 addition & 1 deletion waybackpack/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -173,7 +173,7 @@ def main():

timestamps = [snap["timestamp"] for snap in snapshots]

pack = Pack(search_url, timestamps=timestamps, session=session)
pack = Pack(search_url, snapshots=snapshots)

time.sleep(args.delay)

Expand Down
29 changes: 16 additions & 13 deletions waybackpack/pack.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
import os
import platform
import time
import urllib

from .asset import Asset
from .cdx import search
Expand Down Expand Up @@ -42,7 +43,7 @@ def replace_invalid_chars(path, fallback_char="_"):


class Pack(object):
def __init__(self, url, timestamps=None, uniques_only=False, session=None):
def __init__(self, url, snapshots=None, uniques_only=False, session=None):

self.url = url
prefix = "http://" if urlparse(url).scheme == "" else ""
Expand All @@ -51,15 +52,12 @@ def __init__(self, url, timestamps=None, uniques_only=False, session=None):

self.session = session or Session()

if timestamps is None:
self.timestamps = [
snap["timestamp"]
for snap in search(url, uniques_only=uniques_only, session=self.session)
]
else:
self.timestamps = timestamps

self.assets = [Asset(self.url, ts) for ts in self.timestamps]
self.snapshots = snapshots or search(
url,
uniques_only=uniques_only,
session=self.session
)
self.assets = [Asset(snapshot) for snapshot in self.snapshots]

def download_to(
self,
Expand All @@ -83,9 +81,14 @@ def download_to(
logger.info("Sleeping {0} seconds".format(delay))
time.sleep(delay)

path_head, path_tail = os.path.split(self.parsed_url.path)
if path_tail == "":
path_tail = "index.html"
path = urllib.parse.urlparse(asset.original_url).path[1:]

if path:
path_head, path_tail = path.rsplit('/', 1)
if not path_tail:
path_tail = 'index.html'
else:
path_head, path_tail = '', 'index.html'

filedir = os.path.join(
directory,
Expand Down

0 comments on commit 252c9c3

Please sign in to comment.