diff --git a/tests/test-dol.py b/tests/test-dol.py index 494b0ea..5b4e331 100644 --- a/tests/test-dol.py +++ b/tests/test-dol.py @@ -7,8 +7,7 @@ class Test(unittest.TestCase): def test_basic(self): url = "dol.gov" snapshots = waybackpack.search(url) - timestamps = [ snap["timestamp"] for snap in snapshots ] - first = waybackpack.Asset(url, timestamps[0]) + first = waybackpack.Asset(snapshots[0]) content = first.fetch() assert(b"Regulatory Information" in content) assert(len(content) > 0) diff --git a/tests/test-download.py b/tests/test-download.py index 3e4274c..c78e7d6 100644 --- a/tests/test-download.py +++ b/tests/test-download.py @@ -9,8 +9,7 @@ class Test(unittest.TestCase): def test_basic(self): url = "dol.gov" snapshots = waybackpack.search(url, to_date=1996) - timestamps = [ snap["timestamp"] for snap in snapshots ] - pack = waybackpack.Pack(url, timestamps) + pack = waybackpack.Pack(url, snapshots=snapshots) dirpath = tempfile.mkdtemp() pack.download_to(dirpath) shutil.rmtree(dirpath) diff --git a/tests/test-redirect.py b/tests/test-redirect.py index 35d289e..1044d98 100644 --- a/tests/test-redirect.py +++ b/tests/test-redirect.py @@ -3,18 +3,18 @@ import waybackpack import sys, os -URL = "https://berniesanders.com/" -TIMESTAMP = "20160106120201" +SNAPSHOT = {'timestamp' : "20160106120201", + 'original' : "https://berniesanders.com/"} class Test(unittest.TestCase): def test_no_redirect(self): - asset = waybackpack.Asset(URL, TIMESTAMP) + asset = waybackpack.Asset(SNAPSHOT) content = asset.fetch() assert(b"Impatient" in content) def test_yes_redirect(self): session = waybackpack.Session(follow_redirects=True) - asset = waybackpack.Asset(URL, TIMESTAMP) + asset = waybackpack.Asset(SNAPSHOT) content = asset.fetch(session=session) assert(b"Impatient" not in content) assert(b"Nobody who works 40 hours" in content) diff --git a/waybackpack/asset.py b/waybackpack/asset.py index 2ec4bde..ad4ac9c 100644 --- a/waybackpack/asset.py +++ b/waybackpack/asset.py @@ -23,9 +23,9 @@ ] class Asset(object): - def __init__(self, original_url, timestamp): - self.timestamp = timestamp - self.original_url = original_url + def __init__(self, snapshot): + self.timestamp = snapshot['timestamp'] + self.original_url = snapshot['original'] def get_archive_url(self, raw=False): flag = "id_" if raw else "" diff --git a/waybackpack/cli.py b/waybackpack/cli.py index bbcc439..40098d7 100644 --- a/waybackpack/cli.py +++ b/waybackpack/cli.py @@ -73,11 +73,9 @@ def main(): collapse=args.collapse ) - timestamps = [ snap["timestamp"] for snap in snapshots ] - pack = Pack( args.url, - timestamps=timestamps, + snapshots=snapshots, session=session ) diff --git a/waybackpack/pack.py b/waybackpack/pack.py index 9cecd47..efddd42 100644 --- a/waybackpack/pack.py +++ b/waybackpack/pack.py @@ -3,6 +3,7 @@ from .asset import Asset from .cdx import search import hashlib +import urllib import sys, os import logging logger = logging.getLogger(__name__) @@ -15,7 +16,7 @@ class Pack(object): def __init__(self, url, - timestamps=None, + snapshots=None, uniques_only=False, session=None): @@ -26,21 +27,26 @@ def __init__(self, self.session = session or Session() - self.timestamps = timestamps or [ snap["timestamp"] for snap in search( + self.snapshots = snapshots or search( url, uniques_only=uniques_only, session=self.session - ) ] - self.assets = [ Asset(self.url, ts) for ts in self.timestamps ] + ) + self.assets = [ Asset(snapshot) for snapshot in self.snapshots ] def download_to(self, directory, raw=False, root=DEFAULT_ROOT): for asset in self.assets: - path_head, path_tail = os.path.split(self.parsed_url.path) - if path_tail == "": - path_tail = "index.html" + path = urllib.parse.urlparse(asset.original_url).path[1:] + + if path: + path_head, path_tail = path.rsplit('/', 1) + if not path_tail: + path_tail = 'index.html' + else: + path_head, path_tail = '', 'index.html' filedir = os.path.join( directory,