From c722442ca9ca709c195c5ae771cc23658d5c5d4a Mon Sep 17 00:00:00 2001 From: Forest Gregg Date: Fri, 20 May 2016 11:51:43 -0500 Subject: [PATCH 1/4] allow wildcard url patterns --- waybackpack/asset.py | 6 +++--- waybackpack/cdx.py | 1 + waybackpack/cli.py | 4 +--- waybackpack/pack.py | 20 ++++++++++---------- 4 files changed, 15 insertions(+), 16 deletions(-) diff --git a/waybackpack/asset.py b/waybackpack/asset.py index 2ec4bde..ad4ac9c 100644 --- a/waybackpack/asset.py +++ b/waybackpack/asset.py @@ -23,9 +23,9 @@ ] class Asset(object): - def __init__(self, original_url, timestamp): - self.timestamp = timestamp - self.original_url = original_url + def __init__(self, snapshot): + self.timestamp = snapshot['timestamp'] + self.original_url = snapshot['original'] def get_archive_url(self, raw=False): flag = "id_" if raw else "" diff --git a/waybackpack/cdx.py b/waybackpack/cdx.py index d57086b..b4c0985 100644 --- a/waybackpack/cdx.py +++ b/waybackpack/cdx.py @@ -17,6 +17,7 @@ def search(url, "showDupeCount": "true", "output": "json", "collapse": collapse + }).json() if len(cdx) < 2: return [] fields = cdx[0] diff --git a/waybackpack/cli.py b/waybackpack/cli.py index bbcc439..40098d7 100644 --- a/waybackpack/cli.py +++ b/waybackpack/cli.py @@ -73,11 +73,9 @@ def main(): collapse=args.collapse ) - timestamps = [ snap["timestamp"] for snap in snapshots ] - pack = Pack( args.url, - timestamps=timestamps, + snapshots=snapshots, session=session ) diff --git a/waybackpack/pack.py b/waybackpack/pack.py index 9cecd47..f5b5a6e 100644 --- a/waybackpack/pack.py +++ b/waybackpack/pack.py @@ -3,6 +3,7 @@ from .asset import Asset from .cdx import search import hashlib +import urllib import sys, os import logging logger = logging.getLogger(__name__) @@ -15,7 +16,7 @@ class Pack(object): def __init__(self, url, - timestamps=None, + snapshots=None, uniques_only=False, session=None): @@ -26,30 +27,29 @@ def __init__(self, self.session = session or Session() - self.timestamps = timestamps or [ snap["timestamp"] for snap in search( + self.snapshots = snapshots or search( url, uniques_only=uniques_only, session=self.session - ) ] - self.assets = [ Asset(self.url, ts) for ts in self.timestamps ] + ) + self.assets = [ Asset(snapshot) for snapshot in self.snapshots ] def download_to(self, directory, raw=False, root=DEFAULT_ROOT): for asset in self.assets: - path_head, path_tail = os.path.split(self.parsed_url.path) - if path_tail == "": - path_tail = "index.html" + path = urllib.parse.urlparse(asset.original_url).path + _, path_head, path_tail = path.rsplit('/', 2) filedir = os.path.join( directory, - asset.timestamp, self.parsed_url.netloc, path_head ) - filepath = os.path.join(filedir, path_tail) + filepath = os.path.join(filedir, + ','.join((path_tail, asset.timestamp))) logger.info( "Fetching {0} @ {1}".format( @@ -62,7 +62,7 @@ def download_to(self, directory, raw=raw, root=root ) - + try: os.makedirs(filedir) except OSError: From ced617ad9f71c113c7ce117743ff1870720a177e Mon Sep 17 00:00:00 2001 From: Forest Gregg Date: Mon, 23 May 2016 10:45:39 -0500 Subject: [PATCH 2/4] remove whitespace differences --- waybackpack/cdx.py | 1 - waybackpack/pack.py | 2 +- 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/waybackpack/cdx.py b/waybackpack/cdx.py index b4c0985..d57086b 100644 --- a/waybackpack/cdx.py +++ b/waybackpack/cdx.py @@ -17,7 +17,6 @@ def search(url, "showDupeCount": "true", "output": "json", "collapse": collapse - }).json() if len(cdx) < 2: return [] fields = cdx[0] diff --git a/waybackpack/pack.py b/waybackpack/pack.py index f5b5a6e..5973679 100644 --- a/waybackpack/pack.py +++ b/waybackpack/pack.py @@ -62,7 +62,7 @@ def download_to(self, directory, raw=raw, root=root ) - + try: os.makedirs(filedir) except OSError: From 06b66862d0b33869e8fbe1a9fb04ffcdb554d0b9 Mon Sep 17 00:00:00 2001 From: Forest Gregg Date: Sun, 12 Jun 2016 10:39:29 -0500 Subject: [PATCH 3/4] fix tests, restore directory structore of packing --- tests/test-dol.py | 3 +-- tests/test-download.py | 3 +-- tests/test-redirect.py | 8 ++++---- waybackpack/pack.py | 12 ++++++++---- 4 files changed, 14 insertions(+), 12 deletions(-) diff --git a/tests/test-dol.py b/tests/test-dol.py index 494b0ea..5b4e331 100644 --- a/tests/test-dol.py +++ b/tests/test-dol.py @@ -7,8 +7,7 @@ class Test(unittest.TestCase): def test_basic(self): url = "dol.gov" snapshots = waybackpack.search(url) - timestamps = [ snap["timestamp"] for snap in snapshots ] - first = waybackpack.Asset(url, timestamps[0]) + first = waybackpack.Asset(snapshots[0]) content = first.fetch() assert(b"Regulatory Information" in content) assert(len(content) > 0) diff --git a/tests/test-download.py b/tests/test-download.py index 3e4274c..c78e7d6 100644 --- a/tests/test-download.py +++ b/tests/test-download.py @@ -9,8 +9,7 @@ class Test(unittest.TestCase): def test_basic(self): url = "dol.gov" snapshots = waybackpack.search(url, to_date=1996) - timestamps = [ snap["timestamp"] for snap in snapshots ] - pack = waybackpack.Pack(url, timestamps) + pack = waybackpack.Pack(url, snapshots=snapshots) dirpath = tempfile.mkdtemp() pack.download_to(dirpath) shutil.rmtree(dirpath) diff --git a/tests/test-redirect.py b/tests/test-redirect.py index 35d289e..1044d98 100644 --- a/tests/test-redirect.py +++ b/tests/test-redirect.py @@ -3,18 +3,18 @@ import waybackpack import sys, os -URL = "https://berniesanders.com/" -TIMESTAMP = "20160106120201" +SNAPSHOT = {'timestamp' : "20160106120201", + 'original' : "https://berniesanders.com/"} class Test(unittest.TestCase): def test_no_redirect(self): - asset = waybackpack.Asset(URL, TIMESTAMP) + asset = waybackpack.Asset(SNAPSHOT) content = asset.fetch() assert(b"Impatient" in content) def test_yes_redirect(self): session = waybackpack.Session(follow_redirects=True) - asset = waybackpack.Asset(URL, TIMESTAMP) + asset = waybackpack.Asset(SNAPSHOT) content = asset.fetch(session=session) assert(b"Impatient" not in content) assert(b"Nobody who works 40 hours" in content) diff --git a/waybackpack/pack.py b/waybackpack/pack.py index 5973679..647bf18 100644 --- a/waybackpack/pack.py +++ b/waybackpack/pack.py @@ -39,17 +39,21 @@ def download_to(self, directory, root=DEFAULT_ROOT): for asset in self.assets: - path = urllib.parse.urlparse(asset.original_url).path - _, path_head, path_tail = path.rsplit('/', 2) + path = urllib.parse.urlparse(asset.original_url).path[1:] + + if path: + path_head, path_tail = path.rsplit('/', 1) + else: + path_head, path_tail = '', 'index.html' filedir = os.path.join( directory, + asset.timestamp, self.parsed_url.netloc, path_head ) - filepath = os.path.join(filedir, - ','.join((path_tail, asset.timestamp))) + filepath = os.path.join(filedir, path_tail) logger.info( "Fetching {0} @ {1}".format( From 85e65e77cae6bd260f95b3e535c4c905d29a78a1 Mon Sep 17 00:00:00 2001 From: Forest Gregg Date: Sun, 12 Jun 2016 11:23:54 -0500 Subject: [PATCH 4/4] handle missing filename --- waybackpack/pack.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/waybackpack/pack.py b/waybackpack/pack.py index 647bf18..efddd42 100644 --- a/waybackpack/pack.py +++ b/waybackpack/pack.py @@ -43,6 +43,8 @@ def download_to(self, directory, if path: path_head, path_tail = path.rsplit('/', 1) + if not path_tail: + path_tail = 'index.html' else: path_head, path_tail = '', 'index.html'