Skip to content
Open
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions waybackpack/asset.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,9 +23,9 @@
]

class Asset(object):
def __init__(self, original_url, timestamp):
self.timestamp = timestamp
self.original_url = original_url
def __init__(self, snapshot):
self.timestamp = snapshot['timestamp']
self.original_url = snapshot['original']

def get_archive_url(self, raw=False):
flag = "id_" if raw else ""
Expand Down
4 changes: 1 addition & 3 deletions waybackpack/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,11 +73,9 @@ def main():
collapse=args.collapse
)

timestamps = [ snap["timestamp"] for snap in snapshots ]

pack = Pack(
args.url,
timestamps=timestamps,
snapshots=snapshots,
session=session
)

Expand Down
18 changes: 9 additions & 9 deletions waybackpack/pack.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
from .asset import Asset
from .cdx import search
import hashlib
import urllib
import sys, os
import logging
logger = logging.getLogger(__name__)
Expand All @@ -15,7 +16,7 @@
class Pack(object):
def __init__(self,
url,
timestamps=None,
snapshots=None,
uniques_only=False,
session=None):

Expand All @@ -26,30 +27,29 @@ def __init__(self,

self.session = session or Session()

self.timestamps = timestamps or [ snap["timestamp"] for snap in search(
self.snapshots = snapshots or search(
url,
uniques_only=uniques_only,
session=self.session
) ]
self.assets = [ Asset(self.url, ts) for ts in self.timestamps ]
)
self.assets = [ Asset(snapshot) for snapshot in self.snapshots ]

def download_to(self, directory,
raw=False,
root=DEFAULT_ROOT):

for asset in self.assets:
path_head, path_tail = os.path.split(self.parsed_url.path)
if path_tail == "":
path_tail = "index.html"
path = urllib.parse.urlparse(asset.original_url).path
_, path_head, path_tail = path.rsplit('/', 2)

filedir = os.path.join(
directory,
asset.timestamp,
self.parsed_url.netloc,
path_head
)

filepath = os.path.join(filedir, path_tail)
filepath = os.path.join(filedir,
','.join((path_tail, asset.timestamp)))
Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Most of these change to the path are because I found it more convenient, for my own purposes, to have different versions of the same resource name in the same file. I can make a version that keeps your current, separate directory, behavior.

Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ah, interesting. I'd prefer to keep the current, separate-directory behavior. But I'm curious: What makes the other way more convenient for your purposes?

Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

p.s., Thanks! This seems to be a really elegant solution.


logger.info(
"Fetching {0} @ {1}".format(
Expand Down