diff --git a/README.md b/README.md index 18f2c38..4ca9ccc 100644 --- a/README.md +++ b/README.md @@ -31,7 +31,77 @@ python setup.py install ## Usage -(WIP) +### Credentials +* Pog does not manage cloud storage credentials -- it asks that you configure your environment with API keys before use. + * To validate s3 credentials: + * `awscli ls ` + * To validate b2 credentials: + * `b2 ls ` + +### Using a password or keyfiles +1. symmetric keyfile + * any file can be used as a keyfile. + * the contents of the keyfile will be hashed, and that hash will become the cryptographic key + * cryptographic randomness (ex: 1024 bytes from /dev/urandom) is recommended +2. asymmetric keyfiles + * the `pog-create-keypair` script will generate an "encrypt" and "decrypt" keypair. + * encrypt is used for creating archives + * decrypt is used for extracting them +3. Password entry + * if no keyfiles are specified, Pog supports password entry for creating or reading archives + +### Creating cloud archives and backups + +* Consider an S3 backup: + +``` +pog /home/user/my_file.txt --keyfile=/home/user/secret.keyfile --save-to=s3://my-bucket --store-absolute-paths +``` + +This does a few things: +1. `my_file.txt` is encrypted with `secret.keyfile`. If the file is sufficiently large, it is split into multiple pieces during encryption. +2. The encrypted contents ("blob") of `my_file.txt` is saved to the s3 bucket `my-bucket`, under the top-level `data/` subdirectory. +3. An encrypted "manifest" file is created, named according to the time the archive was created. This manifest file acts as an index from filenames (`/home/user/my_file`) to one or more encrypted blobs. + a. The `--store-absolute-paths` flag tells the manifest to resolve ambiguous paths with the absolute path (`/home/user/my_file`) instead of the relative path (`my_file`). This can be useful to have when extracting archives or backups. +4. The manifest file is also saved to `my-bucket` in s3. + +---- + +* Here is another example, with a series of directories: + +``` +pog /opt/games /opt/apps /opt/music --encryption-keyfile=secret.encrypt --save-to=s3://my-bucket,b2://my-b2-bucket +``` + +* This will recursively go through those 3 directories, gathering up all files and saving the encrypted blobs to both s3 and b2. + +The command line help (`pog -h`) shows other useful examples. + +### Creating local archives + +* It is also possible to use Pog to encrypt a single file. + +``` +pog /home/myfile.original > outputs.txt +``` + +* and to decrypt: + +``` +pog --decrypt $(cat outputs.txt) > myfile.copy +``` + +### Reading archives and backups + +For a given manifest file (`2020-01-23T12:34:56.012345.mfn`), we can download and extract the archive like so: + +``` +pog --decrypt s3:/my-bucket/2020-01-23T12:34:56.012345.mfn --keyfile=/home/user/secret.keyfile +``` + +* The `--decrypt` flag should be specified for read+decrypt -- the default behavior is to write+encrypt. +* If a `--decryption-keyfile` is provided, `--decrypt` is assumed. +* If a local manifest file is provided, it is assumed that the data blobs are already downloaded into the working directory. ## Algorithm @@ -46,7 +116,7 @@ python setup.py install * this is what the `--keyfile` option does * `crypto_sealedbox` with an X25519 key pair * this is what `--decryption-keyfile` and `--encryption-keyfile` do - * an X25519 key pair can be generated with pog-create-keypair. + * an X25519 key pair can be generated with `pog-create-keypair`. * the file->blob relationship is stored in an encrypted manifest file (`.mfn`), which also stores file metadata -- e.g. last modified time. * the `.mfn` can be thought of as the dictionary for the archive. diff --git a/pog/cli.py b/pog/cli.py index 09a21ab..02092b3 100644 --- a/pog/cli.py +++ b/pog/cli.py @@ -10,6 +10,10 @@ def __init__(self, config=None, kwargs=None, pog_cmd=None): self.cmd = pog_cmd or ['python', '-u', '-m', 'pog.pog'] self.config = config or {} self.kwargs = kwargs or {} + self._abort = False + + def abort(self): + self._abort = True def set_keyfiles(self, *keyfiles): for k in ('keyfile', 'decryption-keyfile', 'encryption-keyfile'): @@ -32,6 +36,7 @@ def set_keyfiles(self, *keyfiles): return def run(self, *args, **kwargs): + self._abort = False restrict_config = kwargs.pop('restrict_config', ['encryption-keyfile']) full_args = list(self.cmd) + list(args) + self._flatten_config(restrict_config) kwargs = {**self.kwargs, **kwargs} @@ -47,6 +52,9 @@ def run(self, *args, **kwargs): if kwargs['stdout'] == PIPE: for line in proc.stdout: yield line.decode('utf-8').strip() + if self._abort: + proc.terminate() + break def run_command(self, *args, **kwargs): return list(self.run(*args, **kwargs)) diff --git a/pog/pog.py b/pog/pog.py index 00a9fdc..7d9bf6c 100644 --- a/pog/pog.py +++ b/pog/pog.py @@ -6,7 +6,7 @@ Usage: pog ... pog [--keyfile= | --encryption-keyfile=] [--save-to=] [--chunk-size=] - [--compresslevel=<1-22>] [--store-absolute-paths] ... + [--compresslevel=<1-22>] [--concurrency=<1-N>] [--store-absolute-paths] ... pog [--keyfile= | --decryption-keyfile=] [--decrypt | --dump-manifest] [--consume] ... pog [--keyfile= | --decryption-keyfile= | --encryption-keyfile=] [--dump-manifest-index] ... @@ -31,6 +31,7 @@ --version Show version. --chunk-size= When encrypting, split large files into size parts [default: 100MB]. --compresslevel=<1-22> Zstd compression level during encryption. [default: 3] + --concurrency=<1-N> How many threads to use for uploads. [default: 8] --consume Used with decrypt -- after decrypting a blob, delete it from disk to conserve space. --decrypt Decrypt instead. --decryption-keyfile= Use asymmetric decryption -- contains the (binary) private key. @@ -42,6 +43,8 @@ """ import sys from base64 import urlsafe_b64encode +from collections import ChainMap +from concurrent.futures import ThreadPoolExecutor from datetime import datetime from getpass import getpass from hashlib import sha256 @@ -156,13 +159,14 @@ def get_secret(keyfile=None): class Encryptor(): - def __init__(self, secret, crypto_box=None, chunk_size=100000000, compresslevel=3, store_absolute_paths=False, - blob_store=None): + def __init__(self, secret, crypto_box=None, chunk_size=100000000, compresslevel=3, concurrency=8, + store_absolute_paths=False, blob_store=None): self.secret = sha256(secret).digest() self.index_box = nacl_SecretBox(secret) self.box = crypto_box or self.index_box self.chunk_size = chunk_size self.compresslevel = compresslevel + self.concurrency = concurrency self.store_absolute_paths = store_absolute_paths self.blob_store = blob_store or BlobStore() @@ -249,7 +253,7 @@ def save_manifest(self, mfn, filename=None): self.blob_store.save(filename, temp_path) return filename - def encrypt_single_file(self, filename): + def generate_encrypted_blobs(self, filename): cctx = zstd.ZstdCompressor(level=self.compresslevel) td = TemporaryDirectory(dir=_get_temp_dir()) with open(filename, 'rb') as f, cctx.stream_reader(f) as compressed_stream, td as tempdir: @@ -264,23 +268,34 @@ def encrypt_single_file(self, filename): self._write(f, data) yield temp_path - def encrypt(self, *inputs): - mfn = dict() - all_inputs = local_file_list(*inputs) - for count, filename in enumerate(all_inputs): - _print_progress(count+1, len(all_inputs)+1, filename) - outputs = [] - for temp_path in self.encrypt_single_file(filename): - blob_name = path.basename(temp_path) - self.blob_store.save_blob(blob_name, temp_path) - outputs.append(blob_name) - print(blob_name) - if outputs: - mfn[self.archived_filename(filename)] = { + def encrypt_and_store_file(self, args): + filename, current_count, total_count = args + _print_progress(current_count+1, total_count+1, filename) + outputs = [] + for temp_path in self.generate_encrypted_blobs(filename): + blob_name = path.basename(temp_path) + self.blob_store.save_blob(blob_name, temp_path) + outputs.append(blob_name) + print(blob_name) + return { + self.archived_filename(filename): + { 'blobs': outputs, 'atime': path.getatime(filename), 'mtime': path.getmtime(filename), } + } + + def encrypt(self, *inputs): + mfn = dict() + all_inputs = local_file_list(*inputs) + + exe = ThreadPoolExecutor(max_workers=self.concurrency) + args = [(filename, count, len(all_inputs)) for count, filename in enumerate(all_inputs)] + mfn = exe.map(self.encrypt_and_store_file, args) + mfn = dict(ChainMap(*mfn)) # smash the maps together + mfn = dict(sorted(mfn.items())) + mfn_filename = self.save_manifest(mfn) _print_progress(len(all_inputs)+1, len(all_inputs)+1, mfn_filename) @@ -380,9 +395,10 @@ def decrypt(self, *inputs): def main(): - args = docopt(__doc__, version='Pog 0.1.3') - chunk_size = parse_size(args.get('--chunk-size', '100MB')) - compresslevel = int(args.get('--compresslevel', '3')) + args = docopt(__doc__, version='Pog 0.1.4') + chunk_size = parse_size(args.get('--chunk-size')) + compresslevel = int(args.get('--compresslevel')) + concurrency = int(args.get('--concurrency')) store_absolute_paths = args.get('--store-absolute-paths') secret, crypto_box = get_asymmetric_encryption(args.get('--decryption-keyfile'), args.get('--encryption-keyfile')) @@ -406,7 +422,7 @@ def main(): d.decrypt(*args['']) else: bs = BlobStore(args.get('--save-to')) - en = Encryptor(secret, crypto_box, chunk_size, compresslevel, store_absolute_paths, bs) + en = Encryptor(secret, crypto_box, chunk_size, compresslevel, concurrency, store_absolute_paths, bs) en.encrypt(*args['']) diff --git a/setup.py b/setup.py index 15219ce..20780ca 100644 --- a/setup.py +++ b/setup.py @@ -10,7 +10,7 @@ name='pogcli', license='MIT', url='https://github.com/sz3/pog', - version='0.1.3', + version='0.1.4', entry_points={ 'console_scripts': [ diff --git a/tests/test_pog.py b/tests/test_pog.py index e7c48ef..2c23a49 100644 --- a/tests/test_pog.py +++ b/tests/test_pog.py @@ -15,6 +15,8 @@ 069:15:24 Anders (onboard): It looks like a big - looks like a big beach down there. ''' +CONCURRENCY_FLAG = '--concurrency=1' + def compute_checksum(filename): hash_md5 = hashlib.md5() @@ -51,7 +53,7 @@ class KeyfileTest(TestDirMixin, TestCase): def test_round_trip(self): # encrypt our sample files - enc = self.run_command(self.encryption_flag, self.tiny_sample, self.another_sample) + enc = self.run_command(self.encryption_flag, self.tiny_sample, self.another_sample, CONCURRENCY_FLAG) manifest_name = glob(path.join(self.working_dir.name, '*.mfn'))[0] # ordered lexicographically by filename @@ -167,7 +169,7 @@ def test_consistency_fs_input(self): def test_absolute_paths(self): # encrypt our sample files, saving their absolute paths in the manifest enc = self.run_command( - self.encryption_flag, self.tiny_sample, self.another_sample, '--store-absolute-paths' + self.encryption_flag, self.tiny_sample, self.another_sample, '--store-absolute-paths', CONCURRENCY_FLAG ) manifest_name = glob(path.join(self.working_dir.name, '*.mfn'))[0] @@ -213,7 +215,7 @@ def test_absolute_paths(self): def test_glob_input_directory(self): # encrypt our sample files, saving their absolute paths in the manifest enc = self.run_command( - self.encryption_flag, self.input_dir.name, '--store-absolute-paths' + self.encryption_flag, self.input_dir.name, '--store-absolute-paths', CONCURRENCY_FLAG ) manifest_name = glob(path.join(self.working_dir.name, '*.mfn'))[0] @@ -250,7 +252,7 @@ def test_manifest_index_ordering(self): We sort the blobs stored in the manifest index, to limit information about which blobs belong together. ''' # encrypt our sample files - enc = self.run_command(self.encryption_flag, self.tiny_sample, self.another_sample) + enc = self.run_command(self.encryption_flag, self.tiny_sample, self.another_sample, CONCURRENCY_FLAG) manifest_name = glob(path.join(self.working_dir.name, '*.mfn'))[0] self.assertEqual(enc, [