Skip to content

Commit 4a35af1

Browse files
Merge pull request #24 from SenteraLLC/feature/mp-utils
Feature/mp utils
2 parents 9f40cff + 16105b5 commit 4a35af1

File tree

5 files changed

+65
-6
lines changed

5 files changed

+65
-6
lines changed

S3MP/_version.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,3 @@
11
"""Semantic versioning for S3MP."""
22

3-
__version__ = "0.5.1"
3+
__version__ = "0.5.2"

S3MP/mirror_path.py

+59-2
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,11 @@
11
"""S3 Mirror pathing management."""
22
from __future__ import annotations
3+
import concurrent.futures
34
from typing import Callable, Dict, List
45
from pathlib import Path
6+
from tqdm import tqdm
7+
import shutil
8+
import psutil
59
from S3MP.global_config import S3MPConfig
610
from S3MP.keys import KeySegment, get_matching_s3_keys
711
from S3MP.utils.local_file_utils import (
@@ -142,7 +146,7 @@ def get_sibling(self, sibling_name: str) -> MirrorPath:
142146
)
143147

144148
def get_child(self, child_name: str) -> MirrorPath:
145-
"""Get a file with the same parent as this file."""
149+
"""Get a child of this file."""
146150
return self.replace_key_segments_at_relative_depth([KeySegment(1, child_name)])
147151

148152
def get_children_on_s3(self) -> List[MirrorPath]:
@@ -208,11 +212,64 @@ def save_local(
208212
if upload:
209213
self.upload_from_mirror(overwrite)
210214

215+
def copy_to_mp_s3_only(self, dest_mp: MirrorPath):
216+
"""Copy this file from S3 to a destination on S3."""
217+
S3MPConfig.s3_client.copy_object(
218+
CopySource={"Bucket": S3MPConfig.default_bucket_key, "Key": self.s3_key},
219+
Bucket=S3MPConfig.default_bucket_key,
220+
Key=dest_mp.s3_key,
221+
)
222+
223+
def copy_to_mp_mirror_only(self, dest_mp: MirrorPath):
224+
"""Copy this file from the mirror to a destination on the mirror."""
225+
shutil.copy(self.local_path, dest_mp.local_path)
226+
227+
def copy_to_mp(self, dest_mp: MirrorPath, use_mirror_as_src: bool = False):
228+
"""Copy this file to a destination, on S3 and in the mirror.
229+
230+
By default, assumes the S3 copy is the source of truth.
231+
If use_mirror_as_src is True, assumes the mirror is the source of truth.
232+
"""
233+
if use_mirror_as_src:
234+
# If we're using the mirror as the source of truth, we copy the file
235+
# to the dest mirror, then upload it to S3.
236+
self.copy_to_mp_mirror_only(dest_mp)
237+
dest_mp.upload_from_mirror(overwrite=True)
238+
else:
239+
# If we're using S3 as the source of truth, we copy the file from S3
240+
# to the dest S3 path, then download it to the dest mirror.
241+
self.copy_to_mp_s3_only(dest_mp)
242+
dest_mp.download_to_mirror(overwrite=True)
243+
244+
211245
def get_matching_s3_mirror_paths(
212246
segments: List[KeySegment]
213247
):
214248
"""Get matching S3 mirror paths."""
215249
return [
216250
MirrorPath.from_s3_key(key)
217251
for key in get_matching_s3_keys(segments)
218-
]
252+
]
253+
254+
255+
def multithread_download_mps_to_mirror(
256+
mps: list[MirrorPath], overwrite: bool = False
257+
):
258+
"""Download a list of MirrorPaths to the local mirror."""
259+
n_procs = psutil.cpu_count(logical=False)
260+
proc_executor = concurrent.futures.ProcessPoolExecutor(max_workers=n_procs)
261+
all_proc_futures: list[concurrent.futures.Future] = []
262+
pbar = tqdm(total=len(mps), desc="Downloading to mirror") # Init pbar
263+
for mp in mps:
264+
pf = proc_executor.submit(mp.download_to_mirror, overwrite=overwrite)
265+
all_proc_futures.append(pf)
266+
267+
# Increment pbar as processes finish
268+
for _ in concurrent.futures.as_completed(all_proc_futures):
269+
pbar.update(n=1)
270+
271+
all_proc_futures_except = [pf for pf in all_proc_futures if pf.exception()]
272+
for pf in all_proc_futures_except:
273+
raise pf.exception()
274+
275+
proc_executor.shutdown(wait=True)

S3MP/utils/s3_utils.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -84,7 +84,7 @@ def key_is_file_on_s3(
8484
bucket = bucket or S3MPConfig.bucket
8585
client = client or S3MPConfig.s3_client
8686
if not key_exists_on_s3(key, bucket, client):
87-
raise ValueError("Key does not exist on S3")
87+
raise ValueError(f"Key {key} does not exist on S3")
8888
res = s3_list_single_key(key, bucket, client)
8989
# Handle case of trailing slash, but still verify
9090
if (
@@ -105,7 +105,7 @@ def key_size_on_s3(
105105
bucket = bucket or S3MPConfig.bucket
106106
client = client or S3MPConfig.s3_client
107107
if not key_exists_on_s3(key, bucket, client):
108-
raise ValueError("Key does not exist on S3")
108+
raise ValueError(f"Key {key} does not exist on S3")
109109
res = s3_list_single_key(key, bucket, client)
110110
return res["Contents"]["Size"] if "Contents" in res else 0
111111

environment.yml

+1
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@ dependencies:
1010
- future
1111
- mypy-boto3-s3
1212
- pip
13+
- psutil
1314
- pyparsing
1415
- pyproj
1516
- python

pyproject.toml

+2-1
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
[project]
22
name = "S3MP"
3-
version = "0.5.1"
3+
version = "0.5.2"
44
description = ""
55
authors = [
66
{name = "Joshua Dean", email = "[email protected]"}
@@ -13,6 +13,7 @@ dependencies = [
1313
"future",
1414
"mypy-boto3-s3",
1515
"pip",
16+
"psutil",
1617
"pyparsing",
1718
"pyproj",
1819
"setuptools",

0 commit comments

Comments
 (0)