Skip to content

Commit 0ebdf89

Browse files
authored
DLHub Removal + making https download more robust (#423)
* Removing DLHub and fixing https download bug * Add https streaming download with monitor + flake8 fixes * make it easier to test locally with confidential client credentials * Minimizing requirements * updating GitHub Actions to latest versions (there were breaking issues with caching) * Pinning pydantic less than 2
1 parent 5456145 commit 0ebdf89

File tree

7 files changed

+54
-98
lines changed

7 files changed

+54
-98
lines changed

.github/workflows/tests.yml

+2-2
Original file line numberDiff line numberDiff line change
@@ -20,9 +20,9 @@ jobs:
2020
CLIENT_SECRET: ${{ secrets.CLIENT_SECRET }}
2121
name: build
2222
steps:
23-
- uses: actions/checkout@v2
23+
- uses: actions/checkout@v4
2424
- name: Set up Python ${{ matrix.python-version }}
25-
uses: actions/setup-python@v2
25+
uses: actions/setup-python@v5
2626
with:
2727
python-version: ${{ matrix.python-version }}
2828
cache : 'pip'

foundry/foundry.py

+4-67
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
import numpy as np
66
import pandas as pd
77
from pydantic import ValidationError
8-
from typing import Any, Dict, List
8+
from typing import Any, Dict, List, AnyStr
99
import logging
1010
import warnings
1111
import os
@@ -14,7 +14,6 @@
1414

1515
from mdf_connect_client import MDFConnectClient
1616
from mdf_forge import Forge
17-
from dlhub_sdk import DLHubClient
1817
from globus_sdk import AuthClient
1918

2019
from .auth import PubAuths
@@ -37,15 +36,14 @@ class Foundry(FoundryBase):
3736
"""Foundry Client Base Class
3837
3938
Foundry object used for all interactions with Foundry datasets and models. Interfaces with MDF Connect Client,
40-
Globus Compute, Globus Auth, Globus Transfer, Globus Search, DLHub, and relevant Globus Endpoints
39+
Globus Compute, Globus Auth, Globus Transfer, Globus Search, and relevant Globus Endpoints
4140
"""
4241

43-
dlhub_client: Any
4442
forge_client: Any
4543
connect_client: Any
4644
transfer_client: Any
4745
auth_client: Any
48-
index = ""
46+
index: AnyStr = ""
4947
auths: Any
5048

5149
def __init__(
@@ -84,9 +82,7 @@ def __init__(
8482
"search",
8583
"petrel",
8684
"transfer",
87-
"dlhub",
8885
"openid",
89-
"https://auth.globus.org/scopes/facd7ccc-c5f4-42aa-916b-a0e270e2c2a9/all", # funcx
9086
"https://auth.globus.org/scopes/f10a69a9-338c-4e5b-baa1-0dc92359ab47/https", # Eagle HTTPS
9187
"https://auth.globus.org/scopes/82f1b5c6-6e9b-11e5-ba47-22000b92c6ec/https", # NCSA HTTPS
9288
"https://auth.globus.org/scopes/d31d4f5d-be37-4adc-a761-2f716b7af105/action_all", # Globus Search Lambda
@@ -115,7 +111,7 @@ def __init__(
115111
search_client=self.auths["search"],
116112
transfer_client=self.auths["transfer"],
117113
data_mdf_authorizer=self.auths["data_mdf"],
118-
petrel_authorizer=self.auths["petrel"],
114+
petrel_authorizer=self.auths["petrel"]
119115
)
120116

121117
self.transfer_client = self.auths['transfer']
@@ -132,19 +128,6 @@ def __init__(
132128
authorizer=self.auths["mdf_connect"], test=test
133129
)
134130

135-
self.dlhub_client = DLHubClient(
136-
dlh_authorizer=self.auths["dlhub"],
137-
search_authorizer=self.auths["search_authorizer"],
138-
fx_authorizer=self.auths[
139-
"https://auth.globus.org/scopes/facd7ccc-c5f4-42aa-916b-a0e270e2c2a9/all"
140-
],
141-
openid_authorizer=self.auths['openid'],
142-
sl_authorizer=self.auths[
143-
"https://auth.globus.org/scopes/d31d4f5d-be37-4adc-a761-2f716b7af105/action_all"
144-
],
145-
force_login=False,
146-
)
147-
148131
def load(self, name, download=True, globus=False, verbose=False, metadata=None, authorizers=None, **kwargs):
149132
"""Load the metadata for a Foundry dataset into the client
150133
@@ -250,22 +233,6 @@ def list(self):
250233
"""
251234
return self.search()
252235

253-
def run(self, name, inputs, funcx_endpoint=None, **kwargs):
254-
"""Run a model on inputted data
255-
256-
Args:
257-
name (str): DLHub model name
258-
inputs: Data to send to DLHub as inputs (should be JSON serializable, example types include dict, list,
259-
np.ndarray, etc)
260-
funcx_endpoint (str) (optional): UUID for the funcx endpoint to run the model on, if not the default (eg River)
261-
262-
Returns:
263-
Results after invocation via the DLHub service
264-
"""
265-
if funcx_endpoint is not None:
266-
self.dlhub_client.fx_endpoint = funcx_endpoint
267-
return self.dlhub_client.run(name, inputs=inputs, **kwargs)
268-
269236
def load_data(self, source_id=None, globus=True, as_hdf5=False, splits=[]):
270237
"""Load in the data associated with the prescribed dataset
271238
@@ -469,36 +436,6 @@ def publish_dataset(
469436
res = None
470437
return res
471438

472-
def publish_model(self, title, creators, short_name, servable_type, serv_options, affiliations=None, paper_doi=None):
473-
"""Simplified publishing method for servables
474-
475-
Args:
476-
title (string): title for the servable
477-
creators (string | list): either the creator's name (FamilyName, GivenName) or a list of the creators' names
478-
short_name (string): shorthand name for the servable
479-
servable_type (string): the type of the servable, must be a member of ("static_method",
480-
"class_method",
481-
"keras",
482-
"pytorch",
483-
"tensorflow",
484-
"sklearn")
485-
serv_options (dict): the servable_type specific arguments that are necessary for publishing. arguments can
486-
be found at https://dlhub-sdk.readthedocs.io/en/latest/source/dlhub_sdk.models.servables.html
487-
under the appropriate ``create_model`` signature. use the argument names as keys and their values as
488-
the values.
489-
affiliations (list): list of affiliations for each author
490-
paper_doi (str): DOI of a paper that describes the servable
491-
492-
Returns:
493-
(string): task id of this submission, can be used to check for success
494-
495-
Raises:
496-
ValueError: If the given servable_type is not in the list of acceptable types
497-
Exception: If the serv_options are incomplete or the request to publish results in an error
498-
"""
499-
return self.dlhub_client.easy_publish(title, creators, short_name, servable_type, serv_options, affiliations,
500-
paper_doi)
501-
502439
def check_status(self, source_id, short=False, raw=False):
503440
"""Check the status of your submission.
504441

foundry/https_download.py

+35-9
Original file line numberDiff line numberDiff line change
@@ -53,29 +53,55 @@ def _get_files(tc, ep, queue, max_depth):
5353

5454

5555
# TODO (wardlt): Avoid passing dictionaries, as documenting their content is tedious
56-
def download_file(item, https_config):
56+
def download_file(item, https_config, base_directory="data/", timeout=1800):
5757
"""Download a file to disk
5858
5959
Args:
6060
item: Dictionary defining the path to the file
6161
https_config: Configuration defining the URL of the server and the name of the dataset
6262
"""
63-
url = f"{https_config['base_url']}{item['path']}{item['name']}"
63+
base_url = https_config.get('base_url', '').rstrip('/')
64+
path = item.get('path', '').strip('/')
6465

65-
# build destination path for data file
66-
destination = os.path.join("data/", https_config['source_id'], item['name'])
66+
# Extracting the name and subdirectory from the item
67+
name = item.get('name', '')
68+
subdirectory = name.split('/')[0] if '/' in name else ''
69+
70+
# Avoid duplication of subdirectory in path
71+
if subdirectory and path.endswith(subdirectory):
72+
full_path = f"{path}/{name.split('/', 1)[-1]}".strip('/')
73+
else:
74+
full_path = '/'.join([path, name]).strip('/')
75+
76+
url = f"{base_url}/{full_path}"
6777

78+
# build destination path for data file
79+
destination = os.path.join(base_directory, https_config['source_id'], item['name'])
6880
parent_path = os.path.split(destination)[0]
6981

7082
# if parent directories don't exist, create them
7183
if not os.path.exists(parent_path):
7284
os.makedirs(parent_path)
7385

74-
response = requests.get(url)
75-
76-
# write file to local destination
77-
with open(destination, "wb") as f:
78-
f.write(response.content)
86+
try:
87+
with requests.get(url, stream=True, timeout=timeout) as response:
88+
response.raise_for_status()
89+
90+
downloaded_size = 0
91+
print(f"\rStarting Download of: {url}")
92+
93+
with open(destination, "wb") as f:
94+
for chunk in response.iter_content(chunk_size=8192):
95+
if chunk:
96+
f.write(chunk)
97+
downloaded_size += len(chunk)
98+
# Calculate and print the download progress
99+
print(f"\rDownloading... {downloaded_size/(1<<20):,.2f} MB", end="")
100+
return destination
101+
except requests.exceptions.RequestException as e:
102+
print(f"Error downloading file: {e}")
103+
except IOError as e:
104+
print(f"Error writing file to disk: {e}")
79105

80106
# TODO (wardlt): Should we just return the key?
81107
return {destination + " status": True}

foundry/models.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -144,7 +144,7 @@ class FoundryConfig(BaseModel):
144144
metadata_file: Optional[str] = "foundry_metadata.json"
145145
destination_endpoint: Optional[str] = None
146146
local: Optional[bool] = False
147-
local_cache_dir = "./data"
147+
local_cache_dir: Optional[str] = "./data"
148148
metadata_key: Optional[str] = "foundry"
149149
organization: Optional[str] = "foundry"
150150

requirements.txt

+1-4
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,10 @@
11
globus-sdk>=3,<4
2-
dlhub_sdk>=2.0.3
32
requests>=2.18.4
4-
tqdm>=4.19.4
53
six>=1.11.0
64
h5py>=2.10.0
75
numpy>=1.15.4
86
pandas>=0.23.4
9-
scikit-learn>=1.0
10-
pydantic>=1.6.1
7+
pydantic<2.0.0
118
mdf_forge>=0.8.0
129
mdf-connect-client>=0.4.0
1310
json2table>=1.1.5

setup.py

+1-2
Original file line numberDiff line numberDiff line change
@@ -16,10 +16,9 @@
1616
install_requires=[
1717
"mdf_forge>=0.8.0",
1818
"globus-sdk>=3,<4",
19-
"dlhub_sdk>=1.0.0",
2019
"numpy>=1.15.4",
2120
"pandas>=0.23.4",
22-
"pydantic>=1.4",
21+
"pydantic<2.0.0",
2322
"mdf_connect_client>=0.4.0",
2423
"h5py>=2.10.0",
2524
"json2table"

tests/test_foundry.py

+10-13
Original file line numberDiff line numberDiff line change
@@ -14,30 +14,29 @@
1414
from foundry import Foundry
1515
from foundry.auth import PubAuths
1616
from foundry.https_upload import upload_to_endpoint
17-
from dlhub_sdk import DLHubClient
1817
from globus_sdk import AuthClient
1918
from mdf_connect_client import MDFConnectClient
2019

21-
2220
client_id = os.getenv("CLIENT_ID")
2321
client_secret = os.getenv("CLIENT_SECRET")
24-
is_gha = os.getenv("GITHUB_ACTIONS")
22+
confidential_login = (os.getenv("GITHUB_ACTIONS") or (client_id and client_secret))
2523

2624
services = [
2725
"data_mdf",
2826
"mdf_connect",
2927
"search",
3028
"petrel",
3129
"transfer",
32-
"dlhub",
3330
"openid",
3431
"https://auth.globus.org/scopes/facd7ccc-c5f4-42aa-916b-a0e270e2c2a9/all", # funcx
3532
"https://auth.globus.org/scopes/f10a69a9-338c-4e5b-baa1-0dc92359ab47/https", # Eagle HTTPS
3633
"https://auth.globus.org/scopes/82f1b5c6-6e9b-11e5-ba47-22000b92c6ec/https", # NCSA HTTPS
3734
"https://auth.globus.org/scopes/d31d4f5d-be37-4adc-a761-2f716b7af105/action_all", # Globus Search Lambda
3835
]
3936

40-
if is_gha:
37+
if confidential_login:
38+
# Use confidential login if the tests are being run on GitHub Actions or
39+
# if a client ID and secret are provided
4140
auths = mdf_toolbox.confidential_login(client_id=client_id,
4241
client_secret=client_secret,
4342
services=services, make_clients=True)
@@ -46,6 +45,7 @@
4645
client_secret=client_secret,
4746
services=["search"], make_clients=False)
4847
else:
48+
# Otherwise try to allow the user to login directly
4949
auths = mdf_toolbox.login(services=services, make_clients=True)
5050
search_auth = mdf_toolbox.login(services=["search"], make_clients=False)
5151

@@ -211,16 +211,13 @@ def test_foundry_init():
211211
assert isinstance(f.forge_client, Forge)
212212
assert isinstance(f.connect_client, MDFConnectClient)
213213

214-
if not is_gha:
215-
assert isinstance(f.dlhub_client, DLHubClient)
214+
if not confidential_login:
216215

217216
f2 = Foundry(authorizers=auths, no_browser=False, no_local_server=True)
218-
assert isinstance(f2.dlhub_client, DLHubClient)
219217
assert isinstance(f2.forge_client, Forge)
220218
assert isinstance(f2.connect_client, MDFConnectClient)
221219

222220
f3 = Foundry(authorizers=auths, no_browser=True, no_local_server=False)
223-
assert isinstance(f3.dlhub_client, DLHubClient)
224221
assert isinstance(f3.forge_client, Forge)
225222
assert isinstance(f3.connect_client, MDFConnectClient)
226223

@@ -330,7 +327,7 @@ def test_dataframe_load_doi():
330327
_delete_test_data(f)
331328

332329

333-
@pytest.mark.skipif(bool(is_gha), reason="Test does not succeed on GHA - no Globus endpoint")
330+
@pytest.mark.skipif(bool(confidential_login), reason="Test does not succeed on GHA - no Globus endpoint")
334331
def test_download_globus():
335332
f = Foundry(authorizers=auths, no_browser=True, no_local_server=True)
336333
_delete_test_data(f)
@@ -340,7 +337,7 @@ def test_download_globus():
340337
_delete_test_data(f)
341338

342339

343-
@pytest.mark.skipif(bool(is_gha), reason="Test does not succeed on GHA - no Globus endpoint")
340+
@pytest.mark.skipif(bool(confidential_login), reason="Test does not succeed on GHA - no Globus endpoint")
344341
def test_globus_dataframe_load():
345342
f = Foundry(authorizers=auths, no_browser=True, no_local_server=True)
346343

@@ -358,7 +355,7 @@ def test_globus_dataframe_load():
358355
_delete_test_data(f)
359356

360357

361-
@pytest.mark.skipif(bool(is_gha), reason="Not run as part of GHA CI")
358+
@pytest.mark.skipif(bool(confidential_login), reason="Not run as part of GHA CI")
362359
def test_publish_with_https():
363360
"""System test: Assess the end-to-end publication of a dataset via HTTPS
364361
"""
@@ -456,7 +453,7 @@ def test_ACL_creation_and_deletion():
456453
pass
457454

458455

459-
@pytest.mark.skipif(bool(is_gha), reason="Not run as part of GHA CI")
456+
@pytest.mark.skipif(bool(confidential_login), reason="Not run as part of GHA CI")
460457
def test_publish_with_globus():
461458
# TODO: automate dealing with curation and cleaning after tests
462459

0 commit comments

Comments
 (0)