Skip to content

Commit 66aeea0

Browse files
committed
Revert "Merge branch 'schema-updates' into master"
This reverts commit 265ee7b, reversing changes made to 81ed4fc. Should keep just master/main's content
1 parent 265ee7b commit 66aeea0

File tree

2 files changed

+50
-113
lines changed

2 files changed

+50
-113
lines changed

Diff for: foundry/foundry.py

+38-78
Original file line numberDiff line numberDiff line change
@@ -101,10 +101,7 @@ def load(self, name, version="1.1", provider="MDF", download=True, globus=True,
101101
Args:
102102
name (str): Name of the foundry dataset
103103
download (bool): If True, download the data associated with the package (default is True)
104-
globus (bool): If True, download using Globus, otherwise https
105-
verbose (bool): If True print additional debug information
106-
metadata (dict): **For debug purposes.** A search result analog to prepopulate metadata.
107-
104+
108105
Keyword Args:
109106
interval (int): How often to poll Globus to check if transfers are complete
110107
@@ -123,6 +120,7 @@ def load(self, name, version="1.1", provider="MDF", download=True, globus=True,
123120
.match_field("mdf.organizations", "foundry")
124121
.search()
125122
)
123+
126124
# Handle MDF source_ids
127125
else:
128126
print("Loading by source_id")
@@ -152,6 +150,7 @@ def load(self, name, version="1.1", provider="MDF", download=True, globus=True,
152150
self.download(
153151
interval=kwargs.get("interval", 10), globus=globus, verbose=verbose
154152
)
153+
155154
return self
156155

157156
def list(self):
@@ -252,22 +251,44 @@ def load_data(self, source_id=None, globus=True):
252251
253252
Args:
254253
inputs (list): List of strings for input columns
255-
targets (list): List of strings for output columns
254+
outputs (list): List of strings for output columns
256255
257256
Returns
258-
-------s
257+
-------
259258
(tuple): Tuple of X, y values
260259
"""
261-
data = {}
262-
263-
# Handle splits if they exist. Return as a labeled dictionary of tuples
264-
if self.dataset.splits:
265-
for split in self.dataset.splits:
266-
data[split.label] = self._load_data(file=split.path,
267-
source_id=source_id, globus=globus)
268-
return data
260+
261+
if source_id:
262+
path = os.path.join(self.config.local_cache_dir, source_id)
263+
print("Here")
269264
else:
270-
return {"data": self._load_data(source_id=source_id, globus=globus)}
265+
path = os.path.join(self.config.local_cache_dir, self.mdf["source_id"])
266+
# Handle Foundry-defined types.
267+
if self.dataset.type.value == "tabular":
268+
# If the file is not local, fetch the contents with Globus
269+
# Check if the contents are local
270+
# TODO: Add hashes and versioning to metadata and checking to the file
271+
try:
272+
self.dataset.dataframe = pd.read_json(
273+
os.path.join(path, self.config.dataframe_file)
274+
)
275+
except:
276+
# Try to read individual lines instead
277+
self.dataset.dataframe = pd.read_json(
278+
os.path.join(path, self.config.dataframe_file), lines=True
279+
)
280+
281+
return (
282+
self.dataset.dataframe[self.dataset.inputs],
283+
self.dataset.dataframe[self.dataset.outputs],
284+
)
285+
elif self.dataset.type.value == "hdf5":
286+
f = h5py.File(os.path.join(path, self.config.data_file), "r")
287+
inputs = [f[i[0:]] for i in self.dataset.inputs]
288+
outputs = [f[i[0:]] for i in self.dataset.outputs]
289+
return (inputs, outputs)
290+
else:
291+
raise NotImplementedError
271292

272293
def describe(self):
273294
print("DC:{}".format(self.dc))
@@ -607,15 +628,14 @@ def download(self, globus=True, verbose=False, **kwargs):
607628
num_cores = multiprocessing.cpu_count()
608629

609630
def download_file(file):
610-
requests.packages.urllib3.disable_warnings(
611-
InsecureRequestWarning)
631+
requests.packages.urllib3.disable_warnings(InsecureRequestWarning)
612632

613633
url = "https://data.materialsdatafacility.org" + file["path"]
614634
destination = (
615635
"data/"
616636
+ source_id
617637
+ "/"
618-
+ file["path"][file["path"].rindex("/") + 1:]
638+
+ file["path"][file["path"].rindex("/") + 1 :]
619639
)
620640
response = requests.get(url, verify=False)
621641

@@ -633,7 +653,6 @@ def download_file(file):
633653

634654
return self
635655

636-
637656
def build(self, spec, globus=False, interval=3, file=False):
638657
"""Build a Foundry Data Package
639658
Args:
@@ -670,62 +689,3 @@ def start_download(ds, interval=interval, globus=False):
670689
)
671690

672691
return self
673-
674-
def get_keys(self, type, as_object=False):
675-
"""Get keys for a Foundry dataset
676-
677-
Arguments:
678-
type (str): The type of key to be returned e.g., "input", "target"
679-
as_object (bool): When ``False``, will return a list of keys in as strings
680-
When ``True``, will return the full key objects
681-
**Default:** ``False``
682-
Returns: (list) String representations of keys or if ``as_object``
683-
is False otherwise returns the full key objects.
684-
685-
"""
686-
if as_object:
687-
return [key for key in self.dataset.keys if key.type == type]
688-
else:
689-
return [key.key for key in self.dataset.keys if key.type == type]
690-
691-
def _load_data(self, file=None, source_id=None, globus=True):
692-
693-
# Build the path to access the cached data
694-
if source_id:
695-
path = os.path.join(self.config.local_cache_dir, source_id)
696-
else:
697-
path = os.path.join(self.config.local_cache_dir,
698-
self.mdf["source_id"])
699-
700-
# Handle Foundry-defined types.
701-
if self.dataset.type.value == "tabular":
702-
# Determine which file to load, defaults to config.dataframe_file
703-
if not file:
704-
file = self.config.dataframe_file
705-
706-
# If the file is not local, fetch the contents with Globus
707-
# Check if the contents are local
708-
# TODO: Add hashes and versioning to metadata and checking to the file
709-
try:
710-
self.dataset.dataframe = pd.read_json(
711-
os.path.join(path, file)
712-
)
713-
except:
714-
# Try to read individual lines instead
715-
self.dataset.dataframe = pd.read_json(
716-
os.path.join(path, file), lines=True
717-
)
718-
719-
return (
720-
self.dataset.dataframe[self.get_keys("input")],
721-
self.dataset.dataframe[self.get_keys("target")],
722-
)
723-
elif self.dataset.type.value == "hdf5":
724-
if not file:
725-
file = self.config.data_file
726-
f = h5py.File(os.path.join(path, file), "r")
727-
inputs = [f[i[0:]] for i in self.get_keys("input")]
728-
targets = [f[i[0:]] for i in self.get_keys("target")]
729-
return (inputs, targets)
730-
else:
731-
raise NotImplementedError

Diff for: foundry/models.py

+12-35
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
from typing import List, Dict, Optional, Any
2-
from pydantic import BaseModel, AnyHttpUrl
2+
from pydantic import BaseModel
33
from enum import Enum
44
import pandas as pd
55

@@ -13,7 +13,7 @@
1313
# description: str = ""
1414

1515

16-
# Classes for Foundry Data Package Specification
16+
### Classes for Foundry Data Package Specification
1717
class FoundrySpecificationDataset(BaseModel):
1818
"""Pydantic base class for datasets within the Foundry data package specification"""
1919

@@ -34,8 +34,7 @@ class FoundrySpecification(BaseModel):
3434
dependencies: List[FoundrySpecificationDataset]
3535

3636
def add_dependency(self, name, version, provider="MDF"):
37-
ds = FoundrySpecificationDataset(
38-
name=name, provider=provider, version=version)
37+
ds = FoundrySpecificationDataset(name=name, provider=provider, version=version)
3938
self.dependencies.append(ds)
4039

4140
def remove_duplicate_dependencies(self):
@@ -49,7 +48,8 @@ def clear_dependencies(self):
4948
self.dependencies = []
5049

5150

52-
# END Classes for Foundry Data Package Specification
51+
### END Classes for Foundry Data Package Specification
52+
5353

5454
class FoundryDatasetType(Enum):
5555
"""Foundry Dataset Types
@@ -62,45 +62,22 @@ class FoundryDatasetType(Enum):
6262
other = "other"
6363

6464

65-
class FoundryKey(BaseModel):
66-
key: str = ""
67-
type: str = ""
68-
units: Optional[str] = ""
69-
description: Optional[str] = ""
70-
labels: Optional[List[str]] = []
71-
72-
73-
class FoundrySplit(BaseModel):
74-
type: str = ""
75-
path: Optional[str] = ""
76-
label: Optional[str] = ""
77-
78-
79-
class FoundryLink(BaseModel):
80-
link: Optional[AnyHttpUrl]
81-
doi: Optional[str]
82-
83-
84-
class FoundryLinks(BaseModel):
85-
papers: List[FoundryLink]
86-
code: List[AnyHttpUrl]
87-
homepage: List[AnyHttpUrl]
88-
models: List[AnyHttpUrl]
89-
90-
9165
class FoundryDataset(BaseModel):
9266
"""Foundry Dataset
9367
Schema for Foundry Datasets. This includes specifications of inputs, outputs, type, version, and more
9468
"""
9569

96-
keys: List[FoundryKey] = None
97-
splits: Optional[List[FoundrySplit]] = None
70+
inputs: List = []
71+
outputs: List = []
72+
input_descriptions: Optional[List] = []
73+
output_descriptions: Optional[List] = []
9874
type: FoundryDatasetType = None
75+
# hash: Optional[str] = []
9976
version: Optional[str] = ""
10077
short_name: Optional[str] = ""
78+
# references: Optional[List[str]] = []
10179
dataframe: Optional[Any] = None
102-
links: Optional[FoundryLinks]
103-
citations: Optional[List[str]] = []
80+
# sources: Optional[List[AnyUrl]] = []
10481

10582
class Config:
10683
arbitrary_types_allowed = True

0 commit comments

Comments
 (0)