From de2f7e03a6c1b067d0f6884dfe270d5144af84fd Mon Sep 17 00:00:00 2001
From: Aadit Ambadkar <58674441+Aadit-Ambadkar@users.noreply.github.com>
Date: Tue, 16 Aug 2022 07:33:20 -0700
Subject: [PATCH] To tf dataset (#201) with rebase

* Add Custom Dataset and Implement

* Clean up Branch

* Clean up Branch

* Resolve Some of Logan's Changes

* Resolve Testing Issues?

* Resolve Testing Issues?

* Resolve Testing Issues?

* Resolve Testing Issues?

* Resolve Testing Issues?

* Reflect Logan's Requests

* Fix Import Issues

* Simplify Imports

* Fix Imports

* Apply Logan's Changes

* Comments

* Refactor Common Logic Into New Function

* Add Documentation

* Add Documentation

* Add Custom Dataset and Implement

* Clean up Branch

* Replace keep_hdf5 with as_hdf5

* Resolve Some of Logan's Changes

* Resolve Testing Issues?

* Resolve Testing Issues?

* Resolve Testing Issues?

* Resolve Testing Issues?

* Resolve Testing Issues?

* Reflect Logan's Requests

* Fix Import Issues

* Simplify Imports

* Fix Imports

* Apply Logan's Changes

* Comments

* Refactor Common Logic Into New Function

* Add Documentation

* Add Documentation

* fix reference to _get_inputs_to_targets(); also, whitespace

* remove unused * import

* fix test_foundry.py to have the proper tests from the dev branch

* remove outdated test_to_pytorch() test

* fix passing of self for _get_inputs_targets()

Co-authored-by: Aristana Scourtas <ascourtas@gmail.com>
---
 .gitignore                                    |  1 +
 foundry/foundry.py                            | 67 +++++++++++++------
 foundry/loaders/__init__.py                   |  0
 .../tf_wrapper.py}                            |  9 ++-
 foundry/loaders/torch_wrapper.py              | 28 ++++++++
 requirements.txt                              |  1 +
 tests/test_foundry.py                         | 64 ++++++++++++------
 7 files changed, 124 insertions(+), 46 deletions(-)
 create mode 100644 foundry/loaders/__init__.py
 rename foundry/{external_data_architectures.py => loaders/tf_wrapper.py} (79%)
 create mode 100644 foundry/loaders/torch_wrapper.py

diff --git a/.gitignore b/.gitignore
index 774559d4..d51927fd 100644
--- a/.gitignore
+++ b/.gitignore
@@ -2,3 +2,4 @@
 *.DS_STORE
 *.pyc
 *.idea
+*/foundry_ml.egg-info/*
\ No newline at end of file
diff --git a/foundry/foundry.py b/foundry/foundry.py
index 8f87d6f9..dc778663 100644
--- a/foundry/foundry.py
+++ b/foundry/foundry.py
@@ -21,9 +21,6 @@
     FoundrySpecification,
     FoundryDataset
 )
-from foundry.external_data_architectures import (
-    FoundryDataset_Torch
-)
 
 import logging
 import warnings
@@ -737,7 +734,6 @@ def get_keys(self, type=None, as_object=False):
                 key_list = key_list + k
             return key_list
 
-
     def _load_data(self, file=None, source_id=None, globus=True, as_hdf5=False):
         # Build the path to access the cached data
         if source_id:
@@ -752,7 +748,6 @@ def _load_data(self, file=None, source_id=None, globus=True, as_hdf5=False):
             if not file:
                 file = self.config.dataframe_file
 
-
             # Check to make sure the path can be created
             try:
                 path_to_file = os.path.join(path, file)
@@ -817,22 +812,16 @@ def _load_data(self, file=None, source_id=None, globus=True, as_hdf5=False):
         else:
             raise NotImplementedError
 
-    
-    def toTorch(self, raw=None, split=None):
-        """Convert Foundry Dataset to a PyTorch Dataset
+    def _get_inputs_targets(self, split: str = None):
+        """Get Inputs and Outputs from a Foundry Dataset
 
         Arguments:
-            raw (dict): The output of running ``f.load_data(as_hdf5=False)``
-                    Recommended that this is left as ``None``
+            split (string): Split to get inputs and outputs from.
                     **Default:** ``None``
-            split (string): Split to create PyTorch Dataset on.
-                    **Default:** ``None``
-
-        Returns: (FoundryDataset_Torch) PyTorch Dataset of all the data from the specified split
-
+        
+        Returns: (Tuple) Tuple of the inputs and outputs
         """
-        if not raw:
-            raw = self.load_data(as_hdf5=False)
+        raw = self.load_data(as_hdf5=False)
         
         if not split:
             split = self.dataset.splits[0].type
@@ -841,16 +830,23 @@ def toTorch(self, raw=None, split=None):
             inputs = []
             targets = []
             for key in self.dataset.keys:
+                # raw[split][key.type][key.key[0]] gets the data values for the given key.
+                #
+                # For example, if the key was coordinates and had type target, then 
+                # raw[split][key.type][key.key[0]] would return all the coordinates for each item
+                # and raw[split][key.type][key.key[0]].keys() are the indexes of the item.
                 if len(raw[split][key.type][key.key[0]].keys()) != self.dataset.n_items:
                     continue
 
+                # Get a numpy array of all the values for each item for that key
                 val = np.array([raw[split][key.type][key.key[0]][k] for k in raw[split][key.type][key.key[0]].keys()])
                 if key.type == 'input':
                     inputs.append(val)
                 else:
                     targets.append(val)
+            
+            return (inputs, targets)
 
-            return FoundryDataset_Torch(inputs, targets)
         elif self.dataset.data_type.value == "tabular":
             inputs = []
             targets = []
@@ -859,11 +855,42 @@ def toTorch(self, raw=None, split=None):
                 df = raw[split][index]
                 for key in df.keys():
                     arr.append(df[key].values)
-
-            return FoundryDataset_Torch(inputs, targets)
+            
+            return (inputs, targets)
+            
         else:
             raise NotImplementedError
 
+    def to_torch(self, split: str = None):
+        """Convert Foundry Dataset to a PyTorch Dataset
+
+        Arguments:
+            split (string): Split to create PyTorch Dataset on.
+                    **Default:** ``None``
+
+        Returns: (TorchDataset) PyTorch Dataset of all the data from the specified split
+
+        """
+        from foundry.loaders.torch_wrapper import TorchDataset
+
+        inputs, targets = self._get_inputs_targets(split)
+        return TorchDataset(inputs, targets)
+
+    def to_tensorflow(self, split: str = None):
+        """Convert Foundry Dataset to a Tensorflow Sequence
+
+        Arguments:
+            split (string): Split to create Tensorflow Sequence on.
+                    **Default:** ``None``
+
+        Returns: (TensorflowSequence) Tensorflow Sequence of all the data from the specified split
+
+        """
+        from foundry.loaders.tf_wrapper import TensorflowSequence
+
+        inputs, targets = self._get_inputs_targets(split)
+        return TensorflowSequence(inputs, targets)
+
 
 def is_pandas_pytable(group):
     if 'axis0' in group.keys() and 'axis1' in group.keys():
diff --git a/foundry/loaders/__init__.py b/foundry/loaders/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/foundry/external_data_architectures.py b/foundry/loaders/tf_wrapper.py
similarity index 79%
rename from foundry/external_data_architectures.py
rename to foundry/loaders/tf_wrapper.py
index 2adc5349..ac4e92f9 100644
--- a/foundry/external_data_architectures.py
+++ b/foundry/loaders/tf_wrapper.py
@@ -1,9 +1,8 @@
 import numpy as np
-import torch
-from torch.utils.data import Dataset
+from tensorflow.keras.utils import Sequence
 
-class FoundryDataset_Torch(Dataset):
-    """Foundry Dataset Converted to Pytorch Format"""
+class TensorflowSequence(Sequence):
+    """Foundry Dataset Converted to Tensorflow Format"""
 
     def __init__(self, inputs, targets):
         self.inputs=inputs
@@ -24,4 +23,4 @@ def __getitem__(self, idx):
         item["target"] = np.array(item["target"])
         
         return item
-    
+        
\ No newline at end of file
diff --git a/foundry/loaders/torch_wrapper.py b/foundry/loaders/torch_wrapper.py
new file mode 100644
index 00000000..217022dc
--- /dev/null
+++ b/foundry/loaders/torch_wrapper.py
@@ -0,0 +1,28 @@
+import numpy as np
+from torch.utils.data import Dataset
+
+class TorchDataset(Dataset):
+    """Foundry Dataset Converted to Pytorch Format"""
+
+    def __init__(self, inputs, targets):
+        self.inputs=inputs
+        self.targets=targets
+
+    def __len__(self):
+        return len(self.inputs[0])
+
+    def __getitem__(self, idx):
+        item = {"input": [], "target": []}
+        
+        # adds the correct item at index idx from each input from self.inputs to the item dictionary
+        for input in self.inputs:
+            item["input"].append(np.array(input[idx]))
+        item["input"] = np.array(item["input"])
+        
+        # adds the correct item at index idx from each target from self.targets to the item dictionary
+        for target in self.targets:
+            item["target"].append(np.array(target[idx]))
+        item["target"] = np.array(item["target"])
+        
+        return item
+        
\ No newline at end of file
diff --git a/requirements.txt b/requirements.txt
index 99fc6358..84e6d12e 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -14,3 +14,4 @@ mdf-connect-client>=0.4.0
 json2table>=1.1.5
 joblib>=1.1.0
 torch>=1.8.0
+tensorflow>=2
\ No newline at end of file
diff --git a/tests/test_foundry.py b/tests/test_foundry.py
index f8cb2900..bfb73ba9 100644
--- a/tests/test_foundry.py
+++ b/tests/test_foundry.py
@@ -1,4 +1,6 @@
 import os, shutil
+import re
+import types
 import pytest
 from datetime import datetime
 import mdf_toolbox
@@ -44,6 +46,7 @@
 test_dataset = "foundry_experimental_band_gaps_v1.1"
 expected_title = "Graph Network Based Deep Learning of Band Gaps - Experimental Band Gaps"
 
+
 # Kept the Old metadata format in case we ever want to refer back
 old_test_metadata = {
     "inputs": ["sepal length (cm)", "sepal width (cm)", "petal length (cm)", "petal width (cm)"],
@@ -58,7 +61,7 @@
     "package_type": "tabular"
 }
 
-test_metadata = {
+pub_test_metadata = {
     "keys":[
         {
             "key": ["sepal length (cm)"],
@@ -116,11 +119,11 @@
     'n_items': 1000
 }
 
-# Globus endpoint for '_iris_dev'
-test_data_source = "https://app.globus.org/file-manager?origin_id=e38ee745-6d04-11e5-ba46-22000b92c6ec&origin_path=%2Ffoundry-test%2Firis-dev%2F"
+# Globus endpoint for '_iris_dev' for test publication
+pub_test_data_source = "https://app.globus.org/file-manager?origin_id=e38ee745-6d04-11e5-ba46-22000b92c6ec&origin_path=%2Ffoundry-test%2Firis-dev%2F"
 
 
-#Quick function to delete any downloaded test data
+# Quick function to delete any downloaded test data
 def _delete_test_data(foundry_obj):
     path = os.path.join(foundry_obj.config.local_cache_dir, test_dataset)
     if os.path.isdir(path):
@@ -184,19 +187,6 @@ def test_dataframe_load():
     _delete_test_data(f)
 
 
-def test_to_pytorch():
-    f = Foundry(authorizers=auths, no_browser=True, no_local_server=True)
-    _delete_test_data(f)
-
-    f = f.load(test_dataset, download=True, globus=False, authorizers=auths)
-    raw = f.load_data()
-    ds = f.toTorch(raw=raw, split='train')
-
-    assert raw['train'][0].iloc[0][0] == ds[0]['input'][0]
-    assert len(raw['train'][0]) == len(ds)
-    _delete_test_data(f)
-
-
 @pytest.mark.skipif(bool(is_gha), reason="Test does not succeed online")  # PLEASE CONFIRM THIS BEHAVIOR IS INTENDED
 def test_download_globus():
     f = Foundry(authorizers=auths, no_browser=True, no_local_server=True)
@@ -234,7 +224,7 @@ def test_publish():
     short_name = "example_AS_iris_test_{:.0f}".format(timestamp)
     authors = ["A Scourtas"]
 
-    res = f.publish(test_metadata, test_data_source, title, authors, short_name=short_name)
+    res = f.publish(pub_test_metadata, pub_test_data_source, title, authors, short_name=short_name)
 
     # publish with short name
     assert res['success']
@@ -247,19 +237,51 @@ def test_publish():
     # assert res['source_id'] == "_test_scourtas_example_iris_publish_{:.0f}_v1.1".format(timestamp)
 
     # check that pushing same dataset without update flag fails
-    res = f.publish(test_metadata, test_data_source, title, authors, short_name=short_name)
+    res = f.publish(pub_test_metadata, pub_test_data_source, title, authors, short_name=short_name)
     assert not res['success']
 
     # check that using update flag allows us to update dataset
-    res = f.publish(test_metadata, test_data_source, title, authors, short_name=short_name, update=True)
+    res = f.publish(pub_test_metadata, pub_test_data_source, title, authors, short_name=short_name, update=True)
     assert res['success']
 
     # check that using update flag for new dataset fails
     new_short_name = short_name + "_update"
-    res = f.publish(test_metadata, test_data_source, title, authors, short_name=new_short_name, update=True)
+    res = f.publish(pub_test_metadata, pub_test_data_source, title, authors, short_name=new_short_name, update=True)
     assert not res['success']
 
 
 def test_check_status():
     # TODO: the 'active messages' in MDF CC's check_status() don't appear to do anything? need to determine how to test
     pass
+
+
+def test_to_pytorch():
+    f = Foundry(authorizers=auths, no_browser=True, no_local_server=True)
+    
+    _delete_test_data(f)
+
+    f = f.load(test_dataset, download=True, globus=False, authorizers=auths)
+    raw = f.load_data()
+
+    ds = f.to_torch(split='train')
+    
+    assert raw['train'][0].iloc[0][0] == ds[0]['input'][0]
+    assert len(raw['train'][0]) == len(ds)
+
+    _delete_test_data(f)
+
+
+def test_to_tensorflow():
+    f = Foundry(authorizers=auths, no_browser=True, no_local_server=True)
+    
+    _delete_test_data(f)
+
+    f = f.load(test_dataset, download=True, globus=False, authorizers=auths)
+    raw = f.load_data()
+
+    ds = f.to_tensorflow(split='train')
+    
+    assert raw['train'][0].iloc[0][0] == ds[0]['input'][0]
+    assert len(raw['train'][0]) == len(ds)
+
+    _delete_test_data(f)