I-GUIDE · pkdash · Apr 9, 2024 · Apr 9, 2024 · Apr 9, 2024 · Apr 24, 2024
diff --git a/.github/workflows/dependabot.yml → .github/dependabot.yml b/.github/workflows/dependabot.yml → .github/dependabot.yml
diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml
@@ -3,7 +3,9 @@ name: Run I-GUIDE Builds and Config Saturations
 on:
   workflow_dispatch:
   push:
-    branches: ['*']
+    branches-ignore:
+      - productionalization
+      - develop
 
 env:
   DOMAIN: iguide.cuahsi.io

diff --git a/.github/workflows/deploy-dev.yaml b/.github/workflows/deploy-dev.yaml
diff --git a/.github/workflows/deploy.yaml b/.github/workflows/deploy.yaml
@@ -4,28 +4,30 @@ on:
   workflow_dispatch:
   push:
     branches:
+    - 'develop'
     - 'productionalization'
 
 env:
-  DOMAIN: iguide.cuahsi.io
-  TAG: latest
+  DEPLOY_TO_PRODUCTION: ${{ github.ref == 'refs/heads/productionalization' && true || false }}
+  DOMAIN: ${{ env.DEPLOY_TO_PRODUCTION == false && 'iguide-dev.cuahsi.io' || 'iguide.cuahsi.io' }}
+  IP: ${{ env.DEPLOY_TO_PRODUCTION == false && 'iguide-dev' || 'iguide' }}
+  KUBE_CLUSTER_NAME: ${{ env.DEPLOY_TO_PRODUCTION == false && 'iguide-dev' || 'iguide' }}
+  TAG: ${{ env.DEPLOY_TO_PRODUCTION == false && github.sha || 'latest' }}
   TESTING: false
-  IP: iguide
   OIDC_ISSUER: https://orcid.org
-  DATABASE_NAME: iguide_beta
+  # why are we using iguide_beta for production deployment? Should it be iguide_demo?
+  DATABASE_NAME: ${{ env.DEPLOY_TO_PRODUCTION == false && 'iguide_dev' || 'iguide_beta' }}
   DB_PROTOCOL: mongodb+srv
   HYDROSHARE_META_READ_URL: https://www.hydroshare.org/hsapi2/resource/%s/json/
   HYDROSHARE_FILE_READ_URL: https://www.hydroshare.org/hsapi/resource/%s/files/
   VITE_APP_NAME: I-GUIDE
-  VITE_APP_URL: https://iguide.cuahsi.io
-  VITE_APP_API_URL: https://iguide.cuahsi.io/api
+  VITE_APP_URL: ${{ env.DEPLOY_TO_PRODUCTION == false && 'https://iguide-dev.cuahsi.io' || 'https://iguide.cuahsi.io' }}
+  VITE_APP_API_URL: ${{ env.DEPLOY_TO_PRODUCTION == false && 'https://iguide-dev.cuahsi.io/api' || 'https://iguide.cuahsi.io/api' }}
   VITE_APP_LOGIN_URL: https://orcid.org/oauth/authorize
-  VITE_APP_GOOGLE_MAPS_API_KEY: ""
   VITE_APP_SUPPORT_EMAIL: [email protected]
   VITE_APP_CLIENT_ID: APP-4ZA8C8BYAH3QHNE9
   SEARCH_RELEVANCE_SCORE_THRESHOLD: 1.4
 
-
 jobs:
   deploy:
     runs-on: ubuntu-latest
@@ -43,9 +45,9 @@ jobs:
 
     - name: Compile the root env file
       env:
-        DB_HOST: ${{ secrets.DB_HOST }}
-        DB_USERNAME: ${{ secrets.DB_USERNAME }}
-        DB_PASSWORD: ${{ secrets.DB_PASSWORD }}
+        DB_HOST: ${{ env.DEPLOY_TO_PRODUCTION == false && secrets.DB_HOST_BETA || secrets.DB_HOST }}
+        DB_USERNAME: ${{ env.DEPLOY_TO_PRODUCTION == false && secrets.DB_USERNAME_BETA || secrets.DB_USERNAME }}
+        DB_PASSWORD: ${{ env.DEPLOY_TO_PRODUCTION == false && secrets.DB_PASSWORD_BETA || secrets.DB_PASSWORD }}
       run: |
         variables=("OIDC_ISSUER" "DB_USERNAME" "DB_PASSWORD" "DB_HOST" "DATABASE_NAME" "DB_PROTOCOL" "TESTING" "VITE_APP_LOGIN_URL" "HYDROSHARE_META_READ_URL" "HYDROSHARE_FILE_READ_URL" "SEARCH_RELEVANCE_SCORE_THRESHOLD")
 
@@ -58,6 +60,9 @@ jobs:
         done
 
     - name: Compile the frontend env file
+      env:
+        VITE_APP_GOOGLE_MAPS_API_KEY: ${{env.DEPLOY_TO_PRODUCTION == false && secrets.VITE_APP_GOOGLE_MAPS_API_KEY || ''}}
+
       run: |
         variables=("VITE_APP_NAME" "VITE_APP_API_URL" "VITE_APP_SUPPORT_EMAIL" "VITE_APP_URL" "VITE_APP_LOGIN_URL" "VITE_APP_CLIENT_ID" "VITE_APP_GOOGLE_MAPS_API_KEY")
 
@@ -86,11 +91,13 @@ jobs:
         USE_GKE_GCLOUD_AUTH_PLUGIN: True
         GOOGLE_PROJECT: ${{ secrets.GOOGLE_PROJECT }}
       run: |
-        gcloud container clusters get-credentials iguide --region us-central1
-        find ./kubernetes -type f | xargs -i sed -i "s/GOOGLE_PROJECT/$GOOGLE_PROJECT/g" {}
+        gcloud container clusters get-credentials $KUBE_CLUSTER_NAME --region us-central1
+        find ./kubernetes -type f | xargs -i sed -i "s/GOOGLE_PROJECT/$GOOGLE_PROJECT/g" {}        
         find ./kubernetes -type f | xargs -i sed -i "s/IGUIDE_TAG/$TAG/g" {}
         find ./kubernetes -type f | xargs -i sed -i "s/IGUIDE_DOMAIN/$DOMAIN/g" {}
-        find ./kubernetes -type f | xargs -i sed -i "s/IGUIDE_IP/$IP/g" {}
+        if [[ "${{ env.DEPLOY_TO_PRODUCTION }}" == true ]]; then          
+          find ./kubernetes -type f | xargs -i sed -i "s/IGUIDE_IP/$IP/g" {}
+        fi
         kubectl apply -f kubernetes/
         # Refresh pods
         kubectl delete pods --all
diff --git a/Makefile b/Makefile
@@ -33,4 +33,4 @@ test:
 
 .PHONY: pre-post
 pre-post:
-	docker-compose run catalog-trigger python /app/triggers/management/change_streams_pre_and_post.py
+	docker-compose run catalog-trigger python /app/api/models/management/change_streams_pre_and_post.py
diff --git a/api/adapters/hydroshare.py b/api/adapters/hydroshare.py
@@ -8,7 +8,7 @@
 from api.exceptions import RepositoryException
 from api.models import schema
 from api.models.catalog import DatasetMetadataDOC
-from api.models.user import Submission, SubmissionType
+from api.models.user import Submission
 
 
 class Creator(BaseModel):

diff --git a/api/adapters/s3.py b/api/adapters/s3.py
@@ -1,12 +1,16 @@
-import boto3
 import json
-from botocore.client import Config
+from http import HTTPStatus
+
+import boto3
 from botocore import UNSIGNED
+from botocore.client import Config
+from botocore.exceptions import ClientError as S3ClientError
 
 from api.adapters.base import AbstractRepositoryMetadataAdapter, AbstractRepositoryRequestHandler
 from api.adapters.utils import RepositoryType, register_adapter
+from api.exceptions import RepositoryException
 from api.models.catalog import DatasetMetadataDOC
-from api.models.user import Submission, SubmissionType
+from api.models.user import Submission
 
 
 class _S3RequestHandler(AbstractRepositoryRequestHandler):
@@ -16,12 +20,25 @@ def get_metadata(self, record_id: str):
         file_key = record_id.split("+")[2]
 
         s3 = boto3.client('s3', config=Config(signature_version=UNSIGNED), endpoint_url=endpoint_url)
-
-        response = s3.get_object(Bucket=bucket_name, Key=file_key)
-        json_content = response['Body'].read().decode('utf-8')
+        try:
+            response = s3.get_object(Bucket=bucket_name, Key=file_key)
+        except S3ClientError as ex:
+            if ex.response["Error"]["Code"] == "NoSuchKey":
+                raise RepositoryException(
+                    detail=f"Specified metadata file was not found in S3: {bucket_name}/{file_key}",
+                    status_code=HTTPStatus.NOT_FOUND
+                )
+            else:
+                err_msg = f"Error accessing S3 file({bucket_name}/{file_key}): {str(ex)}"
+                raise RepositoryException(detail=err_msg, status_code=HTTPStatus.BAD_REQUEST)
 
+        json_content = response['Body'].read().decode('utf-8')
         # Parse the JSON content
-        data = json.loads(json_content)
+        try:
+            data = json.loads(json_content)
+        except json.JSONDecodeError as ex:
+            err_msg = f"Invalid JSON content in S3 file ({file_key}). Error: {str(ex)}"
+            raise RepositoryException(detail=err_msg, status_code=HTTPStatus.BAD_REQUEST)
 
         return data
 

diff --git a/api/models/user.py b/api/models/user.py
@@ -30,6 +30,11 @@ def identifier(self):
             identifier = f"{endpoint_url}/{self.bucket}/{self.path}"
         return identifier
 
+    @property
+    def fetch_identifier(self):
+        # This is the identifier that is used to fetch the file from S3
+        return f"{self.endpoint_url}+{self.bucket}+{self.path}"
+
 
 class Submission(Document):
     title: str = None

diff --git a/api/routes/catalog.py b/api/routes/catalog.py
@@ -144,14 +144,18 @@ async def refresh_dataset_from_hydroshare(identifier: str, user: Annotated[User,
 
 
 @router.put("/repository/s3", response_model=DatasetMetadataDOC)
-async def register_s3_dataset(request_model: S3Path, user: Annotated[User, Depends(get_current_user)]):
+async def register_s3_dataset(s3_path: S3Path, user: Annotated[User, Depends(get_current_user)]):
     """User provides the path to the S3 object. The metadata is fetched from the s3 object and saved to the catalog."""
-    path = request_model.path
-    bucket = request_model.bucket
-    endpoint_url = request_model.endpoint_url
-    identifier = f"{endpoint_url}+{bucket}+{path}"
+
+    identifier = s3_path.identifier
     submission: Submission = user.submission_by_repository(repo_type=RepositoryType.S3, identifier=identifier)
-    dataset = await _save_to_db(repository_type=RepositoryType.S3, identifier=identifier, user=user,
+    if submission is not None:
+        raise HTTPException(
+            status_code=status.HTTP_400_BAD_REQUEST,
+            detail="This S3 dataset has already been submitted by this user",
+        )
+    fetch_identifier = s3_path.fetch_identifier
+    dataset = await _save_to_db(repository_type=RepositoryType.S3, identifier=fetch_identifier, user=user,
                                 submission=submission)
     return dataset
 
@@ -171,7 +175,7 @@ async def create_dataset_s3(
     if submission is not None:
         raise HTTPException(
             status_code=status.HTTP_400_BAD_REQUEST,
-            detail="Dataset metadata record was not found",
+            detail="This S3 dataset has already been submitted by this user",
         )
     await document.insert()
     submission = document.as_submission()
@@ -221,11 +225,17 @@ async def _save_to_db(repository_type: RepositoryType, identifier: str, user: Us
     adapter = get_adapter_by_type(repository_type=repository_type)
     # fetch metadata from repository as catalog dataset
     repo_dataset: DatasetMetadataDOC = await _get_repo_meta_as_catalog_record(adapter=adapter, identifier=identifier)
+    s3_path = None
+    if repository_type == RepositoryType.S3:
+        s3_endpoint_url, bucket, path = identifier.split("+")
+        s3_path = S3Path(endpoint_url=s3_endpoint_url, bucket=bucket, path=path)
+        identifier = s3_path.identifier
     if submission is None:
         # new registration
         await repo_dataset.insert()
         submission = repo_dataset.as_submission()
         submission = adapter.update_submission(submission=submission, repo_record_id=identifier)
+        submission.s3_path = s3_path
         user.submissions.append(submission)
         await user.save(link_rule=WriteRules.WRITE)
         dataset = repo_dataset
@@ -239,12 +249,14 @@ async def _save_to_db(repository_type: RepositoryType, identifier: str, user: Us
         updated_submission = adapter.update_submission(submission=updated_submission, repo_record_id=identifier)
         updated_submission.id = submission.id
         updated_submission.submitted = submission.submitted
+        updated_submission.s3_path = s3_path
         await updated_submission.replace()
         dataset = updated_dataset
         submission = updated_submission
 
     dataset = inject_repository_identifier(submission, dataset)
     dataset = inject_submission_type(submission, dataset)
+    dataset = inject_submission_s3_path(submission, dataset)
     return dataset
 
 

diff --git a/tests/test_dataset_routes.py b/tests/test_dataset_routes.py
@@ -402,6 +402,31 @@ async def test_get_datasets_exclude_none(client_test, dataset_data):
         assert "measurementTechnique" not in a_property
 
 
+@pytest.mark.asyncio
+async def test_register_minio_s3_dataset(client_test):
+    """Testing registering metadata for a generic dataset stored on minIO s3"""
+
+    # set the path to the generic metadata file on minIO s3
+    s3_path = {
+        "path": "data/.hs/dataset_metadata.json",
+        "bucket": "catalog-api-test",
+        "endpoint_url": "https://api.minio.cuahsi.io/",
+    }
+
+    dataset_response = await client_test.put(
+        "api/catalog/repository/s3", json=s3_path
+    )
+    assert dataset_response.status_code == 200
+    ds_metadata = dataset_response.json()
+    expected_repository_identifier = f"{s3_path['endpoint_url']}{s3_path['bucket']}/{s3_path['path']}"
+    assert ds_metadata["repository_identifier"] == expected_repository_identifier
+
+    # retrieve the record from the db
+    record_id = ds_metadata.get('_id')
+    response = await client_test.get(f"api/catalog/dataset/{record_id}")
+    assert response.status_code == 200
+
+
 @pytest.mark.parametrize("multiple", [True, False])
 @pytest.mark.asyncio
 async def test_get_submissions_1(client_test, dataset_data, multiple):

diff --git a/triggers/scheduler.py b/triggers/scheduler.py
@@ -71,8 +71,8 @@ async def do_daily():
             else:
                 # couldn't retrieve matching repository record
                 await db["discovery"].delete_one({"_id": submission.identifier})
-        except:
-            logger.exception(f"Failed to collect submission {submission.url}")
+        except Exception as exp:
+            logger.exception(f"Failed to collect submission {submission.url}, Error: {str(exp)}")
 
 
 def main():

diff --git a/triggers/update_catalog.py b/triggers/update_catalog.py
@@ -23,8 +23,8 @@ async def _main():
         while True:
             try:
                 await watch_catalog(db)
-            except:
-                logger.exception("Submission Watch Task failed, restarting the task")
+            except Exception as exp:
+                logger.exception(f"Submission Watch Task failed. Error:{str(exp)}, restarting the task")
     finally:
         db.close()
 

diff --git a/triggers/update_typeahead.py b/triggers/update_typeahead.py
@@ -19,8 +19,8 @@ async def _main():
         while True:
             try:
                 await watch_discovery(db)
-            except:
-                logger.exception("Discovery Watch Task failed, restarting the task")
+            except Exception as exp:
+                logger.exception(f"Discovery Watch Task failed. Error:{str(exp)}, restarting the task")
     finally:
         db.close()