diff --git a/.github/workflows/dependabot.yml b/.github/dependabot.yml similarity index 100% rename from .github/workflows/dependabot.yml rename to .github/dependabot.yml diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index a78d077..250fbab 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -3,7 +3,9 @@ name: Run I-GUIDE Builds and Config Saturations on: workflow_dispatch: push: - branches: ['*'] + branches-ignore: + - productionalization + - develop env: DOMAIN: iguide.cuahsi.io diff --git a/.github/workflows/deploy-dev.yaml b/.github/workflows/deploy-dev.yaml deleted file mode 100644 index c764159..0000000 --- a/.github/workflows/deploy-dev.yaml +++ /dev/null @@ -1,96 +0,0 @@ -name: Deploy I-GUIDE to GKE Autopilot (Beta) - -on: - workflow_dispatch: - push: - branches: - - 'develop' - -env: - DOMAIN: iguide-dev.cuahsi.io - IP: iguide-dev - TESTING: false - OIDC_ISSUER: https://orcid.org - DATABASE_NAME: iguide_dev - DB_PROTOCOL: mongodb+srv - HYDROSHARE_META_READ_URL: https://www.hydroshare.org/hsapi2/resource/%s/json/ - HYDROSHARE_FILE_READ_URL: https://www.hydroshare.org/hsapi/resource/%s/files/ - VITE_APP_NAME: I-GUIDE - VITE_APP_URL: https://iguide-dev.cuahsi.io - VITE_APP_API_URL: https://iguide-dev.cuahsi.io/api - VITE_APP_LOGIN_URL: https://orcid.org/oauth/authorize - VITE_APP_SUPPORT_EMAIL: help@example.com - VITE_APP_CLIENT_ID: APP-4ZA8C8BYAH3QHNE9 - SEARCH_RELEVANCE_SCORE_THRESHOLD: 1.4 - - -jobs: - deploy: - runs-on: ubuntu-latest - steps: - - name: code checkout - uses: actions/checkout@v2 - - - name: Install the gcloud cli - uses: google-github-actions/setup-gcloud@v0 - with: - project_id: ${{ secrets.GOOGLE_PROJECT }} - service_account_key: ${{ secrets.GOOGLE_APPLICATION_CREDENTIALS }} - install_components: 'gke-gcloud-auth-plugin' - export_default_credentials: true - - - name: Compile the root env file - env: - DB_HOST: ${{ secrets.DB_HOST_BETA }} - DB_USERNAME: ${{ secrets.DB_USERNAME_BETA }} - DB_PASSWORD: ${{ secrets.DB_PASSWORD_BETA }} - run: | - variables=("OIDC_ISSUER" "DB_USERNAME" "DB_PASSWORD" "DB_HOST" "DATABASE_NAME" "DB_PROTOCOL" "TESTING" "VITE_APP_LOGIN_URL" "HYDROSHARE_META_READ_URL" "HYDROSHARE_FILE_READ_URL" "SEARCH_RELEVANCE_SCORE_THRESHOLD") - - # Empty the .env file - > .env - - # Loop through the variables and add them to the .env file - for var in "${variables[@]}"; do - echo "$var=${!var}" >> .env - done - - - name: Compile the frontend env file - env: - VITE_APP_GOOGLE_MAPS_API_KEY: ${{ secrets.VITE_APP_GOOGLE_MAPS_API_KEY }} - - run: | - variables=("VITE_APP_NAME" "VITE_APP_API_URL" "VITE_APP_SUPPORT_EMAIL" "VITE_APP_URL" "VITE_APP_LOGIN_URL" "VITE_APP_CLIENT_ID" "VITE_APP_GOOGLE_MAPS_API_KEY") - - # Empty the .env file - > frontend/.env - - # Loop through the variables and add them to the .env file - for var in "${variables[@]}"; do - echo "$var=${!var}" >> frontend/.env - done - - - name: Build and push docker images - env: - GOOGLE_PROJECT: ${{ secrets.GOOGLE_PROJECT }} - run: | - gcloud auth configure-docker us-central1-docker.pkg.dev - docker build -t us-central1-docker.pkg.dev/$GOOGLE_PROJECT/iguide/api:$GITHUB_SHA -f docker/api/Dockerfile . - docker push us-central1-docker.pkg.dev/$GOOGLE_PROJECT/iguide/api:$GITHUB_SHA - docker build -t us-central1-docker.pkg.dev/$GOOGLE_PROJECT/iguide/frontend:$GITHUB_SHA -f docker/frontend/Dockerfile . - docker push us-central1-docker.pkg.dev/$GOOGLE_PROJECT/iguide/frontend:$GITHUB_SHA - docker build -t us-central1-docker.pkg.dev/$GOOGLE_PROJECT/iguide/trigger:$GITHUB_SHA -f docker/triggers/Dockerfile . - docker push us-central1-docker.pkg.dev/$GOOGLE_PROJECT/iguide/trigger:$GITHUB_SHA - - - name: Deploy to GKE - env: - USE_GKE_GCLOUD_AUTH_PLUGIN: True - GOOGLE_PROJECT: ${{ secrets.GOOGLE_PROJECT }} - run: | - gcloud container clusters get-credentials iguide-dev --region us-central1 - find ./kubernetes -type f | xargs -i sed -i "s/GOOGLE_PROJECT/$GOOGLE_PROJECT/g" {} - find ./kubernetes -type f | xargs -i sed -i "s/IGUIDE_TAG/$GITHUB_SHA/g" {} - find ./kubernetes -type f | xargs -i sed -i "s/IGUIDE_DOMAIN/$DOMAIN/g" {} - kubectl apply -f kubernetes/ - # Refresh pods - kubectl delete pods --all diff --git a/.github/workflows/deploy.yaml b/.github/workflows/deploy.yaml index 063bf22..4b08833 100644 --- a/.github/workflows/deploy.yaml +++ b/.github/workflows/deploy.yaml @@ -4,28 +4,30 @@ on: workflow_dispatch: push: branches: + - 'develop' - 'productionalization' env: - DOMAIN: iguide.cuahsi.io - TAG: latest + DEPLOY_TO_PRODUCTION: ${{ github.ref == 'refs/heads/productionalization' && true || false }} + DOMAIN: ${{ env.DEPLOY_TO_PRODUCTION == false && 'iguide-dev.cuahsi.io' || 'iguide.cuahsi.io' }} + IP: ${{ env.DEPLOY_TO_PRODUCTION == false && 'iguide-dev' || 'iguide' }} + KUBE_CLUSTER_NAME: ${{ env.DEPLOY_TO_PRODUCTION == false && 'iguide-dev' || 'iguide' }} + TAG: ${{ env.DEPLOY_TO_PRODUCTION == false && github.sha || 'latest' }} TESTING: false - IP: iguide OIDC_ISSUER: https://orcid.org - DATABASE_NAME: iguide_beta + # why are we using iguide_beta for production deployment? Should it be iguide_demo? + DATABASE_NAME: ${{ env.DEPLOY_TO_PRODUCTION == false && 'iguide_dev' || 'iguide_beta' }} DB_PROTOCOL: mongodb+srv HYDROSHARE_META_READ_URL: https://www.hydroshare.org/hsapi2/resource/%s/json/ HYDROSHARE_FILE_READ_URL: https://www.hydroshare.org/hsapi/resource/%s/files/ VITE_APP_NAME: I-GUIDE - VITE_APP_URL: https://iguide.cuahsi.io - VITE_APP_API_URL: https://iguide.cuahsi.io/api + VITE_APP_URL: ${{ env.DEPLOY_TO_PRODUCTION == false && 'https://iguide-dev.cuahsi.io' || 'https://iguide.cuahsi.io' }} + VITE_APP_API_URL: ${{ env.DEPLOY_TO_PRODUCTION == false && 'https://iguide-dev.cuahsi.io/api' || 'https://iguide.cuahsi.io/api' }} VITE_APP_LOGIN_URL: https://orcid.org/oauth/authorize - VITE_APP_GOOGLE_MAPS_API_KEY: "" VITE_APP_SUPPORT_EMAIL: help@example.com VITE_APP_CLIENT_ID: APP-4ZA8C8BYAH3QHNE9 SEARCH_RELEVANCE_SCORE_THRESHOLD: 1.4 - jobs: deploy: runs-on: ubuntu-latest @@ -43,9 +45,9 @@ jobs: - name: Compile the root env file env: - DB_HOST: ${{ secrets.DB_HOST }} - DB_USERNAME: ${{ secrets.DB_USERNAME }} - DB_PASSWORD: ${{ secrets.DB_PASSWORD }} + DB_HOST: ${{ env.DEPLOY_TO_PRODUCTION == false && secrets.DB_HOST_BETA || secrets.DB_HOST }} + DB_USERNAME: ${{ env.DEPLOY_TO_PRODUCTION == false && secrets.DB_USERNAME_BETA || secrets.DB_USERNAME }} + DB_PASSWORD: ${{ env.DEPLOY_TO_PRODUCTION == false && secrets.DB_PASSWORD_BETA || secrets.DB_PASSWORD }} run: | variables=("OIDC_ISSUER" "DB_USERNAME" "DB_PASSWORD" "DB_HOST" "DATABASE_NAME" "DB_PROTOCOL" "TESTING" "VITE_APP_LOGIN_URL" "HYDROSHARE_META_READ_URL" "HYDROSHARE_FILE_READ_URL" "SEARCH_RELEVANCE_SCORE_THRESHOLD") @@ -58,6 +60,9 @@ jobs: done - name: Compile the frontend env file + env: + VITE_APP_GOOGLE_MAPS_API_KEY: ${{env.DEPLOY_TO_PRODUCTION == false && secrets.VITE_APP_GOOGLE_MAPS_API_KEY || ''}} + run: | variables=("VITE_APP_NAME" "VITE_APP_API_URL" "VITE_APP_SUPPORT_EMAIL" "VITE_APP_URL" "VITE_APP_LOGIN_URL" "VITE_APP_CLIENT_ID" "VITE_APP_GOOGLE_MAPS_API_KEY") @@ -86,11 +91,13 @@ jobs: USE_GKE_GCLOUD_AUTH_PLUGIN: True GOOGLE_PROJECT: ${{ secrets.GOOGLE_PROJECT }} run: | - gcloud container clusters get-credentials iguide --region us-central1 - find ./kubernetes -type f | xargs -i sed -i "s/GOOGLE_PROJECT/$GOOGLE_PROJECT/g" {} + gcloud container clusters get-credentials $KUBE_CLUSTER_NAME --region us-central1 + find ./kubernetes -type f | xargs -i sed -i "s/GOOGLE_PROJECT/$GOOGLE_PROJECT/g" {} find ./kubernetes -type f | xargs -i sed -i "s/IGUIDE_TAG/$TAG/g" {} find ./kubernetes -type f | xargs -i sed -i "s/IGUIDE_DOMAIN/$DOMAIN/g" {} - find ./kubernetes -type f | xargs -i sed -i "s/IGUIDE_IP/$IP/g" {} + if [[ "${{ env.DEPLOY_TO_PRODUCTION }}" == true ]]; then + find ./kubernetes -type f | xargs -i sed -i "s/IGUIDE_IP/$IP/g" {} + fi kubectl apply -f kubernetes/ # Refresh pods kubectl delete pods --all diff --git a/Makefile b/Makefile index f13b2c7..e3c130d 100644 --- a/Makefile +++ b/Makefile @@ -33,4 +33,4 @@ test: .PHONY: pre-post pre-post: - docker-compose run catalog-trigger python /app/triggers/management/change_streams_pre_and_post.py + docker-compose run catalog-trigger python /app/api/models/management/change_streams_pre_and_post.py diff --git a/api/adapters/hydroshare.py b/api/adapters/hydroshare.py index 67a0a6d..ecf51d1 100644 --- a/api/adapters/hydroshare.py +++ b/api/adapters/hydroshare.py @@ -8,7 +8,7 @@ from api.exceptions import RepositoryException from api.models import schema from api.models.catalog import DatasetMetadataDOC -from api.models.user import Submission, SubmissionType +from api.models.user import Submission class Creator(BaseModel): diff --git a/api/adapters/s3.py b/api/adapters/s3.py index 5620904..4d4dc0e 100644 --- a/api/adapters/s3.py +++ b/api/adapters/s3.py @@ -1,12 +1,16 @@ -import boto3 import json -from botocore.client import Config +from http import HTTPStatus + +import boto3 from botocore import UNSIGNED +from botocore.client import Config +from botocore.exceptions import ClientError as S3ClientError from api.adapters.base import AbstractRepositoryMetadataAdapter, AbstractRepositoryRequestHandler from api.adapters.utils import RepositoryType, register_adapter +from api.exceptions import RepositoryException from api.models.catalog import DatasetMetadataDOC -from api.models.user import Submission, SubmissionType +from api.models.user import Submission class _S3RequestHandler(AbstractRepositoryRequestHandler): @@ -16,12 +20,25 @@ def get_metadata(self, record_id: str): file_key = record_id.split("+")[2] s3 = boto3.client('s3', config=Config(signature_version=UNSIGNED), endpoint_url=endpoint_url) - - response = s3.get_object(Bucket=bucket_name, Key=file_key) - json_content = response['Body'].read().decode('utf-8') + try: + response = s3.get_object(Bucket=bucket_name, Key=file_key) + except S3ClientError as ex: + if ex.response["Error"]["Code"] == "NoSuchKey": + raise RepositoryException( + detail=f"Specified metadata file was not found in S3: {bucket_name}/{file_key}", + status_code=HTTPStatus.NOT_FOUND + ) + else: + err_msg = f"Error accessing S3 file({bucket_name}/{file_key}): {str(ex)}" + raise RepositoryException(detail=err_msg, status_code=HTTPStatus.BAD_REQUEST) + json_content = response['Body'].read().decode('utf-8') # Parse the JSON content - data = json.loads(json_content) + try: + data = json.loads(json_content) + except json.JSONDecodeError as ex: + err_msg = f"Invalid JSON content in S3 file ({file_key}). Error: {str(ex)}" + raise RepositoryException(detail=err_msg, status_code=HTTPStatus.BAD_REQUEST) return data diff --git a/api/models/user.py b/api/models/user.py index 94b3f8b..566dae9 100644 --- a/api/models/user.py +++ b/api/models/user.py @@ -30,6 +30,11 @@ def identifier(self): identifier = f"{endpoint_url}/{self.bucket}/{self.path}" return identifier + @property + def fetch_identifier(self): + # This is the identifier that is used to fetch the file from S3 + return f"{self.endpoint_url}+{self.bucket}+{self.path}" + class Submission(Document): title: str = None diff --git a/api/routes/catalog.py b/api/routes/catalog.py index 43b73b0..7ba1b5c 100644 --- a/api/routes/catalog.py +++ b/api/routes/catalog.py @@ -144,14 +144,18 @@ async def refresh_dataset_from_hydroshare(identifier: str, user: Annotated[User, @router.put("/repository/s3", response_model=DatasetMetadataDOC) -async def register_s3_dataset(request_model: S3Path, user: Annotated[User, Depends(get_current_user)]): +async def register_s3_dataset(s3_path: S3Path, user: Annotated[User, Depends(get_current_user)]): """User provides the path to the S3 object. The metadata is fetched from the s3 object and saved to the catalog.""" - path = request_model.path - bucket = request_model.bucket - endpoint_url = request_model.endpoint_url - identifier = f"{endpoint_url}+{bucket}+{path}" + + identifier = s3_path.identifier submission: Submission = user.submission_by_repository(repo_type=RepositoryType.S3, identifier=identifier) - dataset = await _save_to_db(repository_type=RepositoryType.S3, identifier=identifier, user=user, + if submission is not None: + raise HTTPException( + status_code=status.HTTP_400_BAD_REQUEST, + detail="This S3 dataset has already been submitted by this user", + ) + fetch_identifier = s3_path.fetch_identifier + dataset = await _save_to_db(repository_type=RepositoryType.S3, identifier=fetch_identifier, user=user, submission=submission) return dataset @@ -171,7 +175,7 @@ async def create_dataset_s3( if submission is not None: raise HTTPException( status_code=status.HTTP_400_BAD_REQUEST, - detail="Dataset metadata record was not found", + detail="This S3 dataset has already been submitted by this user", ) await document.insert() submission = document.as_submission() @@ -221,11 +225,17 @@ async def _save_to_db(repository_type: RepositoryType, identifier: str, user: Us adapter = get_adapter_by_type(repository_type=repository_type) # fetch metadata from repository as catalog dataset repo_dataset: DatasetMetadataDOC = await _get_repo_meta_as_catalog_record(adapter=adapter, identifier=identifier) + s3_path = None + if repository_type == RepositoryType.S3: + s3_endpoint_url, bucket, path = identifier.split("+") + s3_path = S3Path(endpoint_url=s3_endpoint_url, bucket=bucket, path=path) + identifier = s3_path.identifier if submission is None: # new registration await repo_dataset.insert() submission = repo_dataset.as_submission() submission = adapter.update_submission(submission=submission, repo_record_id=identifier) + submission.s3_path = s3_path user.submissions.append(submission) await user.save(link_rule=WriteRules.WRITE) dataset = repo_dataset @@ -239,12 +249,14 @@ async def _save_to_db(repository_type: RepositoryType, identifier: str, user: Us updated_submission = adapter.update_submission(submission=updated_submission, repo_record_id=identifier) updated_submission.id = submission.id updated_submission.submitted = submission.submitted + updated_submission.s3_path = s3_path await updated_submission.replace() dataset = updated_dataset submission = updated_submission dataset = inject_repository_identifier(submission, dataset) dataset = inject_submission_type(submission, dataset) + dataset = inject_submission_s3_path(submission, dataset) return dataset diff --git a/tests/test_dataset_routes.py b/tests/test_dataset_routes.py index ef29b89..ad282d6 100644 --- a/tests/test_dataset_routes.py +++ b/tests/test_dataset_routes.py @@ -402,6 +402,31 @@ async def test_get_datasets_exclude_none(client_test, dataset_data): assert "measurementTechnique" not in a_property +@pytest.mark.asyncio +async def test_register_minio_s3_dataset(client_test): + """Testing registering metadata for a generic dataset stored on minIO s3""" + + # set the path to the generic metadata file on minIO s3 + s3_path = { + "path": "data/.hs/dataset_metadata.json", + "bucket": "catalog-api-test", + "endpoint_url": "https://api.minio.cuahsi.io/", + } + + dataset_response = await client_test.put( + "api/catalog/repository/s3", json=s3_path + ) + assert dataset_response.status_code == 200 + ds_metadata = dataset_response.json() + expected_repository_identifier = f"{s3_path['endpoint_url']}{s3_path['bucket']}/{s3_path['path']}" + assert ds_metadata["repository_identifier"] == expected_repository_identifier + + # retrieve the record from the db + record_id = ds_metadata.get('_id') + response = await client_test.get(f"api/catalog/dataset/{record_id}") + assert response.status_code == 200 + + @pytest.mark.parametrize("multiple", [True, False]) @pytest.mark.asyncio async def test_get_submissions_1(client_test, dataset_data, multiple): diff --git a/triggers/scheduler.py b/triggers/scheduler.py index 876da7e..4fe14df 100644 --- a/triggers/scheduler.py +++ b/triggers/scheduler.py @@ -71,8 +71,8 @@ async def do_daily(): else: # couldn't retrieve matching repository record await db["discovery"].delete_one({"_id": submission.identifier}) - except: - logger.exception(f"Failed to collect submission {submission.url}") + except Exception as exp: + logger.exception(f"Failed to collect submission {submission.url}, Error: {str(exp)}") def main(): diff --git a/triggers/update_catalog.py b/triggers/update_catalog.py index 135f837..4162efb 100644 --- a/triggers/update_catalog.py +++ b/triggers/update_catalog.py @@ -23,8 +23,8 @@ async def _main(): while True: try: await watch_catalog(db) - except: - logger.exception("Submission Watch Task failed, restarting the task") + except Exception as exp: + logger.exception(f"Submission Watch Task failed. Error:{str(exp)}, restarting the task") finally: db.close() diff --git a/triggers/update_typeahead.py b/triggers/update_typeahead.py index 8f227c4..b11097c 100644 --- a/triggers/update_typeahead.py +++ b/triggers/update_typeahead.py @@ -19,8 +19,8 @@ async def _main(): while True: try: await watch_discovery(db) - except: - logger.exception("Discovery Watch Task failed, restarting the task") + except Exception as exp: + logger.exception(f"Discovery Watch Task failed. Error:{str(exp)}, restarting the task") finally: db.close()