From 1ded45d5703149d7ef05b797d593d09623f5573e Mon Sep 17 00:00:00 2001 From: Binita Date: Wed, 26 Jun 2024 02:23:23 -0500 Subject: [PATCH 01/71] Modified code to see use_constraints errors. --- pyQuARC/code/custom_checker.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/pyQuARC/code/custom_checker.py b/pyQuARC/code/custom_checker.py index f38cedda..ee79e480 100644 --- a/pyQuARC/code/custom_checker.py +++ b/pyQuARC/code/custom_checker.py @@ -45,7 +45,11 @@ def _get_path_value_recursively( or isinstance(root_content, int) or isinstance(root_content, float) ): - container.append(root_content) + # if there is at least one element in new_path, the value can not be found + if new_path: + container.append(None) + else: + container.append(root_content) return elif isinstance(root_content, list): if not new_path: From 7aff3709e06976f9a5aa192f97798d23d57030a6 Mon Sep 17 00:00:00 2001 From: Binita Date: Wed, 2 Oct 2024 11:25:11 -0500 Subject: [PATCH 02/71] add Z at the end of datetime formats --- pyQuARC/code/constants.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/pyQuARC/code/constants.py b/pyQuARC/code/constants.py index e77c74fc..03f1c25d 100644 --- a/pyQuARC/code/constants.py +++ b/pyQuARC/code/constants.py @@ -79,10 +79,10 @@ CMR_URL = "https://cmr.earthdata.nasa.gov" DATE_FORMATS = [ - "%Y-%m-%dT%H:%M:%S.%f", # Year to microsecond - "%Y-%m-%dT%H:%M:%S", # Year to second - "%Y-%m-%dT%H:%M", # Year to minute - "%Y-%m-%dT%H", # Year to hour + "%Y-%m-%dT%H:%M:%S.%fZ", # Year to microsecond + "%Y-%m-%dT%H:%M:%SZ", # Year to second + "%Y-%m-%dT%H:%MZ", # Year to minute + "%Y-%m-%dT%HZ", # Year to hour "%Y-%m-%d", # Year to day "%Y-%m", # Year to month "%Y", # Year From b75ae2f3b3563878382e202b36c5ae6668434b99 Mon Sep 17 00:00:00 2001 From: Binita Date: Mon, 4 Nov 2024 15:32:27 -0600 Subject: [PATCH 03/71] added new check for opendap for umm-g and echo-g --- pyQuARC/code/custom_validator.py | 50 +++++++++++++++++++++++++++++ pyQuARC/schemas/check_messages.json | 8 +++++ pyQuARC/schemas/checks.json | 5 +++ pyQuARC/schemas/rule_mapping.json | 28 ++++++++++++++++ 4 files changed, 91 insertions(+) diff --git a/pyQuARC/code/custom_validator.py b/pyQuARC/code/custom_validator.py index bf3620d1..1973ca75 100644 --- a/pyQuARC/code/custom_validator.py +++ b/pyQuARC/code/custom_validator.py @@ -277,3 +277,53 @@ def count_check(count, values, key): items = [items] num_items = len(items) return {"valid": int(count) == num_items, "value": (count, num_items)} + + @staticmethod + def opendap_link_check(related_urls, key, extra=None): + """ + Checks if the related_urls contains an OPeNDAP link with the type "OPENDAP DATA" or URL containing "opendap". + + Args: + related_urls (list): The related_urls field of the object, expected to be a list of URL objects. + key (dict): A dictionary with "type" and "url_keyword" keys for the checks. + extra (optional): An additional argument to match the expected function call signature. This argument is ignored. + + Returns: + dict: A validation result indicating whether a valid OPeNDAP link is present and the link itself if found. + """ + + # If related_urls is None or not provided, initialize it as an empty list + if not related_urls: + related_urls = [] + + # If related_urls is not a list, assume it's a single URL string and wrap it in a list of one dictionary + elif isinstance(related_urls, str): + related_urls = [{"URL": related_urls, "Type": key.get("type", "OPENDAP DATA")}] + + # Default return object if no valid OPeNDAP link is found + return_obj = { + "valid": False, + "value": "None" + } + + # Extract type and keyword from key for clearer conditions + type_to_check = key.get("type", "OPENDAP DATA").upper() + url_keyword = key.get("url_keyword", "opendap").lower() + + # Process each URL object in the list + for url_obj in related_urls: + # Ensure that url_obj is a dictionary before accessing its fields + if not isinstance(url_obj, dict): + continue + + # Check for "opendap" in the URL + url_value = url_obj.get("URL", "").lower() + type_field = url_obj.get("Type", "").upper() + + # Check if the URL contains "opendap" or if the Type matches "OPENDAP DATA" + if url_keyword in url_value or type_field == type_to_check: + return_obj["valid"] = True + return_obj["value"] = url_obj.get("URL", "None") + break + + return return_obj diff --git a/pyQuARC/schemas/check_messages.json b/pyQuARC/schemas/check_messages.json index 0b8b38c8..18bde23f 100644 --- a/pyQuARC/schemas/check_messages.json +++ b/pyQuARC/schemas/check_messages.json @@ -1070,5 +1070,13 @@ "url": "https://wiki.earthdata.nasa.gov/display/CMR/Spatial+Extent" }, "remediation": "Recommend providing the horizontal pixel resolution, if applicable. If provided, this information will be indexed in the EDSC 'Horizontal Data Resolution' search facet which allows users to search by spatial resolution." + }, + "opendap_link_check": { + "failure": "No OPeNDAP URL is provided in the granule fields. An OPeNDAP link is recommended for data access.", + "help": { + "message": "OPeNDAP links allow for direct data access through the OPeNDAP protocol.", + "url": "https://wiki.earthdata.nasa.gov/display/CMR/Related+URLs" + }, + "remediation": "Recommend providing an OPeNDAP in the granule's Online Resources or Related URLs fields for enhanced data accessibility." } } \ No newline at end of file diff --git a/pyQuARC/schemas/checks.json b/pyQuARC/schemas/checks.json index 778f4da3..ef303aa6 100644 --- a/pyQuARC/schemas/checks.json +++ b/pyQuARC/schemas/checks.json @@ -298,5 +298,10 @@ "data_type": "custom", "check_function": "count_check", "available": true + }, + "opendap_link_check": { + "data_type": "custom", + "check_function": "opendap_link_check", + "available": true } } diff --git a/pyQuARC/schemas/rule_mapping.json b/pyQuARC/schemas/rule_mapping.json index 2e3acc41..b6aab68e 100644 --- a/pyQuARC/schemas/rule_mapping.json +++ b/pyQuARC/schemas/rule_mapping.json @@ -3745,6 +3745,34 @@ "severity": "error", "check_id": "string_compare" }, + "opendap_link_check": { + "rule_name": "OPeNDAP Link Presence Check", + "fields_to_apply": { + "echo-g": [ + { + "fields": [ + "Granule/OnlineResources/OnlineResource/URL" + ] + } + ], + "umm-g": [ + { + "fields": [ + "RelatedURLs/URL" + ] + } + ] + }, + "data": [ + { + "type": "OPENDAP DATA", + "url_keyword": "opendap" + } + ], + "relation": "contains", + "check_id": "opendap_link_check", + "severity": "warning" + }, "location_keyword_presence_check": { "rule_name": "Location Keyword Presence Check", "fields_to_apply": { From 50e8082ec5c3f692ebc14ccb3e6dcf4d1670972d Mon Sep 17 00:00:00 2001 From: Binita Date: Tue, 5 Nov 2024 15:15:53 -0600 Subject: [PATCH 04/71] changes on rule_mapping for opendap --- pyQuARC/code/custom_validator.py | 17 +++++++++-------- pyQuARC/schemas/rule_mapping.json | 4 ++-- 2 files changed, 11 insertions(+), 10 deletions(-) diff --git a/pyQuARC/code/custom_validator.py b/pyQuARC/code/custom_validator.py index 1973ca75..6becf37d 100644 --- a/pyQuARC/code/custom_validator.py +++ b/pyQuARC/code/custom_validator.py @@ -281,7 +281,7 @@ def count_check(count, values, key): @staticmethod def opendap_link_check(related_urls, key, extra=None): """ - Checks if the related_urls contains an OPeNDAP link with the type "OPENDAP DATA" or URL containing "opendap". + Checks if the related_urls contains an OPeNDAP link by looking for "opendap" in the URL or matching Type/Subtype fields. Args: related_urls (list): The related_urls field of the object, expected to be a list of URL objects. @@ -296,9 +296,9 @@ def opendap_link_check(related_urls, key, extra=None): if not related_urls: related_urls = [] - # If related_urls is not a list, assume it's a single URL string and wrap it in a list of one dictionary + # If related_urls is a string, wrap it in a list as a single URL dictionary without setting Type elif isinstance(related_urls, str): - related_urls = [{"URL": related_urls, "Type": key.get("type", "OPENDAP DATA")}] + related_urls = [{"URL": related_urls}] # Default return object if no valid OPeNDAP link is found return_obj = { @@ -306,9 +306,9 @@ def opendap_link_check(related_urls, key, extra=None): "value": "None" } - # Extract type and keyword from key for clearer conditions - type_to_check = key.get("type", "OPENDAP DATA").upper() + # Extract URL keyword and type to check from key url_keyword = key.get("url_keyword", "opendap").lower() + type_to_check = key.get("type", "OPENDAP DATA").upper() # Process each URL object in the list for url_obj in related_urls: @@ -316,12 +316,13 @@ def opendap_link_check(related_urls, key, extra=None): if not isinstance(url_obj, dict): continue - # Check for "opendap" in the URL + # Retrieve URL, Type, and Subtype fields from each URL object url_value = url_obj.get("URL", "").lower() type_field = url_obj.get("Type", "").upper() + subtype_field = url_obj.get("Subtype", "").upper() - # Check if the URL contains "opendap" or if the Type matches "OPENDAP DATA" - if url_keyword in url_value or type_field == type_to_check: + # Check if any of the conditions is met: URL contains "opendap", Type is "OPENDAP DATA", or Subtype contains "OPENDAP DATA" + if url_keyword in url_value or type_to_check == type_field or type_to_check in subtype_field: return_obj["valid"] = True return_obj["value"] = url_obj.get("URL", "None") break diff --git a/pyQuARC/schemas/rule_mapping.json b/pyQuARC/schemas/rule_mapping.json index b6aab68e..054df11e 100644 --- a/pyQuARC/schemas/rule_mapping.json +++ b/pyQuARC/schemas/rule_mapping.json @@ -3751,14 +3751,14 @@ "echo-g": [ { "fields": [ - "Granule/OnlineResources/OnlineResource/URL" + "Granule/OnlineResources/OnlineResource" ] } ], "umm-g": [ { "fields": [ - "RelatedURLs/URL" + "RelatedUrls" ] } ] From 693965c023c0739b53cd6ae630b3228bcae7cfb6 Mon Sep 17 00:00:00 2001 From: Binita Date: Wed, 6 Nov 2024 09:38:58 -0600 Subject: [PATCH 05/71] make sure it works on dict or orderdict --- pyQuARC/code/custom_validator.py | 40 ++++++++++++++++++++------------ 1 file changed, 25 insertions(+), 15 deletions(-) diff --git a/pyQuARC/code/custom_validator.py b/pyQuARC/code/custom_validator.py index 6becf37d..ab789d3f 100644 --- a/pyQuARC/code/custom_validator.py +++ b/pyQuARC/code/custom_validator.py @@ -2,6 +2,7 @@ from .string_validator import StringValidator from .utils import cmr_request, if_arg, set_cmr_prms +from collections.abc import Mapping class CustomValidator(BaseValidator): @@ -281,10 +282,13 @@ def count_check(count, values, key): @staticmethod def opendap_link_check(related_urls, key, extra=None): """ - Checks if the related_urls contains an OPeNDAP link by looking for "opendap" in the URL or matching Type/Subtype fields. + Checks if the related_urls contains an OPeNDAP link by looking for "opendap" in the URL + or matching Type/Subtype fields. This function works with both OrderedDict and regular dict, + as well as a list of dictionaries. Args: - related_urls (list): The related_urls field of the object, expected to be a list of URL objects. + related_urls (list or Mapping): The related_urls field of the object, expected to be a list of URL objects + or a single OrderedDict. key (dict): A dictionary with "type" and "url_keyword" keys for the checks. extra (optional): An additional argument to match the expected function call signature. This argument is ignored. @@ -296,15 +300,12 @@ def opendap_link_check(related_urls, key, extra=None): if not related_urls: related_urls = [] - # If related_urls is a string, wrap it in a list as a single URL dictionary without setting Type - elif isinstance(related_urls, str): - related_urls = [{"URL": related_urls}] + # If related_urls is a single Mapping (like OrderedDict), wrap it in a list + elif isinstance(related_urls, Mapping): + related_urls = [related_urls] # Default return object if no valid OPeNDAP link is found - return_obj = { - "valid": False, - "value": "None" - } + return_obj = {"valid": False, "value": "None"} # Extract URL keyword and type to check from key url_keyword = key.get("url_keyword", "opendap").lower() @@ -312,19 +313,28 @@ def opendap_link_check(related_urls, key, extra=None): # Process each URL object in the list for url_obj in related_urls: - # Ensure that url_obj is a dictionary before accessing its fields - if not isinstance(url_obj, dict): + # Ensure that url_obj is a dictionary-like object before processing + if not isinstance(url_obj, Mapping): continue - # Retrieve URL, Type, and Subtype fields from each URL object + # Retrieve the URL field url_value = url_obj.get("URL", "").lower() + + # Check if the URL contains "opendap" + if "opendap" in url_value: + return_obj["valid"] = True + return_obj["value"] = url_value + break + + # Retrieve and normalize Type and Subtype fields type_field = url_obj.get("Type", "").upper() subtype_field = url_obj.get("Subtype", "").upper() - # Check if any of the conditions is met: URL contains "opendap", Type is "OPENDAP DATA", or Subtype contains "OPENDAP DATA" - if url_keyword in url_value or type_to_check == type_field or type_to_check in subtype_field: + # Check if the Type or Subtype contains "OPENDAP DATA" + if type_to_check in type_field or type_to_check in subtype_field: return_obj["valid"] = True - return_obj["value"] = url_obj.get("URL", "None") + return_obj["value"] = url_value if url_value else "None" break return return_obj + From 4cd239e51ab86ee1ca4e52f0f9eadfd1fda34802 Mon Sep 17 00:00:00 2001 From: Binita Date: Thu, 5 Dec 2024 21:13:58 -0600 Subject: [PATCH 06/71] added cmr validation check --- pyQuARC/code/constants.py | 6 ++++++ pyQuARC/main.py | 39 +++++++++++++++++++++++++++++++++++++-- 2 files changed, 43 insertions(+), 2 deletions(-) diff --git a/pyQuARC/code/constants.py b/pyQuARC/code/constants.py index e77c74fc..dd220e56 100644 --- a/pyQuARC/code/constants.py +++ b/pyQuARC/code/constants.py @@ -87,3 +87,9 @@ "%Y-%m", # Year to month "%Y", # Year ] + +MAPPING_CMR = { + "umm-c": "vnd.nasa.cmr.umm+json", + "echo-c": "echo10+xml", + "dif10": "dif10+xml" +} diff --git a/pyQuARC/main.py b/pyQuARC/main.py index c0890e31..42d6a792 100644 --- a/pyQuARC/main.py +++ b/pyQuARC/main.py @@ -8,7 +8,7 @@ if __name__ == "__main__": from code.checker import Checker - from code.constants import COLOR, ECHO10_C, SUPPORTED_FORMATS + from code.constants import COLOR, ECHO10_C, SUPPORTED_FORMATS, MAPPING_CMR from code.downloader import Downloader from code.utils import get_cmr_url, is_valid_cmr_url from code.utils import get_headers @@ -134,6 +134,38 @@ def _cmr_query(self): return concept_ids + def _validate_with_cmr(self, metadata_content): + """ + Validates metadata using the CMR API. + + Args: + metadata_content (str): The metadata content to validate. + + Returns: + dict: Results of the CMR API validation. + """ + provider_id = self.concept_ids[0].split("-")[1] + cmr_url = f"{self.cmr_host}/ingest/providers/{provider_id}/validate/collection/" + headers = { + "Content-Type": f"application/{MAPPING_CMR[self.metadata_format]}", + "Accept": "application/json", + } + response = requests.post(cmr_url, data=metadata_content, headers=headers) + return response + + def add_cmr_response(self, response): + # as it returns status code 200 with a list of any warnings on successful validation + # refer here for details: https://github.com/NASA-IMPACT/pyQuARC/issues/269 + if response.status_code == 200: + cmr_warnings = response.json()['warnings'] + self.errors[-1]["cmr_warnings"] = cmr_warnings + + # in the issue, 400 status code was mentioned but i think 422 refers to invalid data + # refer here for details: https://github.com/NASA-IMPACT/pyQuARC/issues/269 + if response.status_code == 422: + cmr_errors = response.json()['errors'] + self.errors[-1]["cmr_errors"] = cmr_errors + def validate(self): """ Validates the metadata contents of all the `concept_ids` and returns the errors @@ -163,6 +195,7 @@ def validate(self): ) continue content = content.encode() + response = self._validate_with_cmr(content) validation_errors, pyquarc_errors = checker.run(content) self.errors.append( { @@ -171,11 +204,12 @@ def validate(self): "pyquarc_errors": pyquarc_errors, } ) + self.add_cmr_response(response) elif self.file_path: with open(os.path.abspath(self.file_path), "r") as myfile: content = myfile.read().encode() - + response = self._validate_with_cmr(content) validation_errors, pyquarc_errors = checker.run(content) self.errors.append( { @@ -184,6 +218,7 @@ def validate(self): "pyquarc_errors": pyquarc_errors, } ) + self.add_cmr_response(response) return self.errors @staticmethod From 68193520585fbf59c8a18cd3b2596b0a4e4c2197 Mon Sep 17 00:00:00 2001 From: Emma Koontz Date: Tue, 10 Dec 2024 11:24:28 -0600 Subject: [PATCH 07/71] new_contributing_file --- contributing.md | 2 ++ 1 file changed, 2 insertions(+) create mode 100644 contributing.md diff --git a/contributing.md b/contributing.md new file mode 100644 index 00000000..8ba1f31c --- /dev/null +++ b/contributing.md @@ -0,0 +1,2 @@ +# Contributing + From 27515f7d9baf4812a2fc2cf5109591c521bff8b4 Mon Sep 17 00:00:00 2001 From: em-koontz Date: Wed, 19 Mar 2025 11:33:42 -0500 Subject: [PATCH 08/71] Update contributing.md --- contributing.md | 54 ++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 53 insertions(+), 1 deletion(-) diff --git a/contributing.md b/contributing.md index 8ba1f31c..f3806127 100644 --- a/contributing.md +++ b/contributing.md @@ -1,2 +1,54 @@ -# Contributing +# Contributing File + +# Welcome to pyQuARC! +This page is meant to help you learn how you can contribute to pyQuARC Content! We are passionate about NASA's Open Science initiative and we want your help. Read below to find ways that you can contribute to our project, either through reporting bugs, suggesting new features, or even directly editing the code yourself! +## Getting Started +#### What is pyQuARC? +PyQuARC is a tool that can helpp streamline the process of metadata assessment by automating it as much as possible. It completes basic validation checks, flags areas that need contextual metadata, and confirms information associated with both the collection level metadata and granuale level metadata are consistent. Read more about pyQuARC in the project's [**Read Me File**](https://github.com/NASA-IMPACT/pyQuARC?tab=readme-ov-file#pyquarc). + +## How you can contributute to pyQuARC: +We will start realativly simple, and progress to the more difficult ways to contribute. + +1. **Report a Bug:** for when you find something within the code that does not respond the way you expected/wanted it to. + * To start you will need to proceed to the **Issues** tab within the [pyQuARC Github Page](https://github.com/NASA-IMPACT/pyQuARC/issues). + * From here, look for the green button on the right side of the page labeled **New issue**. + * Select **Bug Report** from the list that appears, so that you can create a report to help us improve an aspect of pyQuARC. + * The page you are directed to will provide a prompt to add a title and explains how to fill in the bug you are wanting to report. + * If you change your mind about reporting a bug, there is a white button on the bottom right of the page labeled **Cancel** where you can either decide to keep editing or close and discard your issue. + * When you are finished describing the bug you wish to report, you can click the big green button at the bottom of the page labeled **Create**. This will make your Issue visible to all pyQuARC contributers and will help us to make pyQuARC more user friendly! + * You can see your new issue if you return to the **Issues** page of the pyQuARC GitHub and look for your title followed by the red __bug__ tag. + * Thank you for submitting a bug report to our team! We know that your report will help to better pyQuARC for everyone. + +2. **Suggest a New Feature:** for when you think of something that could enhance pyQuARC for other users. + * Suggesting a new feature is very similar to reporting a bug. You will start at the **Issues** tab within the [pyQuARC Github Page](https://github.com/NASA-IMPACT/pyQuARC/issues). + * Select the green **New Issue** button found on the top right side of the page. + * From the menu that appears, select **Feature Request** so that you can suggest an idea for our project. + * The page you are directed to will provide a prompt to add a title and explains how to make a new suggestion. + * If you change your mind about making a feature request, there is a white button on the bottom right of the page labeled **Cancel** where you can either decide to keep editing or close and discard your issue. + * When you are finished describing your suggestion, you can click the big green button at the bottom of the page labeled **Create**. This will make your Issue visible to all pyQuARC contributers and will help us to make pyQuARC more user friendly! + * You can see your new issue if you return to the **Issues** page of the pyQuARC GitHub and look for your title followed by the green __new check__ tag. + * Thank you for suggesting a new feature or enhancment to our team! Your suggestiong will help us to better pyQuARC for everyone. + +3. **Directly Contribute to PyQuARC Content:** for when you want to directly edit the code to add checks, or play around and see how things work. + * To edit the code you will need to first create your own 'fork' of the repository. A fork is a new repository that shares code and visibilty settings with the original repository, but allows you to create your own edits freely without changing the original repository. Read more about Forks [Here](https://docs.github.com/en/pull-requests/collaborating-with-pull-requests/working-with-forks/fork-a-repo) + * To create your own fork of pyQuARC, return to the [**Code**](https://github.com/NASA-IMPACT/pyQuARC) tab of the pyQuARC GitHub. + * On the top right of the page select the **Fork** tab. + * Under the "Owner" dropdowm menu select yourself as the owner of the new forked repository. + * The fork name will default to 'pyQuARC'. If you wish to name your fork something different, edit the 'Repository Name' field. + * You can set an optional description in the 'Description' field below. + * Make sure the checkbox next to 'Copy the master branch only' is checked. + * Click **Create fork** when you are finished to create your fork! + * After completing the steps above, you should be in a new page titled the same as your new fork. You have successfully created a fork of pyQuARC! You shpuld be able to see all the pyQuARC files and the Read Me. + * Now we will store the files locally on your computer so you will be able to edit the code. Click the green dropdown button labeled **<> Code**. + * Under the **HTTPS** tab, copy the link to the repository. + * Open a python terminal in your preferred coding location. + * Change your working directory to wherever you want your cloned pyQuARC repository to be stored. + * Type '__git clone__' and then paste the URL you copied a few steps above. + * Press **Enter** and your local clone of pyQuARC will be created! You can now explore all of the files on your local computer and make changes as you wish. + +## Thank you for your interest in pyQuARC! +We appriciate your interest in our metadata review project! In the spirit of Open Science, everyone is encouraged to help to improve pyQuARC, and we welcome your comments, suggestiong, and new ideas! + + + From b112896d7838af947c1db6e8e782695f6d159fbe Mon Sep 17 00:00:00 2001 From: em-koontz Date: Wed, 9 Apr 2025 09:31:37 -0500 Subject: [PATCH 09/71] Update contributing.md --- contributing.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/contributing.md b/contributing.md index f3806127..0a8d5f52 100644 --- a/contributing.md +++ b/contributing.md @@ -47,7 +47,7 @@ We will start realativly simple, and progress to the more difficult ways to cont * Press **Enter** and your local clone of pyQuARC will be created! You can now explore all of the files on your local computer and make changes as you wish. ## Thank you for your interest in pyQuARC! -We appriciate your interest in our metadata review project! In the spirit of Open Science, everyone is encouraged to help to improve pyQuARC, and we welcome your comments, suggestiong, and new ideas! +We appriciate your interest in our metadata review project! In the spirit of Open Science, everyone is encouraged to help to improve pyQuARC, and we welcome your comments, suggestions, and new ideas! From 14108beede11b2467f19ca98c68add5e9bf75142 Mon Sep 17 00:00:00 2001 From: Sheyenne Kirkland Date: Tue, 22 Apr 2025 14:50:33 -0500 Subject: [PATCH 10/71] content updates --- contributing.md | 63 ++++++++++++++++++++++++++----------------------- 1 file changed, 34 insertions(+), 29 deletions(-) diff --git a/contributing.md b/contributing.md index 0a8d5f52..71f52ea3 100644 --- a/contributing.md +++ b/contributing.md @@ -1,53 +1,58 @@ # Contributing File # Welcome to pyQuARC! -This page is meant to help you learn how you can contribute to pyQuARC Content! We are passionate about NASA's Open Science initiative and we want your help. Read below to find ways that you can contribute to our project, either through reporting bugs, suggesting new features, or even directly editing the code yourself! -## Getting Started -#### What is pyQuARC? -PyQuARC is a tool that can helpp streamline the process of metadata assessment by automating it as much as possible. It completes basic validation checks, flags areas that need contextual metadata, and confirms information associated with both the collection level metadata and granuale level metadata are consistent. Read more about pyQuARC in the project's [**Read Me File**](https://github.com/NASA-IMPACT/pyQuARC?tab=readme-ov-file#pyquarc). +This page is meant to help you learn how you can contribute to pyQuARC! We are passionate about NASA's Open Science initiative and are open to a variety of contributions. Read below to find ways that you can contribute to our project, either through reporting bugs, suggesting new features, or even directly editing the code yourself. ## How you can contributute to pyQuARC: -We will start realativly simple, and progress to the more difficult ways to contribute. 1. **Report a Bug:** for when you find something within the code that does not respond the way you expected/wanted it to. - * To start you will need to proceed to the **Issues** tab within the [pyQuARC Github Page](https://github.com/NASA-IMPACT/pyQuARC/issues). + * To start you will need to proceed to the [**Issues** tab](https://github.com/NASA-IMPACT/pyQuARC/issues) within the pyQuARC Github page. * From here, look for the green button on the right side of the page labeled **New issue**. * Select **Bug Report** from the list that appears, so that you can create a report to help us improve an aspect of pyQuARC. * The page you are directed to will provide a prompt to add a title and explains how to fill in the bug you are wanting to report. * If you change your mind about reporting a bug, there is a white button on the bottom right of the page labeled **Cancel** where you can either decide to keep editing or close and discard your issue. - * When you are finished describing the bug you wish to report, you can click the big green button at the bottom of the page labeled **Create**. This will make your Issue visible to all pyQuARC contributers and will help us to make pyQuARC more user friendly! + * Beneath the description box, select "Issue Type" and "Bug". + * When you are finished describing the bug you wish to report, you can click the big green button at the bottom of the page labeled **Create**. This will make your Issue visible to all pyQuARC contributers, and pyQuARC developers will automatically be assigned to the Issue and notified. * You can see your new issue if you return to the **Issues** page of the pyQuARC GitHub and look for your title followed by the red __bug__ tag. - * Thank you for submitting a bug report to our team! We know that your report will help to better pyQuARC for everyone. 2. **Suggest a New Feature:** for when you think of something that could enhance pyQuARC for other users. - * Suggesting a new feature is very similar to reporting a bug. You will start at the **Issues** tab within the [pyQuARC Github Page](https://github.com/NASA-IMPACT/pyQuARC/issues). + * Suggesting a new feature is very similar to reporting a bug. You will start at the [**Issues** tab](https://github.com/NASA-IMPACT/pyQuARC/issues) within the pyQuARC Github page. * Select the green **New Issue** button found on the top right side of the page. * From the menu that appears, select **Feature Request** so that you can suggest an idea for our project. * The page you are directed to will provide a prompt to add a title and explains how to make a new suggestion. * If you change your mind about making a feature request, there is a white button on the bottom right of the page labeled **Cancel** where you can either decide to keep editing or close and discard your issue. - * When you are finished describing your suggestion, you can click the big green button at the bottom of the page labeled **Create**. This will make your Issue visible to all pyQuARC contributers and will help us to make pyQuARC more user friendly! + * Beneath the description box, select "Issue Type" and "Feature". + * When you are finished describing your suggestion, you can click the big green button at the bottom of the page labeled **Create**. This will make your Issue visible to all pyQuARC contributers. * You can see your new issue if you return to the **Issues** page of the pyQuARC GitHub and look for your title followed by the green __new check__ tag. - * Thank you for suggesting a new feature or enhancment to our team! Your suggestiong will help us to better pyQuARC for everyone. - -3. **Directly Contribute to PyQuARC Content:** for when you want to directly edit the code to add checks, or play around and see how things work. - * To edit the code you will need to first create your own 'fork' of the repository. A fork is a new repository that shares code and visibilty settings with the original repository, but allows you to create your own edits freely without changing the original repository. Read more about Forks [Here](https://docs.github.com/en/pull-requests/collaborating-with-pull-requests/working-with-forks/fork-a-repo) - * To create your own fork of pyQuARC, return to the [**Code**](https://github.com/NASA-IMPACT/pyQuARC) tab of the pyQuARC GitHub. - * On the top right of the page select the **Fork** tab. - * Under the "Owner" dropdowm menu select yourself as the owner of the new forked repository. - * The fork name will default to 'pyQuARC'. If you wish to name your fork something different, edit the 'Repository Name' field. - * You can set an optional description in the 'Description' field below. - * Make sure the checkbox next to 'Copy the master branch only' is checked. - * Click **Create fork** when you are finished to create your fork! - * After completing the steps above, you should be in a new page titled the same as your new fork. You have successfully created a fork of pyQuARC! You shpuld be able to see all the pyQuARC files and the Read Me. - * Now we will store the files locally on your computer so you will be able to edit the code. Click the green dropdown button labeled **<> Code**. - * Under the **HTTPS** tab, copy the link to the repository. - * Open a python terminal in your preferred coding location. - * Change your working directory to wherever you want your cloned pyQuARC repository to be stored. - * Type '__git clone__' and then paste the URL you copied a few steps above. - * Press **Enter** and your local clone of pyQuARC will be created! You can now explore all of the files on your local computer and make changes as you wish. + +3. **Directly Contribute to PyQuARC Content:** for when you want to directly edit the code to add checks or new features. + * Fork the respository + * To edit the code you will need to first create your own 'fork' of the repository. A fork is a new repository that shares code and visibilty settings with the original repository, and allows you to create your own edits. Read more about Forks [here](https://docs.github.com/en/pull-requests/collaborating-with-pull-requests/working-with-forks/fork-a-repo) + * To create your own fork of pyQuARC, return to the [**Code**](https://github.com/NASA-IMPACT/pyQuARC) tab of the pyQuARC GitHub. + * On the top right of the page select the **Fork** tab. + * Under the "Owner" dropdown menu select yourself as the owner of the new forked repository. + * The fork name will default to 'pyQuARC'. If you wish to name your fork something different, edit the 'Repository Name' field. + * You can set an optional description in the 'Description' field below. + * Make sure the checkbox next to 'Copy the master branch only' is selected. + * Click **Create fork** when you are finished to create your fork! + * After completing the steps above, you should be in a new page titled the same as your new fork, with "forked from NASA-IMPACT/pyQuARC" beneath the title. You have successfully created a fork of pyQuARC! + * Clone your fork locally + * Now we will store the files locally on your computer so you will be able to edit the code. Click the green dropdown button labeled **<> Code**. + * Under the **HTTPS** tab, copy the link to the repository. + * Open a python terminal in your preferred coding location. + * Change your working directory to wherever you want your cloned pyQuARC repository to be stored. + * Type '__git clone__' and then paste the URL you copied a few steps above. + * Press **Enter** and your local clone of pyQuARC will be created! You can now explore all of the files on your local computer. + * Create a new branch and make your desired changes. + * Create a PR + * Once your changes are made, push your commits. + * You can then open a Pull Request (PR) on the [**Pull requests** tab](https://github.com/NASA-IMPACT/pyQuARC/pulls) within the pyQuARC Github page. + * Set the base respository to "NASA-IMPACT/pyQuARC" and the base to "dev". + * Fill out a title and description, then submit! + * Feedback may be provided on your PR. Once it is approved, a pyQuARC team member will merge your changes. ## Thank you for your interest in pyQuARC! -We appriciate your interest in our metadata review project! In the spirit of Open Science, everyone is encouraged to help to improve pyQuARC, and we welcome your comments, suggestions, and new ideas! +We appreciate your interest in pyQuARC! Everyone is encouraged to help to improve pyQuARC, and we welcome your comments, suggestions, and new ideas! From 3fd903bea3c71429757beaf13a325e8777e062d4 Mon Sep 17 00:00:00 2001 From: Sheyenne Kirkland Date: Tue, 22 Apr 2025 14:55:09 -0500 Subject: [PATCH 11/71] minor update --- contributing.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/contributing.md b/contributing.md index 71f52ea3..01f3664f 100644 --- a/contributing.md +++ b/contributing.md @@ -27,7 +27,7 @@ This page is meant to help you learn how you can contribute to pyQuARC! We are p 3. **Directly Contribute to PyQuARC Content:** for when you want to directly edit the code to add checks or new features. * Fork the respository - * To edit the code you will need to first create your own 'fork' of the repository. A fork is a new repository that shares code and visibilty settings with the original repository, and allows you to create your own edits. Read more about Forks [here](https://docs.github.com/en/pull-requests/collaborating-with-pull-requests/working-with-forks/fork-a-repo) + * To edit the code you will need to first create your own 'fork' of the repository. A fork is a new repository that shares code and visibilty settings with the original repository, and allows you to create your own edits. Read more about Forks [here](https://docs.github.com/en/pull-requests/collaborating-with-pull-requests/working-with-forks/fork-a-repo). * To create your own fork of pyQuARC, return to the [**Code**](https://github.com/NASA-IMPACT/pyQuARC) tab of the pyQuARC GitHub. * On the top right of the page select the **Fork** tab. * Under the "Owner" dropdown menu select yourself as the owner of the new forked repository. @@ -39,7 +39,7 @@ This page is meant to help you learn how you can contribute to pyQuARC! We are p * Clone your fork locally * Now we will store the files locally on your computer so you will be able to edit the code. Click the green dropdown button labeled **<> Code**. * Under the **HTTPS** tab, copy the link to the repository. - * Open a python terminal in your preferred coding location. + * Open a Python terminal in your preferred coding location. * Change your working directory to wherever you want your cloned pyQuARC repository to be stored. * Type '__git clone__' and then paste the URL you copied a few steps above. * Press **Enter** and your local clone of pyQuARC will be created! You can now explore all of the files on your local computer. From f6c181fb03cd553019be50a5cfd9c62f3e0caf60 Mon Sep 17 00:00:00 2001 From: Slesa Adhikari Date: Tue, 29 Apr 2025 13:03:55 -0500 Subject: [PATCH 12/71] Add support for granule, only run cmr validation for concept ids, show validation errors in terminal --- pyQuARC/code/constants.py | 13 ++++--- pyQuARC/code/utils.py | 8 +++++ pyQuARC/main.py | 75 ++++++++++++++++++++++++++------------- 3 files changed, 67 insertions(+), 29 deletions(-) diff --git a/pyQuARC/code/constants.py b/pyQuARC/code/constants.py index dd220e56..0a3cb0d5 100644 --- a/pyQuARC/code/constants.py +++ b/pyQuARC/code/constants.py @@ -73,7 +73,8 @@ GCMD_BASIC_URL = "https://gcmd.earthdata.nasa.gov/kms/concepts/concept_scheme/" GCMD_LINKS = { - keyword: f"{GCMD_BASIC_URL}{keyword}?format=csv" for keyword in GCMD_KEYWORDS + keyword: f"{GCMD_BASIC_URL}{keyword}?format=csv" + for keyword in GCMD_KEYWORDS } CMR_URL = "https://cmr.earthdata.nasa.gov" @@ -88,8 +89,10 @@ "%Y", # Year ] -MAPPING_CMR = { - "umm-c": "vnd.nasa.cmr.umm+json", - "echo-c": "echo10+xml", - "dif10": "dif10+xml" +CONTENT_TYPE_MAP = { + UMM_C: "vnd.nasa.cmr.umm+json", + UMM_G: "vnd.nasa.cmr.umm+json", + ECHO10_C: "echo10+xml", + ECHO10_G: "echo10+xml", + DIF: "dif10+xml" } diff --git a/pyQuARC/code/utils.py b/pyQuARC/code/utils.py index 1fe82270..bf6d9f41 100644 --- a/pyQuARC/code/utils.py +++ b/pyQuARC/code/utils.py @@ -82,3 +82,11 @@ def get_date_time(dt_str): except ValueError: continue return None + + +def get_concept_type(concept_id): + """ + Extract the concept type from a given concept ID. + This is useful for determining the type of concept (e.g., 'collection', 'granule') from its ID. + """ + return concept_id.startswith("C") and "collection" or "granule" diff --git a/pyQuARC/main.py b/pyQuARC/main.py index 42d6a792..5fadd980 100644 --- a/pyQuARC/main.py +++ b/pyQuARC/main.py @@ -8,16 +8,21 @@ if __name__ == "__main__": from code.checker import Checker - from code.constants import COLOR, ECHO10_C, SUPPORTED_FORMATS, MAPPING_CMR + from code.constants import ( + COLOR, + ECHO10_C, + SUPPORTED_FORMATS, + CONTENT_TYPE_MAP, + ) from code.downloader import Downloader from code.utils import get_cmr_url, is_valid_cmr_url - from code.utils import get_headers + from code.utils import get_concept_type, get_headers else: from .code.checker import Checker from .code.constants import COLOR, ECHO10_C, SUPPORTED_FORMATS from .code.downloader import Downloader from .code.utils import get_cmr_url, is_valid_cmr_url - from .code.utils import get_headers + from .code.utils import get_concept_type, get_headers ABS_PATH = os.path.abspath(os.path.dirname(__file__)) END = COLOR["reset"] @@ -134,7 +139,7 @@ def _cmr_query(self): return concept_ids - def _validate_with_cmr(self, metadata_content): + def _validate_with_cmr(self, concept_id, metadata_content): """ Validates metadata using the CMR API. @@ -144,28 +149,22 @@ def _validate_with_cmr(self, metadata_content): Returns: dict: Results of the CMR API validation. """ - provider_id = self.concept_ids[0].split("-")[1] - cmr_url = f"{self.cmr_host}/ingest/providers/{provider_id}/validate/collection/" + provider_id = concept_id.split("-")[1] + # native-id is only available in umm-json (sometimes not even) format and it seems like validation works without the actual native-id value, so just leaving in the url + cmr_url = ( + f"{self.cmr_host}/ingest/providers/{provider_id}/validate/" + f"{get_concept_type(concept_id)}/" + ) headers = { - "Content-Type": f"application/{MAPPING_CMR[self.metadata_format]}", + "Content-Type": ( + f"application/{CONTENT_TYPE_MAP[self.metadata_format]}" + ), "Accept": "application/json", + "Cmr-Validate-Keywords": "true", } response = requests.post(cmr_url, data=metadata_content, headers=headers) return response - def add_cmr_response(self, response): - # as it returns status code 200 with a list of any warnings on successful validation - # refer here for details: https://github.com/NASA-IMPACT/pyQuARC/issues/269 - if response.status_code == 200: - cmr_warnings = response.json()['warnings'] - self.errors[-1]["cmr_warnings"] = cmr_warnings - - # in the issue, 400 status code was mentioned but i think 422 refers to invalid data - # refer here for details: https://github.com/NASA-IMPACT/pyQuARC/issues/269 - if response.status_code == 422: - cmr_errors = response.json()['errors'] - self.errors[-1]["cmr_errors"] = cmr_errors - def validate(self): """ Validates the metadata contents of all the `concept_ids` and returns the errors @@ -195,21 +194,24 @@ def validate(self): ) continue content = content.encode() - response = self._validate_with_cmr(content) + cmr_response = self._validate_with_cmr(concept_id, content) validation_errors, pyquarc_errors = checker.run(content) self.errors.append( { "concept_id": concept_id, "errors": validation_errors, + "cmr_validation": { + "errors": cmr_response.json().get("errors", []), + # TODO: show warnings + "warnings": cmr_response.json().get("warnings", []) + }, "pyquarc_errors": pyquarc_errors, } ) - self.add_cmr_response(response) elif self.file_path: with open(os.path.abspath(self.file_path), "r") as myfile: content = myfile.read().encode() - response = self._validate_with_cmr(content) validation_errors, pyquarc_errors = checker.run(content) self.errors.append( { @@ -218,7 +220,6 @@ def validate(self): "pyquarc_errors": pyquarc_errors, } ) - self.add_cmr_response(response) return self.errors @staticmethod @@ -234,6 +235,24 @@ def _error_message(messages): result_string += f"\t\t{colored_message}{END}\n" return result_string + @staticmethod + def _format_cmr_error(cmr_validation): + if not cmr_validation.get("errors"): + return None + error_msg_dict = {} + error_msg = "" + if errors := cmr_validation.get("errors"): + for error in errors: + print(error) + if error["path"][0] not in error_msg_dict: + error_msg_dict[error["path"][0]] = [] + error_msg_dict[error["path"][0]].append(error['errors']) + for path, errors in error_msg_dict.items(): + error_msg += f"\n\t>> {path}: {END}\n" + for error in errors: + error_msg += f"\t\t{COLOR['error']}Error:{END} {str(error)}\n" + return error_msg + def display_results(self): result_string = """ ******************************** @@ -268,6 +287,14 @@ def display_results(self): for error in pyquarc_errors: error_prompt += f"\t\t ERROR: {error['message']}. Details: {error['details']} \n" + if cmr_validation := error.get("cmr_validation"): + cmr_error_msg = self._format_cmr_error(cmr_validation) + if cmr_error_msg: + error_prompt += ( + f"\n\t {COLOR['title']}{COLOR['bright']} CMR VALIDATION ERRORS: {END}\n" + ) + error_prompt += cmr_error_msg + result_string += error_prompt print(result_string) From 0e7732bb142555d18e6de1867eb90f73d0cff549 Mon Sep 17 00:00:00 2001 From: Slesa Adhikari Date: Tue, 29 Apr 2025 13:32:40 -0500 Subject: [PATCH 13/71] Fix display of non-path errors from cmr, fix pyquarc error format --- pyQuARC/main.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/pyQuARC/main.py b/pyQuARC/main.py index 5fadd980..6995b50c 100644 --- a/pyQuARC/main.py +++ b/pyQuARC/main.py @@ -243,10 +243,12 @@ def _format_cmr_error(cmr_validation): error_msg = "" if errors := cmr_validation.get("errors"): for error in errors: - print(error) - if error["path"][0] not in error_msg_dict: - error_msg_dict[error["path"][0]] = [] - error_msg_dict[error["path"][0]].append(error['errors']) + if type(error) is dict and error.get("path"): + if error["path"][0] not in error_msg_dict: + error_msg_dict[error["path"][0]] = [] + error_msg_dict[error["path"][0]].append(error['errors']) + else: + error_msg_dict["Misc"] = [error] for path, errors in error_msg_dict.items(): error_msg += f"\n\t>> {path}: {END}\n" for error in errors: @@ -285,7 +287,7 @@ def display_results(self): f"\n\t {COLOR['title']}{COLOR['bright']} pyQuARC ERRORS: {END}\n" ) for error in pyquarc_errors: - error_prompt += f"\t\t ERROR: {error['message']}. Details: {error['details']} \n" + error_prompt += f"\t\t ERROR: {error['type']}. Details: {error['details']} \n" if cmr_validation := error.get("cmr_validation"): cmr_error_msg = self._format_cmr_error(cmr_validation) From 83f446aecb106512177036f8a59662d3a76193b6 Mon Sep 17 00:00:00 2001 From: Sheyenne Kirkland Date: Tue, 29 Apr 2025 15:13:07 -0500 Subject: [PATCH 14/71] umm 1.18.4 update --- pyQuARC/schemas/umm-c-json-schema.json | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/pyQuARC/schemas/umm-c-json-schema.json b/pyQuARC/schemas/umm-c-json-schema.json index fd193169..f8103804 100644 --- a/pyQuARC/schemas/umm-c-json-schema.json +++ b/pyQuARC/schemas/umm-c-json-schema.json @@ -1415,9 +1415,9 @@ "enum": ["NEAR_REAL_TIME", "LOW_LATENCY", "EXPEDITED", "SCIENCE_QUALITY", "OTHER"] }, "CollectionProgressEnum": { - "description": "This element describes the production status of the data set. There are five choices for Data Providers: PLANNED refers to data sets to be collected in the future and are thus unavailable at the present time. For Example: The Hydro spacecraft has not been launched, but information on planned data sets may be available. ACTIVE refers to data sets currently in production or data that is continuously being collected or updated. For Example: data from the AIRS instrument on Aqua is being collected continuously. COMPLETE refers to data sets in which no updates or further data collection will be made. For Example: Nimbus-7 SMMR data collection has been completed. DEPRECATED refers to data sets that have been retired, but still can be retrieved. Usually newer products exist that replace the retired data set. NOT APPLICABLE refers to data sets in which a collection progress is not applicable such as a calibration collection. There is a sixth value of NOT PROVIDED that should not be used by a data provider. It is currently being used as a value when a correct translation cannot be done with the current valid values, or when the value is not provided by the data provider.", + "description": "This element describes the production status of the data set. There are multiple choices for Data Providers: PLANNED refers to data sets to be collected in the future and are thus unavailable at the present time. For Example: The Hydro spacecraft has not been launched, but information on planned data sets may be available. ACTIVE refers to data sets currently in production or data that is continuously being collected or updated. For Example: data from the AIRS instrument on Aqua is being collected continuously. COMPLETE refers to data sets in which no updates or further data collection will be made. For Example: Nimbus-7 SMMR data collection has been completed. DEPRECATED refers to data sets that have been retired, but still can be retrieved. Usually newer products exist that replace the retired data set. NOT PROVIDED should not be used by a data provider. It is currently being used as a value when a correct translation cannot be done with the current valid values, or when the value is not provided by the data provider. PREPRINT: Refers to datasets which are made available prior to completion of validation and review processes to support manuscript publication processes and open science.\nPreprint datasets are provisional and should not be used for production applications. INREVIEW: Refers to datasets which are made available to support science team final review. In Review datasets are provisional and should not be used for production applications. Note that if restricted access is needed, an INREVIEW dataset may also have an Access Control List applied. SUPERSEDED: Refers to datasets which remain publicly available, but for which a newer version is available.", "type": "string", - "enum": ["ACTIVE", "PLANNED", "COMPLETE", "DEPRECATED", "NOT APPLICABLE", "NOT PROVIDED"] + "enum": ["ACTIVE", "PLANNED", "COMPLETE", "DEPRECATED", "NOT PROVIDED", "PREPRINT", "INREVIEW", "SUPERSEDED"] }, "LocationKeywordType": { "description": "This element defines a hierarchical location list. It replaces SpatialKeywords. The controlled vocabulary for location keywords is maintained in the Keyword Management System (KMS). Each tier must have data in the tier above it.", @@ -1468,7 +1468,7 @@ "enum": ["KB", "MB", "GB", "TB", "PB", "NA"] }, "DistributionMediaType": { - "description": "This element defines the media by which the end user can obtain the distributable item. Examples of media include: CD-ROM, 9 track tape, diskettes, hard drives, online, transparencies, hardcopy, etc.", + "description": "This element defines the media by which the end user can obtain the distributable item. Each media type is listed separately. Examples of media include HTTPS, Earthdata Cloud, etc.", "type": "string", "minLength": 1, "maxLength": 80 @@ -1593,7 +1593,7 @@ "$ref": "#/definitions/ArchiveDistributionFormatDescriptionType" }, "Media": { - "description": "This element defines the media by which the end user can obtain the distributable item. Each media type is listed separately. Examples of media include: CD-ROM, 9 track tape, diskettes, hard drives, online, transparencies, hardcopy, etc.", + "description": "This element defines the media by which the end user can obtain the distributable item. Each media type is listed separately. Examples of media include HTTPS, Earthdata Cloud, etc.", "type": "array", "items": { "$ref": "#/definitions/DistributionMediaType" @@ -1656,7 +1656,7 @@ "$ref": "#/definitions/ArchiveDistributionFormatDescriptionType" }, "Media": { - "description": "This element defines the media by which the end user can obtain the distributable item. Each media type is listed separately. Examples of media include: CD-ROM, 9 track tape, diskettes, hard drives, online, transparencies, hardcopy, etc.", + "description": "This element defines the media by which the end user can obtain the distributable item. Each media type is listed separately. Examples of media include HTTPS, Earthdata Cloud, etc.", "type": "array", "items": { "$ref": "#/definitions/DistributionMediaType" @@ -1811,7 +1811,7 @@ "Type": { "description": "This element describes to what DOI is associated.", "type": "string", - "enum": ["Child Dataset", "Collaborative/Other Agency", "Field Campaign", "Parent Dataset", "Related Dataset"] + "enum": ["Child Dataset", "Collaborative/Other Agency", "Field Campaign", "Parent Dataset", "Related Dataset", "IsPreviousVersionOf", "IsNewVersionOf", "IsDescribedBy"] } }, "required": ["DOI"] @@ -1905,7 +1905,7 @@ "URL": { "description": "This element represents the URL where the schema lives. The schema can be downloaded.", "type": "string", - "enum": ["https://cdn.earthdata.nasa.gov/umm/collection/v1.18.1"] + "enum": ["https://cdn.earthdata.nasa.gov/umm/collection/v1.18.4"] }, "Name": { "description": "This element represents the name of the schema.", @@ -1915,7 +1915,7 @@ "Version": { "description": "This element represents the version of the schema.", "type": "string", - "enum": ["1.18.1"] + "enum": ["1.18.4"] } }, "required": ["URL", "Name", "Version"] From d1bf5e0d738ac05bcf21880aec447f960fbd0bcb Mon Sep 17 00:00:00 2001 From: Lavanya Ashokkumar Date: Mon, 19 May 2025 14:22:15 -0500 Subject: [PATCH 15/71] Earthdata email update #321 --- pyQuARC/schemas/check_messages_override.json | 108 ++++++++++++++++++- pyQuARC/schemas/ruleset.json | 8 +- pyQuARC/schemas/version.txt | 2 +- pytest.ini | 5 + tests/test_downloader.py | 4 +- 5 files changed, 119 insertions(+), 8 deletions(-) create mode 100644 pytest.ini diff --git a/pyQuARC/schemas/check_messages_override.json b/pyQuARC/schemas/check_messages_override.json index 0967ef42..1d992b39 100644 --- a/pyQuARC/schemas/check_messages_override.json +++ b/pyQuARC/schemas/check_messages_override.json @@ -1 +1,107 @@ -{} +{ + "science_keywords_gcmd_check": { + "failure": "`{}` is not a valid GCMD science keyword.", + "help": { + "message": "", + "url": "https://wiki.earthdata.nasa.gov/display/CMR/Science+Keywords" + }, + "remediation": "Provide a valid GCMD keyword or submit a request to earthdata-support@nasa.gov to have this keyword added to the GCMD KMS." + }, + "location_gcmd_check": { + "failure": "`{}` is not a valid GCMD location keyword.", + "help": { + "message": "", + "url": "" + }, + "remediation": "Provide a valid GCMD keyword or submit a request to earthdata-support@nasa.gov to have this keyword added to the GCMD KMS." + }, + "organization_short_name_gcmd_check": { + "failure": "The provided short name `{}` does not comply with GCMD. ", + "help": { + "message": "", + "url": "https://wiki.earthdata.nasa.gov/display/CMR/Data+Center" + }, + "remediation": "Provide a valid short name from the GCMD Providers keyword list or submit a request to earthdata-support@nasa.gov to have this keyword added to the GCMD KMS." + }, + "organization_long_name_gcmd_check": { + "failure": "The provided data center long name `{}` does not comply with the GCMD. ", + "help": { + "message": "", + "url": "https://wiki.earthdata.nasa.gov/display/CMR/Data+Center" + }, + "remediation": "Provide a valid long name name from the GCMD Providers keyword list or submit a request to earthdata-support@nasa.gov to have this keyword added to the GCMD KMS." + }, + "instrument_short_name_gcmd_check": { + "failure": "The provided instrument short name `{}` does not comply with GCMD.", + "help": { + "message": "", + "url": "https://gcmd.earthdata.nasa.gov/kms/concepts/concept_scheme/instruments/?format=csv&page_num=1&page_size=2000" + }, + "remediation": "Select a valid short name, or submit a request to earthdata-support@nasa.gov to have this instrument added to the GCMD Instruments keyword list." + }, + "instrument_long_name_gcmd_check": { + "failure": "The provided instrument long name `{}` does not comply with GCMD.", + "help": { + "message": "", + "url": "https://gcmd.earthdata.nasa.gov/kms/concepts/concept_scheme/instruments/?format=csv&page_num=1&page_size=2000" + }, + "remediation": "Select a valid long name, or submit a request to earthdata-support@nasa.gov to have this instrument added to the GCMD Instruments keyword list." + }, + "platform_short_name_gcmd_check": { + "failure": "The provided platform short name `{}` does not comply with GCMD.", + "help": { + "message": "", + "url": "https://gcmd.earthdata.nasa.gov/kms/concepts/concept_scheme/platforms/?format=csv&page_num=1&page_size=2000" + }, + "remediation": "Select a valid short name, or submit a request to earthdata-support@nasa.gov to have this platform added to the GCMD Platforms keyword list." + }, + "data_format_gcmd_check": { + "failure": "The provided data format `{}` does not comply with GCMD.", + "help": { + "message": "", + "url": "https://gcmd.earthdata.nasa.gov/kms/concepts/concept_scheme/DataFormat/?format=csv&page_num=1&page_size=2000" + }, + "remediation": "Select a valid data format, or submit a request to earthdata-support@nasa.gov to have this data format added to the GCMD Data Format keyword list." + }, + "platform_long_name_gcmd_check": { + "failure": "The provided platform long name `{}` does not comply with GCMD.", + "help": { + "message": "", + "url": "https://gcmd.earthdata.nasa.gov/kms/concepts/concept_scheme/platforms/?format=csv&page_num=1&page_size=2000" + }, + "remediation": "Select a valid long name, or submit a request to earthdata-support@nasa.gov to have this platform added to the GCMD Platforms keyword list." + }, + "spatial_keyword_gcmd_check": { + "failure": "The provided location/spatial keyword `{}` does not comply with GCMD.", + "help": { + "message": "", + "url": "https://gcmd.earthdata.nasa.gov/kms/concepts/concept_scheme/locations/?format=csv&page_num=1&page_size=2000" + }, + "remediation": "Select a valid location keyword, or submit a request to earthdata-support@nasa.gov to have this value added to the GCMD Locations keyword list." + }, + "platform_type_gcmd_check": { + "failure": "The provided platform type `{}` does not comply with GCMD.", + "help": { + "message": "", + "url": "https://gcmd.earthdata.nasa.gov/kms/concepts/concept_scheme/platforms/?format=csv&page_num=1&page_size=2000" + }, + "remediation": "Select a valid platform type, or submit a request to earthdata-support@nasa.gov to have this platform type added to the GCMD Platforms keyword list." + }, + "campaign_short_name_gcmd_check": { + "failure": "The provided project/campaign short name `{}` does not comply with GCMD.", + "help": { + "message": "", + "url": "https://gcmd.earthdata.nasa.gov/kms/concepts/concept_scheme/projects/?format=csv&page_num=1&page_size=2000" + }, + "remediation": "Select a valid short name, or submit a request to earthdata-support@nasa.gov to have this project/campaign name added to the GCMD Projects keyword list." + }, + "campaign_long_name_gcmd_check": { + "failure": "The provided project/campaign long name `{}` does not comply with GCMD.", + "help": { + "message": "", + "url": "https://gcmd.earthdata.nasa.gov/kms/concepts/concept_scheme/projects/?format=csv&page_num=1&page_size=2000" + }, + "remediation": "Select a valid long name, or submit a request to earthdata-support@nasa.gov to have this project/campaign name added to the GCMD Projects keyword list." + } +} + diff --git a/pyQuARC/schemas/ruleset.json b/pyQuARC/schemas/ruleset.json index ce3bbeca..aaa4ab14 100644 --- a/pyQuARC/schemas/ruleset.json +++ b/pyQuARC/schemas/ruleset.json @@ -464,12 +464,12 @@ { "name-id": "Instrument Short Name Check", "name-display": "Instrument Short Name Check", - "description": "Check to ensure the provided instrument short name matches a value from the GCMD controlled vocabularly list. Provide an error if the provided short name is not an exact match with any of the names on the keyword list, and suggest a request be made to support@earthdata.nasa.gov in order to have it added to the GCMD Instrument KMS .", + "description": "Check to ensure the provided instrument short name matches a value from the GCMD controlled vocabularly list. Provide an error if the provided short name is not an exact match with any of the names on the keyword list, and suggest a request be made to earthdata-support@nasa.gov in order to have it added to the GCMD Instrument KMS .", "severity": "error", "timeframe": null, "scope": null, "message-fail": "1. If the provided short name is not GCMD-compliant: The provided instrument short name does not comply with the GCMD. \n2. If an instrument short name is not provided: The instrument short name appears to be missing from the metadata.", - "remediation": "1. Please submit a request to support@earthdata.nasa.gov to have this instrument added to the GCMD Instrument KMS.\n2. Recommend providing the following as the associated instrument short name under the [associated platform short name] platform.", + "remediation": "1. Please submit a request to earthdata-support@nasa.gov to have this instrument added to the GCMD Instrument KMS.\n2. Recommend providing the following as the associated instrument short name under the [associated platform short name] platform.", "help_url": null, "specification": null, "spec_version": null, @@ -478,12 +478,12 @@ { "name-id": "Instrument Long Name Check", "name-display": "Instrument Long Name Check", - "description": "Check to determine if the provided long name matches a value from the GCMD controlled vocabulary list and is associated with the correct instrument short name. Provide an error if 1) a long name is not provided when one exists in the vocabulary list for the associated instrument short name or 2) if the provided long name is not an exact match with any of the names on the keyword list; suggest a request be made to support@earthdata.nasa.gov in order to have it added to the GCMD Instrument KMS if this is the case.", + "description": "Check to determine if the provided long name matches a value from the GCMD controlled vocabulary list and is associated with the correct instrument short name. Provide an error if 1) a long name is not provided when one exists in the vocabulary list for the associated instrument short name or 2) if the provided long name is not an exact match with any of the names on the keyword list; suggest a request be made to earthdata-support@nasa.gov in order to have it added to the GCMD Instrument KMS if this is the case.", "severity": "warning (if no long name is provided when it should be)\n\nerror (if the names is not an exact match with the keyword list)", "timeframe": null, "scope": null, "message-fail": "1. If the provided long name is not GCMD-compliant: The provided instrument long name does not comply with the GCMD.\n2. If a long name is not provided when one exists: The instrument long name appears to be missing from the metadata.", - "remediation": "1. Please submit a request to support@earthdata.nasa.gov to have this instrument added to the GCMD Instrument KMS.\n2. Recommend providing the following as the associated instrument long name for [the instrument] under the [associated platform short name] platform.", + "remediation": "1. Please submit a request to earthdata-support@nasa.gov to have this instrument added to the GCMD Instrument KMS.\n2. Recommend providing the following as the associated instrument long name for [the instrument] under the [associated platform short name] platform.", "help_url": null, "specification": null, "spec_version": null, diff --git a/pyQuARC/schemas/version.txt b/pyQuARC/schemas/version.txt index adcf29f0..3a6f0812 100644 --- a/pyQuARC/schemas/version.txt +++ b/pyQuARC/schemas/version.txt @@ -1 +1 @@ -2023-04-24 \ No newline at end of file +2025-05-19 \ No newline at end of file diff --git a/pytest.ini b/pytest.ini new file mode 100644 index 00000000..29158f47 --- /dev/null +++ b/pytest.ini @@ -0,0 +1,5 @@ +[pytest] +filterwarnings = + ignore:Accessing jsonschema.draft7_format_checker is deprecated:DeprecationWarning + ignore:ssl.PROTOCOL_TLS is deprecated:DeprecationWarning + ignore:ssl.match_hostname.*:DeprecationWarning diff --git a/tests/test_downloader.py b/tests/test_downloader.py index ddd7d5db..0336d084 100644 --- a/tests/test_downloader.py +++ b/tests/test_downloader.py @@ -9,11 +9,11 @@ class TestDownloader: def setup_method(self): self.concept_ids = { "collection": { - "real": "C1339230297-GES_DISC", + "real": "C1000000010-CDDIS", #C1339230297-GES_DISC "dummy": "C123456-LPDAAC_ECS", }, "granule": { - "real": "G1370895082-GES_DISC", + "real": "G1001434969-CDDIS", #G1370895082-GES_DISC "dummy": "G1000000002-CMR_PROV", }, "invalid": "asdfasdf", From 5fee2f4e89fc60604cdfb233967141f9e602adfe Mon Sep 17 00:00:00 2001 From: Lavanya Ashokkumar Date: Tue, 3 Jun 2025 12:23:13 -0500 Subject: [PATCH 16/71] Entrytitle - Revised #325 --- pyQuARC/code/schema_validator.py | 11 ++++++----- pyQuARC/code/string_validator.py | 14 ++++++++++---- pyQuARC/schemas/check_messages_override.json | 12 +++++++++++- tests/test_downloader.py | 4 ++-- 4 files changed, 29 insertions(+), 12 deletions(-) diff --git a/pyQuARC/code/schema_validator.py b/pyQuARC/code/schema_validator.py index 11b3f087..e88a3e1e 100644 --- a/pyQuARC/code/schema_validator.py +++ b/pyQuARC/code/schema_validator.py @@ -3,7 +3,7 @@ import re from io import BytesIO -from jsonschema import Draft7Validator, draft7_format_checker, RefResolver +from jsonschema import Draft7Validator, RefResolver from lxml import etree from urllib.request import pathname2url @@ -91,7 +91,8 @@ def run_json_validator(self, content_to_validate): resolver = RefResolver.from_schema(schema, store=schema_store) validator = Draft7Validator( - schema, format_checker=draft7_format_checker, resolver=resolver + schema, + format_checker=Draft7Validator.FORMAT_CHECKER, resolver=resolver ) for error in sorted( @@ -136,13 +137,13 @@ def _build_errors(error_log, paths): # For DIF, because the namespace is specified in the metadata file, lxml library # provides field name concatenated with the namespace, # the following 3 lines of code removes the namespace - namespaces = re.findall("(\{http[^}]*\})", line) + namespaces = re.findall(r"(\{http[^}]*\})", line) for namespace in namespaces: line = line.replace(namespace, "") - field_name = re.search("Element\s'(.*)':", line)[1] + field_name = re.search(r"Element\s'(.*)':", line)[1] field_paths = [abs_path for abs_path in paths if field_name in abs_path] field_name = field_paths[0] if len(field_paths) == 1 else field_name - message = re.search("Element\s'.+':\s(\[.*\])?(.*)", line)[2].strip() + message = re.search(r"Element\s'.+':\s(\[.*\])?(.*)", line)[2].strip() errors.setdefault(field_name, {})["schema"] = { "message": [f"Error: {message}"], "valid": False, diff --git a/pyQuARC/code/string_validator.py b/pyQuARC/code/string_validator.py index 1bd27715..8ba756c1 100644 --- a/pyQuARC/code/string_validator.py +++ b/pyQuARC/code/string_validator.py @@ -1,7 +1,7 @@ from .base_validator import BaseValidator from .gcmd_validator import GcmdValidator from .utils import cmr_request, collection_in_cmr, if_arg, set_cmr_prms - +import re class StringValidator(BaseValidator): """ @@ -38,15 +38,21 @@ def length_check(string, extent, relation): def compare(first, second, relation): """ Compares two strings based on the relationship - Returns: - (dict) An object with the validity of the check and the instance + (dict) An object with the validity of the check and the instance """ + + # Check if 'first' and 'second' contain any special characters + first_clean = re.sub(r'[^a-zA-Z0-9]', '', first).upper() + second_clean = re.sub(r'[^a-zA-Z0-9]', '', second).upper() + + # If either string contains special characters, return a warning or handle as needed return { - "valid": BaseValidator.compare(first.upper(), second.upper(), relation), + "valid": BaseValidator.compare(first_clean, second_clean, relation), "value": (first, second), } + @staticmethod @if_arg def controlled_keywords_check(value, keywords_list): diff --git a/pyQuARC/schemas/check_messages_override.json b/pyQuARC/schemas/check_messages_override.json index 0967ef42..2fee2195 100644 --- a/pyQuARC/schemas/check_messages_override.json +++ b/pyQuARC/schemas/check_messages_override.json @@ -1 +1,11 @@ -{} +{ + "shortname_uniqueness": { + "failure": "The EntryTitle/DataSetId `{}` is identical to the ShortName `{}`.", + "help": { + "message": "", + "url": "https://wiki.earthdata.nasa.gov/display/CMR/Entry+Title" + }, + "remediation": "Recommend providing a more descriptive title for the dataset. " + } +} + diff --git a/tests/test_downloader.py b/tests/test_downloader.py index ddd7d5db..ca1762c8 100644 --- a/tests/test_downloader.py +++ b/tests/test_downloader.py @@ -9,11 +9,11 @@ class TestDownloader: def setup_method(self): self.concept_ids = { "collection": { - "real": "C1339230297-GES_DISC", + "real": "C1000000042-CDDIS", "dummy": "C123456-LPDAAC_ECS", }, "granule": { - "real": "G1370895082-GES_DISC", + "real": "G1018577631-CDDIS", "dummy": "G1000000002-CMR_PROV", }, "invalid": "asdfasdf", From 510c787f6a6b660f252063c3365c75bfe498044f Mon Sep 17 00:00:00 2001 From: Lavanya Ashokkumar Date: Wed, 4 Jun 2025 12:55:23 -0500 Subject: [PATCH 17/71] Processing level - revised #327 --- pyQuARC/code/schema_validator.py | 11 +++++---- pyQuARC/schemas/check_messages_override.json | 19 +++++++++++++++- pyQuARC/schemas/rules_override.json | 24 +++++++++++++++++++- tests/test_downloader.py | 4 ++-- 4 files changed, 49 insertions(+), 9 deletions(-) diff --git a/pyQuARC/code/schema_validator.py b/pyQuARC/code/schema_validator.py index 11b3f087..c743741e 100644 --- a/pyQuARC/code/schema_validator.py +++ b/pyQuARC/code/schema_validator.py @@ -3,7 +3,7 @@ import re from io import BytesIO -from jsonschema import Draft7Validator, draft7_format_checker, RefResolver +from jsonschema import Draft7Validator, RefResolver from lxml import etree from urllib.request import pathname2url @@ -91,7 +91,7 @@ def run_json_validator(self, content_to_validate): resolver = RefResolver.from_schema(schema, store=schema_store) validator = Draft7Validator( - schema, format_checker=draft7_format_checker, resolver=resolver + schema, format_checker=Draft7Validator.FORMAT_CHECKER, resolver=resolver ) for error in sorted( @@ -136,13 +136,14 @@ def _build_errors(error_log, paths): # For DIF, because the namespace is specified in the metadata file, lxml library # provides field name concatenated with the namespace, # the following 3 lines of code removes the namespace - namespaces = re.findall("(\{http[^}]*\})", line) + + namespaces = re.findall(r"(\{http[^}]*\})", line) for namespace in namespaces: line = line.replace(namespace, "") - field_name = re.search("Element\s'(.*)':", line)[1] + field_name = re.search(r"Element\s'(.*)':", line)[1] field_paths = [abs_path for abs_path in paths if field_name in abs_path] field_name = field_paths[0] if len(field_paths) == 1 else field_name - message = re.search("Element\s'.+':\s(\[.*\])?(.*)", line)[2].strip() + message = re.search(r"Element\s'.+':\s(\[.*\])?(.*)", line)[2].strip() errors.setdefault(field_name, {})["schema"] = { "message": [f"Error: {message}"], "valid": False, diff --git a/pyQuARC/schemas/check_messages_override.json b/pyQuARC/schemas/check_messages_override.json index 0967ef42..d4ed34be 100644 --- a/pyQuARC/schemas/check_messages_override.json +++ b/pyQuARC/schemas/check_messages_override.json @@ -1 +1,18 @@ -{} +{ + "processing_level_description_presence_check": { + "failure": "The Processing Level Description is missing.", + "help": { + "message": "Recommend providing a processing level description, using the EOSDIS processing level descriptions as guidance", + "url": "https://www.earthdata.nasa.gov/learn/earth-observation-data-basics/data-processing-levels" + }, + "remediation": "Recommend providing a processing level description, using the EOSDIS processing level descriptions as guidance:\nhttps://www.earthdata.nasa.gov/engage/open-data-services-and-software/data-information-policy/data-levels" + }, + "processing_level_description_length_check": { + "failure": "The provided description is less than 50 characters and therefore may be lacking in contextual information.", + "help": { + "message": "Use the EOSDIS Data Processing level description as guidance.", + "url": "https://www.earthdata.nasa.gov/engage/open-data-services-and-software/data-information-policy/data-levels" + }, + "remediation": "Recommend providing a more detailed processing level description, using the EOSDIS processing level descriptions as guidance:\nhttps://www.earthdata.nasa.gov/engage/open-data-services-and-software/data-information-policy/data-levels" + } +} diff --git a/pyQuARC/schemas/rules_override.json b/pyQuARC/schemas/rules_override.json index 0967ef42..16b89647 100644 --- a/pyQuARC/schemas/rules_override.json +++ b/pyQuARC/schemas/rules_override.json @@ -1 +1,23 @@ -{} +{ + "processing_level_description_presence_check": { + "rule_name": "Processing Level Description Presence Check", + "fields_to_apply": { + "echo-c": [ + { + "fields": [ + "Collection/ProcessingLevelDescription" + ] + } + ], + "umm-c": [ + { + "fields": [ + "ProcessingLevel/ProcessingLevelDescription" + ] + } + ] + }, + "severity": "info", + "check_id": "one_item_presence_check" + } +} diff --git a/tests/test_downloader.py b/tests/test_downloader.py index ddd7d5db..5a6ca777 100644 --- a/tests/test_downloader.py +++ b/tests/test_downloader.py @@ -9,11 +9,11 @@ class TestDownloader: def setup_method(self): self.concept_ids = { "collection": { - "real": "C1339230297-GES_DISC", + "real": "C1000000010-CDDIS", "dummy": "C123456-LPDAAC_ECS", }, "granule": { - "real": "G1370895082-GES_DISC", + "real": "G1001434969-CDDIS", "dummy": "G1000000002-CMR_PROV", }, "invalid": "asdfasdf", From fb9cc5edbe490164b24cba5c301df7415879c144 Mon Sep 17 00:00:00 2001 From: Lavanya Ashokkumar Date: Wed, 4 Jun 2025 14:33:20 -0500 Subject: [PATCH 18/71] Datacenter email - revised #326 --- pyQuARC/code/schema_validator.py | 11 ++++++----- pyQuARC/code/url_validator.py | 11 +++++++++++ pyQuARC/schemas/check_messages_override.json | 11 ++++++++++- pyQuARC/schemas/checks_override.json | 8 +++++++- pyQuARC/schemas/rules_override.json | 18 +++++++++++++++++- tests/test_downloader.py | 4 ++-- 6 files changed, 53 insertions(+), 10 deletions(-) diff --git a/pyQuARC/code/schema_validator.py b/pyQuARC/code/schema_validator.py index 11b3f087..8423e348 100644 --- a/pyQuARC/code/schema_validator.py +++ b/pyQuARC/code/schema_validator.py @@ -3,7 +3,7 @@ import re from io import BytesIO -from jsonschema import Draft7Validator, draft7_format_checker, RefResolver +from jsonschema import Draft7Validator, RefResolver from lxml import etree from urllib.request import pathname2url @@ -91,7 +91,7 @@ def run_json_validator(self, content_to_validate): resolver = RefResolver.from_schema(schema, store=schema_store) validator = Draft7Validator( - schema, format_checker=draft7_format_checker, resolver=resolver + schema, format_checker=Draft7Validator.FORMAT_CHECKER, resolver=resolver ) for error in sorted( @@ -136,13 +136,14 @@ def _build_errors(error_log, paths): # For DIF, because the namespace is specified in the metadata file, lxml library # provides field name concatenated with the namespace, # the following 3 lines of code removes the namespace - namespaces = re.findall("(\{http[^}]*\})", line) + + namespaces = re.findall(r"(\{http[^}]*\})", line) for namespace in namespaces: line = line.replace(namespace, "") - field_name = re.search("Element\s'(.*)':", line)[1] + field_name = re.search(r"Element\s'(.*)':", line)[1] field_paths = [abs_path for abs_path in paths if field_name in abs_path] field_name = field_paths[0] if len(field_paths) == 1 else field_name - message = re.search("Element\s'.+':\s(\[.*\])?(.*)", line)[2].strip() + message = re.search(r"Element\s'.+':\s(\[.*\])?(.*)", line)[2].strip() errors.setdefault(field_name, {})["schema"] = { "message": [f"Error: {message}"], "valid": False, diff --git a/pyQuARC/code/url_validator.py b/pyQuARC/code/url_validator.py index 55a74e61..9b6befd6 100644 --- a/pyQuARC/code/url_validator.py +++ b/pyQuARC/code/url_validator.py @@ -117,3 +117,14 @@ def doi_link_update(value, bad_urls): validity = False return {"valid": validity, "value": value} + + @staticmethod + @if_arg + def url_update_email_check(url, bad_urls): + validity = True + # Check if the URL matches 'support-cddis@earthdata.nasa.gov' + if url in bad_urls or url == "support-cddis@earthdata.nasa.gov": + # Update the URL + url = "support-cddis@nasa.gov" + validity = False # Mark as invalid if the URL was updated + return {"valid": validity, "value": url} diff --git a/pyQuARC/schemas/check_messages_override.json b/pyQuARC/schemas/check_messages_override.json index 0967ef42..9f7673ed 100644 --- a/pyQuARC/schemas/check_messages_override.json +++ b/pyQuARC/schemas/check_messages_override.json @@ -1 +1,10 @@ -{} +{ + "url_update_email_check": { + "failure": "The listed email contact information must be updated.", + "help": { + "message": "Recommend providing the updated contact information as per the data product.", + "url": "https://wiki.earthdata.nasa.gov/display/CMR/Data+Center" + }, + "remediation": "Recommend changing the contact information to 'support-cddis@nasa.gov'. " + } +} diff --git a/pyQuARC/schemas/checks_override.json b/pyQuARC/schemas/checks_override.json index 0967ef42..2f4d6fc1 100644 --- a/pyQuARC/schemas/checks_override.json +++ b/pyQuARC/schemas/checks_override.json @@ -1 +1,7 @@ -{} +{ + "url_update_email_check": { + "data_type": "url", + "check_function": "url_update_email_check", + "available": true + } +} diff --git a/pyQuARC/schemas/rules_override.json b/pyQuARC/schemas/rules_override.json index 0967ef42..2ecc4799 100644 --- a/pyQuARC/schemas/rules_override.json +++ b/pyQuARC/schemas/rules_override.json @@ -1 +1,17 @@ -{} +{ + "url_update_email_check": { + "rule_name": "URL Email address check", + "fields_to_apply": { + "umm-c": [ + { + "fields": [ + "DataCenters/ContactGroups/ContactInformation/ContactMechanisms/Value", + "DataCenters/ContactGroups/ContactInformation/ContactInstruction" + ] + } + ] + }, + "severity": "info", + "check_id": "url_update_email_check" +} +} diff --git a/tests/test_downloader.py b/tests/test_downloader.py index ddd7d5db..ca1762c8 100644 --- a/tests/test_downloader.py +++ b/tests/test_downloader.py @@ -9,11 +9,11 @@ class TestDownloader: def setup_method(self): self.concept_ids = { "collection": { - "real": "C1339230297-GES_DISC", + "real": "C1000000042-CDDIS", "dummy": "C123456-LPDAAC_ECS", }, "granule": { - "real": "G1370895082-GES_DISC", + "real": "G1018577631-CDDIS", "dummy": "G1000000002-CMR_PROV", }, "invalid": "asdfasdf", From 8dc76a472b520b3265acb214fd6a4e4f7a1c7519 Mon Sep 17 00:00:00 2001 From: Lavanya Ashokkumar Date: Thu, 5 Jun 2025 16:22:14 -0500 Subject: [PATCH 19/71] Schema update for umm-c (1.18.2 to 1.18.4) #328 --- pyQuARC/code/checker.py | 3 ++ pyQuARC/code/schema_validator.py | 85 +++++++++++++++++++++++++------- pyQuARC/code/utils.py | 8 +++ tests/test_downloader.py | 4 +- 4 files changed, 81 insertions(+), 19 deletions(-) diff --git a/pyQuARC/code/checker.py b/pyQuARC/code/checker.py index 4bb401c7..93993024 100644 --- a/pyQuARC/code/checker.py +++ b/pyQuARC/code/checker.py @@ -14,6 +14,9 @@ from .string_validator import StringValidator from .url_validator import UrlValidator +from .schema_validator import SchemaValidator +from .constants import UMM_C # or however you define metadata format + from .constants import ECHO10_C, SCHEMA_PATHS diff --git a/pyQuARC/code/schema_validator.py b/pyQuARC/code/schema_validator.py index 11b3f087..26f2d315 100644 --- a/pyQuARC/code/schema_validator.py +++ b/pyQuARC/code/schema_validator.py @@ -3,11 +3,22 @@ import re from io import BytesIO -from jsonschema import Draft7Validator, draft7_format_checker, RefResolver +from jsonschema import Draft7Validator, RefResolver from lxml import etree from urllib.request import pathname2url +from .utils import read_json_schema_from_url +from .constants import ECHO10_C, SCHEMA_PATHS, UMM_C, UMM_G + + +SUPPORTED_UMM_C_VERSIONS = ["v1.18.4", "v1.18.3", "v1.18.2"] +DEFAULT_UMM_C_VERSION = "v1.18.4" # Or any other version you prefer as default + +# Define UMM-G versions if you want to make it flexible as well +SUPPORTED_UMM_G_VERSIONS = ["v1.6.6"] +DEFAULT_UMM_G_VERSION = "v1.6.6" + +SCHEMA_CDN_BASE = "https://cdn.earthdata.nasa.gov/umm" -from .constants import ECHO10_C, SCHEMA_PATHS, UMM_C class SchemaValidator: @@ -21,6 +32,10 @@ def __init__( self, check_messages, metadata_format=ECHO10_C, + # Add a new parameter for UMM-C version + umm_c_version=DEFAULT_UMM_C_VERSION, + # Add a new parameter for UMM-G version (if you want to make it flexible too) + umm_g_version=DEFAULT_UMM_G_VERSION ): """ Args: @@ -29,8 +44,27 @@ def __init__( validation_paths (list of str): The path of the fields in the metadata that need to be validated. In the form ['Collection/StartDate', ...]. + umm_c_version (str): The specific UMM-C version to use for validation (e.g., "v1.18.4"). + umm_g_version (str): The specific UMM-G version to use for validation (e.g., "v1.6.6"). + check_messages (dict): A dictionary of check messages for errors. """ self.metadata_format = metadata_format + # Validate and store the UMM-C version + if umm_c_version not in SUPPORTED_UMM_C_VERSIONS: + raise ValueError( + f"Unsupported UMM-C version: {umm_c_version}. " + f"Supported versions are: {', '.join(SUPPORTED_UMM_C_VERSIONS)}" + ) + self.umm_c_version = umm_c_version + + # Validate and store the UMM-G version + if umm_g_version not in SUPPORTED_UMM_G_VERSIONS: + raise ValueError( + f"Unsupported UMM-G version: {umm_g_version}. " + f"Supported versions are: {', '.join(SUPPORTED_UMM_G_VERSIONS)}" + ) + self.umm_g_version = umm_g_version + if metadata_format.startswith("umm-"): self.validator_func = self.run_json_validator else: @@ -61,9 +95,16 @@ def read_json_schema(self): """ Reads the json schema file """ + if self.metadata_format == UMM_C: + schema_url = (f"{SCHEMA_CDN_BASE}/collection/{self.umm_c_version}/umm-c-json-schema.json") + return read_json_schema_from_url(schema_url) + + if self.metadata_format == UMM_G: + schema_url = (f"{SCHEMA_CDN_BASE}/granule/{self.umm_g_version}/umm-g-json-schema.json") + return read_json_schema_from_url(schema_url) + with open(SCHEMA_PATHS[f"{self.metadata_format}-json-schema"]) as schema_file: - schema = json.load(schema_file) - return schema + return json.load(schema_file) def run_json_validator(self, content_to_validate): """ @@ -77,21 +118,30 @@ def run_json_validator(self, content_to_validate): schema_store = {} if self.metadata_format == UMM_C: - with open(SCHEMA_PATHS["umm-cmn-json-schema"]) as schema_file: - schema_base = json.load(schema_file) - # workaround to read local referenced schema file (only supports uri) - schema_store = { - schema_base.get("$id", "/umm-cmn-json-schema.json"): schema_base, - schema_base.get("$id", "umm-cmn-json-schema.json"): schema_base, - } - errors = {} + #umm_cmn_schema_url = f"{SCHEMA_CDN_BASE}/collection/{self.umm_c_version}/umm-c-json-schema.json" + # If it's *not* versioned and always the latest or a specific fixed version, adjust this URL + # e.g., f"{SCHEMA_CDN_BASE}/common/umm-cmn-json-schema.json" or from SCHEMA_PATHS + + try: + with open(SCHEMA_PATHS["umm-cmn-json-schema"]) as common_schema_file: + schema_base = json.load(common_schema_file) + # 1. Add the schema using its $id (most common canonical reference) + if "$id" in schema_base: + schema_store[schema_base["$id"]] = schema_base + + # 2. Add the schema using the full URL you fetched it from (if different from $id or for robustness) + schema_store["/umm-cmn-json-schema.json"] = schema_base + schema_store["umm-cmn-json-schema.json"] = schema_base + except Exception as e: + print(f"Error loading UMM Common schema from {SCHEMA_PATHS['umm-cmn-json-schema']}: {e}") + print("Schema validation for UMM-C might proceed without common schema, leading to incomplete validation.") + errors = {} resolver = RefResolver.from_schema(schema, store=schema_store) - validator = Draft7Validator( - schema, format_checker=draft7_format_checker, resolver=resolver + schema, format_checker=Draft7Validator.FORMAT_CHECKER, resolver=resolver ) for error in sorted( @@ -136,13 +186,14 @@ def _build_errors(error_log, paths): # For DIF, because the namespace is specified in the metadata file, lxml library # provides field name concatenated with the namespace, # the following 3 lines of code removes the namespace - namespaces = re.findall("(\{http[^}]*\})", line) + namespaces = re.findall(r"(\{http[^}]*\})", line) for namespace in namespaces: line = line.replace(namespace, "") - field_name = re.search("Element\s'(.*)':", line)[1] + field_name = re.search(r"Element\s'(.*)':", line)[1] field_paths = [abs_path for abs_path in paths if field_name in abs_path] field_name = field_paths[0] if len(field_paths) == 1 else field_name - message = re.search("Element\s'.+':\s(\[.*\])?(.*)", line)[2].strip() + + message = re.search(r"Element\s'.+':\s(\[.*\])?(.*)", line)[2].strip() errors.setdefault(field_name, {})["schema"] = { "message": [f"Error: {message}"], "valid": False, diff --git a/pyQuARC/code/utils.py b/pyQuARC/code/utils.py index 1fe82270..f0544d45 100644 --- a/pyQuARC/code/utils.py +++ b/pyQuARC/code/utils.py @@ -82,3 +82,11 @@ def get_date_time(dt_str): except ValueError: continue return None + +def read_json_schema_from_url(url): + """ + Downloads and returns a JSON schema from a given URL. + """ + response = requests.get(url) + response.raise_for_status() + return response.json() diff --git a/tests/test_downloader.py b/tests/test_downloader.py index ddd7d5db..5a6ca777 100644 --- a/tests/test_downloader.py +++ b/tests/test_downloader.py @@ -9,11 +9,11 @@ class TestDownloader: def setup_method(self): self.concept_ids = { "collection": { - "real": "C1339230297-GES_DISC", + "real": "C1000000010-CDDIS", "dummy": "C123456-LPDAAC_ECS", }, "granule": { - "real": "G1370895082-GES_DISC", + "real": "G1001434969-CDDIS", "dummy": "G1000000002-CMR_PROV", }, "invalid": "asdfasdf", From 72d49fa581c7558fa903dfb86e2c124fa2ed17f9 Mon Sep 17 00:00:00 2001 From: Lavanya Ashokkumar Date: Tue, 10 Jun 2025 15:04:18 -0500 Subject: [PATCH 20/71] Horizontal datum - revised #330 --- pyQuARC/schemas/checks_override.json | 11 ++++++- pyQuARC/schemas/rules_override.json | 49 ++++++++++++++++------------ 2 files changed, 38 insertions(+), 22 deletions(-) diff --git a/pyQuARC/schemas/checks_override.json b/pyQuARC/schemas/checks_override.json index 0967ef42..10dff77f 100644 --- a/pyQuARC/schemas/checks_override.json +++ b/pyQuARC/schemas/checks_override.json @@ -1 +1,10 @@ -{} +{ + "horizontal_datum_name_check": { + "failure": "The Horizontal Datum Name is missing.", + "help": { + "message": "", + "url": "https://wiki.earthdata.nasa.gov/display/CMR/Spatial+Extent" + }, + "remediation": "Information about the datum should be provided in the metadata if possible." + } +} diff --git a/pyQuARC/schemas/rules_override.json b/pyQuARC/schemas/rules_override.json index 16b89647..2d923b7c 100644 --- a/pyQuARC/schemas/rules_override.json +++ b/pyQuARC/schemas/rules_override.json @@ -1,23 +1,30 @@ { - "processing_level_description_presence_check": { - "rule_name": "Processing Level Description Presence Check", - "fields_to_apply": { - "echo-c": [ - { - "fields": [ - "Collection/ProcessingLevelDescription" - ] - } - ], - "umm-c": [ - { - "fields": [ - "ProcessingLevel/ProcessingLevelDescription" - ] - } - ] - }, - "severity": "info", - "check_id": "one_item_presence_check" - } + "horizontal_datum_name_check": { + "rule_name": "Horizontal Datum Name Check", + "fields_to_apply": { + "echo-c": [ + { + "fields": [ + "Collection/SpatialInfo/HorizontalCoordinateSystem/GeodeticModel/HorizontalDatumName" + ] + } + ], + "dif10": [ + { + "fields": [ + "DIF/Spatial_Coverage/Spatial_Info/Horizontal_Coordinate_System/Geodetic_Model/Horizontal_DatumName" + ] + } + ], + "umm-c": [ + { + "fields": [ + "SpatialExtent/HorizontalSpatialDomain/ResolutionAndCoordinateSystem/GeodeticModel/HorizontalDatumName" + ] + } + ] + }, + "severity": "info", + "check_id": "one_item_presence_check" + } } From baeaa6da0654df310e3aeaeeae6f8e9956a57b39 Mon Sep 17 00:00:00 2001 From: Lavanya Ashokkumar Date: Tue, 10 Jun 2025 15:14:37 -0500 Subject: [PATCH 21/71] Horizontal datum - revised #330-v1 --- pyQuARC/schemas/check_messages_override.json | 18 +++++------------- 1 file changed, 5 insertions(+), 13 deletions(-) diff --git a/pyQuARC/schemas/check_messages_override.json b/pyQuARC/schemas/check_messages_override.json index d4ed34be..10dff77f 100644 --- a/pyQuARC/schemas/check_messages_override.json +++ b/pyQuARC/schemas/check_messages_override.json @@ -1,18 +1,10 @@ { - "processing_level_description_presence_check": { - "failure": "The Processing Level Description is missing.", - "help": { - "message": "Recommend providing a processing level description, using the EOSDIS processing level descriptions as guidance", - "url": "https://www.earthdata.nasa.gov/learn/earth-observation-data-basics/data-processing-levels" - }, - "remediation": "Recommend providing a processing level description, using the EOSDIS processing level descriptions as guidance:\nhttps://www.earthdata.nasa.gov/engage/open-data-services-and-software/data-information-policy/data-levels" - }, - "processing_level_description_length_check": { - "failure": "The provided description is less than 50 characters and therefore may be lacking in contextual information.", + "horizontal_datum_name_check": { + "failure": "The Horizontal Datum Name is missing.", "help": { - "message": "Use the EOSDIS Data Processing level description as guidance.", - "url": "https://www.earthdata.nasa.gov/engage/open-data-services-and-software/data-information-policy/data-levels" + "message": "", + "url": "https://wiki.earthdata.nasa.gov/display/CMR/Spatial+Extent" }, - "remediation": "Recommend providing a more detailed processing level description, using the EOSDIS processing level descriptions as guidance:\nhttps://www.earthdata.nasa.gov/engage/open-data-services-and-software/data-information-policy/data-levels" + "remediation": "Information about the datum should be provided in the metadata if possible." } } From 3d766427e4f8fb606b837802d50eb52cb3c8052d Mon Sep 17 00:00:00 2001 From: Bhawana Karakheti Date: Wed, 11 Jun 2025 12:00:42 -0500 Subject: [PATCH 22/71] Granule-url_description_uniqueness_check fixes 1 #324 --- pyQuARC/schemas/rules_override.json | 46 ++++++++++++++++++++++++++++- 1 file changed, 45 insertions(+), 1 deletion(-) diff --git a/pyQuARC/schemas/rules_override.json b/pyQuARC/schemas/rules_override.json index 0967ef42..d2f72744 100644 --- a/pyQuARC/schemas/rules_override.json +++ b/pyQuARC/schemas/rules_override.json @@ -1 +1,45 @@ -{} + +{ + "url_description_uniqueness_check": { + "rule_name": "URL Description Uniqueness Check", + "fields_to_apply": { + "umm-c": [ + { + "fields": [ + "RelatedUrls/Description" + ] + }, + { + "fields": [ + "DataCenters/ContactInformation/RelatedUrls" + ] + } + ], + "dif10": [ + { + "fields": [ + "DIF/Multimedia_Sample" + ] + }, + { + "fields": [ + "DIF/Related_URL" + ] + } + ], + "umm-g": [ + { + "fields": [ + "RelatedUrls/URL" + ] + } + ] + }, + "data": [ + "Description" + ], + "severity": "info", + "check_id": "uniqueness_check" + } + +} From b192aca689dab079feb2159b2537937f88ebe59f Mon Sep 17 00:00:00 2001 From: Lavanya Ashokkumar Date: Wed, 30 Jul 2025 15:35:45 -0500 Subject: [PATCH 23/71] Processing level ID (ESDS request) #320 --- pyQuARC/schemas/check_messages_override.json | 18 ++---- pyQuARC/schemas/rules_override.json | 68 ++++++++++++++------ 2 files changed, 52 insertions(+), 34 deletions(-) diff --git a/pyQuARC/schemas/check_messages_override.json b/pyQuARC/schemas/check_messages_override.json index d4ed34be..49873976 100644 --- a/pyQuARC/schemas/check_messages_override.json +++ b/pyQuARC/schemas/check_messages_override.json @@ -1,18 +1,10 @@ { - "processing_level_description_presence_check": { - "failure": "The Processing Level Description is missing.", - "help": { - "message": "Recommend providing a processing level description, using the EOSDIS processing level descriptions as guidance", - "url": "https://www.earthdata.nasa.gov/learn/earth-observation-data-basics/data-processing-levels" - }, - "remediation": "Recommend providing a processing level description, using the EOSDIS processing level descriptions as guidance:\nhttps://www.earthdata.nasa.gov/engage/open-data-services-and-software/data-information-policy/data-levels" - }, - "processing_level_description_length_check": { - "failure": "The provided description is less than 50 characters and therefore may be lacking in contextual information.", + "processing_level_id_check": { + "failure": "`{}` is not an EOSDIS recognized Processing Level Id.", "help": { - "message": "Use the EOSDIS Data Processing level description as guidance.", - "url": "https://www.earthdata.nasa.gov/engage/open-data-services-and-software/data-information-policy/data-levels" + "message": "", + "url": "https://wiki.earthdata.nasa.gov/display/CMR/Processing+Level, https://www.earthdata.nasa.gov/engage/open-data-services-and-software/data-information-policy/data-levels" }, - "remediation": "Recommend providing a more detailed processing level description, using the EOSDIS processing level descriptions as guidance:\nhttps://www.earthdata.nasa.gov/engage/open-data-services-and-software/data-information-policy/data-levels" + "remediation": "Recommend changing the Id to match one of the EOSDIS Data Processing Levels, if applicable: [Not Provided, 0, 1, 1A, 1B, 1C, 1T, 2, 2A, 2B, 2G, 2P, 3, 3A 4, NA]" } } diff --git a/pyQuARC/schemas/rules_override.json b/pyQuARC/schemas/rules_override.json index 16b89647..56e52da1 100644 --- a/pyQuARC/schemas/rules_override.json +++ b/pyQuARC/schemas/rules_override.json @@ -1,23 +1,49 @@ { - "processing_level_description_presence_check": { - "rule_name": "Processing Level Description Presence Check", - "fields_to_apply": { - "echo-c": [ - { - "fields": [ - "Collection/ProcessingLevelDescription" - ] - } - ], - "umm-c": [ - { - "fields": [ - "ProcessingLevel/ProcessingLevelDescription" - ] - } - ] - }, - "severity": "info", - "check_id": "one_item_presence_check" - } +"processing_level_id_check": { + "rule_name": "EOSDIS Standard Processing Level ID Check", + "fields_to_apply": { + "echo-c": [ + { + "fields": [ + "Collection/ProcessingLevelId" + ] + } + ], + "dif10": [ + { + "fields": [ + "DIF/ProductLevelId" + ] + } + ], + "umm-c": [ + { + "fields": [ + "ProcessingLevel/Id" + ] + } + ] + }, + "data": [ + [ + "0", + "1", + "1A", + "1B", + "1C", + "2", + "2A", + "2B", + "2G", + "2P", + "3", + "3A", + "4", + "NA", + "Not provided" + ] + ], + "severity": "warning", + "check_id": "controlled_keywords_check" + } } From 7690ff2a92b4e07487780f83db93eee8419481e6 Mon Sep 17 00:00:00 2001 From: Bhawana Karakheti Date: Mon, 4 Aug 2025 15:26:37 -0500 Subject: [PATCH 24/71] #324 update schema validator and text_downloader --- pyQuARC/code/schema_validator.py | 12 +++++------- tests/test_downloader.py | 4 ++-- 2 files changed, 7 insertions(+), 9 deletions(-) diff --git a/pyQuARC/code/schema_validator.py b/pyQuARC/code/schema_validator.py index 11b3f087..54a45c2d 100644 --- a/pyQuARC/code/schema_validator.py +++ b/pyQuARC/code/schema_validator.py @@ -3,7 +3,7 @@ import re from io import BytesIO -from jsonschema import Draft7Validator, draft7_format_checker, RefResolver +from jsonschema import Draft7Validator, RefResolver from lxml import etree from urllib.request import pathname2url @@ -90,9 +90,7 @@ def run_json_validator(self, content_to_validate): resolver = RefResolver.from_schema(schema, store=schema_store) - validator = Draft7Validator( - schema, format_checker=draft7_format_checker, resolver=resolver - ) + validator = Draft7Validator(schema, format_checker=Draft7Validator.FORMAT_CHECKER) for error in sorted( validator.iter_errors(json.loads(content_to_validate)), key=str @@ -136,13 +134,13 @@ def _build_errors(error_log, paths): # For DIF, because the namespace is specified in the metadata file, lxml library # provides field name concatenated with the namespace, # the following 3 lines of code removes the namespace - namespaces = re.findall("(\{http[^}]*\})", line) + namespaces = re.findall(r"(\{http[^}]*\})", line) for namespace in namespaces: line = line.replace(namespace, "") - field_name = re.search("Element\s'(.*)':", line)[1] + field_name = re.search(r"Element\s'(.*)':", line)[1] field_paths = [abs_path for abs_path in paths if field_name in abs_path] field_name = field_paths[0] if len(field_paths) == 1 else field_name - message = re.search("Element\s'.+':\s(\[.*\])?(.*)", line)[2].strip() + message = re.search(r"Element\s'.+':\s(\[.*\])?(.*)", line)[2].strip() errors.setdefault(field_name, {})["schema"] = { "message": [f"Error: {message}"], "valid": False, diff --git a/tests/test_downloader.py b/tests/test_downloader.py index ddd7d5db..5a6ca777 100644 --- a/tests/test_downloader.py +++ b/tests/test_downloader.py @@ -9,11 +9,11 @@ class TestDownloader: def setup_method(self): self.concept_ids = { "collection": { - "real": "C1339230297-GES_DISC", + "real": "C1000000010-CDDIS", "dummy": "C123456-LPDAAC_ECS", }, "granule": { - "real": "G1370895082-GES_DISC", + "real": "G1001434969-CDDIS", "dummy": "G1000000002-CMR_PROV", }, "invalid": "asdfasdf", From 87687a184844708a3ae0509c67720ae8684165ae Mon Sep 17 00:00:00 2001 From: Lavanya Ashokkumar Date: Mon, 4 Aug 2025 15:38:37 -0500 Subject: [PATCH 25/71] Granule campaign shortname la #339 --- pyQuARC/code/schema_validator.py | 10 ++++---- pyQuARC/schemas/check_messages_override.json | 11 ++++++++- pyQuARC/schemas/rules_override.json | 24 +++++++++++++++++++- tests/test_downloader.py | 4 ++-- 4 files changed, 40 insertions(+), 9 deletions(-) diff --git a/pyQuARC/code/schema_validator.py b/pyQuARC/code/schema_validator.py index 11b3f087..fcdc2078 100644 --- a/pyQuARC/code/schema_validator.py +++ b/pyQuARC/code/schema_validator.py @@ -3,7 +3,7 @@ import re from io import BytesIO -from jsonschema import Draft7Validator, draft7_format_checker, RefResolver +from jsonschema import Draft7Validator, RefResolver from lxml import etree from urllib.request import pathname2url @@ -91,7 +91,7 @@ def run_json_validator(self, content_to_validate): resolver = RefResolver.from_schema(schema, store=schema_store) validator = Draft7Validator( - schema, format_checker=draft7_format_checker, resolver=resolver + schema, format_checker=Draft7Validator.FORMAT_CHECKER, resolver=resolver ) for error in sorted( @@ -136,13 +136,13 @@ def _build_errors(error_log, paths): # For DIF, because the namespace is specified in the metadata file, lxml library # provides field name concatenated with the namespace, # the following 3 lines of code removes the namespace - namespaces = re.findall("(\{http[^}]*\})", line) + namespaces = re.findall(r"(\{http[^}]*\})", line) for namespace in namespaces: line = line.replace(namespace, "") - field_name = re.search("Element\s'(.*)':", line)[1] + field_name = re.search(r"Element\s'(.*)':", line)[1] field_paths = [abs_path for abs_path in paths if field_name in abs_path] field_name = field_paths[0] if len(field_paths) == 1 else field_name - message = re.search("Element\s'.+':\s(\[.*\])?(.*)", line)[2].strip() + message = re.search(r"Element\s'.+':\s(\[.*\])?(.*)", line)[2].strip() errors.setdefault(field_name, {})["schema"] = { "message": [f"Error: {message}"], "valid": False, diff --git a/pyQuARC/schemas/check_messages_override.json b/pyQuARC/schemas/check_messages_override.json index 0967ef42..97ce108d 100644 --- a/pyQuARC/schemas/check_messages_override.json +++ b/pyQuARC/schemas/check_messages_override.json @@ -1 +1,10 @@ -{} +{ + "granule_campaign_name_presence_check": { + "failure": "The campaign/project short name is missing.", + "help": { + "message": "", + "url": "https://wiki.earthdata.nasa.gov/display/CMR/Project" + }, + "remediation": "Recommend providing a campaign short name from the following list: https://gcmd.earthdata.nasa.gov/kms/concepts/concept_scheme/projects/?format=csv" + } +} diff --git a/pyQuARC/schemas/rules_override.json b/pyQuARC/schemas/rules_override.json index 0967ef42..536de84c 100644 --- a/pyQuARC/schemas/rules_override.json +++ b/pyQuARC/schemas/rules_override.json @@ -1 +1,23 @@ -{} +{ + "granule_campaign_name_presence_check": { + "rule_name": "Campaign Name Presence Check", + "fields_to_apply": { + "echo-g": [ + { + "fields": [ + "Granule/Campaigns/Campaign/ShortName" + ] + } + ], + "umm-g": [ + { + "fields": [ + "Campaign/ShortName" + ] + } + ] + }, + "severity": "info", + "check_id": "one_item_presence_check" + } +} diff --git a/tests/test_downloader.py b/tests/test_downloader.py index ddd7d5db..5a6ca777 100644 --- a/tests/test_downloader.py +++ b/tests/test_downloader.py @@ -9,11 +9,11 @@ class TestDownloader: def setup_method(self): self.concept_ids = { "collection": { - "real": "C1339230297-GES_DISC", + "real": "C1000000010-CDDIS", "dummy": "C123456-LPDAAC_ECS", }, "granule": { - "real": "G1370895082-GES_DISC", + "real": "G1001434969-CDDIS", "dummy": "G1000000002-CMR_PROV", }, "invalid": "asdfasdf", From 65411d85852bb7954b3eee39f60432b9016f36f4 Mon Sep 17 00:00:00 2001 From: em-koontz Date: Tue, 5 Aug 2025 14:22:02 -0500 Subject: [PATCH 26/71] Update contributing.md --- contributing.md | 1 + 1 file changed, 1 insertion(+) diff --git a/contributing.md b/contributing.md index 01f3664f..67523779 100644 --- a/contributing.md +++ b/contributing.md @@ -53,6 +53,7 @@ This page is meant to help you learn how you can contribute to pyQuARC! We are p ## Thank you for your interest in pyQuARC! We appreciate your interest in pyQuARC! Everyone is encouraged to help to improve pyQuARC, and we welcome your comments, suggestions, and new ideas! +Please contact earthdata-support@nasa.gov with any questions. From 68589647349daf4739efeb5444407fb817a2a319 Mon Sep 17 00:00:00 2001 From: Slesa Adhikari Date: Fri, 8 Aug 2025 11:11:48 -0500 Subject: [PATCH 27/71] Upgrade pytest because buggy --- .github/workflows/python-app.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/python-app.yml b/.github/workflows/python-app.yml index de6750a2..4f52ecfa 100644 --- a/.github/workflows/python-app.yml +++ b/.github/workflows/python-app.yml @@ -19,7 +19,7 @@ jobs: - name: Install dependencies run: | python -m pip install --upgrade pip - pip install flake8 pytest + pip install flake8 pytest==6.2.5 if [ -f requirements.txt ]; then pip install -r requirements.txt; fi - name: Lint with flake8 run: | From a3d3a6106e058b6dccaf1d23de665c74a33de158 Mon Sep 17 00:00:00 2001 From: Slesa Adhikari Date: Fri, 8 Aug 2025 11:12:36 -0500 Subject: [PATCH 28/71] Update real collection, granule id in test fixtures because existing ones were removed from cmr --- tests/test_downloader.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/test_downloader.py b/tests/test_downloader.py index ddd7d5db..b015b844 100644 --- a/tests/test_downloader.py +++ b/tests/test_downloader.py @@ -9,11 +9,11 @@ class TestDownloader: def setup_method(self): self.concept_ids = { "collection": { - "real": "C1339230297-GES_DISC", + "real": "C2515837343-GES_DISC", "dummy": "C123456-LPDAAC_ECS", }, "granule": { - "real": "G1370895082-GES_DISC", + "real": "G2519682101-GES_DISC", "dummy": "G1000000002-CMR_PROV", }, "invalid": "asdfasdf", From ab1df2bb68e3d8649cc8988262c9cd249c1da608 Mon Sep 17 00:00:00 2001 From: Slesa Adhikari Date: Fri, 8 Aug 2025 11:12:56 -0500 Subject: [PATCH 29/71] Fix jsonschema checker import --- pyQuARC/code/schema_validator.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pyQuARC/code/schema_validator.py b/pyQuARC/code/schema_validator.py index 11b3f087..51645140 100644 --- a/pyQuARC/code/schema_validator.py +++ b/pyQuARC/code/schema_validator.py @@ -3,7 +3,7 @@ import re from io import BytesIO -from jsonschema import Draft7Validator, draft7_format_checker, RefResolver +from jsonschema import Draft7Validator, RefResolver from lxml import etree from urllib.request import pathname2url @@ -91,7 +91,7 @@ def run_json_validator(self, content_to_validate): resolver = RefResolver.from_schema(schema, store=schema_store) validator = Draft7Validator( - schema, format_checker=draft7_format_checker, resolver=resolver + schema, format_checker=Draft7Validator.FORMAT_CHECKER, resolver=resolver ) for error in sorted( From fb5c794fc539a76920dedcd28f81b36456bb2a2f Mon Sep 17 00:00:00 2001 From: Lavanya Ashokkumar Date: Fri, 8 Aug 2025 13:19:42 -0500 Subject: [PATCH 30/71] Earthdata email - updated files #321 --- pyQuARC/schemas/check_messages.json | 26 ++--- pyQuARC/schemas/check_messages_override.json | 107 +------------------ 2 files changed, 14 insertions(+), 119 deletions(-) diff --git a/pyQuARC/schemas/check_messages.json b/pyQuARC/schemas/check_messages.json index 0b8b38c8..9cac706a 100644 --- a/pyQuARC/schemas/check_messages.json +++ b/pyQuARC/schemas/check_messages.json @@ -109,7 +109,7 @@ "message": "", "url": "https://wiki.earthdata.nasa.gov/display/CMR/Science+Keywords" }, - "remediation": "Provide a valid GCMD keyword or submit a request to support@earthdata.nasa.gov to have this keyword added to the GCMD KMS." + "remediation": "Provide a valid GCMD keyword or submit a request to earthdata-support@nasa.gov to have this keyword added to the GCMD KMS." }, "science_keywords_presence_check": { "failure": "Science keywords are required.", @@ -125,7 +125,7 @@ "message": "", "url": "" }, - "remediation": "Provide a valid GCMD keyword or submit a request to support@earthdata.nasa.gov to have this keyword added to the GCMD KMS." + "remediation": "Provide a valid GCMD keyword or submit a request to earthdata-support@nasa.gov to have this keyword added to the GCMD KMS." }, "eosdis_doi_authority_check": { "failure": "`{}` may be an invalid value.", @@ -157,7 +157,7 @@ "message": "", "url": "https://wiki.earthdata.nasa.gov/display/CMR/Data+Center" }, - "remediation": "Provide a valid short name from the GCMD Providers keyword list or submit a request to support@earthdata.nasa.gov to have this keyword added to the GCMD KMS." + "remediation": "Provide a valid short name from the GCMD Providers keyword list or submit a request to earthdata-support@nasa.gov to have this keyword added to the GCMD KMS." }, "organization_long_name_gcmd_check": { "failure": "The provided data center long name `{}` does not comply with the GCMD. ", @@ -165,7 +165,7 @@ "message": "", "url": "https://wiki.earthdata.nasa.gov/display/CMR/Data+Center" }, - "remediation": "Provide a valid long name name from the GCMD Providers keyword list or submit a request to support@earthdata.nasa.gov to have this keyword added to the GCMD KMS." + "remediation": "Provide a valid long name name from the GCMD Providers keyword list or submit a request to earthdata-support@nasa.gov to have this keyword added to the GCMD KMS." }, "organization_short_long_name_consistency_check": { "failure": "The provided data center short name `{}` and long name `{}` aren't consistent.", @@ -309,7 +309,7 @@ "message": "", "url": "https://gcmd.earthdata.nasa.gov/kms/concepts/concept_scheme/instruments/?format=csv&page_num=1&page_size=2000" }, - "remediation": "Select a valid short name, or submit a request to support@earthdata.nasa.gov to have this instrument added to the GCMD Instruments keyword list." + "remediation": "Select a valid short name, or submit a request to earthdata-support@nasa.gov to have this instrument added to the GCMD Instruments keyword list." }, "instrument_long_name_gcmd_check": { "failure": "The provided instrument long name `{}` does not comply with GCMD.", @@ -317,7 +317,7 @@ "message": "", "url": "https://gcmd.earthdata.nasa.gov/kms/concepts/concept_scheme/instruments/?format=csv&page_num=1&page_size=2000" }, - "remediation": "Select a valid long name, or submit a request to support@earthdata.nasa.gov to have this instrument added to the GCMD Instruments keyword list." + "remediation": "Select a valid long name, or submit a request to earthdata-support@nasa.gov to have this instrument added to the GCMD Instruments keyword list." }, "instrument_long_name_presence_check": { "failure": "The provided instrument/sensor short name `{}` is missing the corresponding instrument/sensor long name.", @@ -365,7 +365,7 @@ "message": "", "url": "https://gcmd.earthdata.nasa.gov/kms/concepts/concept_scheme/platforms/?format=csv&page_num=1&page_size=2000" }, - "remediation": "Select a valid short name, or submit a request to support@earthdata.nasa.gov to have this platform added to the GCMD Platforms keyword list." + "remediation": "Select a valid short name, or submit a request to earthdata-support@nasa.gov to have this platform added to the GCMD Platforms keyword list." }, "platform_short_long_name_consistency_check": { "failure": "The provided platform short name `{}` and long name `{}` are not consistent.", @@ -389,7 +389,7 @@ "message": "", "url": "https://gcmd.earthdata.nasa.gov/kms/concepts/concept_scheme/DataFormat/?format=csv&page_num=1&page_size=2000" }, - "remediation": "Select a valid data format, or submit a request to support@earthdata.nasa.gov to have this data format added to the GCMD Data Format keyword list." + "remediation": "Select a valid data format, or submit a request to earthdata-support@nasa.gov to have this data format added to the GCMD Data Format keyword list." }, "platform_long_name_gcmd_check": { "failure": "The provided platform long name `{}` does not comply with GCMD.", @@ -397,7 +397,7 @@ "message": "", "url": "https://gcmd.earthdata.nasa.gov/kms/concepts/concept_scheme/platforms/?format=csv&page_num=1&page_size=2000" }, - "remediation": "Select a valid long name, or submit a request to support@earthdata.nasa.gov to have this platform added to the GCMD Platforms keyword list." + "remediation": "Select a valid long name, or submit a request to earthdata-support@nasa.gov to have this platform added to the GCMD Platforms keyword list." }, "spatial_keyword_gcmd_check": { "failure": "The provided location/spatial keyword `{}` does not comply with GCMD.", @@ -405,7 +405,7 @@ "message": "", "url": "https://gcmd.earthdata.nasa.gov/kms/concepts/concept_scheme/locations/?format=csv&page_num=1&page_size=2000" }, - "remediation": "Select a valid location keyword, or submit a request to support@earthdata.nasa.gov to have this value added to the GCMD Locations keyword list." + "remediation": "Select a valid location keyword, or submit a request to earthdata-support@nasa.gov to have this value added to the GCMD Locations keyword list." }, "platform_type_gcmd_check": { "failure": "The provided platform type `{}` does not comply with GCMD.", @@ -413,7 +413,7 @@ "message": "", "url": "https://gcmd.earthdata.nasa.gov/kms/concepts/concept_scheme/platforms/?format=csv&page_num=1&page_size=2000" }, - "remediation": "Select a valid platform type, or submit a request to support@earthdata.nasa.gov to have this platform type added to the GCMD Platforms keyword list." + "remediation": "Select a valid platform type, or submit a request to earthdata-support@nasa.gov to have this platform type added to the GCMD Platforms keyword list." }, "campaign_short_long_name_consistency_check": { "failure": "The provided project/campaign short name `{}` and long name `{}` are not consistent.", @@ -429,7 +429,7 @@ "message": "", "url": "https://gcmd.earthdata.nasa.gov/kms/concepts/concept_scheme/projects/?format=csv&page_num=1&page_size=2000" }, - "remediation": "Select a valid short name, or submit a request to support@earthdata.nasa.gov to have this project/campaign name added to the GCMD Projects keyword list." + "remediation": "Select a valid short name, or submit a request to earthdata-support@nasa.gov to have this project/campaign name added to the GCMD Projects keyword list." }, "campaign_long_name_gcmd_check": { "failure": "The provided project/campaign long name `{}` does not comply with GCMD.", @@ -437,7 +437,7 @@ "message": "", "url": "https://gcmd.earthdata.nasa.gov/kms/concepts/concept_scheme/projects/?format=csv&page_num=1&page_size=2000" }, - "remediation": "Select a valid long name, or submit a request to support@earthdata.nasa.gov to have this project/campaign name added to the GCMD Projects keyword list." + "remediation": "Select a valid long name, or submit a request to earthdata-support@nasa.gov to have this project/campaign name added to the GCMD Projects keyword list." }, "campaign_long_name_presence_check": { "failure": "The provided project/campaign short name `{}` is missing the corresponding project/campaign long name.", diff --git a/pyQuARC/schemas/check_messages_override.json b/pyQuARC/schemas/check_messages_override.json index 1d992b39..311847da 100644 --- a/pyQuARC/schemas/check_messages_override.json +++ b/pyQuARC/schemas/check_messages_override.json @@ -1,107 +1,2 @@ -{ - "science_keywords_gcmd_check": { - "failure": "`{}` is not a valid GCMD science keyword.", - "help": { - "message": "", - "url": "https://wiki.earthdata.nasa.gov/display/CMR/Science+Keywords" - }, - "remediation": "Provide a valid GCMD keyword or submit a request to earthdata-support@nasa.gov to have this keyword added to the GCMD KMS." - }, - "location_gcmd_check": { - "failure": "`{}` is not a valid GCMD location keyword.", - "help": { - "message": "", - "url": "" - }, - "remediation": "Provide a valid GCMD keyword or submit a request to earthdata-support@nasa.gov to have this keyword added to the GCMD KMS." - }, - "organization_short_name_gcmd_check": { - "failure": "The provided short name `{}` does not comply with GCMD. ", - "help": { - "message": "", - "url": "https://wiki.earthdata.nasa.gov/display/CMR/Data+Center" - }, - "remediation": "Provide a valid short name from the GCMD Providers keyword list or submit a request to earthdata-support@nasa.gov to have this keyword added to the GCMD KMS." - }, - "organization_long_name_gcmd_check": { - "failure": "The provided data center long name `{}` does not comply with the GCMD. ", - "help": { - "message": "", - "url": "https://wiki.earthdata.nasa.gov/display/CMR/Data+Center" - }, - "remediation": "Provide a valid long name name from the GCMD Providers keyword list or submit a request to earthdata-support@nasa.gov to have this keyword added to the GCMD KMS." - }, - "instrument_short_name_gcmd_check": { - "failure": "The provided instrument short name `{}` does not comply with GCMD.", - "help": { - "message": "", - "url": "https://gcmd.earthdata.nasa.gov/kms/concepts/concept_scheme/instruments/?format=csv&page_num=1&page_size=2000" - }, - "remediation": "Select a valid short name, or submit a request to earthdata-support@nasa.gov to have this instrument added to the GCMD Instruments keyword list." - }, - "instrument_long_name_gcmd_check": { - "failure": "The provided instrument long name `{}` does not comply with GCMD.", - "help": { - "message": "", - "url": "https://gcmd.earthdata.nasa.gov/kms/concepts/concept_scheme/instruments/?format=csv&page_num=1&page_size=2000" - }, - "remediation": "Select a valid long name, or submit a request to earthdata-support@nasa.gov to have this instrument added to the GCMD Instruments keyword list." - }, - "platform_short_name_gcmd_check": { - "failure": "The provided platform short name `{}` does not comply with GCMD.", - "help": { - "message": "", - "url": "https://gcmd.earthdata.nasa.gov/kms/concepts/concept_scheme/platforms/?format=csv&page_num=1&page_size=2000" - }, - "remediation": "Select a valid short name, or submit a request to earthdata-support@nasa.gov to have this platform added to the GCMD Platforms keyword list." - }, - "data_format_gcmd_check": { - "failure": "The provided data format `{}` does not comply with GCMD.", - "help": { - "message": "", - "url": "https://gcmd.earthdata.nasa.gov/kms/concepts/concept_scheme/DataFormat/?format=csv&page_num=1&page_size=2000" - }, - "remediation": "Select a valid data format, or submit a request to earthdata-support@nasa.gov to have this data format added to the GCMD Data Format keyword list." - }, - "platform_long_name_gcmd_check": { - "failure": "The provided platform long name `{}` does not comply with GCMD.", - "help": { - "message": "", - "url": "https://gcmd.earthdata.nasa.gov/kms/concepts/concept_scheme/platforms/?format=csv&page_num=1&page_size=2000" - }, - "remediation": "Select a valid long name, or submit a request to earthdata-support@nasa.gov to have this platform added to the GCMD Platforms keyword list." - }, - "spatial_keyword_gcmd_check": { - "failure": "The provided location/spatial keyword `{}` does not comply with GCMD.", - "help": { - "message": "", - "url": "https://gcmd.earthdata.nasa.gov/kms/concepts/concept_scheme/locations/?format=csv&page_num=1&page_size=2000" - }, - "remediation": "Select a valid location keyword, or submit a request to earthdata-support@nasa.gov to have this value added to the GCMD Locations keyword list." - }, - "platform_type_gcmd_check": { - "failure": "The provided platform type `{}` does not comply with GCMD.", - "help": { - "message": "", - "url": "https://gcmd.earthdata.nasa.gov/kms/concepts/concept_scheme/platforms/?format=csv&page_num=1&page_size=2000" - }, - "remediation": "Select a valid platform type, or submit a request to earthdata-support@nasa.gov to have this platform type added to the GCMD Platforms keyword list." - }, - "campaign_short_name_gcmd_check": { - "failure": "The provided project/campaign short name `{}` does not comply with GCMD.", - "help": { - "message": "", - "url": "https://gcmd.earthdata.nasa.gov/kms/concepts/concept_scheme/projects/?format=csv&page_num=1&page_size=2000" - }, - "remediation": "Select a valid short name, or submit a request to earthdata-support@nasa.gov to have this project/campaign name added to the GCMD Projects keyword list." - }, - "campaign_long_name_gcmd_check": { - "failure": "The provided project/campaign long name `{}` does not comply with GCMD.", - "help": { - "message": "", - "url": "https://gcmd.earthdata.nasa.gov/kms/concepts/concept_scheme/projects/?format=csv&page_num=1&page_size=2000" - }, - "remediation": "Select a valid long name, or submit a request to earthdata-support@nasa.gov to have this project/campaign name added to the GCMD Projects keyword list." - } -} +{} From 8f032f71c799d49d36f37208a9fdc5bdbf8d5686 Mon Sep 17 00:00:00 2001 From: Lavanya Ashokkumar Date: Fri, 8 Aug 2025 14:08:30 -0500 Subject: [PATCH 31/71] Processing level ID (ESDS request) #320 - revised --- pyQuARC/schemas/check_messages.json | 2 +- pyQuARC/schemas/check_messages_override.json | 8 ---- pyQuARC/schemas/rule_mapping.json | 7 ++- pyQuARC/schemas/rules_override.json | 50 +------------------- 4 files changed, 8 insertions(+), 59 deletions(-) diff --git a/pyQuARC/schemas/check_messages.json b/pyQuARC/schemas/check_messages.json index 0b8b38c8..79c044ee 100644 --- a/pyQuARC/schemas/check_messages.json +++ b/pyQuARC/schemas/check_messages.json @@ -93,7 +93,7 @@ "message": "", "url": "https://wiki.earthdata.nasa.gov/display/CMR/Processing+Level, https://www.earthdata.nasa.gov/engage/open-data-services-and-software/data-information-policy/data-levels" }, - "remediation": "Recommend changing the Id to match one of the EOSDIS Data Processing Levels, if applicable: [0, 1A, 1B, 1C, 2, 2A, 2B, 3, 3A, 4]" + "remediation": "Recommend changing the Id to match one of the EOSDIS Data Processing Levels, if applicable: [Not Provided, 0, 1, 1A, 1B, 1C, 1T, 2, 2A, 2B, 2G, 2P, 3, 3A 4, NA]" }, "processing_level_description_length_check": { "failure": "The provided description is less than 50 characters and therefore may be lacking in contextual information.", diff --git a/pyQuARC/schemas/check_messages_override.json b/pyQuARC/schemas/check_messages_override.json index 49873976..2c63c085 100644 --- a/pyQuARC/schemas/check_messages_override.json +++ b/pyQuARC/schemas/check_messages_override.json @@ -1,10 +1,2 @@ { - "processing_level_id_check": { - "failure": "`{}` is not an EOSDIS recognized Processing Level Id.", - "help": { - "message": "", - "url": "https://wiki.earthdata.nasa.gov/display/CMR/Processing+Level, https://www.earthdata.nasa.gov/engage/open-data-services-and-software/data-information-policy/data-levels" - }, - "remediation": "Recommend changing the Id to match one of the EOSDIS Data Processing Levels, if applicable: [Not Provided, 0, 1, 1A, 1B, 1C, 1T, 2, 2A, 2B, 2G, 2P, 3, 3A 4, NA]" - } } diff --git a/pyQuARC/schemas/rule_mapping.json b/pyQuARC/schemas/rule_mapping.json index 2e3acc41..95ab58b5 100644 --- a/pyQuARC/schemas/rule_mapping.json +++ b/pyQuARC/schemas/rule_mapping.json @@ -994,15 +994,20 @@ "data": [ [ "0", + "1", "1A", "1B", "1C", "2", "2A", "2B", + "2G", + "2P", "3", "3A", - "4" + "4", + "NA", + "Not provided" ] ], "severity": "warning", diff --git a/pyQuARC/schemas/rules_override.json b/pyQuARC/schemas/rules_override.json index 56e52da1..0967ef42 100644 --- a/pyQuARC/schemas/rules_override.json +++ b/pyQuARC/schemas/rules_override.json @@ -1,49 +1 @@ -{ -"processing_level_id_check": { - "rule_name": "EOSDIS Standard Processing Level ID Check", - "fields_to_apply": { - "echo-c": [ - { - "fields": [ - "Collection/ProcessingLevelId" - ] - } - ], - "dif10": [ - { - "fields": [ - "DIF/ProductLevelId" - ] - } - ], - "umm-c": [ - { - "fields": [ - "ProcessingLevel/Id" - ] - } - ] - }, - "data": [ - [ - "0", - "1", - "1A", - "1B", - "1C", - "2", - "2A", - "2B", - "2G", - "2P", - "3", - "3A", - "4", - "NA", - "Not provided" - ] - ], - "severity": "warning", - "check_id": "controlled_keywords_check" - } -} +{} From 150bff55de9367e55d2fcb6e2beaf2b87e35f373 Mon Sep 17 00:00:00 2001 From: Slesa Adhikari Date: Fri, 8 Aug 2025 14:29:30 -0500 Subject: [PATCH 32/71] Fix grammatical errors --- contributing.md | 30 +++++++++++++----------------- 1 file changed, 13 insertions(+), 17 deletions(-) diff --git a/contributing.md b/contributing.md index 67523779..91cf80ee 100644 --- a/contributing.md +++ b/contributing.md @@ -3,39 +3,39 @@ # Welcome to pyQuARC! This page is meant to help you learn how you can contribute to pyQuARC! We are passionate about NASA's Open Science initiative and are open to a variety of contributions. Read below to find ways that you can contribute to our project, either through reporting bugs, suggesting new features, or even directly editing the code yourself. -## How you can contributute to pyQuARC: +## How you can contribute to pyQuARC: 1. **Report a Bug:** for when you find something within the code that does not respond the way you expected/wanted it to. * To start you will need to proceed to the [**Issues** tab](https://github.com/NASA-IMPACT/pyQuARC/issues) within the pyQuARC Github page. * From here, look for the green button on the right side of the page labeled **New issue**. * Select **Bug Report** from the list that appears, so that you can create a report to help us improve an aspect of pyQuARC. - * The page you are directed to will provide a prompt to add a title and explains how to fill in the bug you are wanting to report. + * The page you are directed to will provide a prompt to add a title and explain how to fill in the bug you want to report. * If you change your mind about reporting a bug, there is a white button on the bottom right of the page labeled **Cancel** where you can either decide to keep editing or close and discard your issue. * Beneath the description box, select "Issue Type" and "Bug". - * When you are finished describing the bug you wish to report, you can click the big green button at the bottom of the page labeled **Create**. This will make your Issue visible to all pyQuARC contributers, and pyQuARC developers will automatically be assigned to the Issue and notified. + * When you are finished describing the bug you wish to report, you can click the big green button at the bottom of the page labeled **Create**. This will make your Issue visible to all pyQuARC contributors, and pyQuARC developers will automatically be assigned to the Issue and notified. * You can see your new issue if you return to the **Issues** page of the pyQuARC GitHub and look for your title followed by the red __bug__ tag. 2. **Suggest a New Feature:** for when you think of something that could enhance pyQuARC for other users. * Suggesting a new feature is very similar to reporting a bug. You will start at the [**Issues** tab](https://github.com/NASA-IMPACT/pyQuARC/issues) within the pyQuARC Github page. * Select the green **New Issue** button found on the top right side of the page. * From the menu that appears, select **Feature Request** so that you can suggest an idea for our project. - * The page you are directed to will provide a prompt to add a title and explains how to make a new suggestion. + * The page you are directed to will provide a prompt to add a title and explain how to make a new suggestion. * If you change your mind about making a feature request, there is a white button on the bottom right of the page labeled **Cancel** where you can either decide to keep editing or close and discard your issue. * Beneath the description box, select "Issue Type" and "Feature". - * When you are finished describing your suggestion, you can click the big green button at the bottom of the page labeled **Create**. This will make your Issue visible to all pyQuARC contributers. + * When you are finished describing your suggestion, you can click the big green button at the bottom of the page labeled **Create**. This will make your Issue visible to all pyQuARC contributors. * You can see your new issue if you return to the **Issues** page of the pyQuARC GitHub and look for your title followed by the green __new check__ tag. 3. **Directly Contribute to PyQuARC Content:** for when you want to directly edit the code to add checks or new features. - * Fork the respository - * To edit the code you will need to first create your own 'fork' of the repository. A fork is a new repository that shares code and visibilty settings with the original repository, and allows you to create your own edits. Read more about Forks [here](https://docs.github.com/en/pull-requests/collaborating-with-pull-requests/working-with-forks/fork-a-repo). - * To create your own fork of pyQuARC, return to the [**Code**](https://github.com/NASA-IMPACT/pyQuARC) tab of the pyQuARC GitHub. - * On the top right of the page select the **Fork** tab. - * Under the "Owner" dropdown menu select yourself as the owner of the new forked repository. + * Fork the repository + * To edit the code, you will need to first create your own 'fork' of the repository. A fork is a new repository that shares code and visibility settings with the original repository and allows you to create your edits. Read more about Forks [here](https://docs.github.com/en/pull-requests/collaborating-with-pull-requests/working-with-forks/fork-a-repo). + * To create your fork of pyQuARC, return to the [**Code**](https://github.com/NASA-IMPACT/pyQuARC) tab of the pyQuARC GitHub. + * On the top right of the page, select the **Fork** tab. + * Under the "Owner" dropdown menu, select yourself as the owner of the new forked repository. * The fork name will default to 'pyQuARC'. If you wish to name your fork something different, edit the 'Repository Name' field. * You can set an optional description in the 'Description' field below. * Make sure the checkbox next to 'Copy the master branch only' is selected. * Click **Create fork** when you are finished to create your fork! - * After completing the steps above, you should be in a new page titled the same as your new fork, with "forked from NASA-IMPACT/pyQuARC" beneath the title. You have successfully created a fork of pyQuARC! + * After completing the steps above, you should be on a new page titled the same as your new fork, with "forked from NASA-IMPACT/pyQuARC" beneath the title. You have successfully created a fork of pyQuARC! * Clone your fork locally * Now we will store the files locally on your computer so you will be able to edit the code. Click the green dropdown button labeled **<> Code**. * Under the **HTTPS** tab, copy the link to the repository. @@ -47,14 +47,10 @@ This page is meant to help you learn how you can contribute to pyQuARC! We are p * Create a PR * Once your changes are made, push your commits. * You can then open a Pull Request (PR) on the [**Pull requests** tab](https://github.com/NASA-IMPACT/pyQuARC/pulls) within the pyQuARC Github page. - * Set the base respository to "NASA-IMPACT/pyQuARC" and the base to "dev". + * Set the base repository to "NASA-IMPACT/pyQuARC" and the base to "dev". * Fill out a title and description, then submit! * Feedback may be provided on your PR. Once it is approved, a pyQuARC team member will merge your changes. ## Thank you for your interest in pyQuARC! -We appreciate your interest in pyQuARC! Everyone is encouraged to help to improve pyQuARC, and we welcome your comments, suggestions, and new ideas! +We appreciate your interest in pyQuARC! Everyone is encouraged to help improve pyQuARC, and we welcome your comments, suggestions, and new ideas! Please contact earthdata-support@nasa.gov with any questions. - - - - From d888f7166b9c7d1c21569de1e0ba8c52c0ca74f4 Mon Sep 17 00:00:00 2001 From: Slesa Adhikari Date: Fri, 8 Aug 2025 14:37:31 -0500 Subject: [PATCH 33/71] Update error message --- pyQuARC/schemas/check_messages.json | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pyQuARC/schemas/check_messages.json b/pyQuARC/schemas/check_messages.json index 0b8b38c8..a786e043 100644 --- a/pyQuARC/schemas/check_messages.json +++ b/pyQuARC/schemas/check_messages.json @@ -768,7 +768,7 @@ "remediation": "Recommend providing a unique name for each characteristic." }, "validate_beginning_datetime_against_granules": { - "failure": "The collection beginning date time `{}` is not consistent with the first granule's beginning date time `{}`.", + "failure": "The collection beginning date time `{}` is not consistent with the beginning date time in the metadata for the first granule `{}`.", "help": { "message": "", "url": "https://wiki.earthdata.nasa.gov/display/CMR/Temporal+Extent" @@ -776,7 +776,7 @@ "remediation": "Recommend updating the beginning date time to match the granule extent." }, "validate_ending_datetime_against_granules": { - "failure": "The collection ending date time `{}` is not consistent with the last granule's ending date time `{}`.", + "failure": "The collection ending date time `{}` is not consistent with the ending date time in the metadata for the last granule `{}`.", "help": { "message": "", "url": "https://wiki.earthdata.nasa.gov/display/CMR/Temporal+Extent" From a60d3eab8ed7a2b0ae4d0aa899d0d276f3628efa Mon Sep 17 00:00:00 2001 From: Lavanya Ashokkumar Date: Fri, 8 Aug 2025 15:17:11 -0500 Subject: [PATCH 34/71] Datacenter email - revised v1 #326 --- pyQuARC/schemas/check_messages.json | 8 ++++++++ pyQuARC/schemas/check_messages_override.json | 11 +---------- pyQuARC/schemas/checks.json | 5 +++++ pyQuARC/schemas/checks_override.json | 8 +------- pyQuARC/schemas/rule_mapping.json | 17 ++++++++++++++++- pyQuARC/schemas/rules_override.json | 18 +----------------- 6 files changed, 32 insertions(+), 35 deletions(-) diff --git a/pyQuARC/schemas/check_messages.json b/pyQuARC/schemas/check_messages.json index 0b8b38c8..92cfa877 100644 --- a/pyQuARC/schemas/check_messages.json +++ b/pyQuARC/schemas/check_messages.json @@ -47,6 +47,14 @@ }, "remediation": "This often indicates a broken link. If the URL is broken, recommend revising." }, + "url_update_email_check": { + "failure": "The listed email contact information must be updated.", + "help": { + "message": "Recommend providing the updated contact information as per the data product.", + "url": "https://wiki.earthdata.nasa.gov/display/CMR/Data+Center" + }, + "remediation": "Recommend changing the contact information to 'support-cddis@nasa.gov'. " + }, "shortname_uniqueness": { "failure": "The EntryTitle/DataSetId `{}` is identical to the ShortName `{}`.", "help": { diff --git a/pyQuARC/schemas/check_messages_override.json b/pyQuARC/schemas/check_messages_override.json index 9f7673ed..0967ef42 100644 --- a/pyQuARC/schemas/check_messages_override.json +++ b/pyQuARC/schemas/check_messages_override.json @@ -1,10 +1 @@ -{ - "url_update_email_check": { - "failure": "The listed email contact information must be updated.", - "help": { - "message": "Recommend providing the updated contact information as per the data product.", - "url": "https://wiki.earthdata.nasa.gov/display/CMR/Data+Center" - }, - "remediation": "Recommend changing the contact information to 'support-cddis@nasa.gov'. " - } -} +{} diff --git a/pyQuARC/schemas/checks.json b/pyQuARC/schemas/checks.json index 778f4da3..d921ac28 100644 --- a/pyQuARC/schemas/checks.json +++ b/pyQuARC/schemas/checks.json @@ -24,6 +24,11 @@ "check_function": "health_and_status_check", "available": true }, + "url_update_email_check": { + "data_type": "url", + "check_function": "url_update_email_check", + "available": true + }, "string_compare": { "data_type": "string", "check_function": "compare", diff --git a/pyQuARC/schemas/checks_override.json b/pyQuARC/schemas/checks_override.json index 2f4d6fc1..0967ef42 100644 --- a/pyQuARC/schemas/checks_override.json +++ b/pyQuARC/schemas/checks_override.json @@ -1,7 +1 @@ -{ - "url_update_email_check": { - "data_type": "url", - "check_function": "url_update_email_check", - "available": true - } -} +{} diff --git a/pyQuARC/schemas/rule_mapping.json b/pyQuARC/schemas/rule_mapping.json index 2e3acc41..214d6ddc 100644 --- a/pyQuARC/schemas/rule_mapping.json +++ b/pyQuARC/schemas/rule_mapping.json @@ -5559,5 +5559,20 @@ }, "severity": "warning", "check_id": "one_item_presence_check" - } + }, + "url_update_email_check": { + "rule_name": "URL Email address check", + "fields_to_apply": { + "umm-c": [ + { + "fields": [ + "DataCenters/ContactGroups/ContactInformation/ContactMechanisms/Value", + "DataCenters/ContactGroups/ContactInformation/ContactInstruction" + ] + } + ] + }, + "severity": "info", + "check_id": "url_update_email_check" +} } \ No newline at end of file diff --git a/pyQuARC/schemas/rules_override.json b/pyQuARC/schemas/rules_override.json index 2ecc4799..0967ef42 100644 --- a/pyQuARC/schemas/rules_override.json +++ b/pyQuARC/schemas/rules_override.json @@ -1,17 +1 @@ -{ - "url_update_email_check": { - "rule_name": "URL Email address check", - "fields_to_apply": { - "umm-c": [ - { - "fields": [ - "DataCenters/ContactGroups/ContactInformation/ContactMechanisms/Value", - "DataCenters/ContactGroups/ContactInformation/ContactInstruction" - ] - } - ] - }, - "severity": "info", - "check_id": "url_update_email_check" -} -} +{} From 26d5a101e445545c37f76bb4ae80af807d51104a Mon Sep 17 00:00:00 2001 From: Lavanya Ashokkumar Date: Fri, 8 Aug 2025 15:37:34 -0500 Subject: [PATCH 35/71] Processing level description - revised v1 #327 --- pyQuARC/schemas/check_messages.json | 12 ++++++++-- pyQuARC/schemas/check_messages_override.json | 19 +--------------- pyQuARC/schemas/rule_mapping.json | 21 +++++++++++++++++ pyQuARC/schemas/rules_override.json | 24 +------------------- 4 files changed, 33 insertions(+), 43 deletions(-) diff --git a/pyQuARC/schemas/check_messages.json b/pyQuARC/schemas/check_messages.json index 0b8b38c8..4c6c2efa 100644 --- a/pyQuARC/schemas/check_messages.json +++ b/pyQuARC/schemas/check_messages.json @@ -98,11 +98,19 @@ "processing_level_description_length_check": { "failure": "The provided description is less than 50 characters and therefore may be lacking in contextual information.", "help": { - "message": "Use the EOSDIS Data Processing Level descriptions as guidance.", + "message": "Use the EOSDIS Data Processing level description as guidance.", "url": "https://www.earthdata.nasa.gov/engage/open-data-services-and-software/data-information-policy/data-levels" }, - "remediation": "Consider providing a more detailed processing level description." + "remediation": "Recommend providing a more detailed processing level description, using the EOSDIS processing level descriptions as guidance:\nhttps://www.earthdata.nasa.gov/engage/open-data-services-and-software/data-information-policy/data-levels" }, + "processing_level_description_presence_check": { + "failure": "The Processing Level Description is missing.", + "help": { + "message": "Recommend providing a processing level description, using the EOSDIS processing level descriptions as guidance", + "url": "https://www.earthdata.nasa.gov/learn/earth-observation-data-basics/data-processing-levels" + }, + "remediation": "Recommend providing a processing level description, using the EOSDIS processing level descriptions as guidance:\nhttps://www.earthdata.nasa.gov/engage/open-data-services-and-software/data-information-policy/data-levels" + }, "science_keywords_gcmd_check": { "failure": "`{}` is not a valid GCMD science keyword.", "help": { diff --git a/pyQuARC/schemas/check_messages_override.json b/pyQuARC/schemas/check_messages_override.json index d4ed34be..0967ef42 100644 --- a/pyQuARC/schemas/check_messages_override.json +++ b/pyQuARC/schemas/check_messages_override.json @@ -1,18 +1 @@ -{ - "processing_level_description_presence_check": { - "failure": "The Processing Level Description is missing.", - "help": { - "message": "Recommend providing a processing level description, using the EOSDIS processing level descriptions as guidance", - "url": "https://www.earthdata.nasa.gov/learn/earth-observation-data-basics/data-processing-levels" - }, - "remediation": "Recommend providing a processing level description, using the EOSDIS processing level descriptions as guidance:\nhttps://www.earthdata.nasa.gov/engage/open-data-services-and-software/data-information-policy/data-levels" - }, - "processing_level_description_length_check": { - "failure": "The provided description is less than 50 characters and therefore may be lacking in contextual information.", - "help": { - "message": "Use the EOSDIS Data Processing level description as guidance.", - "url": "https://www.earthdata.nasa.gov/engage/open-data-services-and-software/data-information-policy/data-levels" - }, - "remediation": "Recommend providing a more detailed processing level description, using the EOSDIS processing level descriptions as guidance:\nhttps://www.earthdata.nasa.gov/engage/open-data-services-and-software/data-information-policy/data-levels" - } -} +{} diff --git a/pyQuARC/schemas/rule_mapping.json b/pyQuARC/schemas/rule_mapping.json index 2e3acc41..ff2a596e 100644 --- a/pyQuARC/schemas/rule_mapping.json +++ b/pyQuARC/schemas/rule_mapping.json @@ -1253,6 +1253,27 @@ "severity": "info", "check_id": "length_check" }, + "processing_level_description_presence_check": { + "rule_name": "Processing Level Description Presence Check", + "fields_to_apply": { + "echo-c": [ + { + "fields": [ + "Collection/ProcessingLevelDescription" + ] + } + ], + "umm-c": [ + { + "fields": [ + "ProcessingLevel/ProcessingLevelDescription" + ] + } + ] + }, + "severity": "info", + "check_id": "one_item_presence_check" + }, "umm_controlled_collection_state_list_check": { "rule_name": "UMM Controlled Collection State List", "fields_to_apply": { diff --git a/pyQuARC/schemas/rules_override.json b/pyQuARC/schemas/rules_override.json index 16b89647..0967ef42 100644 --- a/pyQuARC/schemas/rules_override.json +++ b/pyQuARC/schemas/rules_override.json @@ -1,23 +1 @@ -{ - "processing_level_description_presence_check": { - "rule_name": "Processing Level Description Presence Check", - "fields_to_apply": { - "echo-c": [ - { - "fields": [ - "Collection/ProcessingLevelDescription" - ] - } - ], - "umm-c": [ - { - "fields": [ - "ProcessingLevel/ProcessingLevelDescription" - ] - } - ] - }, - "severity": "info", - "check_id": "one_item_presence_check" - } -} +{} From b861b84678615a0642eaa8e3d78f468f02ef7ac1 Mon Sep 17 00:00:00 2001 From: Lavanya Ashokkumar Date: Fri, 8 Aug 2025 15:57:48 -0500 Subject: [PATCH 36/71] Horizontal datum - revised v2 #330 --- pyQuARC/schemas/check_messages.json | 2 +- pyQuARC/schemas/check_messages_override.json | 11 +------ pyQuARC/schemas/checks_override.json | 11 +------ pyQuARC/schemas/rule_mapping.json | 2 +- pyQuARC/schemas/rules_override.json | 32 ++------------------ 5 files changed, 6 insertions(+), 52 deletions(-) diff --git a/pyQuARC/schemas/check_messages.json b/pyQuARC/schemas/check_messages.json index 0b8b38c8..07fb3535 100644 --- a/pyQuARC/schemas/check_messages.json +++ b/pyQuARC/schemas/check_messages.json @@ -645,7 +645,7 @@ "message": "", "url": "https://wiki.earthdata.nasa.gov/display/CMR/Spatial+Extent" }, - "remediation": "If appropriate for the dataset, recommend providing information about the horizontal datum." + "remediation": "Information about the datum should be provided in the metadata if possible." }, "online_access_url_presence_check": { "failure": "No Online Access URL is provided. A link to access the data is required.", diff --git a/pyQuARC/schemas/check_messages_override.json b/pyQuARC/schemas/check_messages_override.json index 10dff77f..0967ef42 100644 --- a/pyQuARC/schemas/check_messages_override.json +++ b/pyQuARC/schemas/check_messages_override.json @@ -1,10 +1 @@ -{ - "horizontal_datum_name_check": { - "failure": "The Horizontal Datum Name is missing.", - "help": { - "message": "", - "url": "https://wiki.earthdata.nasa.gov/display/CMR/Spatial+Extent" - }, - "remediation": "Information about the datum should be provided in the metadata if possible." - } -} +{} diff --git a/pyQuARC/schemas/checks_override.json b/pyQuARC/schemas/checks_override.json index 10dff77f..0967ef42 100644 --- a/pyQuARC/schemas/checks_override.json +++ b/pyQuARC/schemas/checks_override.json @@ -1,10 +1 @@ -{ - "horizontal_datum_name_check": { - "failure": "The Horizontal Datum Name is missing.", - "help": { - "message": "", - "url": "https://wiki.earthdata.nasa.gov/display/CMR/Spatial+Extent" - }, - "remediation": "Information about the datum should be provided in the metadata if possible." - } -} +{} diff --git a/pyQuARC/schemas/rule_mapping.json b/pyQuARC/schemas/rule_mapping.json index 2e3acc41..574ce8ad 100644 --- a/pyQuARC/schemas/rule_mapping.json +++ b/pyQuARC/schemas/rule_mapping.json @@ -3677,7 +3677,7 @@ "umm-c": [ { "fields": [ - "SpatialRepresentationInfo/HorizontalCoordinateSystem/GeodeticModel/HorizontalDatumName" + "SpatialExtent/HorizontalSpatialDomain/ResolutionAndCoordinateSystem/GeodeticModel/HorizontalDatumName" ] } ] diff --git a/pyQuARC/schemas/rules_override.json b/pyQuARC/schemas/rules_override.json index 2d923b7c..311847da 100644 --- a/pyQuARC/schemas/rules_override.json +++ b/pyQuARC/schemas/rules_override.json @@ -1,30 +1,2 @@ -{ - "horizontal_datum_name_check": { - "rule_name": "Horizontal Datum Name Check", - "fields_to_apply": { - "echo-c": [ - { - "fields": [ - "Collection/SpatialInfo/HorizontalCoordinateSystem/GeodeticModel/HorizontalDatumName" - ] - } - ], - "dif10": [ - { - "fields": [ - "DIF/Spatial_Coverage/Spatial_Info/Horizontal_Coordinate_System/Geodetic_Model/Horizontal_DatumName" - ] - } - ], - "umm-c": [ - { - "fields": [ - "SpatialExtent/HorizontalSpatialDomain/ResolutionAndCoordinateSystem/GeodeticModel/HorizontalDatumName" - ] - } - ] - }, - "severity": "info", - "check_id": "one_item_presence_check" - } -} +{} + From fb1434c907bcc7e8160a7feaebec825bfbf8683c Mon Sep 17 00:00:00 2001 From: Lavanya Ashokkumar Date: Mon, 11 Aug 2025 16:45:59 -0500 Subject: [PATCH 37/71] Granule campaign shortname - revised #339 --- pyQuARC/schemas/check_messages.json | 2 +- pyQuARC/schemas/check_messages_override.json | 11 +-------- pyQuARC/schemas/rule_mapping.json | 2 +- pyQuARC/schemas/rules_override.json | 24 +------------------- 4 files changed, 4 insertions(+), 35 deletions(-) diff --git a/pyQuARC/schemas/check_messages.json b/pyQuARC/schemas/check_messages.json index 0b8b38c8..b3593222 100644 --- a/pyQuARC/schemas/check_messages.json +++ b/pyQuARC/schemas/check_messages.json @@ -629,7 +629,7 @@ "message": "", "url": "https://wiki.earthdata.nasa.gov/display/CMR/Project" }, - "remediation": "Please add a GCMD compliant campaign/project name if applicable to the dataset." + "remediation": "Recommend providing a campaign short name from the following list: https://gcmd.earthdata.nasa.gov/kms/concepts/concept_scheme/projects/?format=csv" }, "spatial_coverage_type_presence_check": { "failure": "The Spatial Coverage Type is missing.", diff --git a/pyQuARC/schemas/check_messages_override.json b/pyQuARC/schemas/check_messages_override.json index 97ce108d..0967ef42 100644 --- a/pyQuARC/schemas/check_messages_override.json +++ b/pyQuARC/schemas/check_messages_override.json @@ -1,10 +1 @@ -{ - "granule_campaign_name_presence_check": { - "failure": "The campaign/project short name is missing.", - "help": { - "message": "", - "url": "https://wiki.earthdata.nasa.gov/display/CMR/Project" - }, - "remediation": "Recommend providing a campaign short name from the following list: https://gcmd.earthdata.nasa.gov/kms/concepts/concept_scheme/projects/?format=csv" - } -} +{} diff --git a/pyQuARC/schemas/rule_mapping.json b/pyQuARC/schemas/rule_mapping.json index 2e3acc41..2a189b84 100644 --- a/pyQuARC/schemas/rule_mapping.json +++ b/pyQuARC/schemas/rule_mapping.json @@ -3621,7 +3621,7 @@ "umm-g": [ { "fields": [ - "Projects/ShortName" + "Campaign/ShortName" ] } ] diff --git a/pyQuARC/schemas/rules_override.json b/pyQuARC/schemas/rules_override.json index 536de84c..0967ef42 100644 --- a/pyQuARC/schemas/rules_override.json +++ b/pyQuARC/schemas/rules_override.json @@ -1,23 +1 @@ -{ - "granule_campaign_name_presence_check": { - "rule_name": "Campaign Name Presence Check", - "fields_to_apply": { - "echo-g": [ - { - "fields": [ - "Granule/Campaigns/Campaign/ShortName" - ] - } - ], - "umm-g": [ - { - "fields": [ - "Campaign/ShortName" - ] - } - ] - }, - "severity": "info", - "check_id": "one_item_presence_check" - } -} +{} From 91ee31c9dd9bbf68fcee4207409ed10558ba6bed Mon Sep 17 00:00:00 2001 From: Slesa Adhikari Date: Fri, 15 Aug 2025 15:21:05 -0500 Subject: [PATCH 38/71] Update severity based on time difference and tests --- pyQuARC/code/checker.py | 2 +- pyQuARC/code/datetime_validator.py | 17 ++++++- tests/test_datetime_validator.py | 74 ++++++++++++++++++++++++++++++ 3 files changed, 90 insertions(+), 3 deletions(-) diff --git a/pyQuARC/code/checker.py b/pyQuARC/code/checker.py index 4bb401c7..a222bb8e 100644 --- a/pyQuARC/code/checker.py +++ b/pyQuARC/code/checker.py @@ -117,9 +117,9 @@ def build_message(self, result, rule_id): rule_mapping = self.rules_override.get(rule_id) or self.rule_mapping.get( rule_id ) - severity = rule_mapping.get("severity", "error") messages = [] if not (result["valid"]) and result.get("value"): + severity = result.get("severity") or rule_mapping.get("severity", "error") for value in result["value"]: formatted_message = failure_message value = value if isinstance(value, tuple) else (value,) diff --git a/pyQuARC/code/datetime_validator.py b/pyQuARC/code/datetime_validator.py index fd67e4ef..22b11dca 100644 --- a/pyQuARC/code/datetime_validator.py +++ b/pyQuARC/code/datetime_validator.py @@ -140,7 +140,6 @@ def validate_datetime_against_granules( "granules", ) granules = cmr_request(cmr_prms) - validity = True last_granule_datetime = None date_time = None @@ -152,8 +151,22 @@ def validate_datetime_against_granules( date_time = get_date_time(datetime_string) last_granule_datetime = get_date_time(last_granule_datetime) validity = date_time == last_granule_datetime + else: + validity = False + + return_value = {} + if ( + (not date_time) + or not last_granule_datetime + or ((abs(date_time - last_granule_datetime).total_seconds() / 3600) > 24) + ): + return_value["severity"] = "error" - return {"valid": validity, "value": (date_time, last_granule_datetime)} + return { + **return_value, + "valid": validity, + "value": (date_time, last_granule_datetime), + } @staticmethod @if_arg diff --git a/tests/test_datetime_validator.py b/tests/test_datetime_validator.py index 9f9c0262..0358f6d0 100644 --- a/tests/test_datetime_validator.py +++ b/tests/test_datetime_validator.py @@ -1,3 +1,7 @@ +import pytest +from unittest.mock import patch +from datetime import datetime + from pyQuARC.code.datetime_validator import DatetimeValidator from tests.fixtures.validator import INPUT_OUTPUT @@ -18,3 +22,73 @@ def test_datetime_iso_format_check(self): def test_datetime_compare(self): pass + + @patch("pyQuARC.code.datetime_validator.set_cmr_prms") + @patch("pyQuARC.code.datetime_validator.cmr_request") + @patch("pyQuARC.code.datetime_validator.get_date_time") + @pytest.mark.parametrize( + "datetime_string, granule_datetime, expected_valid, expected_severity", + [ + # Exact match → valid, no severity + ("2025-08-01T00:00:00Z", "2025-08-01T00:00:00Z", True, None), + + # Different date but within 24 hours → invalid, no severity + ("2025-08-02T00:00:00Z", "2025-08-01T12:00:00Z", False, None), + + # More than 24 hours difference → invalid, severity error + ("2025-08-03T00:00:00Z", "2025-08-01T00:00:00Z", False, "error"), + + # No granules returned → valid=False, severity error + ("2025-08-01T00:00:00Z", None, False, "error"), + ], + ) + def test_validate_datetime_against_granules( + self, + mock_get_date_time, + mock_cmr_request, + mock_set_cmr_prms, + datetime_string, + granule_datetime, + expected_valid, + expected_severity, + ): + # Arrange: cmr_request mock + if granule_datetime is None: + mock_cmr_request.return_value = {"feed": {"entry": []}} + else: + mock_cmr_request.return_value = { + "feed": { + "entry": [ + { + "time_start": granule_datetime, + "time_end": granule_datetime, + } + ] + } + } + + mock_set_cmr_prms.return_value = {"mock": "params"} + + # Mock get_date_time to return datetime objects or None + def fake_get_date_time(val): + if val is None: + return None + return datetime.strptime(val, "%Y-%m-%dT%H:%M:%SZ") + + mock_get_date_time.side_effect = fake_get_date_time + + # Act + result = DatetimeValidator.validate_datetime_against_granules( + datetime_string, + collection_shortname="TEST", + version="1", + sort_key="start_date", + time_key="time_start", + ) + + # Assert + assert result["valid"] == expected_valid + if expected_severity: + assert result["severity"] == expected_severity + else: + assert "severity" not in result From 01661ba22017b0b0f407ee118479c2d8cdc51fe4 Mon Sep 17 00:00:00 2001 From: Lavanya Ashokkumar Date: Tue, 19 Aug 2025 16:00:30 -0500 Subject: [PATCH 39/71] url_description_unique_check_new #345 --- pyQuARC/code/url_validator.py | 13 ++++++++++++- pyQuARC/main.py | 4 +++- pyQuARC/schemas/rule_mapping.json | 2 +- 3 files changed, 16 insertions(+), 3 deletions(-) diff --git a/pyQuARC/code/url_validator.py b/pyQuARC/code/url_validator.py index 9b6befd6..98acd05a 100644 --- a/pyQuARC/code/url_validator.py +++ b/pyQuARC/code/url_validator.py @@ -120,7 +120,18 @@ def doi_link_update(value, bad_urls): @staticmethod @if_arg - def url_update_email_check(url, bad_urls): + def url_update_email_check(url, bad_urls=None): + if bad_urls is None: + bad_urls = [] + + if not url: + return { + "valid": False, + "value": url, + "message": "No email value provided for URL update contact.", + "remediation": "Provide a valid contact email address." + } + validity = True # Check if the URL matches 'support-cddis@earthdata.nasa.gov' if url in bad_urls or url == "support-cddis@earthdata.nasa.gov": diff --git a/pyQuARC/main.py b/pyQuARC/main.py index 6995b50c..8982d41b 100644 --- a/pyQuARC/main.py +++ b/pyQuARC/main.py @@ -287,7 +287,9 @@ def display_results(self): f"\n\t {COLOR['title']}{COLOR['bright']} pyQuARC ERRORS: {END}\n" ) for error in pyquarc_errors: - error_prompt += f"\t\t ERROR: {error['type']}. Details: {error['details']} \n" + error_prompt += (f"\t\t ERROR: {error.get('message', 'No message available')} \n" + f"\t\t DETAILS: {error.get('details', 'No details available')} \n" + ) if cmr_validation := error.get("cmr_validation"): cmr_error_msg = self._format_cmr_error(cmr_validation) diff --git a/pyQuARC/schemas/rule_mapping.json b/pyQuARC/schemas/rule_mapping.json index 7641306a..55b00436 100644 --- a/pyQuARC/schemas/rule_mapping.json +++ b/pyQuARC/schemas/rule_mapping.json @@ -5301,7 +5301,7 @@ "umm-c": [ { "fields": [ - "RelatedUrls" + "RelatedUrls/Description" ] }, { From 70a053d4c6109167c5106fd2ac58f5c42504ed24 Mon Sep 17 00:00:00 2001 From: Lavanya Ashokkumar Date: Tue, 19 Aug 2025 16:04:05 -0500 Subject: [PATCH 40/71] Delete contributing.md --- contributing.md | 56 ------------------------------------------------- 1 file changed, 56 deletions(-) delete mode 100644 contributing.md diff --git a/contributing.md b/contributing.md deleted file mode 100644 index 91cf80ee..00000000 --- a/contributing.md +++ /dev/null @@ -1,56 +0,0 @@ -# Contributing File - -# Welcome to pyQuARC! -This page is meant to help you learn how you can contribute to pyQuARC! We are passionate about NASA's Open Science initiative and are open to a variety of contributions. Read below to find ways that you can contribute to our project, either through reporting bugs, suggesting new features, or even directly editing the code yourself. - -## How you can contribute to pyQuARC: - -1. **Report a Bug:** for when you find something within the code that does not respond the way you expected/wanted it to. - * To start you will need to proceed to the [**Issues** tab](https://github.com/NASA-IMPACT/pyQuARC/issues) within the pyQuARC Github page. - * From here, look for the green button on the right side of the page labeled **New issue**. - * Select **Bug Report** from the list that appears, so that you can create a report to help us improve an aspect of pyQuARC. - * The page you are directed to will provide a prompt to add a title and explain how to fill in the bug you want to report. - * If you change your mind about reporting a bug, there is a white button on the bottom right of the page labeled **Cancel** where you can either decide to keep editing or close and discard your issue. - * Beneath the description box, select "Issue Type" and "Bug". - * When you are finished describing the bug you wish to report, you can click the big green button at the bottom of the page labeled **Create**. This will make your Issue visible to all pyQuARC contributors, and pyQuARC developers will automatically be assigned to the Issue and notified. - * You can see your new issue if you return to the **Issues** page of the pyQuARC GitHub and look for your title followed by the red __bug__ tag. - -2. **Suggest a New Feature:** for when you think of something that could enhance pyQuARC for other users. - * Suggesting a new feature is very similar to reporting a bug. You will start at the [**Issues** tab](https://github.com/NASA-IMPACT/pyQuARC/issues) within the pyQuARC Github page. - * Select the green **New Issue** button found on the top right side of the page. - * From the menu that appears, select **Feature Request** so that you can suggest an idea for our project. - * The page you are directed to will provide a prompt to add a title and explain how to make a new suggestion. - * If you change your mind about making a feature request, there is a white button on the bottom right of the page labeled **Cancel** where you can either decide to keep editing or close and discard your issue. - * Beneath the description box, select "Issue Type" and "Feature". - * When you are finished describing your suggestion, you can click the big green button at the bottom of the page labeled **Create**. This will make your Issue visible to all pyQuARC contributors. - * You can see your new issue if you return to the **Issues** page of the pyQuARC GitHub and look for your title followed by the green __new check__ tag. - -3. **Directly Contribute to PyQuARC Content:** for when you want to directly edit the code to add checks or new features. - * Fork the repository - * To edit the code, you will need to first create your own 'fork' of the repository. A fork is a new repository that shares code and visibility settings with the original repository and allows you to create your edits. Read more about Forks [here](https://docs.github.com/en/pull-requests/collaborating-with-pull-requests/working-with-forks/fork-a-repo). - * To create your fork of pyQuARC, return to the [**Code**](https://github.com/NASA-IMPACT/pyQuARC) tab of the pyQuARC GitHub. - * On the top right of the page, select the **Fork** tab. - * Under the "Owner" dropdown menu, select yourself as the owner of the new forked repository. - * The fork name will default to 'pyQuARC'. If you wish to name your fork something different, edit the 'Repository Name' field. - * You can set an optional description in the 'Description' field below. - * Make sure the checkbox next to 'Copy the master branch only' is selected. - * Click **Create fork** when you are finished to create your fork! - * After completing the steps above, you should be on a new page titled the same as your new fork, with "forked from NASA-IMPACT/pyQuARC" beneath the title. You have successfully created a fork of pyQuARC! - * Clone your fork locally - * Now we will store the files locally on your computer so you will be able to edit the code. Click the green dropdown button labeled **<> Code**. - * Under the **HTTPS** tab, copy the link to the repository. - * Open a Python terminal in your preferred coding location. - * Change your working directory to wherever you want your cloned pyQuARC repository to be stored. - * Type '__git clone__' and then paste the URL you copied a few steps above. - * Press **Enter** and your local clone of pyQuARC will be created! You can now explore all of the files on your local computer. - * Create a new branch and make your desired changes. - * Create a PR - * Once your changes are made, push your commits. - * You can then open a Pull Request (PR) on the [**Pull requests** tab](https://github.com/NASA-IMPACT/pyQuARC/pulls) within the pyQuARC Github page. - * Set the base repository to "NASA-IMPACT/pyQuARC" and the base to "dev". - * Fill out a title and description, then submit! - * Feedback may be provided on your PR. Once it is approved, a pyQuARC team member will merge your changes. - -## Thank you for your interest in pyQuARC! -We appreciate your interest in pyQuARC! Everyone is encouraged to help improve pyQuARC, and we welcome your comments, suggestions, and new ideas! -Please contact earthdata-support@nasa.gov with any questions. From bf354f727702224c153d94ac49406f100234157a Mon Sep 17 00:00:00 2001 From: Bhawana Karakheti Date: Wed, 20 Aug 2025 11:06:42 -0500 Subject: [PATCH 41/71] Removing rules_override.json content --- pyQuARC/schemas/rule_mapping.json | 2 +- pyQuARC/schemas/rules_override.json | 46 +---------------------------- 2 files changed, 2 insertions(+), 46 deletions(-) diff --git a/pyQuARC/schemas/rule_mapping.json b/pyQuARC/schemas/rule_mapping.json index 2e3acc41..9cb85632 100644 --- a/pyQuARC/schemas/rule_mapping.json +++ b/pyQuARC/schemas/rule_mapping.json @@ -5275,7 +5275,7 @@ "umm-c": [ { "fields": [ - "RelatedUrls" + "RelatedUrls/Description" ] }, { diff --git a/pyQuARC/schemas/rules_override.json b/pyQuARC/schemas/rules_override.json index d2f72744..9e26dfee 100644 --- a/pyQuARC/schemas/rules_override.json +++ b/pyQuARC/schemas/rules_override.json @@ -1,45 +1 @@ - -{ - "url_description_uniqueness_check": { - "rule_name": "URL Description Uniqueness Check", - "fields_to_apply": { - "umm-c": [ - { - "fields": [ - "RelatedUrls/Description" - ] - }, - { - "fields": [ - "DataCenters/ContactInformation/RelatedUrls" - ] - } - ], - "dif10": [ - { - "fields": [ - "DIF/Multimedia_Sample" - ] - }, - { - "fields": [ - "DIF/Related_URL" - ] - } - ], - "umm-g": [ - { - "fields": [ - "RelatedUrls/URL" - ] - } - ] - }, - "data": [ - "Description" - ], - "severity": "info", - "check_id": "uniqueness_check" - } - -} +{} \ No newline at end of file From fa9d1c9da423fca0dd437a6d7ffa97d1f40b0f56 Mon Sep 17 00:00:00 2001 From: Lavanya Ashokkumar Date: Fri, 5 Sep 2025 11:32:47 -0500 Subject: [PATCH 42/71] Granule datetime 'Z' missing #347 --- pyQuARC/code/datetime_validator.py | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/pyQuARC/code/datetime_validator.py b/pyQuARC/code/datetime_validator.py index fd67e4ef..97c82a98 100644 --- a/pyQuARC/code/datetime_validator.py +++ b/pyQuARC/code/datetime_validator.py @@ -87,9 +87,17 @@ def date_or_datetime_format_check(datetime_string): Returns: (dict) An object with the validity of the check and the instance """ + is_datetime = DatetimeValidator._iso_datetime(datetime_string) + is_date = DatetimeValidator._iso_date(datetime_string) + + # If it's a datetime, require that it ends with 'Z' + if is_datetime and not datetime_string.endswith("Z"): + is_datetime = False + + valid = is_datetime or is_date + return { - "valid": bool(DatetimeValidator._iso_datetime(datetime_string)) - or bool(DatetimeValidator._iso_date(datetime_string)), + "valid": valid, "value": datetime_string, } From 71ec2666adb5da7c9868684b946c114f39d3ee17 Mon Sep 17 00:00:00 2001 From: Lavanya Ashokkumar Date: Thu, 18 Sep 2025 11:14:48 -0500 Subject: [PATCH 43/71] Use Constraints check errors #291 --- pyQuARC/schemas/rule_mapping.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyQuARC/schemas/rule_mapping.json b/pyQuARC/schemas/rule_mapping.json index 2e3acc41..9ef4c2d5 100644 --- a/pyQuARC/schemas/rule_mapping.json +++ b/pyQuARC/schemas/rule_mapping.json @@ -3898,7 +3898,7 @@ ] }, "severity": "warning", - "check_id": "license_url_description_check" + "check_id": "one_item_presence_check" }, "collection_citation_presence_check": { "rule_name": "Collection Citation Presence Check", From 393f4bc841f916fe5d9ecd566a7045142c14f7c7 Mon Sep 17 00:00:00 2001 From: Lavanya Ashokkumar Date: Thu, 18 Sep 2025 13:02:01 -0500 Subject: [PATCH 44/71] Granule-url_description_uniqueness_check fixes 2 #324 #331 --- pyQuARC/schemas/rule_mapping.json | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/pyQuARC/schemas/rule_mapping.json b/pyQuARC/schemas/rule_mapping.json index 9cb85632..6c38d41a 100644 --- a/pyQuARC/schemas/rule_mapping.json +++ b/pyQuARC/schemas/rule_mapping.json @@ -3898,7 +3898,7 @@ ] }, "severity": "warning", - "check_id": "license_url_description_check" + "check_id": "one_item_presence_check" }, "collection_citation_presence_check": { "rule_name": "Collection Citation Presence Check", @@ -5275,7 +5275,7 @@ "umm-c": [ { "fields": [ - "RelatedUrls/Description" + "RelatedUrls" ] }, { @@ -5308,7 +5308,7 @@ "Description" ], "severity": "info", - "check_id": "uniqueness_check" + "check_id": "one_item_presence_check" }, "online_resource_description_uniqueness_check": { "rule_name": "Online Resource Description Uniqueness Check", @@ -5342,7 +5342,7 @@ "URLDescription" ], "severity": "info", - "check_id": "uniqueness_check" + "check_id": "one_item_presence_check" }, "metadata_update_time_logic_check": { "rule_name": "Metadata Update Time Logic Check", From 4e3dee7feaf633f30529a959c66fc0e51df25755 Mon Sep 17 00:00:00 2001 From: Lavanya Ashokkumar Date: Mon, 29 Sep 2025 16:57:55 -0500 Subject: [PATCH 45/71] Final version of code of conduct --- pyQuARC_Code_of_Conduct.md | 99 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 99 insertions(+) create mode 100644 pyQuARC_Code_of_Conduct.md diff --git a/pyQuARC_Code_of_Conduct.md b/pyQuARC_Code_of_Conduct.md new file mode 100644 index 00000000..ec17c808 --- /dev/null +++ b/pyQuARC_Code_of_Conduct.md @@ -0,0 +1,99 @@ +Contributor Covenant Code of Conduct + +Our Pledge + +One of the primary goals of the pyQuARC repository is to cultivate a respectful, inclusive, equitable, and collaborative environment for all users, community members, stakeholders, and developers from diverse backgrounds. Our Code of Conduct is grounded in the FAIR principles (Findable, Accessible, Interoperable, and Reusable) and outlines our expectations for all participants, as well as the consequences of unacceptable behavior. We invite all users to help us create a safe and positive experience for every member of the community. + +Our Standards + +We strive to create a welcoming culture that empowers people to provide outstanding open science. Achieving this requires an open exchange of ideas, guided by thoughtful and respectful standards. + +Examples of behaviors that contribute to a positive community environment include: + +Engaging in professional interactions with other members that are respectful, empathetic, and courteous + +Providing and receiving constructive feedback + +Accepting responsibility and offering sincere apologies when mistakes are made + +Prioritizing the well-being of the community as a whole over individual interests + +Examples of unacceptable behavior include: + +Using sexualized language or imagery, or making unwelcome sexual attention or advances of any kind + +Trolling, making insulting or derogatory remarks, or engaging in personal or political attacks + +Harassing others in any form, whether public or private + +Publishing private information (e.g., physical address, email address) without explicit permission + +Engaging in any other conduct that could reasonably be considered inappropriate in a professional setting + +Commit of Malicious Code + +Enforcement Responsibilities + +Community leaders are responsible for clarifying and enforcing our standards of acceptable behavior and will take appropriate and fair corrective action in response to any behavior that they deem inappropriate, threatening, offensive, or harmful. + +Community leaders have the right and responsibility to remove, edit, or reject comments, commits, code, wiki edits, issues, and other contributions that are not aligned with this Code of Conduct, and will communicate reasons for moderation decisions when appropriate. + +This Code of Conduct applies both within project spaces and in public settings where an individual is representing the project or its community. Additional guidance on appropriate conduct will also be provided for in-person and virtual events. + +Key Definitions + +A participant is anyone who creates an issue, posts a comment, or question in the pyQuARC GitHub repository. + +A contributor is an individual who submits a pull request or code commit to the pyQuARC GitHub repository. + +A moderator is an individual appointed to oversee and moderate comments, issues, pull requests, and code commits, as well as manage access to the pyQuARC repository. + + as “Moderators are organization members who, in addition to their permissions as members, are allowed to block and unblock non-member contributors, set interaction limits, and hide comments in public repositories owned by the organization.” GitHub moderators can hide comments, pull requests, and issues; block or unblock contributors; and limit interactions for specific users. + +GitHub resources for moderation can be. + +Enforcement Guidelines + +Community moderators will follow these Community Impact Guidelines when determining the consequences for any action deemed in violation of this Code of Conduct: + +First Code of Conduct Violation (Warning) + +If a participant violates the Code of Conduct for the first time, a community moderator will contact the individual as soon as possible and promptly remove the content. + +Participant: Content removed + contacted by the community moderator + +Contributor: PR not accepted and removed from GitHub + contacted by the community moderator + +Second Code of Conduct Violation (Temporary Ban) + +If a participant violates the Code of Conduct a second time, they will be contacted by a community moderator and informed of a temporary ban from the repository. + +Participant: Content removed + contacted by the community moderator + ban for 90 days from the space where the offense occurred + +Contributor: PR not accepted and removed from GitHub + contacted by the community moderator + banned from submitting PRs for 90 days + +Third Code of Conduct Violation (Permanent Ban) + +If a participant violates the Code of Conduct a third time and demonstrates a repeated pattern of disregarding community standards. In that case, they will be permanently banned and removed from the pyQuARC GitHub repository. + +Participant: Content removed + contacted by the community moderator + permanent ban from the repository + +Contributor: PR not accepted and removed from GitHub + contacted by the community moderator + permanent ban from the repository + +Additional note: If a contributor submits a pull request that is harmful to our digital spaces (e.g., malicious code), they will be immediately and permanently banned from the pyQuARC repository. + +Question? + +If you have a question about how to contribute to the pyQuARC library, please refer to the Contributing file (contributing.md) in the pyQuARC repository. For all other inquiries, including reports of potential violations of this Code of Conduct, please contact . + +Attributions + +The pyQuARC Code of Conduct has been adopted from the following sources: + +, + + and the . + + from Contributor-Covenant.org + + from the General Services Administration \ No newline at end of file From b14a74a21e366f7e48f490ffa94f98b880bf5f5a Mon Sep 17 00:00:00 2001 From: Lavanya Ashokkumar Date: Tue, 30 Sep 2025 11:58:08 -0500 Subject: [PATCH 46/71] updated readme file with resources folder for videos --- README.md | 190 ++++++++++++++++++++++++++++++++++++++++++------------ 1 file changed, 149 insertions(+), 41 deletions(-) diff --git a/README.md b/README.md index 03da9ce2..93bc4e78 100644 --- a/README.md +++ b/README.md @@ -5,79 +5,153 @@ [![DOI](https://zenodo.org/badge/153786129.svg)](https://zenodo.org/doi/10.5281/zenodo.10724716) ## Introduction - The pyQuARC (*pronounced "pie-quark"*) library was designed to read and evaluate descriptive metadata used to catalog Earth observation data products and files. This type of metadata focuses and limits attention to important aspects of data, such as the spatial and temporal extent, in a structured manner that can be leveraged by data catalogs and other applications designed to connect users to data. Therefore, poor quality metadata (e.g. inaccurate, incomplete, improperly formatted, inconsistent) can yield subpar results when users search for data. Metadata that inaccurately represents the data it describes risks matching users with data that does not reflect their search criteria and, in the worst-case scenario, can make data impossible to find. Given the importance of high quality metadata, it is necessary that metadata be regularly assessed and updated as needed. pyQuARC is a tool that can help streamline the process of assessing metadata quality by automating it as much as possible. In addition to basic validation checks (e.g. adherence to the metadata schema, controlled vocabularies, and link checking), pyQuARC flags opportunities to improve or add contextual metadata information to help the user connect to, access, and better understand the data product. pyQuARC also ensures that information common to both data product (i.e. collection) and the file-level (i.e. granule) metadata are consistent and compatible. As open source software, pyQuARC can be adapted and customized to allow for quality checks unique to different needs. -## pyQuARC Base Package +## pyQuARC Metadata Quality Framework +pyQuARC was designed to assess metadata in NASA’s [Common Metadata Repository (CMR)](https://earthdata.nasa.gov/eosdis/science-system-description/eosdis-components), a centralized repository for all of NASA’s Earth observation data products. In addition, the CMR contains metadata for Earth observation products submitted by external partners. The CMR serves as the backend for NASA’s Earthdata Search ([search.earthdata.nasa.gov](https://search.earthdata.nasa.gov/)) and is also the authoritative metadata source for NASA’s [Earth Observing System Data and Information System (EOSDIS)](https://earthdata.nasa.gov/eosdis). -pyQuARC was specifically designed to assess metadata in NASA’s [Common Metadata Repository (CMR)](https://earthdata.nasa.gov/eosdis/science-system-description/eosdis-components), which is a centralized metadata repository for all of NASA’s Earth observation data products. In addition to NASA’s ~9,000 data products, the CMR also holds metadata for over 40,000 additional Earth observation data products submitted by external data partners. The CMR serves as the backend for NASA’s Earthdata Search (search.earthdata.nasa.gov) and is also the authoritative metadata source for NASA’s [Earth Observing System Data and Information System (EOSDIS).](https://earthdata.nasa.gov/eosdis) +pyQuARC was initially developed by a group called the [Analysis and Review of the CMR (ARC)](https://www.earthdata.nasa.gov/data/projects/analysis-review-cmr-project) team. The ARC team conducted quality assessments of NASA’s metadata records in the CMR, identified opportunities for improvement in the metadata records, and collaborated with the data archive centers to resolve any identified issues. ARC has developed a [metadata quality assessment framework](http://doi.org/10.5334/dsj-2021-017) which specifies a common set of assessment criteria. These criteria focus on correctness, completeness, and consistency with the goal of making data more discoverable, accessible, and usable. The ARC metadata quality assessment framework is the basis for the metadata checks that have been incorporated into pyQuARC base package. Specific quality criteria for each CMR metadata element are documented in the [Earthdata Wiki space](https://wiki.earthdata.nasa.gov/display/CMR/CMR+Metadata+Best+Practices%3A+Landing+Page). -pyQuARC was developed by a group called the [Analysis and Review of the CMR (ARC)](https://earthdata.nasa.gov/esds/impact/arc) team. The ARC team conducts quality assessments of NASA’s metadata records in the CMR, identifies opportunities for improvement in the metadata records, and collaborates with the data archive centers to resolve any identified issues. ARC has developed a [metadata quality assessment framework](http://doi.org/10.5334/dsj-2021-017) which specifies a common set of assessment criteria. These criteria focus on correctness, completeness, and consistency with the goal of making data more discoverable, accessible, and usable. The ARC metadata quality assessment framework is the basis for the metadata checks that have been incorporated into pyQuARC base package. Specific quality criteria for each CMR metadata element is documented in the following wiki: -[https://wiki.earthdata.nasa.gov/display/CMR/CMR+Metadata+Best+Practices%3A+Landing+Page](https://wiki.earthdata.nasa.gov/display/CMR/CMR+Metadata+Best+Practices%3A+Landing+Page) +Each metadata element’s wiki page includes an “Metadata Validation and QA/QC” section that lists quality criteria categorized by priority levels, referred to as a priority matrix. The [priority matrix](https://wiki.earthdata.nasa.gov/spaces/CMR/pages/109874556/ARC+Priority+Matrix) are designated as high (red), medium (yellow), or low (blue), and are intended to communicate the importance of meeting the specified criteria. -There is an “ARC Metadata QA/QC” section on the wiki page for each metadata element that lists quality criteria categorized by level of [priority. Priority categories](https://wiki.earthdata.nasa.gov/display/CMR/ARC+Priority+Matrix) are designated as high (red), medium (yellow), or low (blue), and are intended to communicate the importance of meeting the specified criteria. +The CMR is designed around its own metadata standard called the [Unified Metadata Model (UMM)](https://www.earthdata.nasa.gov/about/esdis/eosdis/cmr/umm). In addition to being an extensible metadata model, the UMM provides a crosswalk for mapping among the various CMR-supported metadata standards, including DIF10, ECHO10, ISO 19115-1, and ISO 19115-2. -The CMR is designed around its own metadata standard called the [Unified Metadata Model (UMM).](https://earthdata.nasa.gov/eosdis/science-system-description/eosdis-components/cmr/umm) In addition to being an extensible metadata model, the UMM also provides a cross-walk for mapping between the various CMR-supported metadata standards. CMR-supported metadata standards currently include: -* [DIF10](https://earthdata.nasa.gov/esdis/eso/standards-and-references/directory-interchange-format-dif-standard) (Collection/Data Product-level only) -* [ECHO10](https://earthdata.nasa.gov/esdis/eso/standards-and-references/echo-metadata-standard) (Collection/Data Product and Granule/File-level metadata) -* [ISO19115-1 and ISO19115-2](https://earthdata.nasa.gov/esdis/eso/standards-and-references/iso-19115) (Collection/Data Product and Granule/File-level metadata) +pyQuARC currently supports the following metadata standards: * [UMM-JSON](https://wiki.earthdata.nasa.gov/display/CMR/UMM+Documents) (UMM) - * UMM-C (Collection/Data Product-level metadata) - * UMM-G (Granule/File-level metadata) - * UMM-S (Service metadata) - * UMM-T (Tool metadata) + * Collection/Data Product-level metadata (UMM-C) + * Granule/File-level metadata (UMM-G) +* [ECHO10](https://earthdata.nasa.gov/esdis/eso/standards-and-references/echo-metadata-standard) + * Collection/Data Product-level metadata (ECHO-C) + * Granule/File-level metadata (ECHO-G) +* [DIF10](https://earthdata.nasa.gov/esdis/eso/standards-and-references/directory-interchange-format-dif-standard) + * Collection/Data Product-level only +## pyQuARC User Demo Series +A series of user demos has been created to explain what pyQuARC does and how it can be used. These demos cover the process of installing, activating, and using the library for a specific schema. The demo files are available in the **resources** folder of the pyQuARC GitHub repository. -pyQuARC supports DIF10 (collection only), ECHO10 (collection and granule), UMM-C, and UMM-G standards. At this time, there are no plans to add ISO 19115 or UMM-S/T specific checks. **Note that pyQuARC development is still underway, so further enhancements and revisions are planned.** +## Install and Clone the Repository +The pyQuARC library requires `Python 3.10` to function properly across all operating systems. -**For inquiries, please email: sheyenne.kirkland@uah.edu** +### 1. Open your Command Prompt or Terminal and use the following command to clone the pyQuARC repository: +* `git clone https://github.com/NASA-IMPACT/pyQuARC.git` -## pyQuARC as a Service (QuARC) +Note: If you see the message `fatal: destination path 'pyQuARC' already exists and is not an empty directory` when running this command, it means the repository has already been cloned. To reclone it, delete the folder and its contents using the following command before running the original command again. -QuARC is pyQuARC deployed as a service and can be found here: https://quarc.nasa-impact.net/docs/. +* `rmdir /s /q pyQuARC` # deletes the directory (be cautious) -QuARC is still in beta but is regularly synced with the latest version of pyQuARC on GitHub. Fully cloud-native, the architecture diagram of QuARC is shown below: +Additional note: If you want to know where your freshly cloned pyQuARC folder ended up, you can use the following command to print your working directory: -![QuARC](https://user-images.githubusercontent.com/17416300/179866276-7c025699-01a1-4d3e-93cd-50e12c5a5ec2.png) +* `pwd` # for Linux/MacOS operating systems +* `cd` # for Windows operating systems + +This will show you the full path to the directory where the cloned pyQuARC repository is located. You can then append `\pyQuARC` to the end of the path to get the full path to the folder. + +### 2. Configure and Activate Environment: +Create an environment to set up an isolated workspace for using pyQuARC. You can do this with Anaconda/Miniconda (Option A) or with Python’s built-in `venv` module (Option B). + +**A. Use the Conda package manager to create and name the environment:** +* `conda create --name ` # - Replace `` with the name of your environment. + +**B. Use the Python interpreter to create a virtual environment in your current directory:** +* `python -m venv env` + +Next, activate the environment using either Option A or Option B, depending on how you created it in the previous step: + +**A. Activate the Conda environment with the Conda package manager:** +* `conda activate ` + +**B. Activate the Python virtual environment** +For macOS/Linux operating systems, use the following: +* `source env/bin/activate` + +For Windows operating systems, use the following command: +* `env\Scripts\activate` + +Note: On Windows, you may encounter an error with this command. If that happens, use: +* `.\env\Scripts\Activate.ps1` -## Architecture +Be sure to reference the correct location of the env directory, as you may need to activate either the `.bat` or `.ps1` script. This error is uncommon. +### 3. Install Requirements +Next, install the required packages. The requirements are included as a text file in the repository and will be available on your local machine automatically once you clone the pyQuARC repository. Before installing the requirements, make sure you are in your working directory and navigate to the pyQuARC folder. + +Navigate to your directory: +* `cd` + +Navigate to the pyQuARC folder: +* `cd pyQuARC` + +Install the requirements: +* `pip install -r requirements.txt` + +You are almost there! Open your code editor (e.g., VS Code), navigate to the location where you cloned the repository, select the pyQuARC folder, and click Open. You should now be able to see all the existing files and contents of the pyQuARC folder in your code editor. Voilà! You are ready to use pyQuARC! + +## pyQuARC Architecture ![pyQuARC Architecture](/images/architecture.png) -The Downloader is used to obtain a copy of a metadata record of interest from the CMR. This is accomplished using a [CMR API query,](https://cmr.earthdata.nasa.gov/search/site/docs/search/api.html) where the metadata record of interest is identified by its unique identifier in the CMR (concept_id). CMR API documentation can be found here: -[https://cmr.earthdata.nasa.gov/search/site/docs/search/api.html](https://cmr.earthdata.nasa.gov/search/site/docs/search/api.html) +pyQuARC uses a Downloader to obtain a copy of a metadata record of interest from the CMR API. This is accomplished using a [CMR API query,](https://cmr.earthdata.nasa.gov/search/site/docs/search/api.html) where the metadata record of interest is identified by its unique identifier in the CMR (concept_id). For more, please visi the [CMR API documentation](https://cmr.earthdata.nasa.gov/search/site/docs/search/api.html). -There is also the option to select and run pyQuARC on a metadata record already downloaded to your local desktop. +After cloning the repository, you can find a set of files in the `schemas` folder including `checks.json`, `rule_mapping.json`, and `check_messages.json` that define and apply the rules used to evaluate metadata. Each rule is specified by its `rule_id`, associated function, and any dependencies on specific metadata elements. -The `checks.json` file includes a comprehensive list of rules. Each rule is specified by its `rule_id,` associated function, and any dependencies on specific metadata elements. +* The `checks.json` file contains a comprehensive list of all metadata quality rules used by pyQuARC. Each rule in this file includes a `check_function` that specifies the name of the check. +* The `check_messages.json` file contains the messages that are displayed when a check fails. You can use the `check_function` name from the `checks.json` file to locate the output message associated with each check. +* The `rule_mapping.json` file specifies which metadata element(s) each rule applies to. -The `rule_mapping.json` file specifies which metadata element(s) each rule applies to. The `rule_mapping.json` also references the `messages.json` file which includes messages that can be displayed when a check passes or fails. +Furthermore, the `rule_mapping.json` file specifies the severity level associated with a failure. If a check fails, it is assigned one of three categories: ❌ Error, ⚠️ Warning, or ℹ️ Info. These categories correspond to priority levels in [ARC’s priority matrix](https://wiki.earthdata.nasa.gov/display/CMR/ARC+Priority+Matrix) and indicate the importance of the failed check. Default severity values are based on ARC’s metadata quality assessment framework but can be customized to meet individual needs. -Furthermore, the `rule_mapping.json` file specifies the level of severity associated with a failure. If a check fails, it will be assigned a severity category of “error”, “warning”, or "info.” These categories correspond to priority categorizations in [ARC’s priority matrix](https://wiki.earthdata.nasa.gov/display/CMR/ARC+Priority+Matrix) and communicate the importance of the failed check, with “error” being the most critical category, “warning” indicating a failure of medium priority, and “info” indicating a minor issue or inconsistency. Default severity values are assigned based on ARC’s metadata quality assessment framework, but can be customized to meet individual needs. +❌ Error → most critical issues +⚠️ Warning → medium-priority issues +ℹ️ Info → minor issues -## Customization -pyQuARC is designed to be customizable. Output messages can be modified using the `messages_override.json` file - any messages added to `messages_override.json` will display over the default messages in the `message.json` file. Similarly, there is a `rule_mapping_override.json` file which can be used to override the default settings for which rules/checks are applied to which metadata elements. +In the `code` folder, you will find a series of Python files containing the implementations for each check. For example, the `data_format_gcmd_check` listed in the `checks.json` file can be found in the `string_validator.py` file, where the code performs the check using a string validator. -There is also the opportunity for more sophisticated customization. New QA rules can be added and existing QA rules can be edited or removed. Support for new metadata standards can be added as well. Further details on how to customize pyQuARC will be provided in the technical user’s guide below. +## Run pyQuARC on a Single Record -While the pyQuARC base package is currently managed by the ARC team, the long term goal is for it to be owned and governed by the broader EOSDIS metadata community. +### Locating the Concept ID +To run pyQuARC on a single record, either at the collection (data product) level or the granule (individual file) level, you will need the associated Concept ID. If you don’t know the Concept ID for the record, you can find it by following these steps: -## Install/User’s Guide -### Running the program +1. Go to NASA [Earthdata Search](https://search.earthdata.nasa.gov/) and locate the data product of interest. +2. Click Collection Details and locate the dataset’s Short Name, which is often highlighted in gray along with the Version number (for example: Short Name = Aqua_AIRS_MODIS1km_IND, Version = 1). +3. Copy the Short Name and Version number, then modify the following path: -*Note:* This program requires `Python 3.8` installed in your system. +* `https://cmr.earthdata.nasa.gov/search/collections.umm-json?entry_id=SHORTNAME_VERSION#.2&all_revisions=true` -**Clone the repo:** [https://github.com/NASA-IMPACT/pyQuARC/](https://github.com/NASA-IMPACT/pyQuARC/) +You will need to replace `SHORTNAME` in the path with the actual Short Name of the dataset (for example: Aqua_AIRS_MODIS1km_IND). +You will also need to replace `VERSION#` in the path with the actual Version number listed under Collection Details in Earthdata Search (for example: 1). -**Go to the project directory:** `cd pyQuARC` +For the dataset “Aqua AIRS-MODIS 1-km Matchup Indexes V1 (Aqua_AIRS_MODIS1km_IND) at GES_DISC” with Short Name Aqua_AIRS_MODIS1km_IND and Version 1, the path is modified as follows: + +* `https://cmr.earthdata.nasa.gov/search/collections.umm-json?entry_id=Aqua_AIRS_MODIS1km_IND_1&all_revisions=true` + +You should now be able to find the `concept-id` for that collection (data product). + +For individual files (granules), locating the Concept ID is straightforward. In [Earthdata Search](https://search.earthdata.nasa.gov/), find the file of interest, click View Details, and then check the Information tab to see the Concept ID. -**Create a python virtual environment:** `python -m venv env` +### Running pyQuARC Using the Concept ID +Now that you have identified the Concept ID for the collection (data product) or granule (individual file) metadata, you can use the following command in your code editor to curate it: -**Activate the environment:** `source env/bin/activate` +* `python pyQuARC/main.py --concept_ids CONCEPT_ID --format FORMAT` -**Install the requirements:** `pip install -r requirements.txt` +`CONCEPT_ID` should be replaced with the Concept ID of the collection or granule-level metadata (for example: `C2515837343-GES_DISC`). +`FORMAT` should be replaced with the schema you are using to validate the metadata. This will differ depending on whether you are curating collection- or granule-level metadata. The list of acceptable formats is as follows: + +- `umm-c` (for collection) +- `umm-g` (for granule) +- `echo-c` (for collection) +- `echo-g` (for granule) +- `dif10` (for both collection and granule) + +**Example** +For `C2515837343-GES_DISC`, the command above can be modified as follows: + +`python pyQuARC/main.py --concept_ids C2515837343-GES_DISC --format umm-c` + +In this example, `CONCEPT_ID` has been replaced with `C2515837343-GES_DISC`, and `FORMAT` has been replaced with `umm-c` + +### Running pyQuARC on a Local File +There is also the option to select and run pyQuARC on a metadata record already downloaded to your local desktop. **Run `main.py`:** @@ -110,8 +184,33 @@ or ▶ python pyQuARC/main.py --file "/Users/batman/projects/pyQuARC/tests/fixtures/test_cmr_metadata.echo10" ``` -### Adding a custom rule +## Run pyQuARC on Multiple Records +pyQuARC has the capability to run metadata checks on multiple collection or granule IDs. This feature allows users to perform validation checks on multiple records simultaneously. When performing validation checks on multiple records, it is essential that all records share the same schema format, which could be one of the following: `umm-c`, `umm-g`, `echo-c`, `echo-g`, and `dif10`. + +To run pyQuARC on multiple records, use one of the following options/commands: + +A. List the collection IDs consecutively, separated by commas. The results will be displayed in the console. + +`python pyQuARC/main.py --concept_ids , , , …. --format umm-c` + +B. If you have multiple collection IDs (e.g., more than 10 records), it is recommended to create a text file listing the collection IDs. The format of the records should be: + + + + +…… + + +`python pyQuARC/main.py --concept_ids $(cat pyQuARC/files.txt) --format umm-c` + +C. If you prefer to save the output from multiple records to a `.csv` file for reference, use the following command. Note that the output format may not be perfectly structured due to the default settings used when writing output from the Python console. +`python pyQuARC/main.py --concept_ids , , , …. --format umm-c > pyquarc_output.csv` + +## Customization +pyQuARC is designed to be customizable. Output messages can be modified using the `messages_override.json` file - any messages added to `messages_override.json` will display over the default messages in the `message.json` file. Similarly, there is a `rule_mapping_override.json` file which can be used to override the default settings for which rules/checks are applied to which metadata elements. There is also the opportunity for more sophisticated customization. New QA rules can be added and existing QA rules can be edited or removed. Support for new metadata standards can be added as well. + +### Adding a custom rule To add a custom rule, follow the following steps: **Add an entry to the `schemas/rule_mapping.json` file in the form:** @@ -389,7 +488,6 @@ The values 0 and 1 do not amount to a true value >>> ... ``` - **To provide custom messages for new or old fields:** ```python @@ -418,3 +516,13 @@ The values 0 and 1 do not amount to a true value >>> validator.validate() >>> ... ``` + +## pyQuARC as a Service (QuARC) +QuARC is pyQuARC deployed as a service and can be found here: https://quarc.nasa-impact.net/docs/. + +QuARC is still in beta but is regularly synced with the latest version of pyQuARC on GitHub. Fully cloud-native, the architecture diagram of QuARC is shown below: + +![QuARC](https://user-images.githubusercontent.com/17416300/179866276-7c025699-01a1-4d3e-93cd-50e12c5a5ec2.png) + +## Have a question? +If you have any questions, please contact us at **earthdata-support@nasa.gov**. From de3b84cd6af0515cdb97d69ba6df0f4690ed3fe3 Mon Sep 17 00:00:00 2001 From: Lavanya Ashokkumar Date: Tue, 30 Sep 2025 12:46:32 -0500 Subject: [PATCH 47/71] updated md file -Code of conduct - Final version #352 --- Code_of_Conduct.md | 75 +++++++++++++++++++++++++++++ pyQuARC_Code_of_Conduct.md | 99 -------------------------------------- 2 files changed, 75 insertions(+), 99 deletions(-) create mode 100644 Code_of_Conduct.md delete mode 100644 pyQuARC_Code_of_Conduct.md diff --git a/Code_of_Conduct.md b/Code_of_Conduct.md new file mode 100644 index 00000000..fd3d165a --- /dev/null +++ b/Code_of_Conduct.md @@ -0,0 +1,75 @@ +# Contributor Covenant Code of Conduct + +## Our Pledge +One of the primary goals of the pyQuARC repository is to cultivate a respectful, inclusive, equitable, and collaborative environment for all users, community members, stakeholders, and developers from diverse backgrounds. Our Code of Conduct is grounded in the FAIR principles (Findable, Accessible, Interoperable, and Reusable) and outlines our expectations for all participants, as well as the consequences of unacceptable behavior. We invite all users to help us create a safe and positive experience for every member of the community. + +## Our Standards +We strive to create a welcoming culture that empowers people to provide outstanding open science. Achieving this requires an open exchange of ideas, guided by thoughtful and respectful standards. + +Examples of behaviors that contribute to a positive community environment include: + +* Engaging in professional interactions with other members that are respectful, empathetic, and courteous +* Providing and receiving constructive feedback +* Accepting responsibility and offering sincere apologies when mistakes are made +* Prioritizing the well-being of the community as a whole over individual interests + +Examples of unacceptable behavior include: + +* Using sexualized language or imagery, or making unwelcome sexual attention or advances of any kind +* Trolling, making insulting or derogatory remarks, or engaging in personal or political attacks +* Harassing others in any form, whether public or private +* Publishing private information (e.g., physical address, email address) without explicit permission +* Engaging in any other conduct that could reasonably be considered inappropriate in a professional setting +* Commit of Malicious Code + +## Enforcement Responsibilities +Community leaders are responsible for clarifying and enforcing our standards of acceptable behavior and will take appropriate and fair corrective action in response to any behavior that they deem inappropriate, threatening, offensive, or harmful. +Community leaders have the right and responsibility to remove, edit, or reject comments, commits, code, wiki edits, issues, and other contributions that are not aligned with this Code of Conduct, and will communicate reasons for moderation decisions when appropriate. + +This Code of Conduct applies both within project spaces and in public settings where an individual is representing the project or its community. Additional guidance on appropriate conduct will also be provided for in-person and virtual events. + +## Key Definitions +* A **participant** is anyone who creates an issue, posts a comment, or question in the pyQuARC GitHub repository. +* A **contributor** is an individual who submits a pull request or code commit to the pyQuARC GitHub repository. +* A **moderator** is an individual appointed to oversee and moderate comments, issues, pull requests, and code commits, as well as manage access to the pyQuARC repository. + +[GitHub](https://docs.github.com/en/organizations/managing-peoples-access-to-your-organization-with-roles/roles-in-an-organization#organization-moderators) defines a moderator as “Moderators are organization members who, in addition to their permissions as members, are allowed to block and unblock non-member contributors, set interaction limits, and hide comments in public repositories owned by the organization.” GitHub moderators can hide comments, pull requests, and issues; block or unblock contributors; and limit interactions for specific users. GitHub resources for moderation can be found [here](https://docs.github.com/en/organizations/managing-peoples-access-to-your-organization-with-roles/managing-moderators-in-your-organization#about-organization-moderators). + +## Enforcement Guidelines +Community moderators will follow these Community Impact Guidelines when determining the consequences for any action deemed in violation of this Code of Conduct: + +### First Code of Conduct Violation (Warning) +If a participant violates the Code of Conduct for the first time, a community moderator will contact the individual as soon as possible and promptly remove the content. + +* **Participant:** Content removed + contacted by the community moderator +* **Contributor:** PR not accepted and removed from GitHub + contacted by the community moderator + +### Second Code of Conduct Violation (Temporary Ban) +If a participant violates the Code of Conduct a second time, they will be contacted by a community moderator and informed of a temporary ban from the repository. + +* **Participant:** Content removed + contacted by the community moderator + ban for 90 days from the space where the offense occurred +* **Contributor:** PR not accepted and removed from GitHub + contacted by the community moderator + banned from submitting PRs for 90 days + +### Third Code of Conduct Violation (Permanent Ban) +If a participant violates the Code of Conduct a third time and demonstrates a repeated pattern of disregarding community standards. In that case, they will be permanently banned and removed from the pyQuARC GitHub repository. + +* **Participant:** Content removed + contacted by the community moderator + permanent ban from the repository +* **Contributor:** PR not accepted and removed from GitHub + contacted by the community moderator + permanent ban from the repository + +**Additional note:** If a contributor submits a pull request that is harmful to our digital spaces (e.g., malicious code), they will be immediately and permanently banned from the pyQuARC repository. + +## Question? +If you have a question about how to contribute to the pyQuARC library, please refer to the Contributing file (contributing.md) in the pyQuARC repository. For all other inquiries, including reports of potential violations of this Code of Conduct, please contact earthdata-support@nasa.gov. + +## Attributions +The pyQuARC Code of Conduct has been adopted from the following sources: + +* [The GSA Code of Conduct](https://handbook.tts.gsa.gov/about-us/code-of-conduct/) +* [The Contributor Covenant](https://www.contributor-covenant.org/), [version 2.1](https://www.contributor-covenant.org/version/2/1/code_of_conduct/code_of_conduct.md) +* [Mozilla's code of conduct enforcement ladder](https://github.com/mozilla/inclusion#code-of-conduct--enforcement) +* [The Citizen Code of Conduct](https://github.com/stumpsyn/policies/blob/master/citizen_code_of_conduct.md) +* [Django Code of Conduct](https://www.djangoproject.com/conduct/) +* [The TTS Handbook](https://handbook.tts.gsa.gov/about-us/code-of-conduct/) +* [Ada Initiative](https://adainitiative.org/) +* [National Aeronautics and Space Administration Open-Source Software Policy](https://www.earthdata.nasa.gov/engage/open-data-services-software-policies/open-source-software-policy) +* [MetaDocencia - Transform to Open Science repository](https://github.com/MetaDocencia/Transform-to-Open-Science_ES) \ No newline at end of file diff --git a/pyQuARC_Code_of_Conduct.md b/pyQuARC_Code_of_Conduct.md deleted file mode 100644 index ec17c808..00000000 --- a/pyQuARC_Code_of_Conduct.md +++ /dev/null @@ -1,99 +0,0 @@ -Contributor Covenant Code of Conduct - -Our Pledge - -One of the primary goals of the pyQuARC repository is to cultivate a respectful, inclusive, equitable, and collaborative environment for all users, community members, stakeholders, and developers from diverse backgrounds. Our Code of Conduct is grounded in the FAIR principles (Findable, Accessible, Interoperable, and Reusable) and outlines our expectations for all participants, as well as the consequences of unacceptable behavior. We invite all users to help us create a safe and positive experience for every member of the community. - -Our Standards - -We strive to create a welcoming culture that empowers people to provide outstanding open science. Achieving this requires an open exchange of ideas, guided by thoughtful and respectful standards. - -Examples of behaviors that contribute to a positive community environment include: - -Engaging in professional interactions with other members that are respectful, empathetic, and courteous - -Providing and receiving constructive feedback - -Accepting responsibility and offering sincere apologies when mistakes are made - -Prioritizing the well-being of the community as a whole over individual interests - -Examples of unacceptable behavior include: - -Using sexualized language or imagery, or making unwelcome sexual attention or advances of any kind - -Trolling, making insulting or derogatory remarks, or engaging in personal or political attacks - -Harassing others in any form, whether public or private - -Publishing private information (e.g., physical address, email address) without explicit permission - -Engaging in any other conduct that could reasonably be considered inappropriate in a professional setting - -Commit of Malicious Code - -Enforcement Responsibilities - -Community leaders are responsible for clarifying and enforcing our standards of acceptable behavior and will take appropriate and fair corrective action in response to any behavior that they deem inappropriate, threatening, offensive, or harmful. - -Community leaders have the right and responsibility to remove, edit, or reject comments, commits, code, wiki edits, issues, and other contributions that are not aligned with this Code of Conduct, and will communicate reasons for moderation decisions when appropriate. - -This Code of Conduct applies both within project spaces and in public settings where an individual is representing the project or its community. Additional guidance on appropriate conduct will also be provided for in-person and virtual events. - -Key Definitions - -A participant is anyone who creates an issue, posts a comment, or question in the pyQuARC GitHub repository. - -A contributor is an individual who submits a pull request or code commit to the pyQuARC GitHub repository. - -A moderator is an individual appointed to oversee and moderate comments, issues, pull requests, and code commits, as well as manage access to the pyQuARC repository. - - as “Moderators are organization members who, in addition to their permissions as members, are allowed to block and unblock non-member contributors, set interaction limits, and hide comments in public repositories owned by the organization.” GitHub moderators can hide comments, pull requests, and issues; block or unblock contributors; and limit interactions for specific users. - -GitHub resources for moderation can be. - -Enforcement Guidelines - -Community moderators will follow these Community Impact Guidelines when determining the consequences for any action deemed in violation of this Code of Conduct: - -First Code of Conduct Violation (Warning) - -If a participant violates the Code of Conduct for the first time, a community moderator will contact the individual as soon as possible and promptly remove the content. - -Participant: Content removed + contacted by the community moderator - -Contributor: PR not accepted and removed from GitHub + contacted by the community moderator - -Second Code of Conduct Violation (Temporary Ban) - -If a participant violates the Code of Conduct a second time, they will be contacted by a community moderator and informed of a temporary ban from the repository. - -Participant: Content removed + contacted by the community moderator + ban for 90 days from the space where the offense occurred - -Contributor: PR not accepted and removed from GitHub + contacted by the community moderator + banned from submitting PRs for 90 days - -Third Code of Conduct Violation (Permanent Ban) - -If a participant violates the Code of Conduct a third time and demonstrates a repeated pattern of disregarding community standards. In that case, they will be permanently banned and removed from the pyQuARC GitHub repository. - -Participant: Content removed + contacted by the community moderator + permanent ban from the repository - -Contributor: PR not accepted and removed from GitHub + contacted by the community moderator + permanent ban from the repository - -Additional note: If a contributor submits a pull request that is harmful to our digital spaces (e.g., malicious code), they will be immediately and permanently banned from the pyQuARC repository. - -Question? - -If you have a question about how to contribute to the pyQuARC library, please refer to the Contributing file (contributing.md) in the pyQuARC repository. For all other inquiries, including reports of potential violations of this Code of Conduct, please contact . - -Attributions - -The pyQuARC Code of Conduct has been adopted from the following sources: - -, - - and the . - - from Contributor-Covenant.org - - from the General Services Administration \ No newline at end of file From aca4e407dd6541316791469d285a35f91c114b77 Mon Sep 17 00:00:00 2001 From: Lavanya Ashokkumar Date: Thu, 2 Oct 2025 12:58:18 -0500 Subject: [PATCH 48/71] CMR latest version #355 --- pyQuARC/main.py | 76 ++++++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 69 insertions(+), 7 deletions(-) diff --git a/pyQuARC/main.py b/pyQuARC/main.py index 6995b50c..d6af5bd0 100644 --- a/pyQuARC/main.py +++ b/pyQuARC/main.py @@ -138,6 +138,60 @@ def _cmr_query(self): query = f"{orig_query}&page_num={page_num}" return concept_ids + + def _get_latest_version(self, concept_id): + """ + Fetches the latest revision version for a given concept_id from CMR + + Args: + concept_id (str): The concept ID to query + + Returns: + str: The latest revision number, or None if not found + """ + try: + # Construct the CMR metadata URL for the concept + url = f"{self.cmr_host}/search/concepts/{concept_id}.umm_json" + headers = get_headers() + response = requests.get(url, headers=headers) + + if response.status_code == 200: + # Extract revision-id from response headers + revision_id = response.headers.get('CMR-Revision-Id') + return revision_id + else: + print(f"Warning: Could not fetch latest version for {concept_id}. Using default.") + return None + except Exception as e: + print(f"Error fetching latest version for {concept_id}: {str(e)}") + return None + + def _get_collection_version(self, concept_id): + """ + Fetch the MetadataSpecification.Version of a collection from CMR. + Args: + concept_id (str): The concept ID to query. + + Returns: + str: The collection's MetadataSpecification.Version, or None if not found. + """ + try: + url = f"{self.cmr_host}/search/concepts/{concept_id}.umm_json" + headers = get_headers() + response = requests.get(url, headers=headers) + + if response.status_code == 200: + data = response.json() + # UMM collections have MetadataSpecification.Version + version = data.get("MetadataSpecification", {}).get("Version") + return version + else: + print(f"Warning: Could not fetch metadata for {concept_id}.") + return None + except Exception as e: + print(f"Error fetching collection version for {concept_id}: {str(e)}") + return None + def _validate_with_cmr(self, concept_id, metadata_content): """ @@ -181,8 +235,20 @@ def validate(self): if self.concept_ids: for concept_id in tqdm(self.concept_ids): + # If no version specified, get the latest version + version_to_use = self.version + if not version_to_use: + version_to_use = self._get_latest_version(concept_id) + if version_to_use: + print(f"Using latest version {version_to_use} for {concept_id}") + + # Fetch schema version too + collection_version = self._get_collection_version(concept_id) + if collection_version: + print(f"Collection {concept_id} schema version: {collection_version}") + downloader = Downloader( - concept_id, self.metadata_format, self.version, self.cmr_host + concept_id, self.metadata_format, version_to_use, self.cmr_host ) if not (content := downloader.download()): self.errors.append( @@ -194,17 +260,11 @@ def validate(self): ) continue content = content.encode() - cmr_response = self._validate_with_cmr(concept_id, content) validation_errors, pyquarc_errors = checker.run(content) self.errors.append( { "concept_id": concept_id, "errors": validation_errors, - "cmr_validation": { - "errors": cmr_response.json().get("errors", []), - # TODO: show warnings - "warnings": cmr_response.json().get("warnings", []) - }, "pyquarc_errors": pyquarc_errors, } ) @@ -212,6 +272,7 @@ def validate(self): elif self.file_path: with open(os.path.abspath(self.file_path), "r") as myfile: content = myfile.read().encode() + validation_errors, pyquarc_errors = checker.run(content) self.errors.append( { @@ -388,3 +449,4 @@ def display_results(self): ) results = arc.validate() arc.display_results() + \ No newline at end of file From 649573063f31f564effd8ea40711f6f5c6803813 Mon Sep 17 00:00:00 2001 From: Lavanya Ashokkumar Date: Tue, 7 Oct 2025 14:23:00 -0500 Subject: [PATCH 49/71] Cmr latest version #355 - included changes in main.py --- pyQuARC/main.py | 5 ++++- requirements.txt | 2 +- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/pyQuARC/main.py b/pyQuARC/main.py index d6af5bd0..4baf36f5 100644 --- a/pyQuARC/main.py +++ b/pyQuARC/main.py @@ -348,7 +348,10 @@ def display_results(self): f"\n\t {COLOR['title']}{COLOR['bright']} pyQuARC ERRORS: {END}\n" ) for error in pyquarc_errors: - error_prompt += f"\t\t ERROR: {error['type']}. Details: {error['details']} \n" + error_prompt += ( + f"\t\t ERROR: {error.get('message', 'No message available')} \n" + f"\t\t DETAILS: {error.get('details', 'No details available')} \n" + ) if cmr_validation := error.get("cmr_validation"): cmr_error_msg = self._format_cmr_error(cmr_validation) diff --git a/requirements.txt b/requirements.txt index 30aec17c..6432dc89 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,7 +1,7 @@ colorama==0.4.4 idna==2.10 jsonschema==4.17.3 -lxml==4.9.1 +lxml==5.3.0 #4.9.1 pytest==5.4.3 pytz==2020.1 requests==2.24.0 From d85925c7ee88b89702cd1c0eef6a0b7322e01979 Mon Sep 17 00:00:00 2001 From: Slesa Adhikari Date: Wed, 8 Oct 2025 10:09:51 -0500 Subject: [PATCH 50/71] Add severity to result --- pyQuARC/code/custom_checker.py | 3 +++ pyQuARC/code/datetime_validator.py | 2 +- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/pyQuARC/code/custom_checker.py b/pyQuARC/code/custom_checker.py index f38cedda..55f514d0 100644 --- a/pyQuARC/code/custom_checker.py +++ b/pyQuARC/code/custom_checker.py @@ -184,6 +184,7 @@ def run( for future in as_completed(future_results): try: func_return = future.result() + severity = func_return.get("severity") valid = func_return["valid"] # can be True, False or None if valid is not None: if valid: @@ -196,4 +197,6 @@ def run( raise e result["valid"] = validity result["value"] = invalid_values + if severity: + result["severity"] = severity return result diff --git a/pyQuARC/code/datetime_validator.py b/pyQuARC/code/datetime_validator.py index 22b11dca..b312d3fa 100644 --- a/pyQuARC/code/datetime_validator.py +++ b/pyQuARC/code/datetime_validator.py @@ -158,7 +158,7 @@ def validate_datetime_against_granules( if ( (not date_time) or not last_granule_datetime - or ((abs(date_time - last_granule_datetime).total_seconds() / 3600) > 24) + or abs((date_time - last_granule_datetime).total_seconds() / 3600) > 24 ): return_value["severity"] = "error" From 3ede23b517a3fbcdcdaf66c430fc855a1127ed9b Mon Sep 17 00:00:00 2001 From: Lavanya Ashokkumar Date: Wed, 8 Oct 2025 13:22:40 -0500 Subject: [PATCH 51/71] removed demo videos -- Updated README file #354 --- README.md | 3 --- 1 file changed, 3 deletions(-) diff --git a/README.md b/README.md index 93bc4e78..6e4d61b1 100644 --- a/README.md +++ b/README.md @@ -28,9 +28,6 @@ pyQuARC currently supports the following metadata standards: * [DIF10](https://earthdata.nasa.gov/esdis/eso/standards-and-references/directory-interchange-format-dif-standard) * Collection/Data Product-level only -## pyQuARC User Demo Series -A series of user demos has been created to explain what pyQuARC does and how it can be used. These demos cover the process of installing, activating, and using the library for a specific schema. The demo files are available in the **resources** folder of the pyQuARC GitHub repository. - ## Install and Clone the Repository The pyQuARC library requires `Python 3.10` to function properly across all operating systems. From ce05f46b833ec52aa3298dfc5c20034a8ea29c04 Mon Sep 17 00:00:00 2001 From: Slesa Adhikari Date: Wed, 8 Oct 2025 13:35:15 -0500 Subject: [PATCH 52/71] Return datetime string instead of datetime instance --- pyQuARC/code/datetime_validator.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pyQuARC/code/datetime_validator.py b/pyQuARC/code/datetime_validator.py index b312d3fa..dfe907dc 100644 --- a/pyQuARC/code/datetime_validator.py +++ b/pyQuARC/code/datetime_validator.py @@ -147,9 +147,9 @@ def validate_datetime_against_granules( # Compare the precision of the two datetime strings if len(granules["feed"]["entry"]) > 0: last_granule = granules["feed"]["entry"][0] - last_granule_datetime = last_granule.get(time_key) + last_granule_datetime_string = last_granule.get(time_key) date_time = get_date_time(datetime_string) - last_granule_datetime = get_date_time(last_granule_datetime) + last_granule_datetime = get_date_time(last_granule_datetime_string) validity = date_time == last_granule_datetime else: validity = False @@ -165,7 +165,7 @@ def validate_datetime_against_granules( return { **return_value, "valid": validity, - "value": (date_time, last_granule_datetime), + "value": (datetime_string, last_granule_datetime_string), } @staticmethod From 9469487dd1674edf90736797163e58edd23ec46d Mon Sep 17 00:00:00 2001 From: Lavanya Ashokkumar Date: Wed, 8 Oct 2025 15:17:14 -0500 Subject: [PATCH 53/71] Schema update for echo-c and echo-g #328 --- pyQuARC/code/schema_validator.py | 55 +++++++++++++++++++++++++------- 1 file changed, 43 insertions(+), 12 deletions(-) diff --git a/pyQuARC/code/schema_validator.py b/pyQuARC/code/schema_validator.py index 26f2d315..48ab62f0 100644 --- a/pyQuARC/code/schema_validator.py +++ b/pyQuARC/code/schema_validator.py @@ -19,7 +19,10 @@ SCHEMA_CDN_BASE = "https://cdn.earthdata.nasa.gov/umm" - +REMOTE_XML_SCHEMAS = { + "echo10_collection": "https://git.earthdata.nasa.gov/projects/EMFD/repos/echo-schemas/browse/schemas/10.0/Collection.xsd", + "echo10_granule": "https://git.earthdata.nasa.gov/projects/EMFD/repos/echo-schemas/browse/schemas/10.0/Granule.xsd" +} class SchemaValidator: """ @@ -71,26 +74,54 @@ def __init__( self.validator_func = self.run_xml_validator self.check_messages = check_messages + + def read_xml_schema(self): """ - Reads the xml schema file + Reads the XML schema file (either from a remote URL or local path). """ - # The XML schema file (echo10_xml.xsd) imports another schema file (MetadataCommon.xsd) - # Python cannot figure out the import if they are in a different location than the calling script - # Thus we need to set an environment variable to let it know where the files are located - # Path to catalog must be a url + from urllib.request import urlopen + + # Maintain XML catalog handling catalog_path = f"file:{pathname2url(str(SCHEMA_PATHS['catalog']))}" - # Temporarily set the environment variable os.environ["XML_CATALOG_FILES"] = os.environ.get( "XML_CATALOG_FILES", catalog_path ) - with open(SCHEMA_PATHS[f"{self.metadata_format}_schema"]) as schema_file: - file_content = schema_file.read().encode() - xmlschema_doc = etree.parse(BytesIO(file_content)) - schema = etree.XMLSchema(xmlschema_doc) - return schema + def get_raw_schema_url(browse_url: str) -> str: + """Convert /browse/ URL into /raw/ for direct XML download.""" + if "/browse/" in browse_url: + return browse_url.replace("/browse/", "/raw/") + "?at=refs%2Fheads%2Fmaster" + return browse_url + # Select remote schema if metadata_format matches + schema_url = REMOTE_XML_SCHEMAS.get(self.metadata_format) + try: + if schema_url: + raw_url = get_raw_schema_url(schema_url) + print(f"Fetching schema remotely from: {raw_url}") + import ssl + ssl_context = ssl._create_unverified_context() # Disable certificate check safely for this fetch + with urlopen(raw_url, context=ssl_context) as response: + file_content = response.read() + else: + # Fallback to local schema file + with open(SCHEMA_PATHS[f"{self.metadata_format}_schema"]) as schema_file: + file_content = schema_file.read().encode() + + xmlschema_doc = etree.parse(BytesIO(file_content)) + schema = etree.XMLSchema(xmlschema_doc) + return schema + + except Exception as e: + print(f"⚠️ Remote fetch failed or unavailable for {self.metadata_format}: {e}") + print("Falling back to local schema file...") + with open(SCHEMA_PATHS[f"{self.metadata_format}_schema"]) as schema_file: + file_content = schema_file.read().encode() + xmlschema_doc = etree.parse(BytesIO(file_content)) + schema = etree.XMLSchema(xmlschema_doc) + return schema + def read_json_schema(self): """ Reads the json schema file From 884d9bb52edd6d9ba1c9d1cddcee1c6742bd604a Mon Sep 17 00:00:00 2001 From: Lavanya Ashokkumar Date: Mon, 13 Oct 2025 13:45:14 -0500 Subject: [PATCH 54/71] code change before dev merge - LA --- pyQuARC/schemas/check_messages.json | 2 +- pyQuARC/schemas/check_messages_override.json | 11 +---------- 2 files changed, 2 insertions(+), 11 deletions(-) diff --git a/pyQuARC/schemas/check_messages.json b/pyQuARC/schemas/check_messages.json index 0b8b38c8..03562612 100644 --- a/pyQuARC/schemas/check_messages.json +++ b/pyQuARC/schemas/check_messages.json @@ -53,7 +53,7 @@ "message": "", "url": "https://wiki.earthdata.nasa.gov/display/CMR/Entry+Title" }, - "remediation": "The EntryTitle/DataSetId should not be identical to the ShortName. Recommend providing a descriptive, formal title for the dataset. " + "remediation": "Recommend providing a more descriptive title for the dataset. " }, "abstract_length_check": { "failure": "The abstract provided may be inadequate based on length.", diff --git a/pyQuARC/schemas/check_messages_override.json b/pyQuARC/schemas/check_messages_override.json index 2fee2195..311847da 100644 --- a/pyQuARC/schemas/check_messages_override.json +++ b/pyQuARC/schemas/check_messages_override.json @@ -1,11 +1,2 @@ -{ - "shortname_uniqueness": { - "failure": "The EntryTitle/DataSetId `{}` is identical to the ShortName `{}`.", - "help": { - "message": "", - "url": "https://wiki.earthdata.nasa.gov/display/CMR/Entry+Title" - }, - "remediation": "Recommend providing a more descriptive title for the dataset. " - } -} +{} From 789fb39a9adb23af2fca8415c702890c976ad2af Mon Sep 17 00:00:00 2001 From: Lavanya Ashokkumar Date: Mon, 13 Oct 2025 13:49:19 -0500 Subject: [PATCH 55/71] Added space in checks.json - LA --- pyQuARC/schemas/checks.json | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pyQuARC/schemas/checks.json b/pyQuARC/schemas/checks.json index ef303aa6..1fab4cd9 100644 --- a/pyQuARC/schemas/checks.json +++ b/pyQuARC/schemas/checks.json @@ -300,8 +300,8 @@ "available": true }, "opendap_link_check": { - "data_type": "custom", - "check_function": "opendap_link_check", - "available": true + "data_type": "custom", + "check_function": "opendap_link_check", + "available": true } } From ffacfce4170aa171cc64f3000e6aa9a1605b1833 Mon Sep 17 00:00:00 2001 From: Slesa Adhikari Date: Mon, 13 Oct 2025 13:55:49 -0500 Subject: [PATCH 56/71] Fix calling before assignment --- pyQuARC/code/datetime_validator.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/pyQuARC/code/datetime_validator.py b/pyQuARC/code/datetime_validator.py index dfe907dc..03da3faf 100644 --- a/pyQuARC/code/datetime_validator.py +++ b/pyQuARC/code/datetime_validator.py @@ -142,6 +142,7 @@ def validate_datetime_against_granules( granules = cmr_request(cmr_prms) validity = True last_granule_datetime = None + last_granule_datetime_string = None date_time = None # Compare the precision of the two datetime strings @@ -151,6 +152,9 @@ def validate_datetime_against_granules( date_time = get_date_time(datetime_string) last_granule_datetime = get_date_time(last_granule_datetime_string) validity = date_time == last_granule_datetime + diff_bigger_than_a_day = abs( + (date_time - last_granule_datetime).total_seconds() / 3600 + ) > 24 else: validity = False @@ -158,7 +162,7 @@ def validate_datetime_against_granules( if ( (not date_time) or not last_granule_datetime - or abs((date_time - last_granule_datetime).total_seconds() / 3600) > 24 + or diff_bigger_than_a_day ): return_value["severity"] = "error" From bf8628d3aed2dc0aee502b51865b4d8f538eccb6 Mon Sep 17 00:00:00 2001 From: Lavanya Ashokkumar Date: Mon, 13 Oct 2025 15:44:40 -0500 Subject: [PATCH 57/71] Space added - LA --- pyQuARC/schemas/check_messages.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyQuARC/schemas/check_messages.json b/pyQuARC/schemas/check_messages.json index 18bde23f..58a1ecdd 100644 --- a/pyQuARC/schemas/check_messages.json +++ b/pyQuARC/schemas/check_messages.json @@ -1077,6 +1077,6 @@ "message": "OPeNDAP links allow for direct data access through the OPeNDAP protocol.", "url": "https://wiki.earthdata.nasa.gov/display/CMR/Related+URLs" }, - "remediation": "Recommend providing an OPeNDAP in the granule's Online Resources or Related URLs fields for enhanced data accessibility." + "remediation": "Recommend providing an OPeNDAP in the granule's Online Resources or Related URLs fields for enhanced data accessibility." } } \ No newline at end of file From 40e520275db98036421b96c7a8e9dad14d0ed926 Mon Sep 17 00:00:00 2001 From: Lavanya Ashokkumar Date: Mon, 13 Oct 2025 17:11:44 -0500 Subject: [PATCH 58/71] Code changes for _get_collection_version -LA --- pyQuARC/main.py | 82 +++++++++++++++++++------------------------------ 1 file changed, 31 insertions(+), 51 deletions(-) diff --git a/pyQuARC/main.py b/pyQuARC/main.py index 4baf36f5..87c5edcb 100644 --- a/pyQuARC/main.py +++ b/pyQuARC/main.py @@ -139,58 +139,37 @@ def _cmr_query(self): return concept_ids - def _get_latest_version(self, concept_id): - """ - Fetches the latest revision version for a given concept_id from CMR - - Args: - concept_id (str): The concept ID to query - - Returns: - str: The latest revision number, or None if not found - """ - try: - # Construct the CMR metadata URL for the concept - url = f"{self.cmr_host}/search/concepts/{concept_id}.umm_json" - headers = get_headers() - response = requests.get(url, headers=headers) - - if response.status_code == 200: - # Extract revision-id from response headers - revision_id = response.headers.get('CMR-Revision-Id') - return revision_id - else: - print(f"Warning: Could not fetch latest version for {concept_id}. Using default.") - return None - except Exception as e: - print(f"Error fetching latest version for {concept_id}: {str(e)}") - return None def _get_collection_version(self, concept_id): """ - Fetch the MetadataSpecification.Version of a collection from CMR. + Fetches collection information from CMR for a given concept_id. Args: concept_id (str): The concept ID to query. - + info_type (str): Type of information to fetch. + Options: "revision" or "metadata_version". + Returns: - str: The collection's MetadataSpecification.Version, or None if not found. + str: The requested info (revision ID or MetadataSpecification.Version), or None if not found. """ try: url = f"{self.cmr_host}/search/concepts/{concept_id}.umm_json" headers = get_headers() response = requests.get(url, headers=headers) - if response.status_code == 200: - data = response.json() - # UMM collections have MetadataSpecification.Version - version = data.get("MetadataSpecification", {}).get("Version") - return version - else: - print(f"Warning: Could not fetch metadata for {concept_id}.") - return None + if response.status_code != 200: + print(f"Warning: Could not fetch data for {concept_id}. Status: {response.status_code}") + return {"revision_id": None, "metadata_version": None} + + data = response.json() if response.content else {} + return { + "revision_id": response.headers.get("CMR-Revision-Id"), + "metadata_version": data.get("MetadataSpecification", {}).get("Version"), + } + except Exception as e: - print(f"Error fetching collection version for {concept_id}: {str(e)}") - return None + # Unified error handling — return dict even on failure + print(f"Error fetching collection info for {concept_id}: {str(e)}") + return {"revision_id": None, "metadata_version": None} def _validate_with_cmr(self, concept_id, metadata_content): @@ -236,17 +215,16 @@ def validate(self): if self.concept_ids: for concept_id in tqdm(self.concept_ids): # If no version specified, get the latest version - version_to_use = self.version - if not version_to_use: - version_to_use = self._get_latest_version(concept_id) - if version_to_use: - print(f"Using latest version {version_to_use} for {concept_id}") - - # Fetch schema version too - collection_version = self._get_collection_version(concept_id) - if collection_version: - print(f"Collection {concept_id} schema version: {collection_version}") - + # Get both revision and metadata version in one call + info = self._get_collection_version(concept_id) + version_to_use = self.version or info["revision_id"] + metadata_version = info["metadata_version"] + + if version_to_use: + print(f"Using latest revision {version_to_use} for {concept_id}") + if metadata_version: + print(f"Collection {concept_id} schema version: {metadata_version}") + downloader = Downloader( concept_id, self.metadata_format, version_to_use, self.cmr_host ) @@ -259,6 +237,7 @@ def validate(self): } ) continue + content = content.encode() validation_errors, pyquarc_errors = checker.run(content) self.errors.append( @@ -272,7 +251,6 @@ def validate(self): elif self.file_path: with open(os.path.abspath(self.file_path), "r") as myfile: content = myfile.read().encode() - validation_errors, pyquarc_errors = checker.run(content) self.errors.append( { @@ -281,8 +259,10 @@ def validate(self): "pyquarc_errors": pyquarc_errors, } ) + return self.errors + @staticmethod def _error_message(messages): severities = ["error", "warning", "info"] From fe6874047c47ad3d32f48cfd947211830d711878 Mon Sep 17 00:00:00 2001 From: Lavanya Ashokkumar Date: Tue, 14 Oct 2025 11:46:06 -0500 Subject: [PATCH 59/71] Additional Online Access and Resources - new: LA --- pyQuARC/main.py | 5 ++++- pyQuARC/schemas/rule_mapping.json | 26 ++++++++++++++++++++++++-- 2 files changed, 28 insertions(+), 3 deletions(-) diff --git a/pyQuARC/main.py b/pyQuARC/main.py index 6995b50c..707e4808 100644 --- a/pyQuARC/main.py +++ b/pyQuARC/main.py @@ -287,7 +287,10 @@ def display_results(self): f"\n\t {COLOR['title']}{COLOR['bright']} pyQuARC ERRORS: {END}\n" ) for error in pyquarc_errors: - error_prompt += f"\t\t ERROR: {error['type']}. Details: {error['details']} \n" + error_prompt += ( + f"\t\t ERROR: {error.get('message', 'No message available')} \n" + f"\t\t DETAILS: {error.get('details', 'No details available')} \n" + ) if cmr_validation := error.get("cmr_validation"): cmr_error_msg = self._format_cmr_error(cmr_validation) diff --git a/pyQuARC/schemas/rule_mapping.json b/pyQuARC/schemas/rule_mapping.json index 7641306a..15b1b173 100644 --- a/pyQuARC/schemas/rule_mapping.json +++ b/pyQuARC/schemas/rule_mapping.json @@ -4137,6 +4137,11 @@ "fields": [ "RelatedUrls/Type" ] + }, + { + "fields": [ + "Collection/OnlineResources/OnlineResource/Type" + ] } ], "umm-g": [ @@ -4170,6 +4175,11 @@ } ], "umm-c": [ + { + "fields": [ + "Collection/OnlineResources/OnlineResource/Type" + ] + }, { "fields": [ "RelatedUrls/Type", @@ -4187,7 +4197,7 @@ ] }, "severity": "warning", - "check_id": "availability_check" + "check_id": "one_item_presence_check" }, "characteristic_name_uniqueness_check": { "rule_name": "Characteristic Name Uniqueness Check", @@ -4818,11 +4828,23 @@ "RelatedUrls/Description", "RelatedUrls/URL" ] + }, + { + "fields": [ + "Granule/OnlineAccessURLs/OnlineAccessURL/URLDescription", + "Granule/OnlineAccessURLs/OnlineAccessURL/URL" + ] + }, + { + "fields": [ + "Granule/OnlineResources/OnlineResource/Description", + "Granule/OnlineResources/OnlineResource/URL" + ] } ] }, "severity": "warning", - "check_id": "availability_check" + "check_id": "one_item_presence_check" }, "get_data_url_check": { "rule_name": "GET DATA URL check", From c0b0fa5d61587078cf75b9e17b897465bc4f9edf Mon Sep 17 00:00:00 2001 From: Lavanya Ashokkumar Date: Tue, 14 Oct 2025 12:31:54 -0500 Subject: [PATCH 60/71] Added the missing contributing file for dev -LA --- contributing.md | 56 +++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 56 insertions(+) create mode 100644 contributing.md diff --git a/contributing.md b/contributing.md new file mode 100644 index 00000000..91cf80ee --- /dev/null +++ b/contributing.md @@ -0,0 +1,56 @@ +# Contributing File + +# Welcome to pyQuARC! +This page is meant to help you learn how you can contribute to pyQuARC! We are passionate about NASA's Open Science initiative and are open to a variety of contributions. Read below to find ways that you can contribute to our project, either through reporting bugs, suggesting new features, or even directly editing the code yourself. + +## How you can contribute to pyQuARC: + +1. **Report a Bug:** for when you find something within the code that does not respond the way you expected/wanted it to. + * To start you will need to proceed to the [**Issues** tab](https://github.com/NASA-IMPACT/pyQuARC/issues) within the pyQuARC Github page. + * From here, look for the green button on the right side of the page labeled **New issue**. + * Select **Bug Report** from the list that appears, so that you can create a report to help us improve an aspect of pyQuARC. + * The page you are directed to will provide a prompt to add a title and explain how to fill in the bug you want to report. + * If you change your mind about reporting a bug, there is a white button on the bottom right of the page labeled **Cancel** where you can either decide to keep editing or close and discard your issue. + * Beneath the description box, select "Issue Type" and "Bug". + * When you are finished describing the bug you wish to report, you can click the big green button at the bottom of the page labeled **Create**. This will make your Issue visible to all pyQuARC contributors, and pyQuARC developers will automatically be assigned to the Issue and notified. + * You can see your new issue if you return to the **Issues** page of the pyQuARC GitHub and look for your title followed by the red __bug__ tag. + +2. **Suggest a New Feature:** for when you think of something that could enhance pyQuARC for other users. + * Suggesting a new feature is very similar to reporting a bug. You will start at the [**Issues** tab](https://github.com/NASA-IMPACT/pyQuARC/issues) within the pyQuARC Github page. + * Select the green **New Issue** button found on the top right side of the page. + * From the menu that appears, select **Feature Request** so that you can suggest an idea for our project. + * The page you are directed to will provide a prompt to add a title and explain how to make a new suggestion. + * If you change your mind about making a feature request, there is a white button on the bottom right of the page labeled **Cancel** where you can either decide to keep editing or close and discard your issue. + * Beneath the description box, select "Issue Type" and "Feature". + * When you are finished describing your suggestion, you can click the big green button at the bottom of the page labeled **Create**. This will make your Issue visible to all pyQuARC contributors. + * You can see your new issue if you return to the **Issues** page of the pyQuARC GitHub and look for your title followed by the green __new check__ tag. + +3. **Directly Contribute to PyQuARC Content:** for when you want to directly edit the code to add checks or new features. + * Fork the repository + * To edit the code, you will need to first create your own 'fork' of the repository. A fork is a new repository that shares code and visibility settings with the original repository and allows you to create your edits. Read more about Forks [here](https://docs.github.com/en/pull-requests/collaborating-with-pull-requests/working-with-forks/fork-a-repo). + * To create your fork of pyQuARC, return to the [**Code**](https://github.com/NASA-IMPACT/pyQuARC) tab of the pyQuARC GitHub. + * On the top right of the page, select the **Fork** tab. + * Under the "Owner" dropdown menu, select yourself as the owner of the new forked repository. + * The fork name will default to 'pyQuARC'. If you wish to name your fork something different, edit the 'Repository Name' field. + * You can set an optional description in the 'Description' field below. + * Make sure the checkbox next to 'Copy the master branch only' is selected. + * Click **Create fork** when you are finished to create your fork! + * After completing the steps above, you should be on a new page titled the same as your new fork, with "forked from NASA-IMPACT/pyQuARC" beneath the title. You have successfully created a fork of pyQuARC! + * Clone your fork locally + * Now we will store the files locally on your computer so you will be able to edit the code. Click the green dropdown button labeled **<> Code**. + * Under the **HTTPS** tab, copy the link to the repository. + * Open a Python terminal in your preferred coding location. + * Change your working directory to wherever you want your cloned pyQuARC repository to be stored. + * Type '__git clone__' and then paste the URL you copied a few steps above. + * Press **Enter** and your local clone of pyQuARC will be created! You can now explore all of the files on your local computer. + * Create a new branch and make your desired changes. + * Create a PR + * Once your changes are made, push your commits. + * You can then open a Pull Request (PR) on the [**Pull requests** tab](https://github.com/NASA-IMPACT/pyQuARC/pulls) within the pyQuARC Github page. + * Set the base repository to "NASA-IMPACT/pyQuARC" and the base to "dev". + * Fill out a title and description, then submit! + * Feedback may be provided on your PR. Once it is approved, a pyQuARC team member will merge your changes. + +## Thank you for your interest in pyQuARC! +We appreciate your interest in pyQuARC! Everyone is encouraged to help improve pyQuARC, and we welcome your comments, suggestions, and new ideas! +Please contact earthdata-support@nasa.gov with any questions. From d003d48e11c6594393c38feb3ee5b392c2e4f395 Mon Sep 17 00:00:00 2001 From: Lavanya Ashokkumar Date: Wed, 15 Oct 2025 12:13:14 -0500 Subject: [PATCH 61/71] Updated links in the readme file -la --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 6e4d61b1..89ce3600 100644 --- a/README.md +++ b/README.md @@ -22,10 +22,10 @@ pyQuARC currently supports the following metadata standards: * [UMM-JSON](https://wiki.earthdata.nasa.gov/display/CMR/UMM+Documents) (UMM) * Collection/Data Product-level metadata (UMM-C) * Granule/File-level metadata (UMM-G) -* [ECHO10](https://earthdata.nasa.gov/esdis/eso/standards-and-references/echo-metadata-standard) +* [ECHO10](https://git.earthdata.nasa.gov/projects/EMFD/repos/echo-schemas/browse/schemas/10.0) * Collection/Data Product-level metadata (ECHO-C) * Granule/File-level metadata (ECHO-G) -* [DIF10](https://earthdata.nasa.gov/esdis/eso/standards-and-references/directory-interchange-format-dif-standard) +* [DIF10](https://git.earthdata.nasa.gov/projects/EMFD/repos/dif-schemas/browse) * Collection/Data Product-level only ## Install and Clone the Repository From f404f2a7bef268830a93dcb0dd9a9ce488a5efdc Mon Sep 17 00:00:00 2001 From: Slesa Adhikari Date: Wed, 15 Oct 2025 14:11:59 -0500 Subject: [PATCH 62/71] Remove square brackets from cmr errors and refactor --- pyQuARC/main.py | 23 +++++++++++++---------- 1 file changed, 13 insertions(+), 10 deletions(-) diff --git a/pyQuARC/main.py b/pyQuARC/main.py index 8982d41b..0159b67a 100644 --- a/pyQuARC/main.py +++ b/pyQuARC/main.py @@ -237,22 +237,25 @@ def _error_message(messages): @staticmethod def _format_cmr_error(cmr_validation): - if not cmr_validation.get("errors"): + cmr_errors = cmr_validation.get("errors") + if not cmr_errors: return None error_msg_dict = {} error_msg = "" - if errors := cmr_validation.get("errors"): - for error in errors: - if type(error) is dict and error.get("path"): - if error["path"][0] not in error_msg_dict: - error_msg_dict[error["path"][0]] = [] - error_msg_dict[error["path"][0]].append(error['errors']) - else: - error_msg_dict["Misc"] = [error] + for error in cmr_errors: + if type(error) is dict and error.get("path"): + if error["path"][0] not in error_msg_dict: + error_msg_dict[error["path"][0]] = [] + error_msg_dict[error["path"][0]].append(error['errors']) + else: + error_msg_dict["Misc"] = [error] for path, errors in error_msg_dict.items(): error_msg += f"\n\t>> {path}: {END}\n" for error in errors: - error_msg += f"\t\t{COLOR['error']}Error:{END} {str(error)}\n" + error_str = str(error) + if isinstance(error, list): + error_str = ", ".join(error) + error_msg += f"\t\t{COLOR['error']}Error:{END} {error_str}\n" return error_msg def display_results(self): From c65795c7d4e644bdaadadf8f3c5ce09bf44a7a5d Mon Sep 17 00:00:00 2001 From: Lavanya Ashokkumar Date: Thu, 16 Oct 2025 10:31:50 -0500 Subject: [PATCH 63/71] Updated code of conduct --- Code_of_Conduct.md | 33 +++++++++++++++------------------ 1 file changed, 15 insertions(+), 18 deletions(-) diff --git a/Code_of_Conduct.md b/Code_of_Conduct.md index fd3d165a..90e8037b 100644 --- a/Code_of_Conduct.md +++ b/Code_of_Conduct.md @@ -1,42 +1,39 @@ # Contributor Covenant Code of Conduct ## Our Pledge -One of the primary goals of the pyQuARC repository is to cultivate a respectful, inclusive, equitable, and collaborative environment for all users, community members, stakeholders, and developers from diverse backgrounds. Our Code of Conduct is grounded in the FAIR principles (Findable, Accessible, Interoperable, and Reusable) and outlines our expectations for all participants, as well as the consequences of unacceptable behavior. We invite all users to help us create a safe and positive experience for every member of the community. +One of the primary goals of the pyQuARC repository is to cultivate a respectful and collaborative environment for all users, community members, stakeholders, and developers. Our Code of Conduct is grounded in the FAIR principles (Findable, Accessible, Interoperable, and Reusable) and outlines our expectations for all participants, as well as the consequences of unacceptable behavior. We invite all users to help us create a positive experience for every member of the community. ## Our Standards -We strive to create a welcoming culture that empowers people to provide outstanding open science. Achieving this requires an open exchange of ideas, guided by thoughtful and respectful standards. +We strive to create a space that empowers people to provide outstanding contributions to open science. Achieving this requires an open exchange of ideas, guided by thoughtful and respectful standards. -Examples of behaviors that contribute to a positive community environment include: +Examples of behaviors that contribute to a positive community environment include the following: -* Engaging in professional interactions with other members that are respectful, empathetic, and courteous +* Engaging in professional interactions with other members that are respectful and courteous * Providing and receiving constructive feedback -* Accepting responsibility and offering sincere apologies when mistakes are made +* Accepting responsibility when mistakes are made * Prioritizing the well-being of the community as a whole over individual interests -Examples of unacceptable behavior include: +Examples of unacceptable behavior include the following: -* Using sexualized language or imagery, or making unwelcome sexual attention or advances of any kind -* Trolling, making insulting or derogatory remarks, or engaging in personal or political attacks -* Harassing others in any form, whether public or private +* Using inappropriate or suggestive language or imagery +* Trolling, making insulting or derogatory remarks, or engaging in personal attacks * Publishing private information (e.g., physical address, email address) without explicit permission -* Engaging in any other conduct that could reasonably be considered inappropriate in a professional setting -* Commit of Malicious Code +* Commit of malicious code ## Enforcement Responsibilities -Community leaders are responsible for clarifying and enforcing our standards of acceptable behavior and will take appropriate and fair corrective action in response to any behavior that they deem inappropriate, threatening, offensive, or harmful. -Community leaders have the right and responsibility to remove, edit, or reject comments, commits, code, wiki edits, issues, and other contributions that are not aligned with this Code of Conduct, and will communicate reasons for moderation decisions when appropriate. +Community leaders are responsible for clarifying and enforcing our standards of acceptable behavior. They will take appropriate action in response to any behavior that they deem inappropriate, threatening, offensive, or harmful. -This Code of Conduct applies both within project spaces and in public settings where an individual is representing the project or its community. Additional guidance on appropriate conduct will also be provided for in-person and virtual events. +Community leaders have the right and responsibility to remove, edit, or reject comments, commits, code, wiki edits, issues, and other contributions that are not aligned with this Code of Conduct, and will communicate reasons for moderation decisions when appropriate. ## Key Definitions -* A **participant** is anyone who creates an issue, posts a comment, or question in the pyQuARC GitHub repository. +* A **participant** is anyone who creates an issue, posts a comment, or submits a question in the pyQuARC GitHub repository. * A **contributor** is an individual who submits a pull request or code commit to the pyQuARC GitHub repository. * A **moderator** is an individual appointed to oversee and moderate comments, issues, pull requests, and code commits, as well as manage access to the pyQuARC repository. [GitHub](https://docs.github.com/en/organizations/managing-peoples-access-to-your-organization-with-roles/roles-in-an-organization#organization-moderators) defines a moderator as “Moderators are organization members who, in addition to their permissions as members, are allowed to block and unblock non-member contributors, set interaction limits, and hide comments in public repositories owned by the organization.” GitHub moderators can hide comments, pull requests, and issues; block or unblock contributors; and limit interactions for specific users. GitHub resources for moderation can be found [here](https://docs.github.com/en/organizations/managing-peoples-access-to-your-organization-with-roles/managing-moderators-in-your-organization#about-organization-moderators). ## Enforcement Guidelines -Community moderators will follow these Community Impact Guidelines when determining the consequences for any action deemed in violation of this Code of Conduct: +Community moderators will follow the community impact guidelines detailed below when determining the consequences for any action deemed in violation of this Code of Conduct. ### First Code of Conduct Violation (Warning) If a participant violates the Code of Conduct for the first time, a community moderator will contact the individual as soon as possible and promptly remove the content. @@ -56,10 +53,10 @@ If a participant violates the Code of Conduct a third time and demonstrates a re * **Participant:** Content removed + contacted by the community moderator + permanent ban from the repository * **Contributor:** PR not accepted and removed from GitHub + contacted by the community moderator + permanent ban from the repository -**Additional note:** If a contributor submits a pull request that is harmful to our digital spaces (e.g., malicious code), they will be immediately and permanently banned from the pyQuARC repository. +**Additional note:** If a contributor submits a pull request that is harmful to our digital spaces (e.g., malicious code), they will be immediately and permanently banned from the pyQuARC repository. ## Question? -If you have a question about how to contribute to the pyQuARC library, please refer to the Contributing file (contributing.md) in the pyQuARC repository. For all other inquiries, including reports of potential violations of this Code of Conduct, please contact earthdata-support@nasa.gov. +If you have a question about how to contribute to the pyQuARC library, please refer to the contributing file (contributing.md) in the pyQuARC repository. For all other inquiries, including reports of potential violations of this Code of Conduct, please contact earthdata-support@nasa.gov. ## Attributions The pyQuARC Code of Conduct has been adopted from the following sources: From 3eeba95f095e4c4385f12d15c29597639f45cdf2 Mon Sep 17 00:00:00 2001 From: Lavanya Ashokkumar Date: Thu, 16 Oct 2025 12:37:05 -0500 Subject: [PATCH 64/71] Added granule umm-g version 1.6.5 --- pyQuARC/code/schema_validator.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyQuARC/code/schema_validator.py b/pyQuARC/code/schema_validator.py index 1156f8b4..bc413177 100644 --- a/pyQuARC/code/schema_validator.py +++ b/pyQuARC/code/schema_validator.py @@ -14,7 +14,7 @@ DEFAULT_UMM_C_VERSION = "v1.18.4" # Or any other version you prefer as default # Define UMM-G versions if you want to make it flexible as well -SUPPORTED_UMM_G_VERSIONS = ["v1.6.6"] +SUPPORTED_UMM_G_VERSIONS = ["v1.6.6", "v1.6.5"] DEFAULT_UMM_G_VERSION = "v1.6.6" SCHEMA_CDN_BASE = "https://cdn.earthdata.nasa.gov/umm" From 2f65c0752350c46b3d4c64d104ab6b4ef3d5db68 Mon Sep 17 00:00:00 2001 From: Lavanya Ashokkumar Date: Thu, 16 Oct 2025 14:37:41 -0500 Subject: [PATCH 65/71] Updated schema for 1.18.4 only --- pyQuARC/code/schema_validator.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pyQuARC/code/schema_validator.py b/pyQuARC/code/schema_validator.py index bc413177..621b249c 100644 --- a/pyQuARC/code/schema_validator.py +++ b/pyQuARC/code/schema_validator.py @@ -10,11 +10,11 @@ from .constants import ECHO10_C, SCHEMA_PATHS, UMM_C, UMM_G -SUPPORTED_UMM_C_VERSIONS = ["v1.18.4", "v1.18.3", "v1.18.2"] +SUPPORTED_UMM_C_VERSIONS = ["v1.18.4"] DEFAULT_UMM_C_VERSION = "v1.18.4" # Or any other version you prefer as default # Define UMM-G versions if you want to make it flexible as well -SUPPORTED_UMM_G_VERSIONS = ["v1.6.6", "v1.6.5"] +SUPPORTED_UMM_G_VERSIONS = ["v1.6.6"] DEFAULT_UMM_G_VERSION = "v1.6.6" SCHEMA_CDN_BASE = "https://cdn.earthdata.nasa.gov/umm" From 020599273f8dcb8fc5f1cd8f415b1ee481239e34 Mon Sep 17 00:00:00 2001 From: Slesa Adhikari Date: Thu, 16 Oct 2025 15:00:13 -0500 Subject: [PATCH 66/71] Fix url desc uniqueness and missing checks --- pyQuARC/schemas/rule_mapping.json | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pyQuARC/schemas/rule_mapping.json b/pyQuARC/schemas/rule_mapping.json index 6c7aac72..633bb584 100644 --- a/pyQuARC/schemas/rule_mapping.json +++ b/pyQuARC/schemas/rule_mapping.json @@ -4872,7 +4872,7 @@ ] }, "severity": "warning", - "check_id": "one_item_presence_check" + "check_id": "availability_check" }, "get_data_url_check": { "rule_name": "GET DATA URL check", @@ -5351,7 +5351,7 @@ "umm-c": [ { "fields": [ - "RelatedUrls/Description" + "RelatedUrls" ] }, { @@ -5384,7 +5384,7 @@ "Description" ], "severity": "info", - "check_id": "one_item_presence_check" + "check_id": "uniqueness_check" }, "online_resource_description_uniqueness_check": { "rule_name": "Online Resource Description Uniqueness Check", From ed5a9412cbe55e8937e4924b45df0293d81c8035 Mon Sep 17 00:00:00 2001 From: Slesa Adhikari Date: Thu, 16 Oct 2025 15:20:46 -0500 Subject: [PATCH 67/71] Refactor and undo cmr_validation removal --- pyQuARC/main.py | 25 +++++++++++-------------- 1 file changed, 11 insertions(+), 14 deletions(-) diff --git a/pyQuARC/main.py b/pyQuARC/main.py index 87c5edcb..20621ea6 100644 --- a/pyQuARC/main.py +++ b/pyQuARC/main.py @@ -145,20 +145,16 @@ def _get_collection_version(self, concept_id): Fetches collection information from CMR for a given concept_id. Args: concept_id (str): The concept ID to query. - info_type (str): Type of information to fetch. - Options: "revision" or "metadata_version". Returns: - str: The requested info (revision ID or MetadataSpecification.Version), or None if not found. + dict: {"revision_id": str | None, "metadata_version": str | None } A dict of Revision ID and Metadata Version of the collection. """ + failure_return_value = {"revision_id": None, "metadata_version": None} try: url = f"{self.cmr_host}/search/concepts/{concept_id}.umm_json" headers = get_headers() response = requests.get(url, headers=headers) - - if response.status_code != 200: - print(f"Warning: Could not fetch data for {concept_id}. Status: {response.status_code}") - return {"revision_id": None, "metadata_version": None} + response.raise_for_status() data = response.json() if response.content else {} return { @@ -169,7 +165,7 @@ def _get_collection_version(self, concept_id): except Exception as e: # Unified error handling — return dict even on failure print(f"Error fetching collection info for {concept_id}: {str(e)}") - return {"revision_id": None, "metadata_version": None} + return failure_return_value def _validate_with_cmr(self, concept_id, metadata_content): @@ -218,12 +214,7 @@ def validate(self): # Get both revision and metadata version in one call info = self._get_collection_version(concept_id) version_to_use = self.version or info["revision_id"] - metadata_version = info["metadata_version"] - - if version_to_use: - print(f"Using latest revision {version_to_use} for {concept_id}") - if metadata_version: - print(f"Collection {concept_id} schema version: {metadata_version}") + # metadata_version = info["metadata_version"] downloader = Downloader( concept_id, self.metadata_format, version_to_use, self.cmr_host @@ -239,11 +230,17 @@ def validate(self): continue content = content.encode() + cmr_response = self._validate_with_cmr(concept_id, content) validation_errors, pyquarc_errors = checker.run(content) self.errors.append( { "concept_id": concept_id, "errors": validation_errors, + "cmr_validation": { + "errors": cmr_response.json().get("errors", []), + # TODO: show warnings + "warnings": cmr_response.json().get("warnings", []) + }, "pyquarc_errors": pyquarc_errors, } ) From 24d990968ffe8cf68d981e0e15d09bde57ad1130 Mon Sep 17 00:00:00 2001 From: Slesa Adhikari Date: Thu, 16 Oct 2025 15:31:13 -0500 Subject: [PATCH 68/71] Readd concept id printing --- pyQuARC/main.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/pyQuARC/main.py b/pyQuARC/main.py index 20621ea6..49458433 100644 --- a/pyQuARC/main.py +++ b/pyQuARC/main.py @@ -214,7 +214,10 @@ def validate(self): # Get both revision and metadata version in one call info = self._get_collection_version(concept_id) version_to_use = self.version or info["revision_id"] - # metadata_version = info["metadata_version"] + + metadata_version = info["metadata_version"] + if metadata_version: + print(f"Collection {concept_id} schema version: {metadata_version}") downloader = Downloader( concept_id, self.metadata_format, version_to_use, self.cmr_host From 5a3489d129e8bf82dbd0aeed847a22bbe75fb9ee Mon Sep 17 00:00:00 2001 From: Bhawana Karakheti Date: Fri, 17 Oct 2025 12:57:24 -0500 Subject: [PATCH 69/71] Echo-g OnlineAccessUrls and OnlineResource revised --- pyQuARC/code/url_validator.py | 138 +++++++++++---- pyQuARC/schemas/check_messages.json | 20 ++- pyQuARC/schemas/checks.json | 10 ++ pyQuARC/schemas/rule_mapping.json | 256 ++++++++++++++++++++++++++++ 4 files changed, 388 insertions(+), 36 deletions(-) diff --git a/pyQuARC/code/url_validator.py b/pyQuARC/code/url_validator.py index 55a74e61..fc93ad2f 100644 --- a/pyQuARC/code/url_validator.py +++ b/pyQuARC/code/url_validator.py @@ -34,6 +34,28 @@ def _extract_http_texts(text_with_urls): starts_with_http.add(text) return starts_with_http + @staticmethod + def _status_code_from_request(url): + """ + Return HTTP status code for url, raising requests exceptions to caller. + """ + headers = get_headers() + return requests.get(url, headers=headers, timeout=10).status_code + + @staticmethod + def _extract_and_normalize_urls(text_with_urls): + """ + Extract URLs from text, include tokens that start with 'http', strip trailing dots, + and return (set_of_urls, joined_value_string). + """ + extractor = URLExtract(cache_dir=os.environ.get("CACHE_DIR")) + urls = extractor.find_urls(text_with_urls) + urls.extend(UrlValidator._extract_http_texts(text_with_urls)) + # remove dots at the end and deduplicate + urls = set(url[:-1] if url.endswith(".") else url for url in urls) + value = ", ".join(urls) + return urls, value + @staticmethod @if_arg def health_and_status_check(text_with_urls): @@ -45,48 +67,96 @@ def health_and_status_check(text_with_urls): (dict) An object with the validity of the check and the instance/results """ - def status_code_from_request(url): - headers = get_headers() - # timeout = 10 seconds, to allow for slow but not invalid connections - return requests.get(url, headers=headers, timeout=10).status_code + results = [] + + validity = True + + urls, value = UrlValidator._extract_and_normalize_urls(text_with_urls) + + for url in urls: + if url.startswith("http"): + try: + response_code = 400 + # UrlValidator._status_code_from_request(url) + if response_code == 200: + if url.startswith("http://"): + secure_url = url.replace("http://", "https://") + if UrlValidator._status_code_from_request(secure_url) == 200: + result = { + "url": url, + "error": f"The url{url} is secure. Please use 'https' instead of 'http'.", + } + results.append(result) + + else: + continue + else: + result = {"url": url, "error": f"Status code {response_code}"} + results.append(result) + except requests.ConnectionError: + result = {"url": url, "error": f"The URL {url} does not exist on Internet."} + results.append(result) + + if results: + validity = False + value = results + + return {"valid": validity, "value": value} + + @staticmethod + @if_arg + def protocol_checks(text_with_urls): + """ + Checks the ftp included in `text_with_urls` + Args: + text_with_urls (str, required): The text that contains ftp + Returns: + (dict) An object with the validity of the check and the instance/results + """ results = [] validity = True - # extract URLs from text - extractor = URLExtract(cache_dir=os.environ.get("CACHE_DIR")) - urls = extractor.find_urls(text_with_urls) - urls.extend(UrlValidator._extract_http_texts(text_with_urls)) + urls, value = UrlValidator._extract_and_normalize_urls(text_with_urls) - # remove dots at the end (The URLExtract library catches URLs, but sometimes appends a '.' at the end) - # remove duplicated urls - urls = set(url[:-1] if url.endswith(".") else url for url in urls) - value = ", ".join(urls) + for url in urls: + if url.startswith("ftp://"): + results.append({ + "url": url, + "error": f"The URL {url} exists" + }) + + if results: + validity = False + value = results + + return {"valid": validity, "value": value} + + @staticmethod + @if_arg + def secure_url_checks(text_with_urls): + """ + Checks whether the secure link (https) is included in `text_with_urls` + Args: + text_with_urls (str, required): The text that contains https + Returns: + (dict) An object with the validity of the check and the instance/results + """ + + results = [] + + validity = True + + urls, value = UrlValidator._extract_and_normalize_urls(text_with_urls) - # check that URL returns a valid response for url in urls: - if not url.startswith("http"): - url = f"http://{url}" - try: - response_code = status_code_from_request(url) - if response_code == 200: - if url.startswith("http://"): - secure_url = url.replace("http://", "https://") - if status_code_from_request(secure_url) == 200: - result = { - "url": url, - "error": "The URL is secure. Please use 'https' instead of 'http'.", - } - else: - continue - else: - result = {"url": url, "error": f"Status code {response_code}"} - except requests.ConnectionError: - result = {"url": url, "error": "The URL does not exist on Internet."} - except: - result = {"url": url, "error": "Some unknown error occurred."} - results.append(result) + url="http://" + if url.startswith("http://"): + results.append({ + "url": url, + "error": f"The URL {url} is not secure" + }) if results: validity = False diff --git a/pyQuARC/schemas/check_messages.json b/pyQuARC/schemas/check_messages.json index aa6bcdd1..13b9f394 100644 --- a/pyQuARC/schemas/check_messages.json +++ b/pyQuARC/schemas/check_messages.json @@ -40,12 +40,28 @@ "remediation": "Recommend updating the Revision date so that it comes chronologically after the Insert/Creation time." }, "url_check": { - "failure": "A URL with a status code other than 200 has been identified: `{}`.", + "failure": "`{}`.", "help": { "message": "", "url": "https://en.wikipedia.org/wiki/List_of_HTTP_status_codes" }, - "remediation": "This often indicates a broken link. If the URL is broken, recommend revising." + "remediation": "The following link is broken. Recommend replacing the OnlineAccessURL with a link to directly access the granule via https." + }, + "protocol_check": { + "failure": "The following URL `{}` does not exist.", + "help": { + "message": "", + "url": "https://en.wikipedia.org/wiki/List_of_HTTP_status_codes" + }, + "remediation": "Recommend removing the ftp access link." + }, + "secure_url_check": { + "failure": "`{}`.", + "help": { + "message": "", + "url": "https://en.wikipedia.org/wiki/List_of_HTTP_status_codes" + }, + "remediation": "Recommend updating the following link(s) from 'http' to 'https':" }, "shortname_uniqueness": { "failure": "The EntryTitle/DataSetId `{}` is identical to the ShortName `{}`.", diff --git a/pyQuARC/schemas/checks.json b/pyQuARC/schemas/checks.json index 778f4da3..c2303240 100644 --- a/pyQuARC/schemas/checks.json +++ b/pyQuARC/schemas/checks.json @@ -24,6 +24,16 @@ "check_function": "health_and_status_check", "available": true }, + "protocol_check": { + "data_type": "url", + "check_function": "protocol_checks", + "available": true + }, + "secure_url_check": { + "data_type": "url", + "check_function": "secure_url_checks", + "available": true + }, "string_compare": { "data_type": "string", "check_function": "compare", diff --git a/pyQuARC/schemas/rule_mapping.json b/pyQuARC/schemas/rule_mapping.json index 9afd5059..394a7ad3 100644 --- a/pyQuARC/schemas/rule_mapping.json +++ b/pyQuARC/schemas/rule_mapping.json @@ -812,6 +812,262 @@ "severity": "error", "check_id": "url_check" }, + + "protocol_check": { + "rule_name": "protocol_checks", + "fields_to_apply": { + "echo-c": [ + { + "fields": [ + "Collection/Description" + ] + }, + { + "fields": [ + "Collection/SuggestedUsage" + ] + }, + { + "fields": [ + "Collection/CitationforExternalPublication" + ] + }, + { + "fields": [ + "Collection/OnlineAccessURLs/OnlineAccessURL/URL" + ] + }, + { + "fields": [ + "Collection/OnlineResources/OnlineResource/URL" + ] + } + ], + "dif10": [ + { + "fields": [ + "DIF/Extended_Metadata/Metadata/Value" + ] + }, + { + "fields": [ + "DIF/Dataset_Citation/Online_Resource" + ] + }, + { + "fields": [ + "DIF/Summary/Abstract" + ] + }, + { + "fields": [ + "DIF/Organization/Organization_URL" + ] + }, + { + "fields": [ + "DIF/Related_URL/URL" + ] + }, + { + "fields": [ + "DIF/Extended_Metadata/Metadata/Value" + ] + } + ], + "umm-c": [ + { + "fields": [ + "DataCenters/ContactInformation/RelatedUrls/URL" + ] + }, + { + "fields": [ + "DataCenters/ContactPersons/ContactInformation/RelatedUrls/URL" + ] + }, + { + "fields": [ + "DataCenters/ContactGroups/ContactInformation/RelatedUrls/URL" + ] + }, + { + "fields": [ + "ContactPersons/ContactInformation/RelatedUrls/URL" + ] + }, + { + "fields": [ + "ContactGroups/ContactInformation/RelatedUrls/URL" + ] + }, + { + "fields": [ + "RelatedUrls/URL" + ] + } + ], + "umm-g": [ + { + "fields": [ + "RelatedUrls/URL" + ] + }, + { + "fields": [ + "MetadataSpecification/URL" + ] + } + ], + "echo-g": [ + { + "fields": [ + "Granule/OnlineAccessURLs/OnlineAccessURL/URL" + ] + }, + { + "fields": [ + "Granule/OnlineResources/OnlineResource/URL" + ] + }, + { + "fields": [ + "Granule/AssociatedBrowseImageUrls/ProviderBrowseUrl/URL" + ] + } + ] + }, + "severity": "error", + "check_id": "protocol_check" + }, + "secure_url_check": { + "rule_name": "secure_url_checks", + "fields_to_apply": { + "echo-c": [ + { + "fields": [ + "Collection/Description" + ] + }, + { + "fields": [ + "Collection/SuggestedUsage" + ] + }, + { + "fields": [ + "Collection/CitationforExternalPublication" + ] + }, + { + "fields": [ + "Collection/OnlineAccessURLs/OnlineAccessURL/URL" + ] + }, + { + "fields": [ + "Collection/OnlineResources/OnlineResource/URL" + ] + } + ], + "dif10": [ + { + "fields": [ + "DIF/Extended_Metadata/Metadata/Value" + ] + }, + { + "fields": [ + "DIF/Dataset_Citation/Online_Resource" + ] + }, + { + "fields": [ + "DIF/Summary/Abstract" + ] + }, + { + "fields": [ + "DIF/Organization/Organization_URL" + ] + }, + { + "fields": [ + "DIF/Related_URL/URL" + ] + }, + { + "fields": [ + "DIF/Extended_Metadata/Metadata/Value" + ] + } + ], + "umm-c": [ + { + "fields": [ + "DataCenters/ContactInformation/RelatedUrls/URL" + ] + }, + { + "fields": [ + "DataCenters/ContactPersons/ContactInformation/RelatedUrls/URL" + ] + }, + { + "fields": [ + "DataCenters/ContactGroups/ContactInformation/RelatedUrls/URL" + ] + }, + { + "fields": [ + "ContactPersons/ContactInformation/RelatedUrls/URL" + ] + }, + { + "fields": [ + "ContactGroups/ContactInformation/RelatedUrls/URL" + ] + }, + { + "fields": [ + "RelatedUrls/URL" + ] + } + ], + "umm-g": [ + { + "fields": [ + "RelatedUrls/URL" + ] + }, + { + "fields": [ + "MetadataSpecification/URL" + ] + } + ], + "echo-g": [ + { + "fields": [ + "Granule/OnlineAccessURLs/OnlineAccessURL/URL" + ] + }, + { + "fields": [ + "Granule/OnlineResources/OnlineResource/URL" + ] + }, + { + "fields": [ + "Granule/AssociatedBrowseImageUrls/ProviderBrowseUrl/URL" + ] + } + ] + }, + "severity": "info", + "check_id": "secure_url_check" + }, + "shortname_uniqueness": { "rule_name": "Short Name uniqueness check", "fields_to_apply": { From e84ba7924ab75b720164750be59af02f1750172c Mon Sep 17 00:00:00 2001 From: Bhawana Karakheti Date: Fri, 17 Oct 2025 13:00:34 -0500 Subject: [PATCH 70/71] Remaining changes --- pyQuARC/code/url_validator.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/pyQuARC/code/url_validator.py b/pyQuARC/code/url_validator.py index fc93ad2f..dd049eaa 100644 --- a/pyQuARC/code/url_validator.py +++ b/pyQuARC/code/url_validator.py @@ -76,8 +76,7 @@ def health_and_status_check(text_with_urls): for url in urls: if url.startswith("http"): try: - response_code = 400 - # UrlValidator._status_code_from_request(url) + response_code = UrlValidator._status_code_from_request(url) if response_code == 200: if url.startswith("http://"): secure_url = url.replace("http://", "https://") @@ -151,7 +150,6 @@ def secure_url_checks(text_with_urls): urls, value = UrlValidator._extract_and_normalize_urls(text_with_urls) for url in urls: - url="http://" if url.startswith("http://"): results.append({ "url": url, From eea3d5dea5ebd126e958f8c8e412e3349a9a7fbe Mon Sep 17 00:00:00 2001 From: Bhawana Karakheti <80163528+bhawana11@users.noreply.github.com> Date: Fri, 17 Oct 2025 14:56:59 -0500 Subject: [PATCH 71/71] Fixes: fixes in the health and status check --- pyQuARC/code/url_validator.py | 23 ++++++++--------------- 1 file changed, 8 insertions(+), 15 deletions(-) diff --git a/pyQuARC/code/url_validator.py b/pyQuARC/code/url_validator.py index 756f30cb..fc4d7efa 100644 --- a/pyQuARC/code/url_validator.py +++ b/pyQuARC/code/url_validator.py @@ -74,24 +74,17 @@ def health_and_status_check(text_with_urls): urls, value = UrlValidator._extract_and_normalize_urls(text_with_urls) for url in urls: - if url.startswith("http"): + if url.startswith("https"): try: response_code = UrlValidator._status_code_from_request(url) - if response_code == 200: - if url.startswith("http://"): - secure_url = url.replace("http://", "https://") - if UrlValidator._status_code_from_request(secure_url) == 200: - result = { - "url": url, - "error": f"The url{url} is secure. Please use 'https' instead of 'http'.", - } - results.append(result) - - else: - continue - else: - result = {"url": url, "error": f"Status code {response_code}"} + if response_code != 200: + result = { + "url": url, + "error": f"The url {url} is broken.", + } results.append(result) + else: + continue except requests.ConnectionError: result = {"url": url, "error": f"The URL {url} does not exist on Internet."} results.append(result)