From 17acabf40c5a73de850871454cd160083b719403 Mon Sep 17 00:00:00 2001 From: Bhawana Karakheti Date: Fri, 5 Sep 2025 15:19:59 -0500 Subject: [PATCH 1/4] collectionprogress and temporal extent flag bugs fixes --- pyQuARC/schemas/check_messages.json | 6 +++--- pyQuARC/schemas/rule_mapping.json | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/pyQuARC/schemas/check_messages.json b/pyQuARC/schemas/check_messages.json index 0b8b38c8..aa6bcdd1 100644 --- a/pyQuARC/schemas/check_messages.json +++ b/pyQuARC/schemas/check_messages.json @@ -205,7 +205,7 @@ "message": "", "url": "" }, - "remediation": "If data collection is ongoing, provide an EndsAtPresentFlag of \"true\"" + "remediation": "Since data collection is no longer ongoing, recommend updating the EndsAtPresentFlag to 'false'." }, "ends_at_present_flag_presence_check": { "failure": "Potential issue with:\n - No EndingDateTime provided; no EndsAtPresentFlag provided for a potentially active collection. \n - CollectionState is not \"COMPLETE\"; no EndsAtPresentFlag provided for a potentially active collection.", @@ -736,12 +736,12 @@ "remediation": "Recommend providing an entry of 'true' or 'false'." }, "collection_progress_consistency_check": { - "failure": "The Collection State/Progress `{}` is not consistent with the Ending Date Time and/or the Ends At Present Flag.", + "failure": "The Collection Progress `{}` is not consistent with the Ending Date Time and/or the Ends At Present Flag.", "help": { "message": "", "url": "https://wiki.earthdata.nasa.gov/display/CMR/Collection+Progress" }, - "remediation": "Recommend updating the Collection State/Progress based on the Ending Date Time and Ends At Present Flag values." + "remediation": "Recommend updating the Collection Progress based on the Ending Date Time and Ends At Present Flag values." }, "online_resource_type_gcmd_check": { "failure": "The provided Online Resource/Related URLs Type `{}` is not consistent with GCMD.", diff --git a/pyQuARC/schemas/rule_mapping.json b/pyQuARC/schemas/rule_mapping.json index 2e3acc41..9afd5059 100644 --- a/pyQuARC/schemas/rule_mapping.json +++ b/pyQuARC/schemas/rule_mapping.json @@ -1315,7 +1315,7 @@ } ] }, - "severity": "warning", + "severity": "error", "check_id": "ends_at_present_flag_logic_check" }, "ends_at_present_flag_presence_check": { From 5a3489d129e8bf82dbd0aeed847a22bbe75fb9ee Mon Sep 17 00:00:00 2001 From: Bhawana Karakheti Date: Fri, 17 Oct 2025 12:57:24 -0500 Subject: [PATCH 2/4] Echo-g OnlineAccessUrls and OnlineResource revised --- pyQuARC/code/url_validator.py | 138 +++++++++++---- pyQuARC/schemas/check_messages.json | 20 ++- pyQuARC/schemas/checks.json | 10 ++ pyQuARC/schemas/rule_mapping.json | 256 ++++++++++++++++++++++++++++ 4 files changed, 388 insertions(+), 36 deletions(-) diff --git a/pyQuARC/code/url_validator.py b/pyQuARC/code/url_validator.py index 55a74e61..fc93ad2f 100644 --- a/pyQuARC/code/url_validator.py +++ b/pyQuARC/code/url_validator.py @@ -34,6 +34,28 @@ def _extract_http_texts(text_with_urls): starts_with_http.add(text) return starts_with_http + @staticmethod + def _status_code_from_request(url): + """ + Return HTTP status code for url, raising requests exceptions to caller. + """ + headers = get_headers() + return requests.get(url, headers=headers, timeout=10).status_code + + @staticmethod + def _extract_and_normalize_urls(text_with_urls): + """ + Extract URLs from text, include tokens that start with 'http', strip trailing dots, + and return (set_of_urls, joined_value_string). + """ + extractor = URLExtract(cache_dir=os.environ.get("CACHE_DIR")) + urls = extractor.find_urls(text_with_urls) + urls.extend(UrlValidator._extract_http_texts(text_with_urls)) + # remove dots at the end and deduplicate + urls = set(url[:-1] if url.endswith(".") else url for url in urls) + value = ", ".join(urls) + return urls, value + @staticmethod @if_arg def health_and_status_check(text_with_urls): @@ -45,48 +67,96 @@ def health_and_status_check(text_with_urls): (dict) An object with the validity of the check and the instance/results """ - def status_code_from_request(url): - headers = get_headers() - # timeout = 10 seconds, to allow for slow but not invalid connections - return requests.get(url, headers=headers, timeout=10).status_code + results = [] + + validity = True + + urls, value = UrlValidator._extract_and_normalize_urls(text_with_urls) + + for url in urls: + if url.startswith("http"): + try: + response_code = 400 + # UrlValidator._status_code_from_request(url) + if response_code == 200: + if url.startswith("http://"): + secure_url = url.replace("http://", "https://") + if UrlValidator._status_code_from_request(secure_url) == 200: + result = { + "url": url, + "error": f"The url{url} is secure. Please use 'https' instead of 'http'.", + } + results.append(result) + + else: + continue + else: + result = {"url": url, "error": f"Status code {response_code}"} + results.append(result) + except requests.ConnectionError: + result = {"url": url, "error": f"The URL {url} does not exist on Internet."} + results.append(result) + + if results: + validity = False + value = results + + return {"valid": validity, "value": value} + + @staticmethod + @if_arg + def protocol_checks(text_with_urls): + """ + Checks the ftp included in `text_with_urls` + Args: + text_with_urls (str, required): The text that contains ftp + Returns: + (dict) An object with the validity of the check and the instance/results + """ results = [] validity = True - # extract URLs from text - extractor = URLExtract(cache_dir=os.environ.get("CACHE_DIR")) - urls = extractor.find_urls(text_with_urls) - urls.extend(UrlValidator._extract_http_texts(text_with_urls)) + urls, value = UrlValidator._extract_and_normalize_urls(text_with_urls) - # remove dots at the end (The URLExtract library catches URLs, but sometimes appends a '.' at the end) - # remove duplicated urls - urls = set(url[:-1] if url.endswith(".") else url for url in urls) - value = ", ".join(urls) + for url in urls: + if url.startswith("ftp://"): + results.append({ + "url": url, + "error": f"The URL {url} exists" + }) + + if results: + validity = False + value = results + + return {"valid": validity, "value": value} + + @staticmethod + @if_arg + def secure_url_checks(text_with_urls): + """ + Checks whether the secure link (https) is included in `text_with_urls` + Args: + text_with_urls (str, required): The text that contains https + Returns: + (dict) An object with the validity of the check and the instance/results + """ + + results = [] + + validity = True + + urls, value = UrlValidator._extract_and_normalize_urls(text_with_urls) - # check that URL returns a valid response for url in urls: - if not url.startswith("http"): - url = f"http://{url}" - try: - response_code = status_code_from_request(url) - if response_code == 200: - if url.startswith("http://"): - secure_url = url.replace("http://", "https://") - if status_code_from_request(secure_url) == 200: - result = { - "url": url, - "error": "The URL is secure. Please use 'https' instead of 'http'.", - } - else: - continue - else: - result = {"url": url, "error": f"Status code {response_code}"} - except requests.ConnectionError: - result = {"url": url, "error": "The URL does not exist on Internet."} - except: - result = {"url": url, "error": "Some unknown error occurred."} - results.append(result) + url="http://" + if url.startswith("http://"): + results.append({ + "url": url, + "error": f"The URL {url} is not secure" + }) if results: validity = False diff --git a/pyQuARC/schemas/check_messages.json b/pyQuARC/schemas/check_messages.json index aa6bcdd1..13b9f394 100644 --- a/pyQuARC/schemas/check_messages.json +++ b/pyQuARC/schemas/check_messages.json @@ -40,12 +40,28 @@ "remediation": "Recommend updating the Revision date so that it comes chronologically after the Insert/Creation time." }, "url_check": { - "failure": "A URL with a status code other than 200 has been identified: `{}`.", + "failure": "`{}`.", "help": { "message": "", "url": "https://en.wikipedia.org/wiki/List_of_HTTP_status_codes" }, - "remediation": "This often indicates a broken link. If the URL is broken, recommend revising." + "remediation": "The following link is broken. Recommend replacing the OnlineAccessURL with a link to directly access the granule via https." + }, + "protocol_check": { + "failure": "The following URL `{}` does not exist.", + "help": { + "message": "", + "url": "https://en.wikipedia.org/wiki/List_of_HTTP_status_codes" + }, + "remediation": "Recommend removing the ftp access link." + }, + "secure_url_check": { + "failure": "`{}`.", + "help": { + "message": "", + "url": "https://en.wikipedia.org/wiki/List_of_HTTP_status_codes" + }, + "remediation": "Recommend updating the following link(s) from 'http' to 'https':" }, "shortname_uniqueness": { "failure": "The EntryTitle/DataSetId `{}` is identical to the ShortName `{}`.", diff --git a/pyQuARC/schemas/checks.json b/pyQuARC/schemas/checks.json index 778f4da3..c2303240 100644 --- a/pyQuARC/schemas/checks.json +++ b/pyQuARC/schemas/checks.json @@ -24,6 +24,16 @@ "check_function": "health_and_status_check", "available": true }, + "protocol_check": { + "data_type": "url", + "check_function": "protocol_checks", + "available": true + }, + "secure_url_check": { + "data_type": "url", + "check_function": "secure_url_checks", + "available": true + }, "string_compare": { "data_type": "string", "check_function": "compare", diff --git a/pyQuARC/schemas/rule_mapping.json b/pyQuARC/schemas/rule_mapping.json index 9afd5059..394a7ad3 100644 --- a/pyQuARC/schemas/rule_mapping.json +++ b/pyQuARC/schemas/rule_mapping.json @@ -812,6 +812,262 @@ "severity": "error", "check_id": "url_check" }, + + "protocol_check": { + "rule_name": "protocol_checks", + "fields_to_apply": { + "echo-c": [ + { + "fields": [ + "Collection/Description" + ] + }, + { + "fields": [ + "Collection/SuggestedUsage" + ] + }, + { + "fields": [ + "Collection/CitationforExternalPublication" + ] + }, + { + "fields": [ + "Collection/OnlineAccessURLs/OnlineAccessURL/URL" + ] + }, + { + "fields": [ + "Collection/OnlineResources/OnlineResource/URL" + ] + } + ], + "dif10": [ + { + "fields": [ + "DIF/Extended_Metadata/Metadata/Value" + ] + }, + { + "fields": [ + "DIF/Dataset_Citation/Online_Resource" + ] + }, + { + "fields": [ + "DIF/Summary/Abstract" + ] + }, + { + "fields": [ + "DIF/Organization/Organization_URL" + ] + }, + { + "fields": [ + "DIF/Related_URL/URL" + ] + }, + { + "fields": [ + "DIF/Extended_Metadata/Metadata/Value" + ] + } + ], + "umm-c": [ + { + "fields": [ + "DataCenters/ContactInformation/RelatedUrls/URL" + ] + }, + { + "fields": [ + "DataCenters/ContactPersons/ContactInformation/RelatedUrls/URL" + ] + }, + { + "fields": [ + "DataCenters/ContactGroups/ContactInformation/RelatedUrls/URL" + ] + }, + { + "fields": [ + "ContactPersons/ContactInformation/RelatedUrls/URL" + ] + }, + { + "fields": [ + "ContactGroups/ContactInformation/RelatedUrls/URL" + ] + }, + { + "fields": [ + "RelatedUrls/URL" + ] + } + ], + "umm-g": [ + { + "fields": [ + "RelatedUrls/URL" + ] + }, + { + "fields": [ + "MetadataSpecification/URL" + ] + } + ], + "echo-g": [ + { + "fields": [ + "Granule/OnlineAccessURLs/OnlineAccessURL/URL" + ] + }, + { + "fields": [ + "Granule/OnlineResources/OnlineResource/URL" + ] + }, + { + "fields": [ + "Granule/AssociatedBrowseImageUrls/ProviderBrowseUrl/URL" + ] + } + ] + }, + "severity": "error", + "check_id": "protocol_check" + }, + "secure_url_check": { + "rule_name": "secure_url_checks", + "fields_to_apply": { + "echo-c": [ + { + "fields": [ + "Collection/Description" + ] + }, + { + "fields": [ + "Collection/SuggestedUsage" + ] + }, + { + "fields": [ + "Collection/CitationforExternalPublication" + ] + }, + { + "fields": [ + "Collection/OnlineAccessURLs/OnlineAccessURL/URL" + ] + }, + { + "fields": [ + "Collection/OnlineResources/OnlineResource/URL" + ] + } + ], + "dif10": [ + { + "fields": [ + "DIF/Extended_Metadata/Metadata/Value" + ] + }, + { + "fields": [ + "DIF/Dataset_Citation/Online_Resource" + ] + }, + { + "fields": [ + "DIF/Summary/Abstract" + ] + }, + { + "fields": [ + "DIF/Organization/Organization_URL" + ] + }, + { + "fields": [ + "DIF/Related_URL/URL" + ] + }, + { + "fields": [ + "DIF/Extended_Metadata/Metadata/Value" + ] + } + ], + "umm-c": [ + { + "fields": [ + "DataCenters/ContactInformation/RelatedUrls/URL" + ] + }, + { + "fields": [ + "DataCenters/ContactPersons/ContactInformation/RelatedUrls/URL" + ] + }, + { + "fields": [ + "DataCenters/ContactGroups/ContactInformation/RelatedUrls/URL" + ] + }, + { + "fields": [ + "ContactPersons/ContactInformation/RelatedUrls/URL" + ] + }, + { + "fields": [ + "ContactGroups/ContactInformation/RelatedUrls/URL" + ] + }, + { + "fields": [ + "RelatedUrls/URL" + ] + } + ], + "umm-g": [ + { + "fields": [ + "RelatedUrls/URL" + ] + }, + { + "fields": [ + "MetadataSpecification/URL" + ] + } + ], + "echo-g": [ + { + "fields": [ + "Granule/OnlineAccessURLs/OnlineAccessURL/URL" + ] + }, + { + "fields": [ + "Granule/OnlineResources/OnlineResource/URL" + ] + }, + { + "fields": [ + "Granule/AssociatedBrowseImageUrls/ProviderBrowseUrl/URL" + ] + } + ] + }, + "severity": "info", + "check_id": "secure_url_check" + }, + "shortname_uniqueness": { "rule_name": "Short Name uniqueness check", "fields_to_apply": { From e84ba7924ab75b720164750be59af02f1750172c Mon Sep 17 00:00:00 2001 From: Bhawana Karakheti Date: Fri, 17 Oct 2025 13:00:34 -0500 Subject: [PATCH 3/4] Remaining changes --- pyQuARC/code/url_validator.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/pyQuARC/code/url_validator.py b/pyQuARC/code/url_validator.py index fc93ad2f..dd049eaa 100644 --- a/pyQuARC/code/url_validator.py +++ b/pyQuARC/code/url_validator.py @@ -76,8 +76,7 @@ def health_and_status_check(text_with_urls): for url in urls: if url.startswith("http"): try: - response_code = 400 - # UrlValidator._status_code_from_request(url) + response_code = UrlValidator._status_code_from_request(url) if response_code == 200: if url.startswith("http://"): secure_url = url.replace("http://", "https://") @@ -151,7 +150,6 @@ def secure_url_checks(text_with_urls): urls, value = UrlValidator._extract_and_normalize_urls(text_with_urls) for url in urls: - url="http://" if url.startswith("http://"): results.append({ "url": url, From eea3d5dea5ebd126e958f8c8e412e3349a9a7fbe Mon Sep 17 00:00:00 2001 From: Bhawana Karakheti <80163528+bhawana11@users.noreply.github.com> Date: Fri, 17 Oct 2025 14:56:59 -0500 Subject: [PATCH 4/4] Fixes: fixes in the health and status check --- pyQuARC/code/url_validator.py | 23 ++++++++--------------- 1 file changed, 8 insertions(+), 15 deletions(-) diff --git a/pyQuARC/code/url_validator.py b/pyQuARC/code/url_validator.py index 756f30cb..fc4d7efa 100644 --- a/pyQuARC/code/url_validator.py +++ b/pyQuARC/code/url_validator.py @@ -74,24 +74,17 @@ def health_and_status_check(text_with_urls): urls, value = UrlValidator._extract_and_normalize_urls(text_with_urls) for url in urls: - if url.startswith("http"): + if url.startswith("https"): try: response_code = UrlValidator._status_code_from_request(url) - if response_code == 200: - if url.startswith("http://"): - secure_url = url.replace("http://", "https://") - if UrlValidator._status_code_from_request(secure_url) == 200: - result = { - "url": url, - "error": f"The url{url} is secure. Please use 'https' instead of 'http'.", - } - results.append(result) - - else: - continue - else: - result = {"url": url, "error": f"Status code {response_code}"} + if response_code != 200: + result = { + "url": url, + "error": f"The url {url} is broken.", + } results.append(result) + else: + continue except requests.ConnectionError: result = {"url": url, "error": f"The URL {url} does not exist on Internet."} results.append(result)